In [147]:
#Import necessary package
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime as dt

In [148]:
#Configure parameter
mall = 'Tmtplaza'
shoplisturl = 'https://www.tmtp.com.hk/en/Shop'
fnblisturl = 'https://www.tmtp.com.hk/en/Dining'
shopdetailbasicurl = 'https://www.tmtp.com.hk/en/Shop/'
shopdetailbasictcurl = 'https://www.tmtp.com.hk/tc/Shop/'

In [149]:
#Get shop category data and export into csv
def getShopCategory():
    #Create empty DataFrame for shop category
    shopcategory = pd.DataFrame()

    for type, url in zip(['Shopping','Dining'],[shoplisturl,fnblisturl]):
        #Get shop category
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        for search in soup.find_all('div', attrs = {'data-type':'category'}):
            for key in search.find_all('button', class_ = 'searchKey'):
                try:
                    shop_category_id = key.get('data-key')
                except:
                    shop_category_id = np.nan

                try:
                    shop_category_name = key.text
                except:
                    shop_category_name = np.nan

                shopcategory = shopcategory.append(
                    {
                        'type':type,
                        'shop_category_id':shop_category_id,
                        'shop_category_name':shop_category_name
                        }, ignore_index=True
                        )
        shopcategory['update_date'] = dt.date.today()
        shopcategory['mall'] = mall
        shopcategory.drop(shopcategory[shopcategory.shop_category_id == '0'].index, inplace = True)
        shopcategory = shopcategory.loc[:, ['mall','type','shop_category_id','shop_category_name','update_date']]
    return shopcategory

In [150]:
#Get shop master data and export into csv
def getShopMaster():
    shopcategory = getShopCategory()
    #Create empty DataFrame for shop master
    shoplist = pd.DataFrame()
    shopdetail = pd.DataFrame()

    #Create floor mapping
    shop_floor_id_mapping = {'1':'Phase1 GF','2':'Phase1 UGF','3':'Phase1 1F','4':'Phase1 2F','5':'Phase1 3F','6':'Phase2 GF','7':'Phase2 UGF','8':'Phase2 1F','9':'Phase2 2F'}
    
    for type, url in zip(['Shopping','Dining'],[shoplisturl,fnblisturl]):
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
    
        for shopcube in soup.find_all(class_ = 'shopCube'):
            try:
                if type == 'Shopping':
                    shopdetaillink = shopcube.get('href')
                    shopdetaillinkid = shopdetaillink.find('/Shop/')
                    shop_id = shopdetaillink[shopdetaillinkid+6:]
                elif type == 'Dining':
                    shopdetaillink = shopcube.get('href')
                    shopdetaillinkid = shopdetaillink.find('/Dining/')
                    shop_id = shopdetaillink[shopdetaillinkid+8:]
            except:
                shop_id = np.nan
    
            try:
                shop_floor = shopcube.get('data-floor')
            except:
                shop_floor = np.nan
    
            try:
                shop_category_id = shopcube.get('data-category')
            except:
                shop_category_id = np.nan
    
            try:
                shop_category_name = shopcategory.loc[shopcategory['shop_category_id'] == shop_category_id, 'shop_category_name'].values[0]
            except:
                shop_category_name = np.nan
    
            shoplist = shoplist.append(
                            {
                                'type':type,
                                'shop_id':shop_id,
                                'shop_floor':shop_floor,
                                'shop_category_id':shop_category_id,
                                'shop_category_name':shop_category_name
                                }, ignore_index=True
                                )
    
    for shop_id in shoplist['shop_id']:
        shopdetailurl = shopdetailbasicurl + shop_id
        page = requests.get(shopdetailurl)
        soup = BeautifulSoup(page.content, 'html.parser')
    
        for detail in soup.find_all('div', class_ = 'col-md-9'):
            try:
                shop_name = detail.find(class_ = 'font-weight-bold').text
            except:
                shop_name = np.nan
            
            try:
                shop_number = detail.find(src = re.compile('icon_location')).find_next_sibling('div').text.split(',')[0]
            except:
                shop_number = np.nan
            
            try:
                opening_hours = detail.find(src = re.compile('icon_time')).find_next_sibling('div').text
                opening_hours = opening_hours.replace('\n','').replace('\r','').replace('<br>','')
            except:
                opening_hours = np.nan
            
            try:
                phone = detail.find(src = re.compile('icon_tel')).find_next_sibling('div').text
                phone = phone.replace(' ','').replace('\n','').replace('\r','').replace('<br>','')
            except:
                phone = np.nan
            
            try:
                item = detail.find(class_ = 'flex-wrap')
                taglist = ';'.join([tag.text.strip().replace(';','') for tag in item.findChildren('button')])
                taglist = taglist.replace('\u200b','').replace('\n','').replace('\r','').replace('<br>','')
            except:
                taglist = np.nan
        
        shopdetailtcurl = shopdetailbasictcurl + shop_id
        page = requests.get(shopdetailtcurl)
        soup = BeautifulSoup(page.content, 'html.parser')
    
        for detail in soup.find_all('div', class_ = 'col-md-9'):
            try:
                shop_name_zh = detail.find(class_ = 'font-weight-bold').text
            except:
                shop_name_zh = np.nan
    
        shopdetail = shopdetail.append(
            {
                'shop_id':shop_id,
                'shop_name_en':shop_name,
                'shop_name_tc':shop_name_zh,
                'shop_number':shop_number,
                'phone':phone,
                'opening_hours':opening_hours,
                'tag':taglist
                }, ignore_index=True
                )
    #Merge shop list and shop detail into shop master
    shopmaster = pd.merge(shoplist, shopdetail, on = 'shop_id')
    shopmaster['update_date'] = dt.date.today()
    shopmaster['mall'] = mall
    shopmaster['shop_floor'] = shopmaster['shop_floor'].map(shop_floor_id_mapping)
    shopmaster['loyalty_offer'] = np.nan
    shopmaster['voucher_acceptance'] = np.nan
    shopmaster = shopmaster.loc[:, ['mall','type','shop_id','shop_name_en','shop_name_tc','shop_number','shop_floor','phone','opening_hours','loyalty_offer','voucher_acceptance','shop_category_id','shop_category_name','tag','update_date']]
    return shopmaster

In [151]:
#Export data to csv
shopcategory = getShopCategory()
shopcategory.to_csv('{}_shopcategory_{}.csv'.format(mall,dt.date.strftime(dt.date.today(),'%Y%m%d')), index = False)

shopmaster = getShopMaster()
shopmaster.to_csv('{}_shopmaster_{}.csv'.format(mall,dt.date.strftime(dt.date.today(),'%Y%m%d')), index = False)

In [152]:
print(shopmaster)

         mall      type                           shop_id  \
0    Tmtplaza  Shopping                                14   
1    Tmtplaza  Shopping                                61   
2    Tmtplaza  Shopping                               289   
3    Tmtplaza  Shopping                                88   
4    Tmtplaza  Shopping                                85   
..        ...       ...                               ...   
309  Tmtplaza    Dining  ://www.tmtp.com.hk/en/Dining/227   
310  Tmtplaza    Dining  ://www.tmtp.com.hk/en/Dining/546   
311  Tmtplaza    Dining  ://www.tmtp.com.hk/en/Dining/134   
312  Tmtplaza    Dining  ://www.tmtp.com.hk/en/Dining/330   
313  Tmtplaza    Dining  ://www.tmtp.com.hk/en/Dining/547   

              shop_name_en           shop_name_tc shop_number shop_floor  \
0       7-Eleven (Phase 1)           7-11便利店 (一期)        G055  Phase1 GF   
1       7-Eleven (Phase 2)           7-11便利店 (二期)           7  Phase2 GF   
2              :CHOCOOLATE            :

In [91]:
shopdetail = pd.DataFrame()

for type, url, urltc in zip(['Shopping','Dining'],[shopdetailbasicurl,shopdetailbasictcurl],[fnbdetailbasicurl,fnbdetailbasictcurl]):
#Get shop detail
url = shopdetailbasicurl
urltc = shopdetailbasictcurl

for shop_id in ['14']:#shoplist['shop_id']:
    shopdetailurl = url + shop_id
    page = requests.get(shopdetailurl)
    soup = BeautifulSoup(page.content, 'html.parser')

    for detail in soup.find_all('div', class_ = 'col-md-9'):
        try:
            shop_name = detail.find(class_ = 'font-weight-bold').text
        except:
            shop_name = np.nan
        
        try:
            shop_number = detail.find(src = re.compile('icon_location')).find_next_sibling('div').text
        except:
            shop_number = np.nan
        
        try:
            opening_hours = detail.find(src = re.compile('icon_time')).find_next_sibling('div').text
            opening_hours = opening_hours.replace('\n','').replace('\r','').replace('<br>','')
        except:
            opening_hours = np.nan
        
        try:
            phone = detail.find(src = re.compile('icon_tel')).find_next_sibling('div').text
            phone = phone.replace(' ','')
        except:
            phone = np.nan
        
        try:
            item = detail.find(class_ = 'flex-wrap')
            taglist = ';'.join([tag.text.strip().replace(';','') for tag in item.findChildren('button')])
        except:
            taglist = np.nan
    
    shopdetailtcurl = urltc + shop_id
    page = requests.get(shopdetailurl)
    soup = BeautifulSoup(page.content, 'html.parser')

    for detail in soup.find_all('div', class_ = 'col-md-9'):
        try:
            shop_name_zh = detail.find(class_ = 'font-weight-bold').text
        except:
            shop_name_zh = np.nan

    shopdetail = shopdetail.append(
        {
            'shop_id':shop_id,
            'shop_name_en':shop_name,
            'shop_name_tc':shop_name_zh,
            'shop_number':shop_number,
            'phone':phone,
            'opening_hours':opening_hours,
            'tag':taglist
            }, ignore_index=True
            )
print(shopdetail)

  opening_hours         phone shop_id        shop_name_en        shop_name_tc  \
0      24 hours  \n24517973\n      14  7-Eleven (Phase 1)  7-Eleven (Phase 1)   

                   shop_number  \
0  G055, G/F, tmtplaza Phase 1   

                                                tag  
0  Convenience Stores​;Daily Necessities;Drink;Food  


In [4]:
#Get shop master data and export into csv
def getShopMaster():
    shopcategory = getShopCategory()
    #Create empty DataFrame for shop master
    shoplist = pd.DataFrame()
    shopdetail = pd.DataFrame()
    
    url = shoplisturl
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    for shop in soup.find_all(class_ = 'shopEntry'):
        for shopbrand in shop.find(class_ = 'shopEntryBrand'):
            try:
                shopbrandlink = shopbrand.get('href')
                shopbrandlinkid = shopbrandlink.find('?id=')
                shop_id = shopbrandlink[shopbrandlinkid+4:]
            except:
                shop_id = np.nan
            
            try:
                shop_name = shopbrand.text
            except:
                shop_name = np.nan
        
        try:
            shop_number = shop.find(class_ = 'shopEntryLocation').text
        except:
            shop_number = np.nan
        
        try:
            shop_floor = shop.find(class_ = 'shopEntryFloor').text.replace(' ','')
        except:
            shop_floor = np.nan
        
        try:
            shop_category_name = shop.find(class_ = 'shopEntryCategory').text
        except:
            shop_category_name = np.nan
    
        try:
            shop_category_id = shopcategory.loc[shopcategory['shop_category_name'] == shop_category_name, 'shop_category_id'].values[0]
        except:
            shop_category_id = np.nan
    
        try:
            if shop.find(class_ = 'shopEntryCard').find_all(src = re.compile('ico_the_one_card')):
                loyalty_offer = 'The ONE Card'
            else:
                loyalty_offer = np.nan
        except:
            loyalty_offer = np.nan
    
        try:
            if shop.find(class_ = 'shopEntryCard').find_all(src = re.compile('ico_cash_coupon')):
                voucher_acceptance = '1'
            else:
                voucher_acceptance = np.nan
        except:
            voucher_acceptance = np.nan
        
        shoplist = shoplist.append(
                    {
                        'shop_id':shop_id,
                        'shop_name_en': shop_name,
                        'shop_number':shop_number,
                        'shop_floor':shop_floor,
                        'shop_category_id':shop_category_id,
                        'shop_category_name':shop_category_name,
                        'loyalty_offer':loyalty_offer,
                        'voucher_acceptance':voucher_acceptance
                        }, ignore_index=True
                        )
    #Get shop detail
    for shop_id in shoplist['shop_id']:
        shopdetailurl = shopdetailbasictcurl + shop_id
        page = requests.get(shopdetailurl)
        soup = BeautifulSoup(page.content, 'html.parser')
    
        for shopdetailinner in soup.find_all(class_ = 'shopDetailsInner'):
            try:
                shop_name_zh = shopdetailinner.find('span').text
            except:
                shop_name_zh = np.nan
        
        shopdetailurl = shopdetailbasicurl + shop_id
        page = requests.get(shopdetailurl)
        soup = BeautifulSoup(page.content, 'html.parser')
    
        for shopdetailcontent in soup.find_all(class_ = 'shopDetailsContent'):
            try:
                phone = shopdetailcontent.find('th', text = 'Telephone').find_next_sibling('td').find_next_sibling('td').text
                phone = phone.replace(' ','').replace('\n','').replace('\r','').replace('<br>','')
            except:
                phone = np.nan
            
            try:
                opening_hours = shopdetailcontent.find('th', text = 'Opening Time').find_next_sibling('td').find_next_sibling('td').text
                opening_hours = opening_hours = opening_hours.replace('\n','').replace('\r','').replace('<br>','')
            except:
                opening_hours = np.nan
                
        shopdetail = shopdetail.append(
                    {
                        'shop_id':shop_id,
                        'shop_name_tc':shop_name_zh,
                        'phone': phone,
                        'opening_hours': opening_hours
                        }, ignore_index=True
                        )
    
    #Merge shop list and shop detail into shop master
    shopmaster = pd.merge(shoplist, shopdetail, on = 'shop_id')
    shopmaster['update_date'] = dt.date.today()
    shopmaster['mall'] = mall
    shopmaster['type'] = shopmaster['shop_category_name'].apply(lambda x: 'Dining' if any(keyword in x.lower() for keyword in ['food','dining']) else 'Shopping')
    shopmaster['fnb_zone'] = np.nan
    shopmaster = shopmaster.loc[:, ['mall','type','shop_id','shop_name_en','shop_name_tc','fnb_zone','shop_number','shop_floor','phone','opening_hours','loyalty_offer','voucher_acceptance','shop_category_id','shop_category_name','update_date']]
    return shopmaster