## Website
https://www.hktvmall.com/hktv/zh/homenfamily

## Environment and Imports
No need to run the cell below if you have already installed the packages

In [1]:
!pip install pyppeteer
!pip install pandas



In [2]:
from pyppeteer import launch
import asyncio
import nest_asyncio
import re
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup as bs

In [3]:
nest_asyncio.apply()
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 500)

In [4]:
def run_asyncio(func, args=None):
    return asyncio.get_event_loop().run_until_complete(func(args))

## Getting Pages

In [5]:
TABLET = 0
LAPTOP = 1
PHONE = 2
urls = [
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32301500001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A",
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32300500001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A",
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32201010001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A"
]

In [6]:
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32301500001 tablet
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32300500001 laptop
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32201010001 smartphone
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 50)

In [32]:
async def get_item_page(url: str):
    '''
    get the web page
    params:
        url: str, url of the web page
    return: 
        page: pyppeteer.page.Page, the web page
    '''
    browser = await launch()
    page = await browser.newPage()
    await page.setJavaScriptEnabled(enabled=True)
    await page.goto(url)
    soup = bs((await page.content()), "lxml")
    await browser.close()
    return soup

In [34]:
category_pages = []
category_pages_py = []
for url in urls:
    page = run_asyncio(get_item_page, url)
    print(page.find("title").text)
    # store pyppeteer object
    category_pages.append(page)

平板電腦 | 香港電視 HKTVmall 網上購物 
手提電腦 | 香港電視 HKTVmall 網上購物 
智能電話 | 香港電視 HKTVmall 網上購物 


In [36]:
def get_page_num(page):
    '''
    get number of page of the current category
    params:
        page: BeautifulSoup, the bs object of the webpage
    return:
        str
    '''
    page_string = re.findall(r"/共\d+頁", str(page))
    return page_string

In [37]:
# if any traceback occurs, run the block that calls get_item_page() again repeatedly
print(str(get_page_num(category_pages[0])) + " pages in tablets")
print(str(get_page_num(category_pages[1])) + " pages in laptops")
print(str(get_page_num(category_pages[2])) + " pages in smart phones")

['/共2頁', '/共2頁'] pages in tablets
['/共7頁', '/共7頁'] pages in laptops
['/共14頁', '/共14頁'] pages in smart phones


## Extracting Information

In [38]:
def handle_price(price_str):
    price_str = price_str.replace("$", '').replace(",", '')
    return float(price_str)

In [67]:
def extract_and_store(soup_of_page):
    '''
    extract relevant information for the product
    params:
        soup_of_page: BeautifulSoup object of the page
    return:
        products: DataFrame in pandas that stores relevant information
    '''
    
    BASE_URL = "https://www.hktvmall.com/hktv/zh/"
    brand_product_name_list = soup_of_page.find_all("div", class_="brand-product-name")
    # https://stackoverflow.com/questions/11205386/python-beautifulsoup-get-an-attribute-value-based-on-the-name-attribute
    product_id = soup_of_page.find_all("div", {"class":"product-brief"})
    index_list = [pid.attrs["data-id"] for pid in product_id]
    price_list = soup_of_page.find_all("div", {"class": "price"})
    seller_list = soup_of_page.select(".product-brief > a")

    ###### attributes to be placed in product csv file ######
    products = {
        "product_index": index_list,
        "product_type": [],
        "product_brand": [],
        "product_name": [],
        "product_price": [],
        "average_rate": [],
        "no_of_rates": [],
        "seller": [],
        "website": []
    }

    ###### add attributes ######
    for price in price_list:
        products["product_price"].append(handle_price(price.text))

    for seller in seller_list:
        seller = seller.attrs["href"]
        products["website"].append(urllib.parse.unquote(BASE_URL + seller))
        seller = seller.split("/")[1]    
        products["seller"].append(urllib.parse.unquote(seller))
    

    for i, bpn in enumerate(brand_product_name_list):
        s = bpn.text.split(" - ", 1)
        products["product_type"].append("tablet")
        products["product_brand"].append(s[0])
        products["product_name"].append(s[1])
    
    products["average_rate"] = []
    products["no_of_rates"] = []
    for i in range(len(products["website"])):
        print("Getting page " + str(i + 1) + "...")
        print("URL: " + products["website"][i])
        soup = run_asyncio(get_item_page, products["website"][i])
        avg_rating = soup.find("span", {"class": "averageRating"})
        comment_num = soup.find("span", {"class": "comment"})
        products["average_rate"].append(avg_rating.text)
    
        comment_num = int(re.findall(r"\d+", comment_num.text)[0])
        print(str(comment_num) + " comments retrieved from this page with an average rate of " + str(avg_rating.text))
        products["no_of_rates"].append(comment_num)
    
    
    
#     print("product_index: " + str(len(products["product_index"])))
#     print("product_type: " + str(len(products["product_type"])))
#     print("product_brand: " + str(len(products["product_brand"])))
#     print("product_name: " + str(len(products["product_name"])))
#     print("product_price: " + str(len(products["product_price"])))
#     print("average_rate: " + str(len(products["average_rate"])))
#     print("no_of_rates: " + str(len(products["no_of_rates"])))
#     print("seller: " + str(len(products["seller"])))
#     print("website: " + str(len(products["website"])))
    products_df = pd.DataFrame(products)
    return products_df

In [66]:
p = extract_and_store(category_pages[0])
p

Getting page 1...
URL: https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司/s/H6588001/電子電器/電子電器/手機及平板電腦/平板電腦/Tab-A-8-2019-2GB32GB-WiFi-平板電腦-T290-銀色-香港行貨/p/H6588001_S_Samsung_TabA_8_2019_WiFi
2 comments retrieved from this page with an average rate of 5.0
Getting page 2...
URL: https://www.hktvmall.com/hktv/zh/main/環球貿易/s/H6800001/電子電器/電子電器/電腦及週邊設備/平板電腦/安博科技-UPAD-PRO-4G送套/p/H6800001_S_BGT0026
1 comments retrieved from this page with an average rate of 5.0
Getting page 3...
URL: https://www.hktvmall.com/hktv/zh/main/Citylink/s/H5300001/電子電器/電子電器/手機及平板電腦/平板電腦/小米平板-4-80-LTE一年保養464GB-金色/p/H5300001_S_FXMM1806D9E464GBGD4
0 comments retrieved from this page with an average rate of 0.0
Getting page 4...
URL: https://www.hktvmall.com/hktv/zh/main/Citylink/s/H5300001/電子電器/電子電器/手機及平板電腦/平板電腦/Galaxy-Tab-A-101-WiFi-T510-2GB32GB-黑色平行進口/p/H5300001_S_FSAT510232GBBK
0 comments retrieved from this page with an average rate of 0.0
Getting page 5...
URL: https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司/s/H6588001/

0 comments retrieved from this page with an average rate of 0.0
Getting page 36...
URL: https://www.hktvmall.com/hktv/zh/main/Vertex-恆進/s/S1076001/電子電器/電子電器/手機及平板電腦/手提電話/智能電話/Lenovo-K5-Play-智能手機/p/S1076001_S_193268748026
0 comments retrieved from this page with an average rate of 0.0
Getting page 37...
URL: https://www.hktvmall.com/hktv/zh/main/德誠影業公司/s/H6391001/電子電器/電子電器/電腦及週邊設備/平板電腦/Samsung-Staedtler-Noris-Digital-Samsung-SPen-特別版/p/H6391001_S_Noris
1 comments retrieved from this page with an average rate of 5.0
Getting page 38...
URL: https://www.hktvmall.com/hktv/zh/main/LRT/s/H0183001/電子電器/電子電器/電腦及週邊設備/平板電腦/Aegir-智能筆-綠色原裝行貨/p/H0183001_S_Aegir-Teal
0 comments retrieved from this page with an average rate of 0.0
Getting page 39...
URL: https://www.hktvmall.com/hktv/zh/main/Citylink/s/H5300001/電子電器/電子電器/手機及平板電腦/平板電腦/小米平板-4-80-WiFi-64GB-金色平行進口/p/H5300001_S_FXMM1806D9W64GBGD
0 comments retrieved from this page with an average rate of 0.0
Getting page 40...
URL: https://www.hktvmall.com

Unnamed: 0,product_index,product_type,product_brand,product_name,product_price,average_rate,no_of_rates,seller,website
0,H6588001_S_Samsung_TabA_8_2019_WiFi,tablet,三星,"Tab A 8"" 2019 2GB+32GB Wi-Fi 平板電腦 T290 銀色 香港行貨",949.0,5.0,2,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...
1,H6800001_S_BGT0026,tablet,安博科技,安博科技 UPAD PRO 4G（送套）,1480.0,5.0,1,環球貿易,https://www.hktvmall.com/hktv/zh/main/環球貿易/s/H...
2,H5300001_S_FXMM1806D9E464GBGD4,tablet,小米,"小米平板 4 8.0"" LTE（一年保養）4+64GB 金色",1799.0,0.0,0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...
3,H5300001_S_FSAT510232GBBK,tablet,三星,"Galaxy Tab A 10.1 "" WiFi T510 2GB+32GB 黑色（平行進口）",1799.0,0.0,0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...
4,H6588001_S_Samsung_TabS6_WiFi_128_GY,tablet,三星,Tab S6 6GB+128GB Wi-Fi 平板電腦 T860 灰色 香港行貨,4999.0,0.0,0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...
5,H0972006_S_F7,tablet,Teclast,超薄 notebook - F7 (6+128GB),2222.0,3.4,17,ASK-Gadgets,https://www.hktvmall.com/hktv/zh/main/ASK-Gadg...
6,H5300001_S_FSAT5152-32GBBK,tablet,三星,"Galaxy Tab A 10.1 "" LTE T515 2GB+32GB 黑色（平行進口）",2099.0,0.0,0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...
7,H6588001_S_Huawei_M5_Lite_8_LTE,tablet,Huawei,M5 Lite 8 LTE 3GB+32GB 平板電腦 深空灰色 香港行貨,1599.0,4.0,1,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...
8,H0762001_S_Galaxy-Tab-S5e-LTE-T725-800670050,tablet,三星,Galaxy Tab S5e (LTE) (T725),4148.0,0.0,0,Mob-hk,https://www.hktvmall.com/hktv/zh/main/Mob-hk/s...
9,H5912001_S_T0046,tablet,Huawei,MediaPad M5 lite 平板 4+64GB（香港行貨）,2548.0,0.0,0,UNIVERSAL,https://www.hktvmall.com/hktv/zh/main/UNIVERSA...


In [76]:
p.to_csv("Product_HKTVMall.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")

In [39]:
###### getting plain html ######
BASE_URL = "https://www.hktvmall.com/hktv/zh/"
brand_product_name_list = category_pages[0].find_all("div", class_="brand-product-name")
# https://stackoverflow.com/questions/11205386/python-beautifulsoup-get-an-attribute-value-based-on-the-name-attribute
product_id = category_pages[0].find_all("div", {"class":"product-brief"})
index_list = [pid.attrs["data-id"] for pid in product_id]
price_list = category_pages[0].find_all("div", {"class": "price"})
seller_list = category_pages[0].select(".product-brief > a")

###### attributes to be put in product csv file ######
products = {
    "product_index": index_list,
    "product_type": [],
    "product_brand": [],
    "product_name": [],
    "product_price": [],
    # "rating": [],
    "product_seller": [],
    "website": []
}

###### add attributes ######
for price in price_list:
    products["product_price"].append(handle_price(price.text))

for seller in seller_list:
    seller = seller.attrs["href"]
    products["website"].append(urllib.parse.unquote(BASE_URL + seller))
    seller = seller.split("/")[1]    
    products["product_seller"].append(urllib.parse.unquote(seller))
    

for i, bpn in enumerate(brand_product_name_list):
    s = bpn.text.split(" - ", 1)
    products["product_type"].append("tablet")
    products["product_brand"].append(s[0])
    products["product_name"].append(s[1])

###### create and render dataframe ######
# products_df = pd.DataFrame(products)
# products_df

Unnamed: 0,product_index,product_type,product_brand,product_name,product_price,product_seller,website
0,H6588001_S_Samsung_TabA_8_2019_WiFi,tablet,三星,"Tab A 8"" 2019 2GB+32GB Wi-Fi 平板電腦 T290 銀色 香港行貨",949.0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...
1,H6800001_S_BGT0026,tablet,安博科技,安博科技 UPAD PRO 4G（送套）,1480.0,環球貿易,https://www.hktvmall.com/hktv/zh/main/環球貿易/s/H...
2,H5300001_S_FXMM1806D9E464GBGD4,tablet,小米,"小米平板 4 8.0"" LTE（一年保養）4+64GB 金色",1799.0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...
3,H5300001_S_FSAT510232GBBK,tablet,三星,"Galaxy Tab A 10.1 "" WiFi T510 2GB+32GB 黑色（平行進口）",1799.0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...
4,H6588001_S_Samsung_TabS6_WiFi_128_GY,tablet,三星,Tab S6 6GB+128GB Wi-Fi 平板電腦 T860 灰色 香港行貨,4999.0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...
5,H0972006_S_F7,tablet,Teclast,超薄 notebook - F7 (6+128GB),2222.0,ASK-Gadgets,https://www.hktvmall.com/hktv/zh/main/ASK-Gadg...
6,H5300001_S_FSAT5152-32GBBK,tablet,三星,"Galaxy Tab A 10.1 "" LTE T515 2GB+32GB 黑色（平行進口）",2099.0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...
7,H6588001_S_Huawei_M5_Lite_8_LTE,tablet,Huawei,M5 Lite 8 LTE 3GB+32GB 平板電腦 深空灰色 香港行貨,1599.0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...
8,H0762001_S_Galaxy-Tab-S5e-LTE-T725-800670050,tablet,三星,Galaxy Tab S5e (LTE) (T725),4148.0,Mob-hk,https://www.hktvmall.com/hktv/zh/main/Mob-hk/s...
9,H5912001_S_T0046,tablet,Huawei,MediaPad M5 lite 平板 4+64GB（香港行貨）,2548.0,UNIVERSAL,https://www.hktvmall.com/hktv/zh/main/UNIVERSA...


In [48]:
products["rating"] = []
products["comment_num"] = []
for i in range(len(products["website"])):
    print("Getting page " + str(i + 1) + "...")
    print("url: " + products["website"][i])
    soup = run_asyncio(get_item_page, products["website"][i])
    avg_rating = soup.find("span", {"class": "averageRating"})
    comment_num = soup.find("span", {"class": "comment"})
    products["rating"].append(x.text)
    
    print(comment_num.text)
    comment_num = int(re.findall(r"\d+", comment_num.text)[0])
    products["comment_num"].append(comment_num)
    print(x.text)
    

Getting page 1...
url: https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司/s/H6588001/電子電器/電子電器/手機及平板電腦/平板電腦/Tab-A-8-2019-2GB32GB-WiFi-平板電腦-T290-銀色-香港行貨/p/H6588001_S_Samsung_TabA_8_2019_WiFi

                    (2 則用家評論)
                
0.0
Getting page 2...
url: https://www.hktvmall.com/hktv/zh/main/環球貿易/s/H6800001/電子電器/電子電器/電腦及週邊設備/平板電腦/安博科技-UPAD-PRO-4G送套/p/H6800001_S_BGT0026

                    (1 則用家評論)
                
0.0
Getting page 3...
url: https://www.hktvmall.com/hktv/zh/main/Citylink/s/H5300001/電子電器/電子電器/手機及平板電腦/平板電腦/小米平板-4-80-LTE一年保養464GB-金色/p/H5300001_S_FXMM1806D9E464GBGD4

                    (0 則用家評論)
                
0.0
Getting page 4...
url: https://www.hktvmall.com/hktv/zh/main/Citylink/s/H5300001/電子電器/電子電器/手機及平板電腦/平板電腦/Galaxy-Tab-A-101-WiFi-T510-2GB32GB-黑色平行進口/p/H5300001_S_FSAT510232GBBK

                    (0 則用家評論)
                
0.0
Getting page 5...
url: https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司/s/H6588001/電子電器/電子電器/手機及平板電腦/平板電腦/Tab-S6-6GB128GB-WiFi-平板電腦


                    (1 則用家評論)
                
0.0
Getting page 38...
url: https://www.hktvmall.com/hktv/zh/main/LRT/s/H0183001/電子電器/電子電器/電腦及週邊設備/平板電腦/Aegir-智能筆-綠色原裝行貨/p/H0183001_S_Aegir-Teal

                    (0 則用家評論)
                
0.0
Getting page 39...
url: https://www.hktvmall.com/hktv/zh/main/Citylink/s/H5300001/電子電器/電子電器/手機及平板電腦/平板電腦/小米平板-4-80-WiFi-64GB-金色平行進口/p/H5300001_S_FXMM1806D9W64GBGD

                    (0 則用家評論)
                
0.0
Getting page 40...
url: https://www.hktvmall.com/hktv/zh/main/COOLMALL/s/H5688002/電子電器/電子電器/電腦及週邊設備/平板電腦/Zagg-Universal-FLEX-Keyboard-with-Stand/p/H5688002_S_DG049300147

                    (0 則用家評論)
                
0.0
Getting page 41...
url: https://www.hktvmall.com/hktv/zh/main/DISTEXPRESS/s/H0710001/電子電器/電子電器/電腦及週邊設備/平板電腦/多功能底座-iPad-平板-手機-橙色1個/p/H0710001_S_NS-ORG

                    (0 則用家評論)
                
0.0
Getting page 42...
url: https://www.hktvmall.com/hktv/zh/main/Beauty-Balance-International-Group-Limited/s/S1235001/

In [49]:
products_df = pd.DataFrame(products)
products_df

Unnamed: 0,product_index,product_type,product_brand,product_name,product_price,product_seller,website,rating,comment_num
0,H6588001_S_Samsung_TabA_8_2019_WiFi,tablet,三星,"Tab A 8"" 2019 2GB+32GB Wi-Fi 平板電腦 T290 銀色 香港行貨",949.0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...,0.0,2
1,H6800001_S_BGT0026,tablet,安博科技,安博科技 UPAD PRO 4G（送套）,1480.0,環球貿易,https://www.hktvmall.com/hktv/zh/main/環球貿易/s/H...,0.0,1
2,H5300001_S_FXMM1806D9E464GBGD4,tablet,小米,"小米平板 4 8.0"" LTE（一年保養）4+64GB 金色",1799.0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...,0.0,0
3,H5300001_S_FSAT510232GBBK,tablet,三星,"Galaxy Tab A 10.1 "" WiFi T510 2GB+32GB 黑色（平行進口）",1799.0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...,0.0,0
4,H6588001_S_Samsung_TabS6_WiFi_128_GY,tablet,三星,Tab S6 6GB+128GB Wi-Fi 平板電腦 T860 灰色 香港行貨,4999.0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...,0.0,0
5,H0972006_S_F7,tablet,Teclast,超薄 notebook - F7 (6+128GB),2222.0,ASK-Gadgets,https://www.hktvmall.com/hktv/zh/main/ASK-Gadg...,0.0,17
6,H5300001_S_FSAT5152-32GBBK,tablet,三星,"Galaxy Tab A 10.1 "" LTE T515 2GB+32GB 黑色（平行進口）",2099.0,Citylink,https://www.hktvmall.com/hktv/zh/main/Citylink...,0.0,0
7,H6588001_S_Huawei_M5_Lite_8_LTE,tablet,Huawei,M5 Lite 8 LTE 3GB+32GB 平板電腦 深空灰色 香港行貨,1599.0,輝煌數碼有限公司,https://www.hktvmall.com/hktv/zh/main/輝煌數碼有限公司...,0.0,1
8,H0762001_S_Galaxy-Tab-S5e-LTE-T725-800670050,tablet,三星,Galaxy Tab S5e (LTE) (T725),4148.0,Mob-hk,https://www.hktvmall.com/hktv/zh/main/Mob-hk/s...,0.0,0
9,H5912001_S_T0046,tablet,Huawei,MediaPad M5 lite 平板 4+64GB（香港行貨）,2548.0,UNIVERSAL,https://www.hktvmall.com/hktv/zh/main/UNIVERSA...,0.0,0
