## Website
https://www.hktvmall.com/hktv/zh/homenfamily

## Environment and Imports
No need to run the cell below if you have already installed the packages

In [1]:
!pip install pyppeteer
!pip install pandas



In [2]:
from pyppeteer import launch
import asyncio
import nest_asyncio
import re
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup as bs

In [3]:
nest_asyncio.apply()
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 500)

In [4]:
def run_asyncio(func, args=None):
    return asyncio.get_event_loop().run_until_complete(func(args))

## Getting Pages

In [5]:
TABLET = 0
LAPTOP = 1
PHONE = 2
urls = [
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32301500001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A",
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32300500001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A",
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32201010001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A"
]

In [6]:
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32301500001 tablet
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32300500001 laptop
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32201010001 smartphone
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 50)

In [7]:
async def get_item_page(url: str):
    '''
    get the web page
    params:
        url: str, url of the web page
    return: 
        page: pyppeteer.page.Page, the web page
    '''
    browser = await launch()
    page = await browser.newPage()
    await page.setJavaScriptEnabled(enabled=True)
    await page.goto(url)
    soup = bs((await page.content()), "lxml")
    await browser.close()
    return soup

In [8]:
def get_page_num(page):
    '''
    get number of page of the current category
    params:
        page: BeautifulSoup, the bs object of the webpage
    return:
        str
    '''
    page_string = re.findall(r"/共\d+頁", str(page))
    return page_string

In [9]:
def correct_page(page):
    return (len(get_page_num(page)) > 0)

## Extracting Information

In [10]:
def handle_price(price_str):
    price_str = price_str.replace("$", '').replace(",", '')
    return float(price_str)

In [11]:
def store_customer_review(url, soup, customers):
    '''
    store the review of customers
    params:
        url: str, url of the page
        soup: BeautifulSoup, soup of the page
        customers: Dictionary, customer review to be placed in DataFrame
    '''
    ind = url.split("/")[-1]
    review_list = soup.find_all("div", {"class": "product-review-wrapper"})
    pattern = re.compile("(?<=\\xa0).*") # e.g. "評論 :\xa0Excellent"

    for review in review_list:
        name = review.find("span", {"class": "review-username"}).text
        date = review.find("span", {"class": "review-date"}).text
        rate = len(review.find_all("span", {"class": "star"}))
        content = str(review.find("div", {"class": "review-title"}).text)
        content = pattern.findall(content)[0]
        
        customers["product_index"].append(ind)
        customers["username"].append(name)
        customers["rate_date"].append(date)
        customers["user_rate"].append(rate)
        customers["buyer_comment"].append(content)
        
        print([ind, name, date, rate, content])
        

In [12]:
def extract_and_store(soup_of_page, p_type):
    '''
    extract relevant information for the product
    params:
        soup_of_page: BeautifulSoup object of the page
        p_type: str, type of product, tablet, smartphone, or laptop
    return:
        products: DataFrame in pandas that stores relevant information
    '''
    
    BASE_URL = "https://www.hktvmall.com/hktv/zh/"
    brand_product_name_list = soup_of_page.find_all("div", class_="brand-product-name")
    # https://stackoverflow.com/questions/11205386/python-beautifulsoup-get-an-attribute-value-based-on-the-name-attribute
    product_id = soup_of_page.find_all("div", {"class":"product-brief"})
    index_list = [pid.attrs["data-id"] for pid in product_id]
    price_list = soup_of_page.find_all("div", {"class": "price"})
    seller_list = soup_of_page.select(".product-brief > a")

    ###### attributes to be placed in product csv file ######
    products = {
        "product_index": index_list,
        "product_type": [],
        "product_brand": [],
        "product_name": [],
        "product_price": [],
        "average_rate": [],
        "no_of_rates": [],
        "seller": [],
        "website": []
    }
    
    customers = {
        "product_index": [],
        "username": [],
        "rate_date": [],
        "user_rate": [],
        "buyer_comment": []
    }

    ###### add attributes ######
    for price in price_list:
        products["product_price"].append(handle_price(price.text))

    for seller in seller_list:
        seller = seller.attrs["href"]
        products["website"].append(urllib.parse.unquote(BASE_URL + seller))
        seller = seller.split("/")[1]    
        products["seller"].append(urllib.parse.unquote(seller))
    

    for i, bpn in enumerate(brand_product_name_list):
        s = bpn.text.split(" - ", 1)
        products["product_type"].append(p_type)
        products["product_brand"].append(s[0])
        products["product_name"].append(s[1])
    
    products["average_rate"] = []
    products["no_of_rates"] = []
    for i in range(len(products["website"])):
        while True:
            try:
                print("Getting page " + str(i + 1) + "...")
                print("URL: " + products["website"][i])
                soup = run_asyncio(get_item_page, products["website"][i])
                avg_rating = soup.find("span", {"class": "averageRating"})
                comment_num = soup.find("span", {"class": "comment"})
                products["average_rate"].append(avg_rating.text)
    
                comment_num = int(re.findall(r"\d+", comment_num.text)[0])
                print(str(comment_num) + " comments retrieved from this page with an average rate of " + str(avg_rating.text))
                products["no_of_rates"].append(comment_num)
        
                if comment_num > 0:
                    store_customer_review(products["website"][i], soup, customers)
            
            except:
                print("Error when getting page: ... running again")
                continue
                
            break
    
    products_df = pd.DataFrame(products)
    customers_df = pd.DataFrame(customers)
    return products_df, customers_df

## Run

In [14]:
url = "https://www.hktvmall.com/hktv/zh/search?page=9&q=%3Arelevance%3Acategory%3AAA32201010001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A"
soup = run_asyncio(get_item_page, url)
while not correct_page(soup):
    print("Problem with page source... aborted, running again")
    soup = run_asyncio(get_item_page, url)

p, c = extract_and_store(soup, "smartphone")
p.to_csv("./data/HKTVMall/Products_HKTVMall_smartphone10.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")
c.to_csv("./data/HKTVMall/Customers_HKTVMall_smartphone10.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")

Getting page 1...
URL: https://www.hktvmall.com/hktv/zh/main/SRSWorks-Limited/s/H5302003/電子電器/電子電器/手機及平板電腦/手機周邊配件/保護貼/Huawei-P30-全屏高清玻璃保護貼-黑/p/H5302003_S_L38GL26018
0 comments retrieved from this page with an average rate of 0.0
Getting page 2...
URL: https://www.hktvmall.com/hktv/zh/main/Praesto-Lifestyle/s/H5892002/電子電器/電子電器/手機及平板電腦/手機周邊配件/手機外殼/iPhone-Xs-Presidio-Sport-Black-Gunmetal-Grey/p/H5892002_S_117133-6683
0 comments retrieved from this page with an average rate of 0.0
Getting page 3...
URL: https://www.hktvmall.com/hktv/zh/main/ASK-Gadgets/s/H0972006/電子電器/電子電器/視聽娛樂/其他/其他影音器材-&-配件/AM160-Wireless-MultiRoom-Stereo-Amplifier/p/H0972006_S_AM160
0 comments retrieved from this page with an average rate of 0.0
Getting page 4...
URL: https://www.hktvmall.com/hktv/zh/main/Praesto-Lifestyle/s/H5892002/電子電器/電子電器/手機及平板電腦/手機周邊配件/手機外殼/Presidio-Clear-Print-iPhone-8-iPhone-7-iPhone-6s-iPhone-6-Case/p/H5892002_S_848709047205
0 comments retrieved from this page with an average rate of 0.0
Getti

0 comments retrieved from this page with an average rate of 0.0
Getting page 37...
URL: https://www.hktvmall.com/hktv/zh/main/煒信電信電子科技公司/s/H7007001/電子電器/電子電器/手機及平板電腦/手提電話/智能電話/香港行貨棱鏡白GALAXY-S10-G9750-8GB-RAM-128GB-ROM/p/H7007001_S_SAMG9750H01WH01
0 comments retrieved from this page with an average rate of 0.0
Getting page 38...
URL: https://www.hktvmall.com/hktv/zh/main/凱訊數碼有限公司/s/H7019001/電子電器/電子電器/手機及平板電腦/手提電話/智能電話/4GLTE-智能手機-Enjoy-MAX-黑色/p/H7019001_S_Enjoy_MAX_Black
0 comments retrieved from this page with an average rate of 0.0
Getting page 39...
URL: https://www.hktvmall.com/hktv/zh/main/Xero/s/H5982001/電子電器/電子電器/手機及平板電腦/手提電話/智能電話/Sony-Xperia-1-J9110-6gb-128gb-White-平行進口/p/H5982001_S_sonyj9110white
0 comments retrieved from this page with an average rate of 0.0
Getting page 40...
URL: https://www.hktvmall.com/hktv/zh/main/HKTVmall-Outlet/s/H1137001/電子電器/電子電器/手機及平板電腦/手機周邊配件/智能手錶-配置/贈送黑色錶帶優惠裝-智能手錶手環多色錶帶健康監察關愛家人來電提示顯示WhatsApp微信WeChat-Facebook信息-陳列品-紅色/p/S2069001_S_q9balckred
0 commen