## Website
https://www.hktvmall.com/hktv/zh/homenfamily

## Environment and Imports
No need to run the cell below if you have already installed the packages

In [116]:
!pip install pyppeteer
!pip install pandas



In [115]:
from pyppeteer import launch
import asyncio
import nest_asyncio
import re
import pandas as pd
from bs4 import BeautifulSoup as bs

In [3]:
nest_asyncio.apply()

In [4]:
def run_asyncio(func, args=None):
    return asyncio.get_event_loop().run_until_complete(func(args))

## Getting Pages

In [76]:
TABLET = 0
LAPTOP = 1
PHONE = 2
urls = [
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32301500001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A",
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32300500001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A",
    "https://www.hktvmall.com/hktv/zh/search?page=0&q=%3Arelevance%3Acategory%3AAA32201010001%3Azone%3Ahomenfamily%3Astreet%3Amain%3A"
]

In [77]:
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32301500001 tablet
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32300500001 laptop
# https://www.hktvmall.com/hktv/zh/search?categoryTag=AA32201010001 smartphone


In [78]:
async def get_item_page(url: str):
    '''
    get the web page
    params:
        url: str, url of the web page
    return: 
        page: pyppeteer.page.Page, the web page
    '''
    browser = await launch()
    page = await browser.newPage()
    await page.setJavaScriptEnabled(enabled=True)
    await page.goto(url)
    return page

In [102]:
category_pages = []
category_pages_py = []
for url in urls:
    page = run_asyncio(get_item_page, url)
    print(asyncio.run(page.title()))
    # store beautiful soup object
    category_pages.append(bs(asyncio.get_event_loop().run_until_complete(page.content()), "lxml"))
    # store pyppeteer object
    category_pages_py.append(page)

平板電腦 | 香港電視 HKTVmall 網上購物
手提電腦 | 香港電視 HKTVmall 網上購物
智能電話 | 香港電視 HKTVmall 網上購物


In [103]:
def get_page_num(page):
    '''
    get number of page of the current category
    params:
        page: BeautifulSoup, the bs object of the webpage
    return:
        str
    '''
    page_string = re.findall(r"/共\d+頁", str(page))
    return page_string

In [104]:
print(str(get_page_num(category_pages[0])) + " pages in tablets")
print(str(get_page_num(category_pages[1])) + " pages in laptops")
print(str(get_page_num(category_pages[2])) + " pages in smart phones")

['/共2頁', '/共2頁'] pages in tablets
['/共7頁', '/共7頁'] pages in laptops
['/共14頁', '/共14頁'] pages in smart phones


## Extracting Information

In [148]:
def handle_price(price_str):
    price_str = price_str.replace("$", '').replace(",", '')
    return float(price_str)

In [169]:
###### getting plain html ######
brand_product_name_list = category_pages[0].find_all("div", class_="brand-product-name")
# https://stackoverflow.com/questions/11205386/python-beautifulsoup-get-an-attribute-value-based-on-the-name-attribute
product_id = category_pages[0].find_all("div", {"class":"product-brief"})
index_list = [pid.attrs["data-id"] for pid in product_id]
price_list = category_pages[0].find_all("div", {"class": "price"})
seller_list = category_pages[0].select(".product-brief > a")

###### attributes in product csv ######
products = {
    "product_index": index_list,
    "product_type": [],
    "product_brand": [],
    "product_name": [],
    "product_price": [],
    "product_seller": []
}

###### add attributes ######
for price in price_list:
    products["product_price"].append(handle_price(price.text))

for seller in seller_list:
    products["product_seller"].append(seller.text)
print(len(seller_list))

for i, bpn in enumerate(brand_product_name_list):
    s = bpn.text.split(" - ", 1)
    products["product_type"].append("tablet")
    products["product_brand"].append(s[0])
    products["product_name"].append(s[1])

###### create and render dataframe ######
# products_df = pd.DataFrame(products)
# products_df

59


In [174]:
seller_list = category_pages[0].select(".product-brief > a")
for seller in seller_list:
    seller = seller.attrs["href"]
    seller = seller.split("/")[]

main/%E7%92%B0%E7%90%83%E8%B2%BF%E6%98%93/s/H6800001/%E9%9B%BB%E5%AD%90%E9%9B%BB%E5%99%A8/%E9%9B%BB%E5%AD%90%E9%9B%BB%E5%99%A8/%E9%9B%BB%E8%85%A6%E5%8F%8A%E9%80%B1%E9%82%8A%E8%A8%AD%E5%82%99/%E5%B9%B3%E6%9D%BF%E9%9B%BB%E8%85%A6/%E5%AE%89%E5%8D%9A%E7%A7%91%E6%8A%80-UPAD-PRO-4G%E9%80%81%E5%A5%97/p/H6800001_S_BGT0026
main/Citylink/s/H5300001/%E9%9B%BB%E5%AD%90%E9%9B%BB%E5%99%A8/%E9%9B%BB%E5%AD%90%E9%9B%BB%E5%99%A8/%E6%89%8B%E6%A9%9F%E5%8F%8A%E5%B9%B3%E6%9D%BF%E9%9B%BB%E8%85%A6/%E5%B9%B3%E6%9D%BF%E9%9B%BB%E8%85%A6/Galaxy-Tab-A-101-WiFi-T510-2GB32GB-%E9%87%91%E8%89%B2%E5%B9%B3%E8%A1%8C%E9%80%B2%E5%8F%A3/p/H5300001_S_FSAT510232GBGD
main/Citylink/s/H5300001/%E9%9B%BB%E5%AD%90%E9%9B%BB%E5%99%A8/%E9%9B%BB%E5%AD%90%E9%9B%BB%E5%99%A8/%E6%89%8B%E6%A9%9F%E5%8F%8A%E5%B9%B3%E6%9D%BF%E9%9B%BB%E8%85%A6/%E5%B9%B3%E6%9D%BF%E9%9B%BB%E8%85%A6/%E5%B0%8F%E7%B1%B3%E5%B9%B3%E6%9D%BF-4-80-LTE%E4%B8%80%E5%B9%B4%E4%BF%9D%E9%A4%8A464GB-%E9%87%91%E8%89%B2/p/H5300001_S_FXMM1806D9E464GBGD4
main/%E8%BC%9D%E7%85%8C%E6%