# Website
### https://www.hksuning.com/

***

# Installation
### If you are running this notebook for the first time, please the follow one sell, otherwise, please ignore.

In [None]:
!pip install pyppeteer
!pip install beautifulsoup4
!pip install nest_asyncio
!pip install asyncio
!pip install lxml

# Importing

In [1]:
# coding=utf-8

from pyppeteer import launch
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup as bs
import re
from dataclasses import dataclass
import xml.etree.ElementTree as et
import time
import pandas as pd

In [2]:
nest_asyncio.apply()

# Websites that are needed to be crawling

In [15]:
urls = [
        "https://search.hksuning.com/search/list?ci=503692", # url_tablet
        "https://search.hksuning.com/search/list?ci=503694", # url_laptop
        "https://search.hksuning.com/search/list?ci=503505" # url_phone
]

# Return all web pages of each category

In [18]:
async def get_item(categroy: str):
    browser = await launch()
    page = await browser.newPage()
    await page.setViewport(viewport={'width': 3840, 'height': 2160})
    await page.goto(categroy)
    return page, browser

# Start crawling

## Get pages of each item page

In [5]:
list_pages = []
list_browsers = []

In [6]:
async def get_page_num(page) -> int:
    content = bs(await page.content(), 'lxml')
    page_num = re.findall(re.compile(r"共\d頁"), content.prettify())[0][1]
    return int(page_num)

In [7]:
for url in urls:
    page, browser = asyncio.get_event_loop().run_until_complete(get_item(url))
    list_pages.append(page)
    list_browsers.append(browser)

In [85]:
page_tablet, browser_tablet = asyncio.get_event_loop().run_until_complete(get_item(urls[0]))

page_num_tablet = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_tablet)
    )
print(f"There are total {page_num_tablet} pages about tablet")

There are total 2 pages about tablet


In [10]:
page_laptop, browser_laptop = asyncio.get_event_loop().run_until_complete(get_item(urls[1]))

page_num_laptop = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_laptop)
    )
print(f"There are total {page_num_laptop} pages about laptop")

There are total 2 pages about laptop


## Debug!!!

In [17]:
page_phone, browser_phone = asyncio.get_event_loop().run_until_complete(get_item(urls[2]))

page_num_phone = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_phone)
    )
print(f"There are total {page_num_phone} pages about mobile phone")

TimeoutError: Navigation Timeout Exceeded: 30000 ms exceeded.

## Next Page

### Have next page button?

In [19]:
async def have_button(page) -> bool:

    source = bs(await page.content(), 'lxml').prettify()
    if 'name="ssdln_503692_bottom_pgup02-3" style="display: none;"' in source:
        return False
    else:
        return True

asyncio.get_event_loop().run_until_complete(have_button(page_tablet))

True

### Last page

In [91]:
async def click_last_page(page) ->bool:
    last_btn_selector = "#bottom_pager > div > a.prev"
    try:
        await page.click(last_btn_selector)
        return True
    except:
        return False
        
asyncio.get_event_loop().run_until_complete(click_last_page(page_tablet))

True

### Click next page button

In [21]:
async def click_next_page(page) -> bool:
    next_btn_selector = "#bottom_pager > div > a.next"
    is_next = asyncio.get_event_loop().run_until_complete(have_button(page))

    if is_next:
        await page.click(next_btn_selector)
        return True
    else:
        # print("No next page!")
        return False

## Data storage

### Data Class

In [152]:
@dataclass
class Product:
    product_index: int
    product_type: str
    product_name: str
    prodect_brand: str
    price: float
    avg_rating: float
    total_comment: int
    seller: str
    website: str

@dataclass
class Customer:
    product_index: int
    username: str
    user_rate: float
    rate_date: str
    buyer_comment: str

        
product_type_dict = {
    "平板電腦": "Tablet",
    "手機": "Mobile Phone",
    "手提電腦": "Laptop"
}

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 50)

all_product = []
all_customer = []
column_product = ['product_index', 'product_type', 'product_name', 'prodect_brand', 'price', 'avg_rating', 'total_comment', 'seller', 'website']
column_customer = ['product_index', 'username', 'user_rate', 'rate_date', 'buyer_comment']

### Resolve Data

In [146]:
async def resolve_data(page, key):
    # Click the comment tab
    await page.click('#commentNum')

    while not await page.querySelector('#rv-main > div > div.rv-main-target > div > div:nth-child(1) > div > div.rv-target-topic.clearfix > div.topic-right > div.topic-main.l > div.topic-title.clearfix') :
        pass

    source = bs(await page.content(), 'lxml')

    _product_index = key
    
    type_html = source.select('#pcFourth > div:nth-child(1) > div > ul > li:nth-child(5) > span > a')
    _product_type = product_type_dict[type_html[0].get_text()]
    
    title = source.h1.get_text()
    _product_name = title.replace("自營", "").replace("香港倉", "").strip()
    
    _product_brand = _product_name.split(" ")[0]
    
    text_price = f"{source.find('span', class_='integer').get_text()}.{source.find('span', class_='decimal').get_text()}".strip()
    _price = float(text_price)
    
    tmp_rate = source.select('#appraise > div.rv-wrap > div.rv-container.db > div.rv-rate.rv-bars > div.rv-rate-wrap.clearfix.rv-rate-empty > div.rv-rate-item.rv-rate-score.l > div > p.score > span')
    _avg_rating = float(tmp_rate[0].get_text()) / 20  # Full score is 5
    
    try:
        _total_comment = int(source.select('#commentNum > a > span')[0].get_text()[1])  # To be modifying
    except:
        _total_comment = 0
    
    tmp_seller = source.select('#pcFourth > div.wrapper.mt15 > div.procon-side > div.si-intro > div.si-intro-list > dl > dd')[0]
    _seller = tmp_seller.get_text().replace("<dd>", "").replace("</dd>", "").strip()
    
    _website = dict_urls.get(key)

    # print(_total_comment)

    if _total_comment > 0:
        source = bs(await page.content(), 'lxml')

        # To-Do: while...
        _total_comment = resolve_customer(source, key)

    one_product = [_product_index, _product_type, _product_name, _product_brand, _price, _avg_rating, _total_comment, _seller, _website]
    all_product.append(one_product)


In [150]:
def resolve_customer(source, key) -> int:
    _product_index = key
    
    _user_regex = re.compile(r'<span>(\S+)</span>')
    _date_regex = re.compile(r'<span>(\d+-\d+-\d+ \d+:\d+:\d+)</span>')
    _comment_regex = re.compile(r'<p class="body-content">\s*(\S+)\s*</p>')


    all_username = source.find_all('div', {'class': 'username'})
    all_date = source.find_all('div', {'class': 'date l'})
    all_comment = source.find_all('p', {'class': 'body-content'})

    num_user = len(all_username)

    for n in range(num_user):
        _username = re.findall(_user_regex, str(all_username[n]))[0]
        _buyer_comment = re.findall(_comment_regex, str(all_comment[n]))[0]
        _user_rate = ''
        _rate_date = re.findall(_date_regex, str(all_date[n]))[0]

        one_customer = [_product_index, _username, _user_rate, _rate_date, _buyer_comment]
        all_customer.append(one_customer)   
        # print(_rate_date)
 

    return len(one_customer)

In [97]:
u = list(dict_urls.items())[0]
page_tmp, browser_tmp = asyncio.get_event_loop().run_until_complete(get_item(u[1]))

In [153]:
asyncio.get_event_loop().run_until_complete(resolve_data(page_tmp, u[0]))
save_csv()

product_index username user_rate            rate_date buyer_comment
0    10964145615    9***0            2019-08-15 13:22:14   很方便，正品正貨還有折
1    10964145615    9***7            2019-08-24 08:06:18          發貨迅速
2    10964145615    7***2            2019-10-09 18:26:13   买家没有填写评价内容！
3    10964145615    7***7            2019-10-09 16:15:23   买家没有填写评价内容！
4    10964145615    7***1            2019-10-09 15:51:05   买家没有填写评价内容！
5    10964145615    7***1            2019-10-09 14:55:41   买家没有填写评价内容！
6    10964145615    6***7            2019-10-08 11:45:27   买家没有填写评价内容！
7    10964145615    7***2            2019-10-07 20:46:22   买家没有填写评价内容！
8    10964145615    7***2            2019-10-07 18:11:29   买家没有填写评价内容！
9    10964145615    7***3            2019-10-06 19:37:04   买家没有填写评价内容！
10   10964145615    7***2            2019-10-06 17:56:22   买家没有填写评价内容！
11   10964145615    7***7            2019-10-05 14:31:39   买家没有填写评价内容！


### Store Data

In [148]:
def save_csv():
    df_product = pd.DataFrame(all_product, columns=column_product)
    df_product.to_csv('Products.csv', sep=',', na_rep='N/A', encoding="utf_8_sig")
    # print(df_product)

    df_customer = pd.DataFrame(all_customer, columns=column_customer)
    df_customer.to_csv('Customers.csv', sep=',', na_rep='N/A', encoding="utf_8_sig")
    print(df_customer)


## URLs

In [92]:
dict_urls = {}

### Get all item url

In [93]:
async def get_url(page):   

    source = bs(await page.content(), 'lxml').prettify()

    item_id = re.findall(
            re.compile(r'(?<=//product.hksuning.com/0000000000/)(\d*).html'),
            source
        )
    item_id = list(dict.fromkeys(item_id))
    tmp_dict = {}
    
    for i in item_id:
        url_patten = f"https://product.hksuning.com/0000000000/{i}.html"
        tmp_dict[i] = url_patten
        dict_urls.update(tmp_dict)
        
    if asyncio.get_event_loop().run_until_complete(have_button(page)):        
        # Click next page button
        asyncio.get_event_loop().run_until_complete(click_next_page(page))

        print("next")
        # await page.screenshot({'path': 'example.png'})
        time.sleep(1)
        
        # Get URLs
        asyncio.get_event_loop().run_until_complete(get_url(page))

        print("Got")


asyncio.get_event_loop().run_until_complete(get_url(page_tablet))
print(len(dict_urls))

next
80
Got
80


### URLs Traversal

In [28]:
async def traversal(page):
    source = bs(await page.content(), 'lxml').prettify()
    
    for key, value in dict_urls.items():
        page, browser = asyncio.get_event_loop().run_until_complete(get_item(value))        
        asyncio.get_event_loop().run_until_complete(resolve_data(page, key))
        save_csv()
        
        await browser.close()
        
asyncio.get_event_loop().run_until_complete(traversal(page_tablet))

## Crawl data

In [32]:
async def get_data(page):
    
    # Get URLs
    asyncio.get_event_loop().run_until_complete(get_url(page))
    asyncio.get_event_loop().run_until_complete(traversal(page))
    
    

asyncio.get_event_loop().run_until_complete(get_data(page_tablet))
len(dict_urls)

## Main

In [79]:
async def main():
    for page in pages:
        syncio.get_event_loop().run_until_complete(get_data(page))
        
    
        
    # Close all browsers
    for b in list_browsers:
        await browser.close()
        
# asyncio.get_event_loop().run_until_complete(main())