# Website
### https://www.hksuning.com/

***

# Installation
### If you are running this notebook for the first time, please the follow one sell, otherwise, please ignore.

In [8]:
!pip install pyppeteer
!pip install beautifulsoup4
!pip install nest_asyncio
!pip install asyncio
!pip install lxml

# Importing

In [1]:
# coding=utf-8

from pyppeteer import launch
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup as bs
import re
from dataclasses import dataclass
import time
import pandas as pd
import requests
import pysnooper

# Data

## Data Set Up

In [2]:
nest_asyncio.apply()

@dataclass
class Product:
    product_index: int
    product_type: str
    product_name: str
    prodect_brand: str
    price: float
    avg_rating: float
    total_comment: int
    seller: str
    website: str

@dataclass
class Customer:
    product_index: int
    username: str
    user_rate: float
    rate_date: str
    buyer_comment: str


        
product_type_dict = {
    "平板電腦": "Tablet",
    "手機": "Mobile Phone",
    "手提電腦": "Laptop"
}

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 50)


column_product = ['product_index', 'product_type', 'product_name', 'prodect_brand', 'price', 'avg_rating', 'total_comment', 'seller', 'website']
column_customer = ['product_index', 'username', 'user_rate', 'rate_date', 'buyer_comment']

list_pages = []
list_browsers = []

## Data storage

In [3]:
all_product = []

all_customer = []

item_urls = {}

## Websites that are needed to be crawling

In [4]:
all_urls = {
        "tablet": "https://search.hksuning.com/search/list?ci=503692", # url_tablet
        "laptop": "https://search.hksuning.com/search/list?ci=503694", # url_laptop
        "phone": "https://search.hksuning.com/search/list?ci=503505" # url_phone
}

# Return web page object of each category

In [5]:
async def get_item(categroy: str):
    # browser = await launch()
    print("Setting up broswer for web page...")
    
    browser = await launch({
            "headless": False,
        })
    page = await browser.newPage()
    
    await page.setViewport({'width': 1920, 'height': 1080})
    await page.goto(categroy, {'timeout': 10000*30})
    return page, browser

# Start crawling

## Get page of each item

In [6]:
def get_pages(num):

    key = list(all_urls.items())[num][0]
    url = list(all_urls.items())[num][1]

    page, browser = asyncio.get_event_loop().run_until_complete(get_item(url))
    page_num = asyncio.get_event_loop().run_until_complete(
        get_page_num(page)
    )

    print(f"There are total {page_num} pages about {key}")

    # list_pages.append(page)
    list_browsers.append(browser)

    return page, browser, page_num

## Return number of pages of one categroy

In [7]:
async def get_page_num(page) -> int:
    content = bs(await page.content(), 'lxml')
    page_num = re.findall(re.compile(r"共\d頁"), content.prettify())[0][1]
    return int(page_num)

##  No need to run below 3 cells!

In [7]:
page_tablet, browser_tablet = asyncio.get_event_loop().run_until_complete(get_item(u[0][1]))

page_num_tablet = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_tablet)
    )
print(f"There are total {page_num_tablet} pages about tablet")

In [25]:
page_laptop, browser_laptop = asyncio.get_event_loop().run_until_complete(get_item(u[1][1]))

page_num_laptop = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_laptop)
    )
print(f"There are total {page_num_laptop} pages about laptop")

# asyncio.get_event_loop().run_until_complete(get_url(page_laptop))

In [38]:
page_phone, browser_phone = asyncio.get_event_loop().run_until_complete(get_item(u[2][1]))

page_num_phone = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_phone)
    )
print(f"There are total {page_num_phone} pages about mobile phone")

asyncio.get_event_loop().run_until_complete(get_url(page_phone))
# await browser_phone.close()

### Have next page button?

In [8]:
async def have_button(page) -> bool:
# regex
    source = bs(await page.content(), 'lxml').prettify()
    next_button_regex = re.compile(r'<a class="next"[\S*\s*]*>\s*<b>')

    next_btn_src = re.findall(next_button_regex, source)[0]

    if 'style="display: none;"' in next_btn_src:
        return False
    else:
        return True

# asyncio.get_event_loop().run_until_complete(have_button(page_laptop))

### Last page

In [9]:
async def click_last_page(page) ->bool:
    last_btn_selector = "#bottom_pager > div > a.prev"
    try:
        await page.click(last_btn_selector)
        return True
    except :
        return False
        
# asyncio.get_event_loop().run_until_complete(click_last_page(page_tablet))

### Click next page button

In [10]:
async def click_next_page(page) -> bool:
    next_btn_selector = "#bottom_pager > div > a.next"
    is_next = asyncio.get_event_loop().run_until_complete(have_button(page))

    if is_next:
        try:
            await page.click(next_btn_selector)
            return True
        except:
            return False
    else:
        # print("No next page!")
        return False

## Resolve Data

In [11]:
async def resolve_data(page, key):

    print(f"Getting product {key}...")

    source = bs(await page.content(), 'lxml')

    time.sleep(2)

    _product_index = key
    
    type_html = source.select('#pcFourth > div:nth-child(1) > div > ul > li:nth-child(5) > span > a')
    _product_type = product_type_dict[type_html[0].get_text()]
    
    title = source.h1.get_text()
    _product_name = title.replace("自營", "").replace("香港倉", "").strip()
    
    _product_brand = _product_name.split(" ")[0]
    
    text_price = f"{source.find('span', class_='integer').get_text()}.{source.find('span', class_='decimal').get_text()}".strip()
    _price = float(text_price)

    tmp_seller = source.select('#pcFourth > div.wrapper.mt15 > div.procon-side > div.si-intro > div.si-intro-list > dl > dd')[0]
    _seller = tmp_seller.get_text().replace("<dd>", "").replace("</dd>", "").strip()
    
    _website = item_urls.get(key)

    # print(_seller)

    # await page.evaluate("""{window.scrollBy(0, document.body.scrollHeight);}""")

    comment_regex = re.compile(r'<a>\s*評價\s*<span>\s*(\D*\d*\D*|\S)\s*</span>')
    comment_str = re.findall(comment_regex, source.prettify())

    _avg_rating = -1.0

    _total_comment = 0

    if comment_str[0] is not '':

        # Click the comment tab
        await page.click('#commentNum')

        time.sleep(1)


        while not await page.querySelector('#appraise > div.rv-wrap > div.rv-container.db > div.rv-rate.rv-bars') :
            pass

        _total_comment, _avg_rating = resolve_customer(key)
    

    print(f"Got {_total_comment} comments of product {key}")

    one_product = [_product_index, _product_type, _product_name, _product_brand, _price, _avg_rating, _total_comment, _seller, _website]
    all_product.append(one_product)


In [12]:
def resolve_customer(key) -> int:
    print(f"Getting comments of product {key}...")

    exist_comment = len(all_customer)

    _product_index = key
    
    have_more_comment = True

    p = 1
    while have_more_comment:
        time.sleep(1)
        
        r_url = f'https://product.hksuning.com/proxy/review/hk/ajax/review_lists/general-0000000{key}-0000000000-total-{p}-default-10-----reviewList.htm?callback=reviewList'

        page_source = requests.get(r_url).text

        user_regex = re.compile(r'"nickName":"(\S+)","levelId"')
        date_regex = re.compile(r'"publishTime":"(\d+-\d+-\d+ \d+:\d+:\d+)","publishTimeStr"')
        comment_regex = re.compile(r'"content":"(#?[\S+，?]*\s?\S+#?)","publishTime"')
        rate_regex = re.compile(r'"qualityStar":(\d+),"bestFlag"')

        all_username = re.findall(user_regex, page_source)
        all_date = re.findall(date_regex, page_source)
        all_comment = re.findall(comment_regex, page_source)
        all_rate = re.findall(rate_regex, page_source)

        num_user = len(all_username)
        tmp_rate = 0

        for n in range(num_user):
            _username = all_username[n]
            _rate_date = all_date[n]
            _buyer_comment = all_comment[n]
            _user_rate = int(all_rate[n])

            tmp_rate += _user_rate

            one_customer = [_product_index, _username, _user_rate, _rate_date, _buyer_comment]
            all_customer.append(one_customer) 

        # print(f"Got {key} comments page {p}")

        rtn_msg = re.findall(re.compile(r'"returnMsg":"(\w*)","reCloudDrill"'), page_source)[0]

        if "成功" not in rtn_msg:
            have_more_comment = False
            # print("End")
            num_comments = len(all_customer) - exist_comment
            rating = tmp_rate / num_comments
            return num_comments, rating
      
        p += 1
        # print(_rate_date)

        time.sleep(1)
 


## No need to run below 2 cells

In [131]:
uu = "https://product.hksuning.com/0000000000/10674060101.html"
page_tmp, browser_tmp = asyncio.get_event_loop().run_until_complete(get_item(uu))

In [132]:
async def close_b():
    # await page_tmp.close()
    await browser_tmp.close()

asyncio.get_event_loop().run_until_complete(close_b())

## Save Data

In [13]:
def save_csv():
    df_product = pd.DataFrame(all_product, columns=column_product)
    df_product.to_csv("Web Crawling/data/Suning/Products_Suning.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")
    print(df_product)

    df_customer = pd.DataFrame(all_customer, columns=column_customer)
    df_customer.to_csv("Web Crawling/data/Suning/Customers_Suning.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")
    print(df_customer)

# save_csv()

## URLs

### Get all item url

In [14]:
async def get_url(page):   

    time.sleep(2)
    source = bs(await page.content(), 'lxml').prettify()

    item_id = re.findall(
            re.compile(r'(?<=//product.hksuning.com/0000000000/)(\d*).html'),
            source
        )
    item_id = list(dict.fromkeys(item_id))
    tmp_dict = {}
    
    for i in item_id:
        url_patten = f"https://product.hksuning.com/0000000000/{i}.html"
        tmp_dict[i] = url_patten
        item_urls.update(tmp_dict)
        
    while asyncio.get_event_loop().run_until_complete(have_button(page)):      
        # Click next page button
        asyncio.get_event_loop().run_until_complete(click_next_page(page))

        # print("next")
        # await page.screenshot({'path': 'example.png'})
        time.sleep(2)
        
        # Get URLs
        asyncio.get_event_loop().run_until_complete(get_url(page))


# asyncio.get_event_loop().run_until_complete(get_url(page_tablet))


### URLs Traversal

In [15]:
def delete_item(required: str):
    for p in all_product:
        if required in p:
            all_product.remove(p)

    for c in all_customer:
        if required in c:
            all_customer.remove(c)

# delete_item('10674060101')

In [16]:
async def interrupt_traversal(index: int):
    start_from = index
    current_index = start_from
    while current_index < len(item_urls) - 1:
        item = list(item_urls.items())[current_index]
        key = item[0]
        url = item[1]

        page, browser = asyncio.get_event_loop().run_until_complete(
                get_item(url)
            )
        asyncio.get_event_loop().run_until_complete(resolve_data(page, key))

        time.sleep(1)

        await browser.close()

        time.sleep(3)

        current_index += 1

# asyncio.get_event_loop().run_until_complete(interrupt_traversal(282))

In [17]:
async def traversal():
    
    for key, value in item_urls.items():
        page, browser = asyncio.get_event_loop().run_until_complete(get_item(value))        
        asyncio.get_event_loop().run_until_complete(resolve_data(page, key))
        
        time.sleep(1)

        await browser.close()

        time.sleep(3)

# asyncio.get_event_loop().run_until_complete(traversal())

## Main

In [18]:
async def main():
    process_num = len(all_urls)

    for index in range(process_num):
        page, browser, page_num = get_pages(index)
        asyncio.get_event_loop().run_until_complete(get_url(page))
        
        time.sleep(0.5)

        await browser.close()
        
        print(f"Got total {len(item_urls)} URLs")

        time.sleep(2)
    
    asyncio.get_event_loop().run_until_complete(traversal())

    save_csv()


In [19]:
all_product.clear()
all_customer.clear()

## Run this cell for running main program

In [20]:
asyncio.get_event_loop().run_until_complete(main())