# Website
### https://www.hksuning.com/

***

# Installation
### If you are running this notebook for the first time, please the follow one sell, otherwise, please ignore.

In [None]:
!pip install pyppeteer
!pip install beautifulsoup4
!pip install nest_asyncio
!pip install asyncio
!pip install lxml

# Importing

In [70]:
# coding=utf-8

from pyppeteer import launch
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup as bs
import re
from dataclasses import dataclass
import xml.etree.ElementTree as et
import time
import pandas as pd
import requests
from multiprocessing.dummy import Pool as ThreadPool

In [44]:
nest_asyncio.apply()

# Websites that are needed to be crawling

In [45]:
urls = [
        "https://search.hksuning.com/search/list?ci=503692", # url_tablet
        "https://search.hksuning.com/search/list?ci=503694", # url_laptop
        "https://search.hksuning.com/search/list?ci=503505" # url_phone
]

# Return all web pages of each category

In [46]:
async def get_item(categroy: str):
    # browser = await launch()
    browser = await launch({"headless": False})
    page = await browser.newPage()
    await page.setViewport(viewport={'width': 1920, 'height': 1080})
    await page.goto(categroy)
    return page, browser

# Start crawling

## Get pages of each item page

In [47]:
list_pages = []
list_browsers = []

In [48]:
async def get_page_num(page) -> int:
    content = bs(await page.content(), 'lxml')
    page_num = re.findall(re.compile(r"共\d頁"), content.prettify())[0][1]
    return int(page_num)

In [51]:
def get_all_pages():
    for url in urls:
        page, browser = asyncio.get_event_loop().run_until_complete(get_item(url))
        list_pages.append(page)
        list_browsers.append(browser)

In [7]:
page_tablet, browser_tablet = asyncio.get_event_loop().run_until_complete(get_item(urls[0]))

page_num_tablet = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_tablet)
    )
print(f"There are total {page_num_tablet} pages about tablet")

There are total 2 pages about tablet


In [8]:
page_laptop, browser_laptop = asyncio.get_event_loop().run_until_complete(get_item(urls[1]))

page_num_laptop = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_laptop)
    )
print(f"There are total {page_num_laptop} pages about laptop")

There are total 2 pages about laptop


In [53]:
page_phone, browser_phone = asyncio.get_event_loop().run_until_complete(get_item(urls[2]))

page_num_phone = asyncio.get_event_loop().run_until_complete(
        get_page_num(page_phone)
    )
print(f"There are total {page_num_phone} pages about mobile phone")

There are total 5 pages about mobile phone


## Next Page

### Have next page button?

In [9]:
async def have_button(page) -> bool:

    source = bs(await page.content(), 'lxml').prettify()
    if 'name="ssdln_503692_bottom_pgup02-3" style="display: none;"' in source:
        return False
    else:
        return True

asyncio.get_event_loop().run_until_complete(have_button(page_tablet))

True

### Last page

In [12]:
async def click_last_page(page) ->bool:
    last_btn_selector = "#bottom_pager > div > a.prev"
    try:
        await page.click(last_btn_selector)
        return True
    except:
        return False
        
# asyncio.get_event_loop().run_until_complete(click_last_page(page_tablet))

### Click next page button

In [13]:
async def click_next_page(page) -> bool:
    next_btn_selector = "#bottom_pager > div > a.next"
    is_next = asyncio.get_event_loop().run_until_complete(have_button(page))

    if is_next:
        await page.click(next_btn_selector)
        return True
    else:
        # print("No next page!")
        return False

## Data storage

### Data Class

In [25]:
@dataclass
class Product:
    product_index: int
    product_type: str
    product_name: str
    prodect_brand: str
    price: float
    avg_rating: float
    total_comment: int
    seller: str
    website: str

@dataclass
class Customer:
    product_index: int
    username: str
    user_rate: float
    rate_date: str
    buyer_comment: str


        
product_type_dict = {
    "平板電腦": "Tablet",
    "手機": "Mobile Phone",
    "手提電腦": "Laptop"
}

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 50)

all_product = []
all_customer = []
column_product = ['product_index', 'product_type', 'product_name', 'prodect_brand', 'price', 'avg_rating', 'total_comment', 'seller', 'website']
column_customer = ['product_index', 'username', 'user_rate', 'rate_date', 'buyer_comment']

### Resolve Data

In [15]:
async def resolve_data(page, key):

    print(f"Getting product {key}...")

    # Click the comment tab
    await page.click('#commentNum')

    while not await page.querySelector('#rv-main > div > div.rv-main-target > div > div:nth-child(1) > div > div.rv-target-topic.clearfix > div.topic-right > div.topic-main.l > div.topic-title.clearfix') :
        pass

    time.sleep(1)

    source = bs(await page.content(), 'lxml')

    _product_index = key
    
    type_html = source.select('#pcFourth > div:nth-child(1) > div > ul > li:nth-child(5) > span > a')
    _product_type = product_type_dict[type_html[0].get_text()]
    
    title = source.h1.get_text()
    _product_name = title.replace("自營", "").replace("香港倉", "").strip()
    
    _product_brand = _product_name.split(" ")[0]
    
    text_price = f"{source.find('span', class_='integer').get_text()}.{source.find('span', class_='decimal').get_text()}".strip()
    _price = float(text_price)
    
    tmp_rate = source.select('#appraise > div.rv-wrap > div.rv-container.db > div.rv-rate.rv-bars > div.rv-rate-wrap.clearfix.rv-rate-empty > div.rv-rate-item.rv-rate-score.l > div > p.score > span')
    _avg_rating = float(tmp_rate[0].get_text()) / 20  # Full score is 5
    
    try:
        _tmp_total_comment = int(source.select('#commentNum > a > span')[0].get_text()[1])  # To be modifying
    except IndexError:
        _tmp_total_comment = 0
    
    tmp_seller = source.select('#pcFourth > div.wrapper.mt15 > div.procon-side > div.si-intro > div.si-intro-list > dl > dd')[0]
    _seller = tmp_seller.get_text().replace("<dd>", "").replace("</dd>", "").strip()
    
    _website = dict_urls.get(key)

    # print(_total_comment)

    await page.evaluate("""{window.scrollBy(0, document.body.scrollHeight);}""")

    if _tmp_total_comment > 0:

        resolve_customer(key)

        _total_comment = len(all_customer)

    one_product = [_product_index, _product_type, _product_name, _product_brand, _price, _avg_rating, _total_comment, _seller, _website]
    all_product.append(one_product)

    


In [24]:
def resolve_customer(key):
    _product_index = key
    
    have_more_comment = True

    p = 1
    while have_more_comment:
        time.sleep(0.5)
        
        r_url = f'https://product.hksuning.com/proxy/review/hk/ajax/review_lists/general-0000000{key}-0000000000-total-{p}-default-10-----reviewList.htm?callback=reviewList'

        page_source = requests.get(r_url).text

        user_regex = re.compile(r'"nickName":"(\S+)","levelId"')
        date_regex = re.compile(r'"publishTime":"(\d+-\d+-\d+ \d+:\d+:\d+)","publishTimeStr"')
        comment_regex = re.compile(r'"content":"(\S+)","publishTime"')
        rate_regex = re.compile(r'"qualityStar":(\d+),"bestFlag"')

        all_username = re.findall(user_regex, page_source)
        all_date = re.findall(date_regex, page_source)
        all_comment = re.findall(comment_regex, page_source)
        all_rate = re.findall(rate_regex, page_source)

        num_user = len(all_username)

        for n in range(num_user):
            _username = all_username[n]
            _rate_date = all_rate[n]
            _buyer_comment = all_comment[n]
            _user_rate = all_date[n]

            one_customer = [_product_index, _username, _user_rate, _rate_date, _buyer_comment]
            all_customer.append(one_customer) 

        p = p + 1

        rtn_msg = re.findall(re.compile(r'"returnMsg":"(\w+)","reCloudDrill"'), page_source)[0]

        if "无评价数据" in rtn_msg:
            have_more_comment = False
            print("End")
            return
        
        print(f"Got {key} comments page {p}")
      
        # print(_rate_date)
 


In [20]:
u = list(dict_urls.items())[0]
page_tmp, browser_tmp = asyncio.get_event_loop().run_until_complete(get_item(u[1]))
print(u)

('10964124069', 'https://product.hksuning.com/0000000000/10964124069.html')


In [23]:
asyncio.get_event_loop().run_until_complete(resolve_data(page_tmp, u[0]))
save_csv()

Getting product 10964124069...
Got comments page 2
Got comments page 3
Got comments page 4
Got comments page 5
Got comments page 6
Got comments page 7
Got comments page 8
Got comments page 9
Got comments page 10
Got comments page 11
Got comments page 12
Got comments page 13
Got comments page 14
Got comments page 15
Got comments page 16
Got comments page 17
Got comments page 18
Got comments page 19
Got comments page 20
Got comments page 21
Got comments page 22
Got comments page 23
End
  product_index product_type                                       product_name prodect_brand   price  avg_rating  total_comment seller                                            website
0   10964124069       Tablet  APPLE IPAD AIR 10.5 64GB WIFI GOLD MUUL2ZP/A 平板電腦         APPLE  3699.0         5.0            214   香港蘇寧  https://product.hksuning.com/0000000000/109641...
    product_index username            user_rate rate_date   buyer_comment
0     10964124069    6***8  2019-06-20 17:25:27         5      

### Store Data

In [17]:
def save_csv():
    df_product = pd.DataFrame(all_product, columns=column_product)
    df_product.to_csv("Products_Suning.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")
    print(df_product)

    df_customer = pd.DataFrame(all_customer, columns=column_customer)
    df_customer.to_csv("Customers_Suning.csv", sep=',', na_rep='N/A', encoding="utf_8_sig")
    print(df_customer)

# save_csv()

## URLs

In [18]:
dict_urls = {}

### Get all item url

In [19]:
async def get_url(page):   

    time.sleep(2)
    source = bs(await page.content(), 'lxml').prettify()

    item_id = re.findall(
            re.compile(r'(?<=//product.hksuning.com/0000000000/)(\d*).html'),
            source
        )
    item_id = list(dict.fromkeys(item_id))
    tmp_dict = {}
    
    for i in item_id:
        url_patten = f"https://product.hksuning.com/0000000000/{i}.html"
        tmp_dict[i] = url_patten
        dict_urls.update(tmp_dict)
        
    while asyncio.get_event_loop().run_until_complete(have_button(page)):        
        # Click next page button
        asyncio.get_event_loop().run_until_complete(click_next_page(page))

        print("next")
        # await page.screenshot({'path': 'example.png'})
        time.sleep(2)
        
        # Get URLs
        asyncio.get_event_loop().run_until_complete(get_url(page))

        print("Got")


asyncio.get_event_loop().run_until_complete(get_url(page_tablet))
print(len(dict_urls))

next
Got
80


### URLs Traversal

In [38]:
async def traversal(page):
    source = bs(await page.content(), 'lxml').prettify()
    
    for key, value in dict_urls.items():
        page, browser = asyncio.get_event_loop().run_until_complete(get_item(value))        
        asyncio.get_event_loop().run_until_complete(resolve_data(page, key))
        
        await browser.close()
        
# asyncio.get_event_loop().run_until_complete(traversal(page_tablet))

## Crawl data

In [37]:
async def get_data(page):
    
    # Get URLs
    asyncio.get_event_loop().run_until_complete(get_url(page))
    asyncio.get_event_loop().run_until_complete(traversal(page))
    
    

# asyncio.get_event_loop().run_until_complete(get_data(page_tablet))
# len(dict_urls)

In [69]:
def run_get_data(page):
    asyncio.get_event_loop().run_until_complete(get_data(page))

## Main

In [71]:
def rtn_pages():
    return [p for p in list_pages]

In [73]:
async def main():
    get_all_pages()

    q = multiprocessing.Manager().Queue()
    pool = multiprocessing.Pool()

    for page in list_pages:
        q.put(page)
    
    for i in range(len(list_pages)):
        pool.apply_async(run_get_data, args=(q, ))
        
asyncio.get_event_loop().run_until_complete(main())

AttributeError: Can't pickle local object 'FrameManager.__init__.<locals>.<lambda>'

In [None]:
# Close all browsers
for b in list_browsers:
    await b.close()