這邊會分成三部分:
* A網站爬蟲
* B網站爬蟲
* 顯示資料的程式
A網站的部分會是momo購物網, B網站是PChome 24h購物網, 而要比價的商品是iPhone 7 Plus 128G. 在搜集一定量資料後, 可以透過圖表觀察價格的變化.

### momo購物網爬蟲
這邊是去爬momo購物網的行動版網頁, 要注意的地方是你必須附上User-Agent才可以.

In [None]:
import urllib.parse
import requests
import time
import json
import os
from bs4 import BeautifulSoup


STORE = 'momo'
MOMO_MOBILE_URL = 'http://m.momoshop.com.tw/'
MOMO_QUERY_URL = MOMO_MOBILE_URL + 'mosearch/%s.html'
USER_AGENT_VALUE = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'


def get_web_content(query):
    encoded_query = urllib.parse.quote(query)
    query_url = MOMO_QUERY_URL % encoded_query
    headers = {'User-Agent': USER_AGENT_VALUE}
    resp = requests.get(query_url, headers=headers)
    if not resp:
        return []
    resp.encoding = 'UTF-8'
    return BeautifulSoup(resp.text, 'html.parser')


def search_momo(query):
    dom = get_web_content(query)
    if dom:
        items = []
        for element in dom.find(id='itemizedStyle').ul.find_all('li'):
            item_name = element.find('p', 'prdName').text
            item_price = element.find('b', 'price').text.replace(',', '')
            if not item_price:
                continue
            item_price = int(item_price)
            item_url = MOMO_MOBILE_URL + element.find('a')['href']
            item_img_url = element.a.img['src']

            item = {
                'name': item_name,
                'price': item_price,
                'url': item_url,
                'img_url': item_img_url
            }

            items.append(item)
        return items


def save_search_result(data):
    with open(os.path.join('json', data['date'] + '-%s.json' % STORE), 'w', encoding='UTF-8') as file:
        json.dump(data, file, indent=2, ensure_ascii=False)


def main():
    query_str = 'iPhone 7 Plus 128G'
    items = search_momo(query_str)
    today = time.strftime('%m-%d')
    print('Search item \'%s\' from %s...' % (query_str, STORE))
    print('Search %d records on %s' % (len(items), today))
    for item in items:
        print(item)
    data = {
        'date': today,
        'store': STORE,
        'items': items
    }

    save_search_result(data)


if __name__ == '__main__':
    main()

### PChome 24h API爬蟲
PChome 24h有提供API, 所以就不用爬網頁了.

In [None]:
import html
import urllib.parse
import time
import json
import requests
import os
from requests.adapters import HTTPAdapter


STORE = 'pchome'
SESSION_TIMEOUT = 3
SESSION_MAX_RETRIES = 3
PCHOME_API_ENDPOINT = 'http://ecshweb.pchome.com.tw/search/v3.3/all/results?q=%s&sort=rnk&price=%s-%s'
PCHOME_PRODUCT_URL_PREFIX = 'http://24h.pchome.com.tw/prod/'
PCHOME_IMG_URL_PREFIX = 'http://ec1img.pchome.com.tw/'


def get_web_content(query_url):
    session = requests.Session()
    session.mount(query_url, HTTPAdapter(max_retries=SESSION_MAX_RETRIES))
    try:
        # The timeout unit is second.
        resp = session.get(query_url, timeout=SESSION_TIMEOUT)
    except requests.exceptions.RequestException as e:
        print(e)
        return None
    return resp


def collect_items(raw_data):
    extracted_items = list()
    raw_items = raw_data['prods']
    for raw_item in raw_items:
        try:
            item = dict()
            item['name'] = html.unescape(raw_item['name'])
            item['price'] = int(raw_item['price'])
            item['describe'] = raw_item['describe']
            item['img_url'] = PCHOME_IMG_URL_PREFIX + raw_item['picB']
            item['url'] = PCHOME_PRODUCT_URL_PREFIX + raw_item['Id']
            extracted_items.append(item)
        except Exception:
            pass
    return extracted_items


def search_pchome(query, min_price, max_price):
    query = urllib.parse.quote(query)
    query_url = PCHOME_API_ENDPOINT % (query, str(min_price), str(max_price))
    resp = get_web_content(query_url)
    if not resp:
        return []

    resp.encoding = 'UTF-8'
    data = resp.json()
    if data['prods'] is None:
        return []

    total_page_count = int(data['totalPage'])
    if total_page_count == 1:
        return collect_items(data)

    urls = []
    current_page = 1
    while current_page <= total_page_count:
        current_page_url = query_url + '&page=' + str(current_page)
        urls.append(current_page_url)
        current_page += 1

    items = []
    for url in urls:
        resp = get_web_content(url)
        if resp:
            resp.encoding = 'UTF-8'
            items += collect_items(resp.json())
    return items


def save_search_result(data):
    with open(os.path.join('json', data['date'] + '-%s.json' % STORE), 'w', encoding='UTF-8') as file:
        json.dump(data, file, indent=2, ensure_ascii=False)


def main():
    query_str = 'iphone 7 128g plus'
    min_price = 20000
    max_price = 40000
    items = search_pchome(query_str, min_price, max_price)
    today = time.strftime('%m-%d')
    print('Search item \'%s\' from %s...' % (query_str, STORE))
    print('Search %d records on %s' % (len(items), today))
    for item in items:
        print(item)
    data = {
        'date': today,
        'store': STORE,
        'items': items
    }

    save_search_result(data)


if __name__ == '__main__':
    main()

### 比價圖表程式
搜集了一定數量的資料後, 就可以用圖表來展示結果了.

In [None]:
import json
import os
from matplotlib import pyplot as plt


def get_avg_price(json_data):
    sum = 0
    for item in json_data:
        sum += int(item['price'])
    return sum/len(json_data)


def main():
    json_files = [f for f in os.listdir('json')
                  if os.path.isfile(os.path.join('json', f)) and f.endswith('.json')]

    avg_prices_momo = dict()
    avg_prices_pchome = dict()

    for json_file in json_files:
        with open(os.path.join('json', json_file), 'r', encoding='UTF-8') as file:
            data = json.load(file)
            date = data['date']
            if data['store'] == 'momo':
                avg_prices_momo[date] = get_avg_price(data['items'])
            elif data['store'] == 'pchome':
                avg_prices_pchome[date] = get_avg_price(data['items'])

    keys = avg_prices_momo.keys()
    dates = sorted(keys)
    print('momo')
    for date in dates:
        print(date, int(avg_prices_momo[date]))
    print('pchome')
    for date in dates:
        print(date, int(avg_prices_pchome[date]))

    # x-axis
    x = [int(i) for i in range(len(dates))]
    plt.xticks(x, dates)  # 將 x-axis 用字串標註
    price_momo = [avg_prices_momo[d] for d in dates]  # y1-axis
    price_pchome = [avg_prices_pchome[d] for d in dates]  # y2-axis
    plt.plot(x, price_momo, marker='o', linestyle='solid')
    plt.plot(x, price_pchome, marker='o', linestyle='solid')
    plt.legend(['momo', 'pchome'])
    # specify values on ys
    for a, b in zip(x, price_momo):
        plt.text(a, b, str(int(b)))
    for a, b in zip(x, price_pchome):
        plt.text(a, b, str(int(b)))
    plt.show()


if __name__ == '__main__':
    main()