In [None]:
之前的章節有提到過開發爬蟲的手段, 如果有人已經提供你API了, 那當然就不用去爬網頁了, 接下來的幾隻爬蟲就是基於API來開發的.

### 八卦版鄉民從哪來?
這隻爬蟲會去爬當前八卦版前50篇文章, 然後看這些發文的鄉民是來自哪個國家:

In [1]:
import requests
import time
import json
import re
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'
FREEGEOIP_API = 'http://freegeoip.net/json/'


def get_web_page(url):
    resp = requests.get(url=url, cookies={'over18': '1'})
    if resp.status_code != 200:
        print('Invalid url: ', resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')
    # Retrieve the link of previous page
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        # If post date matched:
        if d.find('div', 'date').text.strip() == date:
            # To retrieve the push count:
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    # If transform failed, it might be '爆', 'X1', 'X2', etc.
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            # To retrieve title and href of the article:
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })

    return articles, prev_url


def get_ip(dom):
    # e.g., ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 27.52.6.175
    pattern = '來自: \d+\.\d+\.\d+\.\d+'
    match = re.search(pattern, dom)
    if match:
        return match.group(0).replace('來自: ', '')
    else:
        return None


def get_country(ip):
    if ip:
        data = json.loads(requests.get(FREEGEOIP_API + ip).text)
        country_name = data['country_name'] if data['country_name'] else None
        return country_name
    return None


def main():
    print('取得今日文章列表:')
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
    if current_page:
        articles = []
        today = time.strftime('%m/%d').lstrip('0')
        current_articles, prev_url = get_articles(current_page, today)
        while current_articles:
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, today)
        print('共 %d 篇文章' % (len(articles)))

        print('取得前50篇文章的IP:')
        country_to_count = dict()
        for article in articles[:50]:
            print('查詢 IP:', article['title'])
            page = get_web_page(PTT_URL + article['href'])
            if page:
                ip = get_ip(page)
                country = get_country(ip)
                if country in country_to_count.keys():
                    country_to_count[country] += 1
                else:
                    country_to_count[country] = 1

        print('各國IP分佈: ')
        for k, v in country_to_count.items():
            print(k, v)


if __name__ == "__main__":
    main()

取得今日文章列表:
共 1077 篇文章
取得前50篇文章的IP:
查詢 IP: Re: [問卦] 我們的言論自由當初是誰幫忙爭取的？


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### Facebook Graph API
* 要使用FB Graph API, 要先取得自己的token, 可以到這個網站去申請: https://developers.facebook.com/tools/explorer
* 點選"取得token" -> "取得用戶存取token" -> 勾選你想讓這個token可以取得的資訊 -> 得到token
* 把這個token記著, 等等程式裡面要用(ACCESS_TOKEN).

In [None]:
import requests


# To obtain the access token, go to https://developers.facebook.com/tools/explorer.
ACCESS_TOKEN = ''


def get_my_friends():
    url = 'https://graph.facebook.com/v2.9/me?fields=id,name,friends&access_token={}'.format(ACCESS_TOKEN)
    data = requests.get(url).json()
    print('My ID: ' + data['id'])
    print('My name: ' + data['name'])
    print('Total friends: ', data['friends']['summary']['total_count'], 'friends.')


def get_page_post(page_id):
    url = 'https://graph.facebook.com/v2.9/{0}/posts?access_token={1}'.format(page_id, ACCESS_TOKEN)
    data = requests.get(url).json()
    print('There are ', len(data['data']), ' posts on the fans page.')
    print('The latest post time is: ', data['data'][0]['created_time'])
    print('Content:', data['data'][0]['message'])


def main():
    get_my_friends()
    get_page_post(1707015819625206)


if __name__ == '__main__':
    main()

### imdb電影資訊查詢
imdb是很熱門的電影資訊網站, 不過其本身是沒有對外開放API的, 所以這邊會透過一個叫做OMDb API的第三方服務去取得imdb的電影資訊, 要使用OMDb API的服務, 必須要有API key, 這部分請自行付費取得(API_KEY).

In [None]:
import requests
import json
import math
from collections import Counter

# Please pay for the key yourself.
API_KEY = ''
OMDB_URL = 'http://www.omdbapi.com/?apikey=' + API_KEY


def get_movie_date(url):
    data = json.loads(requests.get(url).text)
    if data['Response'] == 'True':
        return data
    else:
        return None


def search_ids_by_keyword(keywords):
    movie_ids = list()
    # e.g., "Iron Man" -> Iron+Man
    query = '+'.join(keywords.split())
    url = OMDB_URL + '&s=' + query
    data = get_movie_date(url)

    if data:
        for item in data['Search']:
            movie_ids.append(item['imdbID'])
        total = int(data['totalResults'])
        num_pages = math.floor(total/10) + 1

        for i in range(2, num_pages+1):
            url = OMDB_URL + '&s=' + query + '&page=' + str(i)
            data = get_movie_date(url)
            if data:
                for item in data['Search']:
                    movie_ids.append(item['imdbID'])
    return movie_ids


def search_by_id(movie_id):
    url = OMDB_URL + '&i=' + movie_id
    data = get_movie_date(url)
    return data if data else None


def main():
    keyword = 'iron man'
    m_ids = search_ids_by_keyword(keyword)
    print('There are %s movies contain the keyword %s.' % (len(m_ids), keyword))
    print('Retrieving movie data...')
    movies = list()
    for m_id in m_ids:
        movies.append(search_by_id(m_id))
    print('Top 5 movie results:')
    for movie in movies[:5]:
        print(movie)
    years = [movie['Year'] for movie in movies]
    year_dist = Counter(years)
    print('Publish year distribution: ', year_dist)
    ratings = [float(movie['imdbRating']) for movie in movies if movie['imdbRating'] != 'N/A']
    print('Average rating: %.2f' % (sum(ratings)/len(ratings)))


if __name__ == '__main__':
    main()

### Google Finance API
3.4小節的Google Finance個股資訊是直接爬網頁來的, 這邊要示範怎麼透過Google Finance API達到類似的效果.

In [None]:
import requests
import json
from datetime import datetime, timedelta


GOOGLE_FINANCE_API_URL = 'http://finance.google.com/finance/info?client=ig&q='
GOOGLE_FINANCE_HISTORY_API_URL = 'http://www.google.com/finance/getprices?q='


def get_stock(query):
    # You can query for multiple stocks by splitting with ","
    resp = requests.get(GOOGLE_FINANCE_API_URL + query)
    if resp.status_code != 200:
        print('Invalid url or query param: ' + resp.url)
        return None
    else:
        # Need to remove the redundant chars '//' at the head of response
        return json.loads(resp.text.replace('//', ''))


def get_stock_history(stock_id, stock_mkt):
    resp = requests.get(GOOGLE_FINANCE_HISTORY_API_URL + stock_id + '&x=' + stock_mkt + '&i=86400&p=1M')
    ''' e.g.,
    EXCHANGE%3DTPE
    MARKET_OPEN_MINUTE=540
    MARKET_CLOSE_MINUTE=810
    INTERVAL=86400
    COLUMNS=DATE,CLOSE,HIGH,LOW,OPEN,VOLUME
    DATA=
    TIMEZONE_OFFSET=480
    a1488346200,186,188.5,186,188.5,46176000
    1,186,188.5,185,188,39914000
    2,184,185,184,184.5,28085000
    5,183.5,184.5,183.5,184,12527000
    ...
    '''
    index = -1
    records = resp.text.split('\n')
    for record in records:
        # 'a' means the start point of stock information
        if record.startswith('a'):
            index = records.index(record)
            break
    if index > 0:
        records = records[index:]
        # To transform the unix time to human readable time at the first line of stock info
        unix_time = int(records[0].split(',')[0][1:])
        init_time = datetime.fromtimestamp(unix_time)

        # To handle to first row
        first_row = records[0].split(',')
        first_row[0] = init_time

        history = list()
        history.append(first_row)

        # To handle the rest of stock records
        for record in records[1:]:
            if record:
                data = record.split(',')
                delta = int(data[0])
                data[0] = init_time + timedelta(days=delta)
                history.append(data)
        return history
    else:
        return None


def main():
    query = 'TPE:2330'
    print('Real time stock price for ' + query)
    stocks = get_stock(query)
    print(stocks[0])
    print('\n')
    stock_id = '2330'
    stock_mkt = 'TPE'
    print('Stock price history for ' + stock_mkt + ":" + stock_id)
    print('(Date, Close, High, Low, Open, Volume)')
    history = get_stock_history(stock_id, stock_mkt)
    for hist in history:
        print(hist[0].strftime("%Y/%m/%d"), hist[1:])


if __name__ == '__main__':
    main()

### 台灣證券交易所API
這個API長得大概像這樣:<br>
http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20160501&stockNo=2330 <br>
比較重要的地方是date這個參數, 基本上你給的值一定要是yyyyMMdd的形式, 但是真正作用的只有yyyy與MM, 因為他會把這段request解讀成你想要看stockNo股票在yyyy年MM月的紀錄, 所以dd基本上沒有太大意義, 但卻是不可少的部分.

In [None]:
import requests
import time


TWSE_URL = 'http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json'


def get_web_content(stock_id, current_date):
    resp = requests.get(TWSE_URL + '&date=' + current_date + '&stockNo=' + stock_id)
    if resp.status_code != 200:
        return None
    else:
        return resp.json()


def get_data(stock_id, current_date):
    info = list()
    resp = get_web_content(stock_id, current_date)
    if resp is None:
        return None
    else:
        if resp['data']:
            for data in resp['data']:
                record = {
                    '日期': data[0],
                    '開盤價': data[3],
                    '收盤價': data[6],
                    '成交筆數': data[8]
                }
                info.append(record)
        return info


def main():
    stock_id = '2330'
    current_date = time.strftime('%Y%m%d')
    current_year = time.strftime('%Y')
    current_month = time.strftime('%m')
    print('Processing data for %s %s...' % (current_year, current_month))
    get_data(stock_id, current_date)
    collected_info = get_data(stock_id, current_date)
    for info in collected_info:
        print(info)


if __name__ == '__main__':
    main()