# PTT Crawler

In [12]:
import json
import requests
from bs4 import BeautifulSoup

HOST = 'https://www.ptt.cc'

def list_crawler(url):
    '''
    This is the crawler that parses the list page of ptt
    https://www.ptt.cc/bbs/Gossiping/index.html
    '''
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'lxml')
    
    article_list = []
    for tag in soup.select('div.title > a'):
        url = HOST + tag['href']
        article_dict = article_crawler(url)
        article_list.append(article_dict)
    return article_list

def article_crawler(url):
    '''
    This is the crawler that parsese the article page
    https://www.ptt.cc/bbs/SkiSnowboard/M.1482312528.A.3BB.html
    '''
    print("[DEBUG] Cralwing artilce url {}".format(url))
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'lxml')
    
    article_dict = {}
    meta_values = soup.select('span.article-meta-value')
    article_dict['author'] = meta_values[0].text
    article_dict['board'] = meta_values[1].text
    article_dict['title'] = meta_values[2].text
    article_dict['tm'] = meta_values[3].text
    
    article_dict['ip'] = soup.select_one('span.f2').text.split(': ')[-1].strip()
    
    pushs = []
    try:
        for push in soup.select('div.push'):
            push_dict = {}
            push_spans = push.select('span')
            push_dict['push_type'] = push_spans[0].text.strip()
            push_dict['push_author'] = push_spans[1].text.strip()
            push_dict['push_content'] = push_spans[2].text.split(': ')[1].strip()
            push_dict['push_tm'] = push_spans[3].text.strip()
            pushs.append(push_dict)
    except Exception as e:
        print("[Error] Error while parsing push")
    article_dict['pushs'] = pushs
    
    
    for tag in soup.select('span.article-meta-value'):
        tag.extract()
    for tag in soup.select('span.article-meta-tag'):
        tag.extract()
    for tag in soup.select('span.f2'):
        tag.extract()
    for tag in soup.select('div.push'):
        tag.extract()
        
    article_dict['content'] = soup.select_one("#main-content").text.strip()
    return article_dict

def ptt_crawler(no_pages):
    '''
    This is the main function of the crawler
    '''
    res = requests.get(HOST + "/bbs/SkiSnowboard/index.html")
    soup = BeautifulSoup(res.text, 'lxml')
    total_page = int(soup.select('a.wide')[1]['href'].split('index')[1].split('.html')[0]) + 1
    
    
    for page in range(total_page, total_page - no_pages, -1):
        url = HOST + "/bbs/SkiSnowboard/index{}.html".format(page)
        print(url)
        article_list = list_crawler(url)
    
    with open('./ptt_skisnowboard.json', 'a') as f:
        f.write(json.dumps(article_list, ensure_ascii=False))

In [87]:
ptt_crawler(2)

https://www.ptt.cc/bbs/SkiSnowboard/index125.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482631120.A.7CE.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482654382.A.805.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482672665.A.5F6.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482676594.A.95B.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482737502.A.03B.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482738799.A.28E.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482762861.A.636.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1482789229.A.2FE.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1465218737.A.FB2.html
[DEBUG] Cralwing artilce url https://www.ptt.cc/bbs/SkiSnowboard/M.1476592107.A.50C.html
https://www.ptt.cc/bbs/SkiSnowboard/index124.html
[DEBUG] Cr

In [18]:
with open('./ptt_skisnowboard.json') as f:
    ptt_data = json.loads(f.read())

ptt_data

[{'author': 'grantgao (grantgao)',
  'board': 'SkiSnowboard',
  'content': 'The North Face含雪裙Gore-Tex化纖(PrimaLoft Silver Insulation)\n保暖滑雪外套，全新美國帶回，因太小售出，僅此一件\n尺寸：女生XS\n顏色：藍 (STELLAR BLUE)\n售價：7000元\n台北面交\nhttps://goo.gl/IKMtSC\nhttps://goo.gl/BNwDqp\nhttps://goo.gl/HGfFZO\nhttps://goo.gl/VPrDIO\n\n--',
  'ip': '140.109.222.53',
  'pushs': [],
  'title': '[出售] North Face滑雪外套',
  'tm': 'Sun Dec 25 09:58:38 2016'},
 {'author': 'echi (台北的天空)',
  'board': 'SkiSnowboard',
  'content': '時間：12/26～31\n地點：北志賀龍王\n出發地：北志賀龍王\n預算額度：看人數\nSki or Snowboard：snowboard\n總人數與預計招募人數：目前2人/預計招募2～4人\n主揪的話與其他注意事項：想share教練費 歡迎同樂\n\n意者站內信\n\n--',
  'ip': '1.1.125.74',
  'pushs': [],
  'title': '[揪團] 12/26～31 北志賀龍王',
  'tm': 'Sun Dec 25 16:26:19 2016'},
 {'author': 'momoya802 (水餃熟了)',
  'board': 'SkiSnowboard',
  'content': '各位大家晚安，因為已經訂好1/8-1/11要去琵琶湖，但是最近有意無意看了官網滑雪都是停\n\n現在要改去野澤，飯店貴得有點威，要改也得快動作了。不然都要去六甲山了，難過。\n\n麻煩最近前往的雪友告知了？非常謝謝\n\n雖然京都機票跟住宿加起來四天三夜才7300…\n\n-----\nSent from JPTT on my Sony D5833.\n\n--',
  'ip'

# Datetime

In [23]:
from datetime import datetime, timedelta

In [20]:
datetime

<module 'datetime' from '/Users/ian/.pyenv/versions/3.5.2/lib/python3.5/datetime.py'>

In [25]:
timedelta(days=1)

datetime.timedelta(1)

In [32]:
datetime.now()

datetime.datetime(2016, 12, 27, 9, 47, 57, 845401)

In [33]:
datetime.now().year

2016

In [34]:
datetime.now().month

12

In [35]:
datetime.now().day

27

In [27]:
datetime.now() - timedelta(days=1)

datetime.datetime(2016, 12, 26, 9, 43, 2, 840932)

In [21]:
push_tm = '12/26 21:45'

In [36]:
mydt = datetime.strptime('2016/' + push_tm, "%Y/%m/%d %H:%M")
mydt

datetime.datetime(2016, 12, 26, 21, 45)

In [37]:
mydt.strftime("%Y-%m-%d %H:%M")

'2016-12-26 21:45'

In [65]:
mydt.timestamp()

1482759900.0

In [64]:
article_tm = 'Sun Dec 25 22:36:32 2016'
datetime.strptime(article_tm, "%a %b %d %H:%M:%S %Y").timestamp()

1482676592.0

In [59]:
# Another way to get current UNIX timestamp

In [39]:
import time

In [58]:
time.time()

1482803420.155398

# Regular Expression

In [66]:
import re

In [90]:
mystr = 'I have a pen, I have an apple'

In [93]:
match = re.match('pen', mystr) # Match from beginning

In [94]:
match.group(0)

AttributeError: 'NoneType' object has no attribute 'group'

In [79]:
match = re.search('p.n', mystr) # Search in the string
match.group(0)

'pen'

In [86]:
match = re.search('p\wn', mystr)
match.group(0)

'p1n'

In [84]:
match = re.search('\w\w\w', mystr)
match.group(0)

'hav'

In [100]:
match = re.search('\w{3}', mystr)
match.group(0)

'hav'

In [102]:
match = re.search('[a-zA-Z]en', mystr)
match.group(0)

'pen'

In [None]:
match = re.search('')

In [99]:
err_str = "[Error] Error while parsing push"
match = re.search("^\[Err.*$", err_str)
match.group(0)

'[Error] Error while parsing push'

In [103]:
err_str = "[Error] Error while parsing push"
match = re.sub("^\[Error\]", '',err_str)
match

' Error while parsing push'

# Selenium

In [113]:
!pip install selenium

[33mYou are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
# Install Chromedriver
# https://sites.google.com/a/chromium.org/chromedriver/

In [114]:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

In [134]:
driver = webdriver.Remote(command_executor='http://localhost:9515', desired_capabilities=DesiredCapabilities.CHROME)

In [135]:
driver.get("https://www.skyscanner.com.tw/transport/flights/tpet/cts/170105/170115/airfares-from-taipei-to-sapporo-chitose-in-january-2017.html?adults=1&children=0&adultsv2=1&childrenv2=&infants=0&cabinclass=economy&rtn=1&preferdirects=false&outboundaltsenabled=false&inboundaltsenabled=false&ref=home#results")

In [129]:
driver.page_source

'<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" class="skyscanner  legacy-button  bpk-no-touch-support" lang="zh-TW"><head><script src="https://pagead2.googlesyndication.com/pagead/osd.js"></script><script src="https://securepubads.g.doubleclick.net/gampad/ads?gdfp_req=1&amp;correlator=3715581441601257&amp;output=json_html&amp;callback=googletag.impl.pubads.callbackProxy1&amp;impl=fifs&amp;json_a=1&amp;eid=108809080&amp;sc=1&amp;sfv=1-0-5&amp;iu_parts=24268069%2Cskyscanner.com.tw%2Cflights_funnel%2Cday_view%2Cbooking_panel%2Cinline%2Cleaderboard&amp;enc_prev_ius=%2F0%2F1%2F2%2F3%2F4%2C%2F0%2F1%2F2%2F3%2F5%2C%2F0%2F1%2F2%2F3%2F6&amp;prev_iu_szs=320x50%2C1024x66%2C728x90%7C468x60&amp;fluid=height%2C0%2C0&amp;prev_scp=divId%3DsponsoredPartner-adslot%7C%7C&amp;cust_params=domain%3Dhttps%253A%252F%252Fwww.skyscanner.com.tw%26nativeadjs%3D%252Fsttc%252Fstrevda%252Fjs%252Fnativead.9e02300dd1cf6d9cb7a7.js%26nativeadcss%3D%252Fsttc%252Fstrevda%252Fcss%252Fnativead.bbef3ab22ed30bd91ce

In [130]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'lxml')
soup.select('li.day-list-item')

[<li class="day-list-item clearfix "><article class="card result clearfix no-details " data-cid="model_6829" data-deeplink="details" ontouchstart=""><div class="card-body clearfix operated-by"><div class="clearfix carrier"><div class="airline"><img alt="捷星航空" onerror="__imgErrRemove__(this)" src="//logos.skyscnr.com/images/airlines/favicon/JQ.png"/><span>捷星航空</span></div></div><section class="card-main leg clearfix dept" data-id="0"><div class="big-airline"><img alt="捷星航空" class="big" data-name="捷星航空" onerror="__imgErrReplace__(this)" src="//logos.skyscnr.com/images/airlines/small/JQ.png"/></div><div class="leg-details "><div class="depart"><span class="station-tooltip" data-id="17075"><span class="times">02:40</span><span class="stop-station" data-id="17075">TPE</span></span></div><div class="stops"><span class="duration">6 小時 15 分鐘</span><ul class="stop-line"><li class="stop-dot"></li><li class="stop-line"></li></ul><div class="leg-stops"><span class="leg-stops-red leg-stops-label">轉

In [136]:
next_button = driver.find_element_by_class_name('next')

In [137]:
# Skyscanner has crawler blocking....
next_button.click()

In [143]:
# Another selenium example
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

page_no = 1

driver = webdriver.Remote(
   command_executor='http://127.0.0.1:9515',
   desired_capabilities=DesiredCapabilities.CHROME)

driver.get('http://airbnb.com/');
#time.sleep(5) # Let the user actually see something!


wait = WebDriverWait(driver, 10)

# wait for the page to load
wait.until(
    EC.presence_of_element_located((By.NAME, "location"))
)

location_ele = driver.find_element_by_xpath('//*[@id="search-location"]')
location_ele.click()
location_ele.send_keys("Niseko")
location_ele.send_keys(Keys.RETURN)

check_in_ele = driver.find_element_by_xpath('//*[@id="startDate"]')
check_in_ele.send_keys('01/05/2017')

checkout_ele = driver.find_element_by_xpath('//*[@id="endDate"]')
checkout_ele.send_keys('01/15/2017')

guests_ele = driver.find_element_by_xpath('//*[@id="site-content"]/div/div/div[2]/div[1]/div[1]/div/form/div/div/div[3]/div/button')
guests_ele.click()
add_ele = driver.find_element_by_xpath('//*[@id="site-content"]/div/div/div[2]/div[1]/div[1]/div/form/div/div/div[3]/div/div/div/div/div[1]/div[2]/div/button[2]')
for x in range(4):
    add_ele.click()
submit_ele = driver.find_element_by_xpath('//*[@id="site-content"]/div/div/div[2]/div[1]/div[1]/div/form/div/div/div[4]/button')
submit_ele.click()


wait.until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="katamari-container"]/div/div/div/div[1]/div/div[2]/div[3]/div[2]/div'))
)

with open('./niseko_page{}.html'.format(page_no), 'w') as f:
    f.write(driver.page_source)
    
for x in range(3):
    if not EC.presence_of_element_located((By.CLASS_NAME, 'icon-caret-right')):
        break
    navi_ele = driver.find_element_by_class_name('icon-caret-right')
    #navi_ele = driver.find_element_by_class_name('next')
    driver.execute_script("return arguments[0].scrollIntoView();", navi_ele)
    navi_ele.click()
    wait.until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="katamari-container"]'))
    )
    page_no += 1
    with open('./niseko_page{}.html'.format(page_no), 'w') as f:
        f.write(driver.page_source)
    driver.implicitly_wait(5) # seconds
    if EC.presence_of_element_located((By.CLASS_NAME, 'modal-close')):
        modal_ele = driver.find_element_by_class_name('modal-close')
        modal_ele.click()
    driver.implicitly_wait(1) # seconds



WebDriverException: Message: unknown error: Element <i class="icon icon-caret-right" data-reactid=".24ijqylrkzk.0.2.0.3.2.2.0.1.0.6.0.1"></i> is not clickable at point (199, 769). Other element would receive the click: <div class="modal-cell" data-reactid=".4.0.$=1$modal.0.0">...</div>
  (Session info: chrome=55.0.2883.95)
  (Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Mac OS X 10.11.6 x86_64)


In [144]:
driver.close()