# PTT八卦板今日熱門文章

PTT web版的html結構算是比較有規則的, 所以也是拿來練爬蟲的好對象, 下面這隻爬蟲的目的是要去找出今日的熱門文章(50推以上), 同時也會去找出今天有哪些5566發文了:

In [5]:
import requests
import time
import json
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'


def get_web_page(url):
    resp = requests.get(url=url, cookies={'over18': '1'})
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None  #有異常的話回傳None
    else:
        return resp.text #成功的話回傳網頁資訊


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')  #"dom"是main()爬蟲主程式中的輸入值
    # 取得該頁面上一頁連結（因為爬完原頁資料後就要往前爬舊的資料，所以過程中要不斷往上一頁）
    paging_div = soup.find('div', 'btn-group btn-group-paging')  #換頁區塊標籤位置
    prev_url = paging_div.find_all('a')[1]['href']   #<a>的第2個'href'為'上一頁'的連結

    articles = []  # 用來儲存文章資料
    divs = soup.find_all('div', 'r-ent')  #文章區塊標籤
    for d in divs:
        # If post date matched:
        if d.find('div', 'date').text.strip() == date:  #"date"是main()爬蟲主程式中的輸入值
            # 取得推文數
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str) #轉換字串為數字
                except ValueError:
                    # 若失敗，可能是有 '爆', 'X1', 'X2',....
                    # 若不是，不做任何事，push_count保持為0
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):  #startswith() :檢查字符串是否是以指定子字符串開頭，如果是則返回True否則False
                        push_count = -10

            # 取得文章連結和標題
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''  #後面看不懂
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })

    return articles, prev_url   #回傳文章list，和這頁的"上一頁"超連結
                                # 顯示為: [] /bbs/Gossiping/index39175.html


def get_author_ids(posts, pattern):  #只取作者id資訊
    ids = set()  #set() 函數創建一個無序不重複元素集，可進行關係測試，刪除重複數據，還可以計算交集、差集、並集等
    for post in posts:  #main()中的posts=articles，也就是前面的文章資料
        if pattern in post['author']:  #假如pattern再articles資料的['author']中
            ids.add(post['author'])   #就加入ids這個集合中
    return ids


def main():  #爬蟲主程式（只要抓今日的文章）
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html') #先取得最新一頁的網頁
    if current_page:  #如果取得成功的話
        articles = [] #全部的今日文章
        today = time.strftime("%m/%d").lstrip('0')  #今天的日期，在這裡我們刪除開頭的0以匹配PTT日期的格式
                                    # strftime() : http://tw.gitbook.net/python/time_strftime.html
                                    # lstrip() : http://tw.gitbook.net/python/string_lstrip.html
        current_articles, prev_url = get_articles(current_page, today) #目前頁面的今日文章
        #current_articles:這頁的所有文章，prev_url這頁的"上一頁"超連結
        while current_articles: #若目前頁面有今日文章則加入articles，並回到上一頁尋找是否有今日文章
            articles += current_articles  #更新值 (就地加) ，每執行一次，articles = []就新增current_articles至articles
            current_page = get_web_page(PTT_URL + prev_url) #到"上一頁"取得網址
            current_articles, prev_url = get_articles(current_page, today) #再用"上一頁"的網址資訊取得今天的文章
            #while current_articles:就繼續迴圈往上一頁找今日文章，直到上一頁沒有今日文章，也就是current_articles空白的時候回圈就停止
        print("Today's 5566:")
        print(get_author_ids(articles, '5566'))

        print('\nThere are ', len(articles), ' posts today.')
        threshold = 50   #threshold 門檻訂為50
        print('Hot post(≥ %d push): ' % threshold)  #正則表達式-顯示為: "Hot post(≥ 50 push):" 
        for article in articles:
            if int(article['push_count']) > threshold:  #列出推數大於門檻的文章
                print(article)
                
        # with as: https://openhome.cc/Gossip/Python/WithAs.html
        # json.dump: http://python3-cookbook.readthedocs.io/zh_CN/latest/c06/p02_read-write_json_data.html
        with open('gossiping.json', 'w', encoding='UTF-8') as file:
            json.dump(articles, file, indent=2, sort_keys=True, ensure_ascii=False)

if __name__ == '__main__':
    main()

Today's 5566:
{'RS5566', 'Bonker5566', 'Rin5566', 'XDDDpupu5566', 'fun5566', 'lianpig5566', 'kameaki5566', 'stock5566', 'nikubou5566', 'Nigger5566', 'scum5566', 'zrct5566', 'Andy5566', 'gangster5566', 'purine5566', 'TomFord5566', 'junkjizz5566', 'laba5566', 'youtu5566', 'dickhole5566', 'fantacy5566', 'ce3255666', 'zyc5566', 'WOWO5566', 'vc5566', 'ARZT5566', 'aass5566', 'PA5566', 'AKB5566'}

There are  1002  posts today.
Hot post(≥ 50 push): 
{'title': '[爆卦] 台灣民眾黨-要求陸委會「講清楚說明白」!', 'href': '/bbs/Gossiping/M.1581478722.A.8B2.html', 'push_count': 99, 'author': 'PunkGrass'}
{'title': 'Re: [問卦] 有小明之亂的懶人包嗎？', 'href': '/bbs/Gossiping/M.1581476703.A.395.html', 'push_count': 99, 'author': 'terry1020'}
{'title': '[新聞] WHO命名COVID-19 我國維持簡稱"武漢肺炎"', 'href': '/bbs/Gossiping/M.1581476283.A.98D.html', 'push_count': 99, 'author': 'thouloveme'}
{'title': '[新聞] 入場泳客從寶瓶星號下船才1天 花蓮業者', 'href': '/bbs/Gossiping/M.1581475689.A.36C.html', 'push_count': 99, 'author': 'stevenchiang'}
{'title': '[新聞] 網路瘋傳仁寶工程師疑似感染武

In [9]:
a=2
b="b"
a,b = 3, 9,8
a,b

ValueError: too many values to unpack (expected 2)

# PTT表特版下載器

In [1]:
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json


PTT_URL = 'https://www.ptt.cc'


def get_web_content(url):
    resp = requests.get(url=url, cookies={'over18': '1'})
    if resp.status_code != 200:
        print('Invalid url: ' + resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')

    paging_dev = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_dev.find_all('a')[1]['href']

    articles = []
    divs = soup.find_all('div', 'r-ent')
    for div in divs:
        if div.find('div', 'date').text.strip() == date:
            push_count = 0
            push_str = div.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            if div.find('a'):
                href = div.find('a')['href']
                title = div.find('a').text
                author = div.find('div', 'author').text if div.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })
    return articles, prev_url


def parse(dom):
    soup = BeautifulSoup(dom, 'html.parser')
    links = soup.find(id='main-content').find_all('a')
    img_urls = []
    for link in links:
        if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
            img_urls.append(link['href'])
    return img_urls


def save(img_urls, title):
    if img_urls:
        try:
            folder_name = title.strip()
            os.makedirs(folder_name)
            for img_url in img_urls:
                # e.g. 'http://imgur.com/9487qqq.jpg'.split('//') -> ['http:', 'imgur.com/9487qqq.jpg']
                if img_url.split('//')[1].startswith('m.'):
                    img_url = img_url.replace('//m.', '//i.')
                if not img_url.split('//')[1].startswith('i.'):
                    img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
                if not img_url.endswith('.jpg'):
                    img_url += '.jpg'
                file_name = img_url.split('/')[-1]
                urllib.request.urlretrieve(img_url, os.path.join(folder_name, file_name))
        except Exception as e:
            print(e)


def main():
    current_page = get_web_content(PTT_URL + '/bbs/Beauty/index.html')
    if current_page:
        articles = []
        date = time.strftime("%m/%d").lstrip('0')
        current_articles, prev_url = get_articles(current_page, date)
        while current_articles:
            articles += current_articles
            current_page = get_web_content(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, date)

        for article in articles:
            print('Collecting beauty from:', article)
            page = get_web_content(PTT_URL + article['href'])
            if page:
                img_urls = parse(page)
                save(img_urls, article['title'])
                article['num_image'] = len(img_urls)

        with open('data.json', 'w', encoding='utf-8') as file:
            json.dump(articles, file, indent=2, sort_keys=True, ensure_ascii=False)


if __name__ == '__main__':
    main()

Collecting beauty from: {'title': '[帥哥] Vince Carter', 'href': '/bbs/Beauty/M.1583988574.A.9A9.html', 'push_count': 63, 'author': 'Boss741108'}
Collecting beauty from: {'title': '[帥哥] 福山雅治', 'href': '/bbs/Beauty/M.1583988635.A.F0B.html', 'push_count': 3, 'author': 'Gary5566'}
Collecting beauty from: {'title': '[正妹] 大尺碼女孩(20)', 'href': '/bbs/Beauty/M.1583989363.A.253.html', 'push_count': 4, 'author': 'ckpot'}
Collecting beauty from: {'title': '[新聞] 「屁孩殺手」女警背影網熱議跪求長相！', 'href': '/bbs/Beauty/M.1583990101.A.6C5.html', 'push_count': 0, 'author': 'KKKKJAY'}
Collecting beauty from: {'title': '[正妹] 兇 年輕的岩間香須美', 'href': '/bbs/Beauty/M.1583993459.A.1B9.html', 'push_count': 43, 'author': 'AmedRosario'}
Collecting beauty from: {'title': '[神人] 最右邊是誰阿', 'href': '/bbs/Beauty/M.1584006600.A.760.html', 'push_count': 16, 'author': 'QYIN712'}
Collecting beauty from: {'title': '[正妹] 雲科大學生', 'href': '/bbs/Beauty/M.1584008037.A.7BC.html', 'push_count': 97, 'author': 'a3268403'}
Collecting beauty from: {'tit

# 爬取PTT當日的討論版發文

In [None]:
#Ch9_2(ptt_bbs_crawler.py)-爬取PTT當日的討論版發文
import time
import requests
import json
from bs4 import BeautifulSoup

# 目標URL網址
URL = "https://www.ptt.cc"
TOPIC = "BASEBALL" #看板名稱選擇爬取Food板

def get_resource(url):
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
               "AppleWebKit/537.36 (KHTML, like Gecko)"
               "Chrome/63.0.3239.132 Safari/537.36"}
    return requests.get(url, headers=headers, cookies={"over18":"1"})

def parse_html(r):
    if r.status_code == requests.codes.ok:
        r.encoding = "utf8"
        soup = BeautifulSoup(r.text, "lxml")        
    else:
        print("HTTP請求錯誤..." + url)
        soup = None
    
    return soup    

def get_articles(soup, date): #第一個參數取得今天所有發文，第二個參數是今天日期
    articles = []  
    # 取得上一頁的超連結
    paging_div = soup.find("div", class_="btn-group btn-group-paging")
    paging_a = paging_div.find_all("a", class_="btn")
    prev_url = paging_a[1]["href"]

    tag_divs = soup.find_all("div", class_="r-ent")
    for tag in tag_divs:
        # 判斷文章的日期
        if tag.find("div",class_="date").text.strip() == date:
            push_count = 0    # 取得推文數
            push_str = tag.find("div", class_="nrec").text
            if push_str:
                try:
                    push_count = int(push_str)  # 轉換成數字
                except ValueError:  # 轉換失敗，可能是'爆'或 'X1','X2'
                    if push_str == '爆':  #爆轉換成99
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10  #X開頭轉換成-10
            # 取得發文的超連結和標題文字
            if tag.find("a"):  # 有超連結，表示文章存在
                href = tag.find("a")["href"]
                title = tag.find("a").text
                author = tag.find("div", class_="author").string 
                articles.append({
                    "title": title,
                    "href": href,
                    "push_count": push_count,
                    "author": author
                })
    
    return articles, prev_url

def save_to_json(items, file):
    with open(file, "w", encoding="utf-8") as fp: # 寫入JSON檔案
        json.dump(items,fp,indent=2,sort_keys=True,ensure_ascii=False)

def web_scraping_bot(url):
    articles = []
    print("抓取網路資料中...")
    soup = parse_html(get_resource(url))
    if soup:
        # 取得今天日期, 去掉開頭'0'符合PTT的日期格式
        today = time.strftime("%m/%d").lstrip('0') 
        # 取得目前頁面的今日文章清單
        current_articles, prev_url = get_articles(soup, today) 
        while current_articles: 
            articles += current_articles
            print("等待2秒鐘...")
            time.sleep(2) 
             # 剖析上一頁繼續尋找是否有今日的文章
            soup = parse_html(get_resource(URL + prev_url))
            current_articles, prev_url = get_articles(soup, today)

    return articles

#建立指定看板的URL後，呼叫web_scraping_bot函數爬取發文資訊
if __name__ == '__main__':
    url = URL + "/bbs/" + TOPIC + "/index.html"
    print(url)
    articles = web_scraping_bot(url)
    for item in articles:
        print(item)
    save_to_json(articles, "articles.json")


# 八卦版鄉民從哪來?
這隻爬蟲會去爬當前八卦版前50篇文章, 然後看這些發文的鄉民是來自哪個國家:

In [1]:
import requests
import time
import json
import re
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'
FREEGEOIP_API = 'http://freegeoip.net/json/'


def get_web_page(url):
    resp = requests.get(url=url, cookies={'over18': '1'})
    if resp.status_code != 200:
        print('Invalid url: ', resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')
    # Retrieve the link of previous page
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        # If post date matched:
        if d.find('div', 'date').text.strip() == date:
            # To retrieve the push count:
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    # If transform failed, it might be '爆', 'X1', 'X2', etc.
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            # To retrieve title and href of the article:
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })

    return articles, prev_url


def get_ip(dom):
    # e.g., ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 27.52.6.175
    pattern = '來自: \d+\.\d+\.\d+\.\d+'
    match = re.search(pattern, dom)
    if match:
        return match.group(0).replace('來自: ', '')
    else:
        return None


def get_country(ip):
    if ip:
        data = json.loads(requests.get(FREEGEOIP_API + ip).text)
        country_name = data['country_name'] if data['country_name'] else None
        return country_name
    return None


def main():
    print('取得今日文章列表:')
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
    if current_page:
        articles = []
        today = time.strftime('%m/%d').lstrip('0')
        current_articles, prev_url = get_articles(current_page, today)
        while current_articles:
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, today)
        print('共 %d 篇文章' % (len(articles)))

        print('取得前50篇文章的IP:')
        country_to_count = dict()
        for article in articles[:50]:
            print('查詢 IP:', article['title'])
            page = get_web_page(PTT_URL + article['href'])
            if page:
                ip = get_ip(page)
                country = get_country(ip)
                if country in country_to_count.keys():
                    country_to_count[country] += 1
                else:
                    country_to_count[country] = 1

        print('各國IP分佈: ')
        for k, v in country_to_count.items():
            print(k, v)


if __name__ == "__main__":
    main()

取得今日文章列表:
共 1402 篇文章
取得前50篇文章的IP:
查詢 IP: Re: [新聞] 飲恨身亡！李承翰父曾怨「速食店精神鑑定


JSONDecodeError: Expecting value: line 1 column 1 (char 0)