PTT is a popular online forum in Taiwan. Based on the number of replies of each article, if the number is over 50, the specific article would be called a popular article. The code is to find today's popular articles in specific topic and store them into a json file.

In [1]:
URL = 'https://www.ptt.cc'
topic = 'NBA' #there are different kinds of topics on ptt
url = URL + '/bbs/'+ topic + '/index.html'

In [2]:
import requests
def get_resource(url):
    headers = {"user-agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"}
    return requests.get(url, headers=headers, cookies={'over18':'1'})

In [3]:
from bs4 import BeautifulSoup

def parse_html(r):
    if r.status_code == requests.codes.ok:
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text,'lxml')
    else:
        print('error')
        soup = None
    return soup

In [4]:
import time

def web_scraping_bot(url):
    articles = []
    soup = parse_html(get_resource(url))
    if soup:
        today = time.strftime('%m/%d').lstrip('0') #get today's date and left strip 0 to fit the ptt date format
        current_articles, prev_url = get_articles(soup, today)
        while current_articles: #if we can get current_articles, we keep finding whether there are also today's articles in previous page
            articles = articles+current_articles
            print('wait two seconds...')
            time.sleep(2)
            soup = parse_html(get_resource(URL + prev_url))
            current_articles, prev_url = get_articles(soup, today)
    return articles

In [5]:
def get_articles(soup, date):
    articles = []
    #get previous page url
    tag_div = soup.find('div',class_='btn-group btn-group-paging')
    tag_a = tag_div.find_all('a',class_='btn wide')
    prev_url = tag_a[1]['href']
    
    tag_divs = soup.find_all('div',class_='r-ent')
    for tag in tag_divs:
        if tag.find('div',class_='date').text.strip() == date:#only include today's articles
            push_count = 0
            push_str = tag.find('div',class_='nrec').text
            #get the number of replies
            if push_str: 
                try:
                    push_count = int(push_str)
                except ValueError: #if push_str is not number (爆,X1,X2)
                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10
            #get title, href, author of the article
            if tag.find('a'): #if we can find url, it means the article is still exist, not being removed
                href = tag.find('a')['href']
                title = tag.find('a').text
                author = tag.find('div',class_='author').text
                articles.append({
                    'title':title,
                    'href':href,
                    'push_count':push_count,
                    'author':author
                })
    return articles, prev_url

In [13]:
def popular_articles(articles):
    return [item for item in articles if item['push_count']>=50]

In [15]:
import json

def save_to_json(articles, file):
    with open(file,'w',encoding='utf-8') as fp:
        json.dump(popular_articles, fp, indent=2, sort_keys=True, ensure_ascii=False)

In [16]:
if __name__ == '__main__':
    articles = web_scraping_bot(url)
    for item in articles:
        print(item)
    popular_articles = popular_articles(articles)
    save_to_json(popular_articles,'popular_articles.json')

wait two seconds...
wait two seconds...
wait two seconds...
{'title': '[情報] NBA Standings (Nov. 30, 2022)', 'href': '/bbs/NBA/M.1669793303.A.894.html', 'push_count': 2, 'author': 'guardyo'}
{'title': '[Live] 勇士 @ 獨行俠', 'href': '/bbs/NBA/M.1669766565.A.54B.html', 'push_count': 99, 'author': 'pneumo'}
{'title': '[花邊] 咖哩、唐西奇賽前半場三分表演', 'href': '/bbs/NBA/M.1669767680.A.C06.html', 'push_count': 27, 'author': 'ckpioneer'}
{'title': '[情報] Ben Simmons 膝蓋痠痛 將缺席明天比賽', 'href': '/bbs/NBA/M.1669768223.A.215.html', 'push_count': 20, 'author': 'thnlkj0665'}
{'title': '[BOX ] Knicks 140:110 Pistons 數據', 'href': '/bbs/NBA/M.1669774529.A.7C9.html', 'push_count': 16, 'author': 'Rambo'}
{'title': '[花邊] 反串網黑開嘲諷 KD: 你需要休息一下了', 'href': '/bbs/NBA/M.1669774944.A.8D8.html', 'push_count': 34, 'author': 'pneumo'}
{'title': '[情報] 老巴：小牛這樣打是拿不到冠軍的', 'href': '/bbs/NBA/M.1669775788.A.1E5.html', 'push_count': 99, 'author': 'pneumo'}
{'title': '[Live] 快艇 @ 拓荒者', 'href': '/bbs/NBA/M.1669777542.A.A57.html', 'push_count': 8