In [30]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.request
import os
import time
import json

In [20]:
def get_web_page(url):
    resp = requests.get(url=url, cookies={'over18':'1'})
    if resp.status_code != 200:
        print('Invalid url: ', resp.url)
        return None
    else:
        return resp.text

In [26]:
def get_article(html, date):
    soup = BeautifulSoup(html, 'lxml')
    
    page_div = soup.find('div', {'class':'btn-group btn-group-paging'})
    prev_url = page_div.find_all('a')[1]['href']
    
    articles =[]
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').text.strip() == date:
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str)
                except ValueError:
                    if push_str == '爆':
                        push_count = 100
                    elif push_str.startswith('X'):
                        push_count = -10
            if d.find('a'):
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                                'title': title,
                                'href': href,
                                'push': push_count,
                                'author': author})
    
    return articles, prev_url           

In [22]:
def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    links = soup.find(id='main-container').find_all('a')
    img_urls = []
    for link in links:
        if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):  # r 表示字串內容為原始字串 (raw string)
                                                                      # \d or \\d 為跳脫字元
                                                                      # ? 表示可有可無
                                                                      # (i.)? 表示可以有零或一個 i.
            img_urls.append(link['href'])
    return img_urls

In [36]:
def save(img_urls, title):
    if img_urls:
        try:
            dname = title.strip()
            os.makedirs(dname)
            for img_url in img_urls:
                if img_url.split('//')[1].startswith('m.'):
                    img_url = img_url.replace('//m.', '//i.')
                if not img_url.split('//')[1].startswith('i.'):
                    img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
                if not img_url.endswith('.jpg'):
                    img_url += '.jpg'
                fname = img_url.split('/')[-1]
                urllib.request.urlretrieve(img_url, os.path.join(dname, fname)) # 將網址圖片下載到資料夾中
        except Exception as e:
            print(e)

In [37]:
ptt_url = 'https://www.ptt.cc'
current_page = get_web_page(ptt_url + '/bbs/Beauty/index.html')

if current_page:
    articles = []
    today = time.strftime('%m/%d').lstrip('0')
    current_articles, prev_url = get_article(current_page, today)
    
    while current_articles:
        articles += current_articles
        current_page = get_web_page(ptt_url + prev_url)
        current_articles, prev_url = get_article(current_page, today)
        
for article in articles:
    print(article)
    page = get_web_page(ptt_url + article['href'])
    if page:
        img_urls = parse(page)
        save(img_urls, article['title'])
        article['num_image'] = len(img_urls)
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(articles, file, indent=3, ensure_ascii=False, sort_keys=True)

{'title': '[正妹] 早安 吃漢堡', 'href': '/bbs/Beauty/M.1586223146.A.28C.html', 'push': 2, 'author': 'deeeplove'}
{'title': '[正妹] 兇(2)', 'href': '/bbs/Beauty/M.1586229213.A.6F7.html', 'push': 2, 'author': 'ckpot'}
{'title': '[神人] 櫃姐', 'href': '/bbs/Beauty/M.1586229998.A.C47.html', 'push': 0, 'author': 's8121524'}
{'title': 'Re: [正妹] 賓士的麵包', 'href': '/bbs/Beauty/M.1586235522.A.59B.html', 'push': 72, 'author': 'mizubishi'}
[Errno 17] File exists: 'Re: [正妹] 賓士的麵包'
{'title': '[討論] coser比較 戰起來', 'href': '/bbs/Beauty/M.1586237450.A.674.html', 'push': 4, 'author': 'harry6275'}
<urlopen error [Errno 60] Operation timed out>
{'title': '[廣告] 三上悠亞 前凸後翹的極致誘惑', 'href': '/bbs/Beauty/M.1586239923.A.090.html', 'push': 3, 'author': 'kelseyaya'}
{'title': '[公告] 文章刪除 警告', 'href': '/bbs/Beauty/M.1586240865.A.C28.html', 'push': 0, 'author': 'hateOnas'}
{'title': '[正妹] japan4號', 'href': '/bbs/Beauty/M.1586189047.A.35C.html', 'push': 1, 'author': 'ashin68'}
{'title': '[正妹] japan5號', 'href': '/bbs/Beauty/M.1586189167