In [1]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.request
import os
import time
import json
from urllib.parse import urljoin
import csv
import random
import jieba

In [2]:
def get_article(url):   
    resp = requests.get(url=url, cookies={'over18':'1'})
    soup = BeautifulSoup(resp.text, 'lxml')

    prev_link = soup.find('div', 'btn-group-paging').find_all('a')[1]
    prev_link = prev_link['href'] if 'href' in prev_link.attrs else None

    pos_article = []
    neg_article = []

    for div in soup.find_all('div', 'r-ent'):
        href = div.find('div', 'title').a['href']
        title = div.find('div', 'title').text.strip()
        if re.match('\[.*\]', title):
            tag = re.match('\[.*\]', title).group(0)
            if '好' in tag:
                pos_article.append([title, href])
            if '爛' in tag or '負' in tag:
                neg_article.append([title, href])

    return prev_link, pos_article, neg_article

In [8]:
ptt_url = 'https://www.ptt.cc/bbs/movie/search?q='
movie_name = '復仇者聯盟2'
url = ptt_url + movie_name

postive_posts, negative_posts = [], []
prev_link, pos, neg = get_article(url)
postive_posts += pos
negative_posts += neg

while prev_link:
    url = urljoin(ptt_url, prev_link)
    prev_link, pos, neg = get_article(url)
    postive_posts += pos
    negative_posts += neg


print(len(postive_posts), postive_posts[:4])
print(len(negative_posts), negative_posts[:4])

with open('mov_pos.csv', 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['title', 'href'])
    writer.writerows(postive_posts)
    
with open('mov_neg.csv', 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['title', 'href'])
    writer.writerows(negative_posts) 


46 [['[好雷] 復仇者聯盟2-奧創紀元', '/bbs/movie/M.1552723525.A.914.html'], ['[好雷] 復仇者聯盟2:奧創紀元，家庭敘事', '/bbs/movie/M.1525322737.A.337.html'], ['[普好雷] 復仇者聯盟2 一些心得', '/bbs/movie/M.1512467672.A.8B1.html'], ['[好雷] 正義聯盟--復仇者聯盟2 2.0?', '/bbs/movie/M.1510840746.A.235.html']]
10 [['[負雷] 復仇者聯盟2疑似歹戲拖棚的片段', '/bbs/movie/M.1432016328.A.565.html'], ['[負雷]復仇者聯盟2', '/bbs/movie/M.1430545272.A.7F0.html'], ['[普負雷]《復仇者聯盟2：奧創紀元》在趕進度嗎?', '/bbs/movie/M.1430219814.A.B2F.html'], ['[負雷] 復仇者聯盟2 真心不推', '/bbs/movie/M.1430033678.A.F20.html']]


# Ptt內文

In [9]:
# \u4e00-\u9fa5 -> 保留中文
def clear_text(txt):
    expr = re.compile('[^\u4e00-\u9fa5。；，：“”（）、？「 」『』\s\w:/\-.]')
    txt = re.sub(expr, '', txt)
    txt = re.sub('[。;，:""()、？「」『』：/\-_.（）]', '', txt)
    txt = re.sub('(\s)+', '', txt)
    txt = txt.replace('--', '')
    txt = txt.lower()
    return txt

In [10]:
def get_post(url):
    resp = requests.get(url=url, cookies={'over18':'1'})
    soup = BeautifulSoup(resp.text, 'lxml')
    main_content = soup.find('div', id='main-content')
    
    for meta in main_content.find_all('div', 'article-metaline'):     # 移除標題區塊
        meta.extract()        
    for meta in main_content.find_all('div', 'article-metaline-right'):
        meta.extract()
    for push in main_content.find_all('div', 'push'):                 # 移除推文區塊
        push.extract()
        
    parsed = []
    
    for txt in main_content.stripped_strings:
        if txt[0] == '※' or txt[:2] == '--' or url in txt:
            continue
        txt = clear_text(txt)
        if txt:
            parsed.append(txt)
    return ' '.join(parsed)

In [22]:
def get_article_body(csv_file):
    id_to_body = {}
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            print('處理中...', row['title'], row['href'])
            title = ' '.join(row['title'].split(']')[1:])
            title = clear_text(title)
            body = get_post(urljoin(ptt_url, row['href']))
            id_to_body[row['href']] = title + ' ' + body
            
            time.sleep(random.randint(1,3))
    return id_to_body

# 合體

In [23]:
pos_data = get_article_body('mov_pos.csv')
neg_data = get_article_body('mov_neg.csv')
id_to_body = {**pos_data, **neg_data}

處理中... [好雷] 復仇者聯盟2-奧創紀元 /bbs/movie/M.1552723525.A.914.html
處理中... [好雷] 復仇者聯盟2:奧創紀元，家庭敘事 /bbs/movie/M.1525322737.A.337.html
處理中... [普好雷] 復仇者聯盟2 一些心得 /bbs/movie/M.1512467672.A.8B1.html
處理中... [好雷] 正義聯盟--復仇者聯盟2 2.0? /bbs/movie/M.1510840746.A.235.html
處理中... [好雷] 復仇者聯盟2-原來看了美3再回味會變好看 /bbs/movie/M.1463627195.A.6CB.html
處理中... [好雷] 復仇者聯盟2: "最上流的符號隱喻" /bbs/movie/M.1439710329.A.D30.html
處理中... [微好雷]復仇者聯盟2 精彩中帶點可惜 /bbs/movie/M.1432019828.A.1E9.html
處理中... [普好雷]復仇者聯盟2：奧創紀元 /bbs/movie/M.1431162177.A.3DE.html
處理中... [算好雷]《復仇者聯盟2》 誠意十足也危機四伏 /bbs/movie/M.1430233849.A.3EC.html
處理中... [普好雷] 復仇者聯盟2一點雜感加疑問 /bbs/movie/M.1430229292.A.226.html
處理中... [好雷]復仇者聯盟2:劇情元素加重的大拜拜 /bbs/movie/M.1430194600.A.6A8.html
處理中... [好雷] 復仇者聯盟2 /bbs/movie/M.1430129701.A.8D1.html
處理中... [好雷] 復仇者聯盟2 奧創 /bbs/movie/M.1430118511.A.770.html
處理中... [無好雷]二刷的復仇者聯盟2 /bbs/movie/M.1430067483.A.195.html
處理中... [好雷] 看得很爽的復仇者聯盟2 /bbs/movie/M.1430062787.A.645.html
處理中... [好雷] 復仇者聯盟2 為了迎接無限制之戰 /bbs/movie/M.1430037476.A.1BF.html
處理中... [普好雷] 為

In [25]:
with open('id_to_body.json', 'w', encoding='utf-8') as file:
    json.dump(id_to_body, file, indent=3, ensure_ascii=False)

# 斷字

In [26]:
def load_data(csvfile, jsonfile, label):
    a_ids = []
    with open(csvfile, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            a_ids.append(row['href'])

    with open(jsonfile, 'r', encoding='utf-8') as file:
        id_to_body = json.load(file)

    data = []
    for a_id in a_ids:
        token_post = []
        txt = id_to_body[a_id]         # 抓內容
        for sent in txt.split():       # 斷詞
            filtered = [t for t in jieba.cut(sent) if t.split() and len(t) > 1]
            token_post += filtered
        data.append((token_post, label))
    return data

In [27]:
pos_load_data = load_data('mov_pos.csv', 'id_to_body.json', 'Good')
neg_load_data = load_data('mov_neg.csv', 'id_to_body.json', 'Bad')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/24/c3c3h17d1fd_6698dpq5kc400000gn/T/jieba.cache
Loading model cost 1.430 seconds.
Prefix dict has been built successfully.


In [66]:
# #############################
# for post, label in pos_load_data[:3]:
#    print(post[:5],label)
# for post, label in neg_load_data[:3]:
#    print(post[:5],label)

['仇者', '聯盟', '不夠', '強大', '奧創'] Good
['二刷', '仇者', '聯盟', '女生', 'marvel'] Good
['仇者', '聯盟', '奧創', '大家', '給個'] Good
['仇者', '聯盟', '五金', '紀元', '抱持'] Bad
['仇者', '聯盟', '創紀元', '所以', '人家'] Bad
['仇者', '聯盟', '知道', '同樣', '核心'] Bad


In [173]:
new_neg_load_data = neg_load_data*2          # 增加資料量

data = (*pos_load_data, *new_neg_load_data)
trains = []
targets = []
for p, l in data:
    targets.append(l)
    trains.append(p)

# 訓練資料

In [170]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(trains, targets, test_size = 0.3, 
                                                     random_state=100)

x_trains = []
y_trains = []
x_tests = []
y_tests = []

for i in range(len(x_train)):
    x_trains.append(' '.join(x_train[i]))
    y_trains.append(' '.join(y_train[i]))
    
for i in range(len(x_test)):
    x_tests.append(' '.join(x_test[i]))
    y_tests.append(' '.join(y_test[i]))

In [126]:
# #######################
# random.seed(408)
# random.shuffle(pos_load_data)
# random.shuffle(neg_load_data)

# x_train, y_train, x_test, y_test = [], [], [], []

# for i in range(2):
#    x_train.append(' '.join(pos_load_data[i][0]))
#    x_train.append(' '.join(neg_load_data[i][0]))
#    y_train.append(' '.join(pos_load_data[i][1]))
#    y_train.append(' '.join(neg_load_data[i][1]))

# for i in range(2, len(neg_load_data)):
#    x_test.append(' '.join(pos_load_data[i][0]))
#    x_test.append(' '.join(neg_load_data[i][0]))
#    y_test.append(' '.join(pos_load_data[i][1]))
#    y_test.append(' '.join(neg_load_data[i][1]))

In [127]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

In [171]:
vectorizer = CountVectorizer()
x_trains = vectorizer.fit_transform(x_trains)
transformer = TfidfTransformer()
x_trains = transformer.fit_transform(x_trains)
sgdclf = SGDClassifier(random_state=40)
sgdclf.fit(x_trains, y_trains)

x_tests = vectorizer.transform(x_tests)
x_tests = transformer.transform(x_tests)

y_pred = sgdclf.predict(x_tests)
print('Predict Result: ', list(y_pred))
print('Correct Answer: ', y_test)
print('Accuracy Rate: ', accuracy_score(y_tests, y_pred))

Predict Result:  ['G o o d', 'G o o d', 'G o o d', 'G o o d', 'B a d', 'G o o d', 'G o o d', 'B a d', 'G o o d', 'B a d', 'G o o d', 'G o o d', 'G o o d', 'B a d', 'G o o d', 'G o o d', 'G o o d', 'G o o d', 'G o o d', 'G o o d']
Correct Answer:  ['Good', 'Good', 'Good', 'Bad', 'Bad', 'Good', 'Good', 'Good', 'Good', 'Bad', 'Good', 'Good', 'Good', 'Bad', 'Good', 'Good', 'Good', 'Bad', 'Good', 'Good']
Accuracy Rate:  0.85


# 分析結果

In [172]:
sentences = [
    '很不錯 的 一部 電影',
    '下次 會想 再來看',
    '真的 爛透了',
    '非常 不好看'
]

analyze = vectorizer.build_analyzer()

print(analyze(sentences[0]))
print(analyze(sentences[1]))
print(analyze(sentences[2]))
print(analyze(sentences[3]))

custom_data = transformer.transform(vectorizer.transform(sentences))
print(sgdclf.predict(custom_data))

['很不錯', '一部', '電影']
['下次', '會想', '再來看']
['真的', '爛透了']
['非常', '不好看']
['G o o d' 'G o o d' 'B a d' 'B a d']
