In [None]:
# 第一部分：導入所需的套件
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd


In [None]:
# 第二部分：定義函數以抓取 PTT 文章
def fetch_ptt_articles(board, keyword, max_results):
    url = f'https://www.ptt.cc/bbs/{board}/index.html'
    headers = {'User-Agent': 'Mozilla/5.0'}
    rs = requests.session()
    rs.cookies.set('over18', '1')  # Bypass age verification

    articles_data = []  # List to store articles information

    while url and len(articles_data) < max_results:
        print(f'Fetching {url}')
        res = rs.get(url, headers=headers)
        if res.status_code != 200:
            print(f'Error: Unable to fetch {url}')
            break

        soup = BeautifulSoup(res.text, 'html.parser')
        articles = soup.select('.r-ent')
        for article in articles:
            if len(articles_data) >= max_results:
                break
            title = article.select_one('.title').text.strip()
            link = article.select_one('.title a')
            date = article.select_one('.date').text.strip()
            push_num_tag = article.select_one('.nrec span')
            push_num = push_num_tag.text if push_num_tag else '0'

            # Ignore articles with 'Re:' or 'Fw:' in the title
            if 'Re:' in title or 'Fw:' in title:
                continue

            if link and keyword.lower() in title.lower():
                article_url = 'https://www.ptt.cc' + link['href']
                category = categorize_title(title)
                print(f'Title: {title}, URL: {article_url}')
                articles_data.append({'Title': title, 'Time': date, 'Push_Num': push_num, 'URL': article_url, 'Category': category})

        # Get next page URL
        paging = soup.select('div.btn-group-paging a')
        prev_url = None
        for p in paging:
            if '上頁' in p.text:
                prev_url = 'https://www.ptt.cc' + p['href']

        if prev_url:
            url = prev_url
        else:
            break

    # Save results to Excel
    if articles_data:
        df = pd.DataFrame(articles_data)
        df.to_excel(f'PTT_{board}_{keyword}.xlsx', index=False)
        print(f'Results saved to PTT_{board}_{keyword}.xlsx')

In [None]:
# 第三部分：定義分類函數
def categorize_title(title):
    if any(keyword in title for keyword in ['遊記', '旅遊']):
        return '遊記'
    elif any(keyword in title for keyword in ['新聞', '報導']):
        return '新聞'
    elif any(keyword in title for keyword in ['請益', '求助']):
        return '請益'
    elif any(keyword in title for keyword in ['問題', '疑問']):
        return '問題'
    elif any(keyword in title for keyword in ['心得', '感想']):
        return '心得'
    else:
        return '其他'

In [None]:
# 第四部分：主函數，用於在 Google Colab 中執行
if __name__ == '__main__':
    # 用戶輸入部分
    board = input('請輸入PTT看板名稱（例如: stock）: ')
    keyword = input('請輸入搜尋關鍵字: ')
    max_results = int(input('請輸入要抓取的最多結果數量: '))

    # 抓取文章並保存
    fetch_ptt_articles(board, keyword, max_results)
    print("程式已完成執行。")


請輸入PTT看板名稱（例如: stock）: PingTung
請輸入搜尋關鍵字: 墾丁
請輸入要抓取的最多結果數量: 100
Fetching https://www.ptt.cc/bbs/PingTung/index.html
Fetching https://www.ptt.cc/bbs/PingTung/index3494.html
Fetching https://www.ptt.cc/bbs/PingTung/index3493.html
Title: [新聞] 墾丁觀光最慘一年！ 全年旅遊人次將比疫, URL: https://www.ptt.cc/bbs/PingTung/M.1731558758.A.4C9.html
Fetching https://www.ptt.cc/bbs/PingTung/index3492.html
Fetching https://www.ptt.cc/bbs/PingTung/index3491.html
Fetching https://www.ptt.cc/bbs/PingTung/index3490.html
Fetching https://www.ptt.cc/bbs/PingTung/index3489.html
Fetching https://www.ptt.cc/bbs/PingTung/index3488.html
Fetching https://www.ptt.cc/bbs/PingTung/index3487.html
Title: [新聞] 墾丁警光會館30年沒漲價…每房擬漲200元, URL: https://www.ptt.cc/bbs/PingTung/M.1729659253.A.7B1.html
Title: [問題] 請問有推薦的屏東（墾丁）外燴嗎, URL: https://www.ptt.cc/bbs/PingTung/M.1729819155.A.2AF.html
Fetching https://www.ptt.cc/bbs/PingTung/index3486.html
Fetching https://www.ptt.cc/bbs/PingTung/index3485.html
Fetching https://www.ptt.cc/bbs/PingTung/ind

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import torch
import jieba
import os
import re

def analyze_excel_titles(file_path):
    # Step 1: Load the Excel file
    xls = pd.ExcelFile(file_path)
    sheet1 = pd.read_excel(xls, sheet_name='Sheet1')

    # Step 2: Extract the 'Title' column
    titles = sheet1['Title']

    # Step 3: Extract categories based on the title prefix [XX]
    categories = []
    for title in titles:
        match = re.match(r'\[(.*?)\]', title)
        if match:
            categories.append(match.group(1))
        else:
            categories.append('其他')  # Default category if no prefix is found
    sheet1['Category'] = categories

    # Step 4: Perform TF-IDF analysis to extract top keywords using jieba for segmentation
    def jieba_tokenizer(text):
        return jieba.lcut(text)

    tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, max_features=20)  # Limit to top 20 features
    tfidf_matrix = tfidf_vectorizer.fit_transform(titles)

    # Get the top keywords
    keywords = tfidf_vectorizer.get_feature_names_out()
    print("Top Keywords:")
    for keyword in keywords:
        print(keyword)

    # Step 5: Load the pre-trained BERT model from ckiplab
    tokenizer = BertTokenizer.from_pretrained('ckiplab/bert-base-chinese')
    model = BertModel.from_pretrained('ckiplab/bert-base-chinese')

    # Convert each title into BERT embeddings
    embeddings = []
    for title in titles:
        inputs = tokenizer(title, return_tensors='pt', truncation=True, max_length=128, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        # We use the [CLS] token representation as the embedding for the whole sentence
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embedding.flatten())

    # Convert the embeddings list to a DataFrame
    embeddings_df = pd.DataFrame(embeddings)

    # Step 6: Train a simple logistic regression model for sentiment analysis
    # Replace the dummy labels with actual training labels (positive, negative, neutral)
    # Here, we simulate labels for demonstration.
    dummy_labels = ['Positive' if i % 3 == 0 else 'Negative' if i % 3 == 1 else 'Neutral' for i in range(len(embeddings_df))]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(embeddings_df, dummy_labels, test_size=0.2, random_state=42)

    # Train a logistic regression classifier for multi-class classification
    classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
    classifier.fit(X_train, y_train)

    # Step 7: Perform sentiment analysis on each title
    predictions = classifier.predict(embeddings_df)
    sheet1['Sentiment'] = predictions

    # Print the titles along with their assigned categories and sentiment
    print("\nTitles, their assigned categories, and sentiment:")
    for idx, row in sheet1.iterrows():
        print(f"Title: {row['Title']} -> Category: {row['Category']}, Sentiment: {row['Sentiment']}")

    # Step 8: Save the results to a new Excel file
    output_file_path = os.path.splitext(file_path)[0] + '_sentiment.xlsx'
    sheet1.to_excel(output_file_path, index=False)
    print(f"Results saved to {output_file_path}")

    # Optionally, return the DataFrame with categories and sentiments for further use
    return sheet1


# Usage example
file_path = '/content/PTT_PingTung_墾丁.xlsx'
analyze_excel_titles(file_path)




Top Keywords:
 
(
)
/
1
4
[
]
、
「
」
丁
問題
墾丁
大街
新聞
的
買賣
轉讓
！


Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Titles, their assigned categories, and sentiment:
Title: [新聞] 墾丁觀光最慘一年！ 全年旅遊人次將比疫 -> Category: 新聞, Sentiment: Negative
Title: [新聞] 墾丁警光會館30年沒漲價…每房擬漲200元 -> Category: 新聞, Sentiment: Negative
Title: [問題] 請問有推薦的屏東（墾丁）外燴嗎 -> Category: 問題, Sentiment: Neutral
Title: [新聞] 墾丁買東山鴨頭510元嚇傻　老闆說話了 -> Category: 新聞, Sentiment: Positive
Title: [新聞] 搶救國旅？墾丁旅遊海報正妹「薄紗透視」 -> Category: 新聞, Sentiment: Positive
Title: [新聞] 國旅太慘！各縣市負評排名出爐　墾丁穩居 -> Category: 新聞, Sentiment: Neutral
Title: [遊記] 墾丁台東行 (上) -> Category: 遊記, Sentiment: Positive
Title: [新聞] 屠宰場惡名未停歇 墾丁人潮銳減 -> Category: 新聞, Sentiment: Negative
Title: [新聞] 6片櫛瓜100元！網紅實測墾丁大街物價　全 -> Category: 新聞, Sentiment: Neutral
Title: 4 間墾丁立槳 SUP，夏天海邊必玩活動！ -> Category: 其他, Sentiment: Positive
Title: [買賣] 徵求墾丁夏都飯店Spa卷 -> Category: 買賣, Sentiment: Negative
Title: [新聞] 墾丁遊客雪崩式下滑！飯店業者揭5原因…親吐「經營困難 -> Category: 新聞, Sentiment: Neutral
Title: [新聞] 高溫壟罩！墾丁、小琉球現珊瑚白化危機 -> Category: 新聞, Sentiment: Neutral
Title: [買賣] 徵求墾丁福華晚餐餐卷8張 -> Category: 買賣, Sentiment: Negative
Title: [新聞] 防



Unnamed: 0,Title,Time,Push_Num,URL,Category,Sentiment
0,[新聞] 墾丁觀光最慘一年！ 全年旅遊人次將比疫,11/14,7,https://www.ptt.cc/bbs/PingTung/M.1731558758.A...,新聞,Negative
1,[新聞] 墾丁警光會館30年沒漲價…每房擬漲200元,10/23,0,https://www.ptt.cc/bbs/PingTung/M.1729659253.A...,新聞,Negative
2,[問題] 請問有推薦的屏東（墾丁）外燴嗎,10/25,1,https://www.ptt.cc/bbs/PingTung/M.1729819155.A...,問題,Neutral
3,[新聞] 墾丁買東山鴨頭510元嚇傻　老闆說話了,10/12,5,https://www.ptt.cc/bbs/PingTung/M.1728715925.A...,新聞,Positive
4,[新聞] 搶救國旅？墾丁旅遊海報正妹「薄紗透視」,10/02,1,https://www.ptt.cc/bbs/PingTung/M.1727831216.A...,新聞,Positive
...,...,...,...,...,...,...
95,[新聞] 疫情南北狂燒也無懼！墾丁春節旅客退房又,1/23,8,https://www.ptt.cc/bbs/PingTung/M.1642941667.A...,新聞,Neutral
96,[買賣] 墾丁凱撒 晚餐雙人票券 年底前有效,12/20,5,https://www.ptt.cc/bbs/PingTung/M.1639970927.A...,買賣,Positive
97,[遊記] 恆春。墾丁，海都旅店,12/03,1,https://www.ptt.cc/bbs/PingTung/M.1638536108.A...,遊記,Negative
98,[問題] 請問墾丁哪裡有賣大張的拍立得底片,11/30,0,https://www.ptt.cc/bbs/PingTung/M.1638252453.A...,問題,Neutral
