In [1]:
# 安裝 Jieba 中文斷詞工具
!pip install jieba -q

import pandas as pd
import jieba
from collections import Counter
import re

# === 1. 讀取清洗後語料 ===
df = pd.read_csv("1.2_jay_comments_clean.csv")  # 或你自己的檔名

# === 2. 使用 Jieba 對每則留言做斷詞 ===
def jieba_cut(text):
    # 去除標點與空白，只留中文與常見標點
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
    return list(jieba.cut(text))

df["cut_words"] = df["text_cleaned"].apply(jieba_cut)

# === 3. 展平所有詞語，統計詞頻 ===
all_words = [word for words in df["cut_words"] for word in words]

# 過濾過短詞語（1字）與停用詞
stopwords = set(["的", "了", "啦", "呢", "嘛", "啊", "我", "你", "他", "她", "是", "在", "就", "也", "有", "和", "很", "都", "又", "還", "說", "吧"])
filtered_words = [w for w in all_words if len(w) > 1 and w not in stopwords]

word_freq = Counter(filtered_words)
df_freq = pd.DataFrame(word_freq.most_common(), columns=["word", "count"])

# === 4. 儲存斷詞結果與詞頻表 ===
df.to_csv("jay_cut_words.csv", index=False, encoding="utf-8-sig")
df_freq.to_csv("jay_word_freq.csv", index=False, encoding="utf-8-sig")

print(f"✅ 斷詞完成，共收錄 {len(df)} 筆留言，詞彙總數 {len(df_freq)}")
df_freq.head(20)


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.560 seconds.
DEBUG:jieba:Loading model cost 0.560 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


✅ 斷詞完成，共收錄 58315 筆留言，詞彙總數 62154


Unnamed: 0,word,count
0,首歌,5438
1,周杰倫,5006
2,真的,4263
3,還是,3009
4,什麼,2956
5,還在,2848
6,知道,2755
7,喜歡,2680
8,這首,2580
9,我們,2523
