In [1]:
import nltk
import jieba
import jieba.posseg as pseg
import codecs

from hanziconv import HanziConv
from nltk.corpus import stopwords 

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

import pandas as pd
import numpy as np

In [2]:
# import data
with open('./dataset/news_sohusite_xml.smarty.dat', 'rb') as f:
    data = f.readlines()
    lines =[]
    for line in data:
        doc = line.decode('gb18030')
        lines.append(doc)

In [3]:
# import Chinese stopwords and change them to traditional Chinese
with open('stopwords_zh.txt', encoding='utf-8') as f:
    data = f.readlines()
    stops =[]
    for line in data:
        stops.append(HanziConv.toTraditional(line.replace("\n",'')))

In [4]:
# a function for word tokenization and removing stopwords
def tokenize_clean(item):
    text = ' '.join(jieba.cut(item, cut_all=True))
    text = [word for word in text.split() if word not in stops]
    return " ".join(text)

In [5]:
# Process, clean and tokenize(using jieba) data
results = []
for line in lines:
    soup = BeautifulSoup(line, features='lxml')
    result = soup.content
    if result != None:
        # extract document between <content></content>
        result = re.sub(r"(</content>$)", "", str(result))
        result = re.sub(r"(^<content>)", "", result)
        # remove noise
        result = re.sub(r"(\u3000)", "", result)
        result = re.sub(r"(\ue40c)", "", result)
        result = re.sub(r"(\u7dab)","", result)
        # transform documents in to traiditional Chinese
        result = HanziConv.toTraditional(result)
        # tokenization and stopwords removing
        text = tokenize_clean(result)
        results.append(text)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Yan_Ling\AppData\Local\Temp\jieba.cache
Loading model cost 0.757 seconds.
Prefix dict has been built succesfully.


In [6]:
# Put processed data in to dataframe
content = pd.DataFrame(results, columns=['content'])
content['content'].replace('', np.nan, inplace=True)
content.dropna(inplace = True, axis=0)
content.reset_index()

Unnamed: 0,index,content
0,0,南都 訊 記 者 劉 周 昌 任 笑 繼 推 齣 日 票 深圳 將 設 地 鐵 頭 等 車 ...
1,2,同心 縣 地 處 寜 夏 中部 乾旱 帶 核心 區 鼕 寒 長 春 暖 遲 夏 熱 短 鞦 ...
2,3,滿 歲 永康 飽 經 病痛 摺 磨 孩子 年 月 日 齣 生 患有 先天 先天性 天性 心 ...
3,4,就 廢 棄 茶 葉 轉 手 事件 發 聲 明 本 報 訊 記 者 劉 俊 受害 受害者 昨日...
4,5,奬 勵 辦 法 率先 提交 前 創 意 項 目 經 評 估 優 先 資 助 實 施 谘 詢 ...
5,7,全民 健康 康生 生活 方式 行 動 年度 健康 血 壓 主 題 活 動 拉 開 帷幕 詳 細
6,9,年 月 中 國 慈善 導 航行 動 第一 第一季 一季 正式 啓 動 檔 大 傢 看法 精心...
7,10,年 東 風 標 桔 燈 鄉 村小 學 圖 書 館 計 劃 月 日 日 湖北 湖北省 武 漢 ...
8,11,紀 念 汶川 地震 周年 紀 念 月 日 國 傢 首 防 災 減 災 日 紀 念 索 弗利 ...
9,12,世 間 本 沒 歧 視 歧 視 源自 內 心 活 動 愛 名 年 中 國 艾滋 艾滋病 反 ...


In [7]:
# Import required libraries
from sklearn import feature_extraction 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [8]:
# Calculate TF-IDF
vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{2,}', lowercase = False) 
data_vectorized = vectorizer.fit_transform(content['content'])
analyze = vectorizer.build_analyzer()

In [9]:
# Just want a dictionary, but not always necessary
dictionary = vectorizer.vocabulary_.items()

In [10]:
# Decide the number of topics
NUM_TOPICS = 20

In [11]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=20, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])




(164, 20)
(164, 20)
(164, 20)
[0.00595283 0.00595283 0.00595283 0.00595283 0.00595283 0.00595283
 0.1082012  0.00595283 0.00595283 0.00595283 0.00595283 0.00595283
 0.00595283 0.00595283 0.00595283 0.06646929 0.7241314  0.00595283
 0.00595283 0.00595283]
[0.         0.         0.01764727 0.         0.15726837 0.02682631
 0.         0.         0.         0.00070057 0.         0.
 0.         0.         0.         0.00584209 0.         0.01594759
 0.00096599 0.01377027]
[ 0.06680788  0.04730325 -0.0373484  -0.10757552  0.08570865 -0.08929715
 -0.10790236  0.00267453  0.01943722 -0.03367413 -0.03531013 -0.03738751
 -0.01068913  0.00497523  0.00835196 -0.08259098  0.02069488 -0.04114814
 -0.04471041 -0.03421568]


In [12]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('女孩', 0.8641858747988651), ('男孩', 0.8383783847281434), ('孩子', 0.7101576415883206), ('哥哥', 0.69678533278495), ('施暴', 0.6954826148358594), ('以前', 0.684083185413769), ('暑假', 0.6743411235552315), ('正品', 0.6491669514964714), ('珍珠', 0.6469881202409551), ('泡沫', 0.6469871579912317)]
Topic 1:
[('肺病', 1.670698428213754), ('民工', 1.4850405097729524), ('村小', 1.2429365986287373), ('希望', 1.2114010557133665), ('先生', 1.0599039446514218), ('期望', 1.0297182230420132), ('兄弟', 0.9706971545274432), ('孩子', 0.9412404613344807), ('公益', 0.9176330157694983), ('自己', 0.7563113634117284)]
Topic 2:
[('女性', 1.4816122634537787), ('周年', 1.1642287691259412), ('慈善', 0.8531214844340135), ('女人', 0.8378668966858533), ('成立', 0.8166552935049314), ('十周年', 0.7962270753568748), ('公益', 0.7944803693964648), ('十周', 0.7935158333549919), ('人群', 0.747018365925828), ('小手', 0.7405416332403678)]
Topic 3:
[('透析', 1.4461471440117608), ('慈善', 0.9838542578955647), ('大病', 0.9077062168667737), ('二次', 0.7870863796062705), (

Reference:
---------------------------------
1. <a href="https://nlpforhackers.io/topic-modeling/" title="Complete Guide to Topic Modeling">Complete Guide to Topic Modeling</a>
2. <a href="https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/">Beginners Guide to Topic Modeling in Python</a>


Something usefuel...  
--------------------------------

> <a href="https://www.jianshu.com/p/04cb736fd375">Python的正则表达式和re模块用法介绍</a>