In [1]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from pprint import pprint
import numpy as np
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\POPO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
print(len(documents))
print(documents[:5])

1186018
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [6]:
stemmer = SnowballStemmer('english')
 
#提取词幹,词性還原
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
 
#過濾停用词和長度小於3的单词
def preprocess(text):
    #gensim.utils.simple_preprocess 
    """"
    將英文字母小寫化處理,
    刪除文本中所有的標點符號。
    刪除所有少於min_len(預設2)和大於max_len(預設15)個字符的單詞
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = documents[documents['index'] == 100].values[0][0]
 
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['more', 'women', 'urged', 'to', 'become', 'councillors']


 tokenized and lemmatized document: 
['women', 'urg', 'councillor']


In [10]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [11]:
dictionary = gensim.corpora.Dictionary(processed_docs)
for i in range(10):
    print(i,dictionary[i],dictionary.dfs[i])  #統計的是文檔數而非真正意義上的詞頻
print('number of total words:',len(dictionary))


0 broadcast 430
1 communiti 6074
2 decid 1365
3 licenc 1248
4 awar 589
5 defam 466
6 wit 1814
7 call 10377
8 infrastructur 1124
9 protect 3410
number of total words: 67118


In [12]:
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
#過濾掉詞頻低於5次的詞
#過濾掉高於總詞頻50%的詞
#完成上述兩個步驟後，只保留前100000個最頻繁的詞
len(dictionary)
print('number of total words:',len(dictionary))

number of total words: 24642


In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_doc_0 = bow_corpus[0]
for i in range(len(bow_doc_0)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_0[i][0], 
                                               dictionary[bow_doc_0[i][0]], 
bow_doc_0[i][1]))

Word 0 ("broadcast") appears 1 time.
Word 1 ("communiti") appears 1 time.
Word 2 ("decid") appears 1 time.
Word 3 ("licenc") appears 1 time.


In [15]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]


In [22]:
print(len(bow_corpus))
print(len(processed_docs))

1186018
1186018


# 在詞袋語料庫上訓練LDA模型

In [23]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

訓練完成後,對於每個主題，我們查看一下該主題中出現的單詞及其相對權重:

In [62]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.025*"world" + 0.020*"market" + 0.020*"australia" + 0.017*"adelaid" + 0.016*"live" + 0.016*"open" + 0.014*"test" + 0.014*"final" + 0.014*"miss" + 0.013*"record"
Topic: 1 
Words: 0.034*"polic" + 0.029*"charg" + 0.026*"court" + 0.024*"murder" + 0.017*"face" + 0.017*"alleg" + 0.016*"jail" + 0.015*"accus" + 0.012*"death" + 0.012*"life"
Topic: 2 
Words: 0.040*"sydney" + 0.019*"water" + 0.019*"farmer" + 0.017*"trial" + 0.016*"investig" + 0.016*"child" + 0.016*"interview" + 0.015*"perth" + 0.015*"abus" + 0.012*"hear"
Topic: 3 
Words: 0.058*"australian" + 0.030*"kill" + 0.029*"attack" + 0.022*"shoot" + 0.014*"fall" + 0.014*"dead" + 0.013*"polic" + 0.012*"stori" + 0.010*"return" + 0.009*"star"
Topic: 4 
Words: 0.029*"year" + 0.022*"hous" + 0.021*"coast" + 0.018*"royal" + 0.017*"south" + 0.015*"home" + 0.014*"gold" + 0.013*"island" + 0.013*"time" + 0.011*"find"
Topic: 5 
Words: 0.053*"say" + 0.038*"australia" + 0.016*"protest" + 0.014*"rise" + 0.014*"claim" + 0.014*"speak" + 0.

# 使用LDA的詞袋模型對文檔進行分類的性能評估

In [45]:
docId=100
print(documents['headline_text'][docId])
print(processed_docs[docId])
print(lda_model[bow_corpus[docId]])

for index, score in sorted(lda_model[bow_corpus[docId]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic{}: {}".format(score,index, lda_model.print_topic(index, 10)))

more women urged to become councillors
['women', 'urg', 'councillor']
[(0, 0.025005981), (1, 0.025004901), (2, 0.025002966), (3, 0.025002966), (4, 0.025002966), (5, 0.025002966), (6, 0.5250369), (7, 0.025004596), (8, 0.27493283), (9, 0.025002966)]

Score: 0.5250368714332581	 
Topic6: 0.016*"school" + 0.014*"health" + 0.013*"help" + 0.013*"state" + 0.012*"tasmanian" + 0.011*"countri" + 0.011*"fund" + 0.011*"busi" + 0.011*"high" + 0.010*"children"

Score: 0.2749328315258026	 
Topic8: 0.028*"govern" + 0.015*"chang" + 0.014*"plan" + 0.011*"elect" + 0.011*"commiss" + 0.011*"peopl" + 0.011*"communiti" + 0.011*"meet" + 0.010*"park" + 0.010*"budget"

Score: 0.025005945935845375	 
Topic0: 0.025*"world" + 0.020*"market" + 0.020*"australia" + 0.017*"adelaid" + 0.016*"live" + 0.016*"open" + 0.014*"test" + 0.014*"final" + 0.014*"miss" + 0.013*"record"

Score: 0.025004902854561806	 
Topic1: 0.034*"polic" + 0.029*"charg" + 0.026*"court" + 0.024*"murder" + 0.017*"face" + 0.017*"alleg" + 0.016*"jail" +

In [64]:
lda_model.print_topic(-1)

'0.044*"trump" + 0.028*"queensland" + 0.020*"news" + 0.020*"north" + 0.018*"feder" + 0.015*"rural" + 0.014*"bushfir" + 0.014*"death" + 0.013*"nation" + 0.013*"victoria"'

# 在TF-IDF預料庫上訓練LDA模型

In [65]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
 
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.012*"royal" + 0.011*"health" + 0.010*"commiss" + 0.007*"mental" + 0.006*"abbott" + 0.006*"marriag" + 0.006*"hospit" + 0.005*"govern" + 0.005*"foreign" + 0.005*"toni"
Topic: 1 Word: 0.019*"polic" + 0.018*"charg" + 0.016*"murder" + 0.012*"alleg" + 0.012*"court" + 0.011*"woman" + 0.010*"death" + 0.010*"jail" + 0.009*"arrest" + 0.009*"shoot"
Topic: 2 Word: 0.014*"interview" + 0.010*"weather" + 0.007*"grandstand" + 0.006*"speak" + 0.005*"alan" + 0.005*"smith" + 0.005*"game" + 0.005*"john" + 0.005*"extend" + 0.005*"mount"
Topic: 3 Word: 0.013*"stori" + 0.010*"friday" + 0.010*"turnbul" + 0.010*"wednesday" + 0.009*"christma" + 0.009*"morrison" + 0.009*"sport" + 0.008*"david" + 0.006*"tasmanian" + 0.006*"malcolm"
Topic: 4 Word: 0.026*"trump" + 0.014*"donald" + 0.014*"crash" + 0.013*"kill" + 0.010*"dead" + 0.010*"die" + 0.007*"injur" + 0.006*"plead" + 0.006*"truck" + 0.005*"insid"
Topic: 5 Word: 0.009*"elect" + 0.008*"thursday" + 0.007*"sexual" + 0.007*"financ" + 0.006*"andrew" 

# 使用LDA的TF-IDF詞袋模型對文檔進行分類的性能評估

In [66]:
print(documents['headline_text'][docId])
print(processed_docs[docId])
print('-------------------------------------')
for index, score in sorted(lda_model_tfidf[bow_corpus[docId]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic{}: {}".format(score,index, lda_model_tfidf.print_topic(index, 10)))

more women urged to become councillors
['women', 'urg', 'councillor']
-------------------------------------

Score: 0.7749364972114563	 
Topic5: 0.009*"elect" + 0.008*"thursday" + 0.007*"sexual" + 0.007*"financ" + 0.006*"andrew" + 0.006*"fiji" + 0.006*"novemb" + 0.006*"liber" + 0.006*"parti" + 0.005*"music"

Score: 0.02501094713807106	 
Topic0: 0.012*"royal" + 0.011*"health" + 0.010*"commiss" + 0.007*"mental" + 0.006*"abbott" + 0.006*"marriag" + 0.006*"hospit" + 0.005*"govern" + 0.005*"foreign" + 0.005*"toni"

Score: 0.02500954270362854	 
Topic8: 0.010*"final" + 0.010*"world" + 0.009*"australia" + 0.006*"leagu" + 0.006*"march" + 0.006*"mark" + 0.006*"open" + 0.006*"cricket" + 0.006*"violenc" + 0.005*"septemb"

Score: 0.025008613243699074	 
Topic3: 0.013*"stori" + 0.010*"friday" + 0.010*"turnbul" + 0.010*"wednesday" + 0.009*"christma" + 0.009*"morrison" + 0.009*"sport" + 0.008*"david" + 0.006*"tasmanian" + 0.006*"malcolm"

Score: 0.02500835433602333	 
Topic9: 0.019*"countri" + 0.014*"ho

# 使用數據集以外的文檔對模型進行測試

In [68]:
unseen_document = "Five bystanders shot during police shootout in New Orleans"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
 
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.41241928935050964	 Topic: 0.034*"polic" + 0.029*"charg" + 0.026*"court" + 0.024*"murder" + 0.017*"face"
Score: 0.2250247448682785	 Topic: 0.058*"australian" + 0.030*"kill" + 0.029*"attack" + 0.022*"shoot" + 0.014*"fall"
Score: 0.2223723977804184	 Topic: 0.026*"crash" + 0.026*"woman" + 0.025*"donald" + 0.022*"die" + 0.014*"victim"
Score: 0.020027846097946167	 Topic: 0.029*"year" + 0.022*"hous" + 0.021*"coast" + 0.018*"royal" + 0.017*"south"
Score: 0.020026206970214844	 Topic: 0.028*"govern" + 0.015*"chang" + 0.014*"plan" + 0.011*"elect" + 0.011*"commiss"
Score: 0.020026087760925293	 Topic: 0.044*"trump" + 0.028*"queensland" + 0.020*"news" + 0.020*"north" + 0.018*"feder"
Score: 0.02002592757344246	 Topic: 0.025*"world" + 0.020*"market" + 0.020*"australia" + 0.017*"adelaid" + 0.016*"live"
Score: 0.020025884732604027	 Topic: 0.053*"say" + 0.038*"australia" + 0.016*"protest" + 0.014*"rise" + 0.014*"claim"
Score: 0.020025866106152534	 Topic: 0.016*"school" + 0.014*"health" + 0.013*"