In [1]:
###ref: https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
import pandas as pd
data = pd.read_csv('E:\\pythonprog\\LDA\\abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [None]:
print(len(documents))
print(documents[:5])

In [3]:
###Data Preprocessing
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to E:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [5]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [6]:
def lemmatize_stemming(text):
    stemmer=PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [7]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print preprocess(doc_sample)

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
[u'rain', u'help', u'dampen', u'bushfir']


In [8]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [11]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

(0, u'mdba')
(1, u'woodi')
(2, u'gavar')
(3, u'yellow')
(4, u'interchang')
(5, u'elvi')
(6, u'jihad')
(7, u'mdbp')
(8, u'authoris')
(9, u'reshuffl')
(10, u'scold')


In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(2126, 1), (5885, 1), (10948, 1), (13151, 1)]

In [13]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 2126 ("rain") appears 1 time.
Word 5885 ("help") appears 1 time.
Word 10948 ("dampen") appears 1 time.
Word 13151 ("bushfir") appears 1 time.


In [14]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [15]:
corpus_tfidf = tfidf[bow_corpus]

In [16]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(2805, 0.38524510107363613),
 (3963, 0.5055678583740412),
 (8390, 0.5903602896750699),
 (10270, 0.4974556071174764)]


In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.027*queensland + 0.021*north + 0.019*warn + 0.015*tasmanian + 0.015*chang + 0.014*west + 0.012*victoria + 0.011*flood + 0.011*busi + 0.010*game
Topic: 1 
Words: 0.015*health + 0.015*fund + 0.015*rural + 0.013*indigen + 0.013*help + 0.012*commun + 0.012*servic + 0.011*farmer + 0.011*need + 0.010*say
Topic: 2 
Words: 0.031*govern + 0.026*australia + 0.024*attack + 0.021*kill + 0.020*australian + 0.016*south + 0.015*donald + 0.013*turnbul + 0.011*protest + 0.011*citi
Topic: 3 
Words: 0.023*perth + 0.023*canberra + 0.017*tasmania + 0.014*life + 0.013*sentenc + 0.012*case + 0.012*farm + 0.011*prison + 0.011*week + 0.010*question
Topic: 4 
Words: 0.054*polic + 0.025*death + 0.020*crash + 0.019*melbourn + 0.018*die + 0.017*interview + 0.016*miss + 0.016*shoot + 0.014*woman + 0.013*women
Topic: 5 
Words: 0.035*charg + 0.033*court + 0.023*murder + 0.021*face + 0.017*accus + 0.015*child + 0.015*alleg + 0.014*claim + 0.014*break + 0.013*jail
Topic: 6 
Words: 0.029*sydney + 0.01

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.014*market + 0.010*news + 0.009*price + 0.008*share + 0.008*weather + 0.008*rise + 0.006*australian + 0.006*busi + 0.005*christma + 0.005*fall
Topic: 1 Word: 0.009*govern + 0.009*drum + 0.007*grandstand + 0.007*sport + 0.006*fund + 0.006*plan + 0.005*council + 0.005*rail + 0.004*green + 0.004*care
Topic: 2 Word: 0.012*podcast + 0.008*plead + 0.007*guilti + 0.007*wednesday + 0.007*tuesday + 0.006*dairi + 0.005*retir + 0.005*decemb + 0.005*father + 0.005*interview
Topic: 3 Word: 0.022*countri + 0.020*hour + 0.008*live + 0.006*mental + 0.006*korea + 0.006*health + 0.005*export + 0.005*program + 0.004*know + 0.004*say
Topic: 4 Word: 0.008*queensland + 0.007*victoria + 0.007*friday + 0.006*drought + 0.006*flood + 0.006*hobart + 0.006*rain + 0.006*storm + 0.006*june + 0.005*australia
Topic: 5 Word: 0.023*rural + 0.007*novemb + 0.007*peter + 0.006*young + 0.005*australia + 0.005*univers + 0.005*foreign + 0.005*grow + 0.005*futur + 0.005*say
Topic: 6 Word: 0.019*trump + 0.010*

In [21]:
processed_docs[4310]

[u'rain', u'help', u'dampen', u'bushfir']

In [22]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.619929014035	 
Topic: 0.027*queensland + 0.021*north + 0.019*warn + 0.015*tasmanian + 0.015*chang + 0.014*west + 0.012*victoria + 0.011*flood + 0.011*busi + 0.010*game

Score: 0.220057543066	 
Topic: 0.023*perth + 0.023*canberra + 0.017*tasmania + 0.014*life + 0.013*sentenc + 0.012*case + 0.012*farm + 0.011*prison + 0.011*week + 0.010*question

Score: 0.0200109919823	 
Topic: 0.015*health + 0.015*fund + 0.015*rural + 0.013*indigen + 0.013*help + 0.012*commun + 0.012*servic + 0.011*farmer + 0.011*need + 0.010*say

Score: 0.0200012265798	 
Topic: 0.054*polic + 0.025*death + 0.020*crash + 0.019*melbourn + 0.018*die + 0.017*interview + 0.016*miss + 0.016*shoot + 0.014*woman + 0.013*women

Score: 0.020001093648	 
Topic: 0.033*year + 0.018*market + 0.017*countri + 0.013*record + 0.013*share + 0.013*peopl + 0.012*australian + 0.012*student + 0.011*fall + 0.010*royal

Score: 0.0200001306839	 
Topic: 0.029*sydney + 0.018*open + 0.017*world + 0.015*live + 0.015*final + 0.014*win + 0.01

In [23]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.562866996195	 
Topic: 0.008*queensland + 0.007*victoria + 0.007*friday + 0.006*drought + 0.006*flood + 0.006*hobart + 0.006*rain + 0.006*storm + 0.006*june + 0.005*australia

Score: 0.277123232692	 
Topic: 0.014*market + 0.010*news + 0.009*price + 0.008*share + 0.008*weather + 0.008*rise + 0.006*australian + 0.006*busi + 0.005*christma + 0.005*fall

Score: 0.0200019367436	 
Topic: 0.022*countri + 0.020*hour + 0.008*live + 0.006*mental + 0.006*korea + 0.006*health + 0.005*export + 0.005*program + 0.004*know + 0.004*say

Score: 0.0200015714852	 
Topic: 0.009*govern + 0.009*drum + 0.007*grandstand + 0.007*sport + 0.006*fund + 0.006*plan + 0.005*council + 0.005*rail + 0.004*green + 0.004*care

Score: 0.0200015531603	 
Topic: 0.018*polic + 0.014*charg + 0.014*crash + 0.012*woman + 0.010*murder + 0.009*death + 0.009*shoot + 0.008*driver + 0.008*jail + 0.008*court

Score: 0.0200014854431	 
Topic: 0.012*podcast + 0.008*plead + 0.007*guilti + 0.007*wednesday + 0.007*tuesday + 0.006*da

In [24]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.293450485529	 Topic: 0.034*trump + 0.027*elect + 0.014*say + 0.014*labor + 0.013*call
Score: 0.239877715838	 Topic: 0.031*govern + 0.026*australia + 0.024*attack + 0.021*kill + 0.020*australian
Score: 0.183333334985	 Topic: 0.033*year + 0.018*market + 0.017*countri + 0.013*record + 0.013*share
Score: 0.1833330023	 Topic: 0.035*charg + 0.033*court + 0.023*murder + 0.021*face + 0.017*accus
Score: 0.0166717946987	 Topic: 0.015*health + 0.015*fund + 0.015*rural + 0.013*indigen + 0.013*help
Score: 0.0166669999745	 Topic: 0.054*polic + 0.025*death + 0.020*crash + 0.019*melbourn + 0.018*die
Score: 0.0166666666688	 Topic: 0.023*perth + 0.023*canberra + 0.017*tasmania + 0.014*life + 0.013*sentenc
Score: 0.0166666666687	 Topic: 0.023*school + 0.017*hour + 0.014*plan + 0.013*nation + 0.012*rise
Score: 0.0166666666685	 Topic: 0.027*queensland + 0.021*north + 0.019*warn + 0.015*tasmanian + 0.015*chang
Score: 0.0166666666683	 Topic: 0.029*sydney + 0.018*open + 0.017*world + 0.015*live + 0.0