In [40]:
# source https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [41]:
import pandas as pd

In [42]:
data = pd.read_csv('p:/datasets/abcnews-date-text.csv', error_bad_lines=False)

In [43]:
type(data)

pandas.core.frame.DataFrame

In [44]:
data[:5]

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [45]:
data.describe()

Unnamed: 0,publish_date
count,1082168.0
mean,20098380.0
std,39960.17
min,20030220.0
25%,20061130.0
50%,20100530.0
75%,20130820.0
max,20170630.0


In [46]:
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [47]:
len(documents)

1082168

In [48]:
documents[:15]

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4
5,ambitious olsson wins triple jump,5
6,antic delighted with record breaking barca,6
7,aussie qualifier stosur wastes four memphis match,7
8,aust addresses un security council over iraq,8
9,australia is locked into war timetable opp,9


In [49]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [50]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\infoadmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [52]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [53]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [54]:
documents[documents['index'] == 4310]

Unnamed: 0,headline_text,index
4310,rain helps dampen bushfires,4310


In [55]:
documents[documents['index'] == 4310].values[0][0]

'rain helps dampen bushfires'

In [56]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
    
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [57]:
documents['headline_text']

0          aba decides against community broadcasting lic...
1             act fire witnesses must be aware of defamation
2             a g calls for infrastructure protection summit
3                   air nz staff in aust strike for pay rise
4              air nz strike to affect australian travellers
5                          ambitious olsson wins triple jump
6                 antic delighted with record breaking barca
7          aussie qualifier stosur wastes four memphis match
8               aust addresses un security council over iraq
9                 australia is locked into war timetable opp
10         australia to contribute 10 million in aid to iraq
11         barca take record as robson celebrates birthda...
12                                bathhouse plans move ahead
13             big hopes for launceston cycling championship
14                    big plan to boost paroo water supplies
15                    blizzard buries united states in bills
16            brigadier 

In [58]:
processed_docs = documents['headline_text'].map(preprocess)

In [59]:
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [60]:
dictionary = gensim.corpora.Dictionary(processed_docs) # Initialize the dictionary

In [61]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [62]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [63]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4009, 1)]

In [64]:
bow_corpus[:5]

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1)]]

In [65]:
type(bow_corpus)

list

In [66]:
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4009, 1)]

In [67]:
type(bow_corpus[4310])

list

In [68]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 76 ("bushfir") appears 1 time.
Word 112 ("help") appears 1 time.
Word 483 ("rain") appears 1 time.
Word 4009 ("dampen") appears 1 time.


In [69]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5902626795041239),
 (1, 0.3892065020004992),
 (2, 0.4955704490710528),
 (3, 0.5044979662918994)]


In [70]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [39]:
lda_model

<gensim.models.ldamulticore.LdaMulticore at 0x19c943688d0>

In [71]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.024*"south" + 0.024*"world" + 0.019*"coast" + 0.018*"australia" + 0.016*"women" + 0.015*"win" + 0.013*"gold" + 0.012*"leagu" + 0.010*"labor" + 0.009*"elect"
Topic: 1 
Words: 0.020*"final" + 0.017*"farmer" + 0.015*"budget" + 0.014*"meet" + 0.012*"say" + 0.012*"public" + 0.011*"royal" + 0.011*"australian" + 0.010*"trade" + 0.010*"commiss"
Topic: 2 
Words: 0.021*"market" + 0.014*"share" + 0.014*"servic" + 0.013*"health" + 0.013*"worker" + 0.012*"fall" + 0.011*"guilti" + 0.011*"news" + 0.011*"bank" + 0.010*"close"
Topic: 3 
Words: 0.054*"polic" + 0.020*"death" + 0.020*"perth" + 0.016*"miss" + 0.014*"shoot" + 0.013*"investig" + 0.011*"victoria" + 0.010*"offic" + 0.009*"search" + 0.009*"prison"
Topic: 4 
Words: 0.034*"australia" + 0.015*"tasmania" + 0.013*"record" + 0.012*"break" + 0.011*"take" + 0.011*"lead" + 0.009*"campaign" + 0.009*"hill" + 0.008*"melbourn" + 0.008*"storm"
Topic: 5 
Words: 0.018*"elect" + 0.017*"live" + 0.012*"turnbul" + 0.012*"life" + 0.011*"protest" 

In [72]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [73]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"juli" + 0.006*"island" + 0.006*"stori" + 0.005*"quiz" + 0.005*"breakfast" + 0.005*"port" + 0.005*"histori" + 0.004*"fiji" + 0.004*"video" + 0.004*"music"
Topic: 1 Word: 0.017*"crash" + 0.012*"polic" + 0.009*"miss" + 0.009*"search" + 0.009*"dead" + 0.008*"die" + 0.008*"driver" + 0.008*"woman" + 0.008*"kill" + 0.007*"fatal"
Topic: 2 Word: 0.009*"interview" + 0.007*"michael" + 0.007*"wednesday" + 0.007*"thursday" + 0.006*"decemb" + 0.006*"sexual" + 0.006*"jam" + 0.005*"harvest" + 0.005*"quarter" + 0.005*"open"
Topic: 3 Word: 0.015*"trump" + 0.009*"leagu" + 0.009*"australia" + 0.009*"drum" + 0.008*"world" + 0.008*"final" + 0.006*"donald" + 0.006*"rugbi" + 0.005*"test" + 0.005*"cricket"
Topic: 4 Word: 0.014*"podcast" + 0.012*"interview" + 0.011*"weather" + 0.007*"cattl" + 0.007*"octob" + 0.007*"live" + 0.007*"peter" + 0.007*"august" + 0.006*"christma" + 0.006*"northern"
Topic: 5 Word: 0.016*"countri" + 0.015*"hour" + 0.014*"rural" + 0.008*"health" + 0.007*"fund" + 0.00

In [74]:
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [75]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.35238465666770935	 
Topic: 0.020*"final" + 0.017*"farmer" + 0.015*"budget" + 0.014*"meet" + 0.012*"say" + 0.012*"public" + 0.011*"royal" + 0.011*"australian" + 0.010*"trade" + 0.010*"commiss"

Score: 0.2873835265636444	 
Topic: 0.034*"australia" + 0.015*"tasmania" + 0.013*"record" + 0.012*"break" + 0.011*"take" + 0.011*"lead" + 0.009*"campaign" + 0.009*"hill" + 0.008*"melbourn" + 0.008*"storm"

Score: 0.2202247828245163	 
Topic: 0.054*"polic" + 0.020*"death" + 0.020*"perth" + 0.016*"miss" + 0.014*"shoot" + 0.013*"investig" + 0.011*"victoria" + 0.010*"offic" + 0.009*"search" + 0.009*"prison"

Score: 0.02000320330262184	 
Topic: 0.026*"govern" + 0.022*"trump" + 0.021*"plan" + 0.017*"countri" + 0.016*"council" + 0.015*"fund" + 0.014*"hour" + 0.014*"nation" + 0.011*"water" + 0.010*"communiti"

Score: 0.020002063363790512	 
Topic: 0.021*"market" + 0.014*"share" + 0.014*"servic" + 0.013*"health" + 0.013*"worker" + 0.012*"fall" + 0.011*"guilti" + 0.011*"news" + 0.011*"bank" + 0.010*

In [76]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8199766278266907	 
Topic: 0.013*"market" + 0.010*"price" + 0.009*"share" + 0.007*"grandstand" + 0.007*"rise" + 0.007*"mine" + 0.006*"australian" + 0.006*"queensland" + 0.005*"farmer" + 0.005*"dairi"

Score: 0.020004022866487503	 
Topic: 0.014*"podcast" + 0.012*"interview" + 0.011*"weather" + 0.007*"cattl" + 0.007*"octob" + 0.007*"live" + 0.007*"peter" + 0.007*"august" + 0.006*"christma" + 0.006*"northern"

Score: 0.02000393345952034	 
Topic: 0.017*"charg" + 0.016*"murder" + 0.013*"court" + 0.012*"news" + 0.010*"polic" + 0.010*"alleg" + 0.009*"assault" + 0.008*"accus" + 0.008*"guilti" + 0.008*"turnbul"

Score: 0.020003635436296463	 
Topic: 0.006*"govern" + 0.006*"cancer" + 0.006*"climat" + 0.006*"john" + 0.005*"malcolm" + 0.005*"pacif" + 0.005*"say" + 0.005*"drought" + 0.004*"marriag" + 0.004*"beef"

Score: 0.020003605633974075	 
Topic: 0.017*"crash" + 0.012*"polic" + 0.009*"miss" + 0.009*"search" + 0.009*"dead" + 0.008*"die" + 0.008*"driver" + 0.008*"woman" + 0.008*"kill" + 0

In [77]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5166648030281067	 Topic: 0.020*"final" + 0.017*"farmer" + 0.015*"budget" + 0.014*"meet" + 0.012*"say"
Score: 0.34999850392341614	 Topic: 0.018*"elect" + 0.017*"live" + 0.012*"turnbul" + 0.012*"life" + 0.011*"protest"
Score: 0.01666928268969059	 Topic: 0.021*"market" + 0.014*"share" + 0.014*"servic" + 0.013*"health" + 0.013*"worker"
Score: 0.016667351126670837	 Topic: 0.026*"govern" + 0.022*"trump" + 0.021*"plan" + 0.017*"countri" + 0.016*"council"
Score: 0.01666666753590107	 Topic: 0.024*"south" + 0.024*"world" + 0.019*"coast" + 0.018*"australia" + 0.016*"women"
Score: 0.01666666753590107	 Topic: 0.054*"polic" + 0.020*"death" + 0.020*"perth" + 0.016*"miss" + 0.014*"shoot"
Score: 0.01666666753590107	 Topic: 0.034*"australia" + 0.015*"tasmania" + 0.013*"record" + 0.012*"break" + 0.011*"take"
Score: 0.01666666753590107	 Topic: 0.024*"sydney" + 0.018*"interview" + 0.018*"adelaid" + 0.014*"arrest" + 0.011*"sentenc"
Score: 0.01666666753590107	 Topic: 0.037*"australian" + 0.029*"quee