In [1]:
# In this workshop we perform topic modeling using gensim

# We are using the subnews dataset that we used last week. 
# The "Class" labels here are only used for sanity check of the topics discovered later.
# Remember, in actual use of topic modelling, the documents DON'T come with labeled classes.
# It's unsupervised learning.

import pandas as pd
news=pd.read_table('r8-train-all-terms.txt',header=None,names = ["Class", "Text"])
subnews=news[(news.Class=="trade")| (news.Class=='crude')|(news.Class=='money-fx') ]
subnews.head()

Unnamed: 0,Class,Text
15,trade,brazil anti inflation plan limps to anniversar...
43,crude,diamond shamrock dia cuts crude prices diamond...
55,crude,opec may have to meet to firm prices analysts ...
76,crude,texaco canada cuts crude prices canadian cts b...
77,crude,texaco canada txc lowers crude postings texaco...


In [2]:
# Let's use the similar preprocessing we used last week.
# The output of each document is a list of tokens.

import nltk
from nltk.corpus import stopwords
mystopwords=stopwords.words("English") + ['one', 'become', 'get', 'make', 'take']
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)

In [3]:
#split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(subnews.Text, subnews.Class, test_size=0.33, random_state=12)


In [4]:
# Apply preprocessing to every document in the training set.
toks_train = X_train.apply(pre_process)


In [5]:
import logging
import gensim 
from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
# Prepare a vocabulary dictionary.
dictionary = corpora.Dictionary(toks_train)
print(dictionary)

2017-09-30 12:03:45,789 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-09-30 12:03:45,859 : INFO : built Dictionary(5971 unique tokens: ['multi', 'mobil', 'grown', 'interim', 'expense']...) from 475 documents (total 57987 corpus positions)


Dictionary(5971 unique tokens: ['multi', 'mobil', 'grown', 'interim', 'expense']...)


In [7]:
# It's mappings between ids and tokens
# To get a token's id
dictionary.token2id['exchange']

109

In [8]:
# To get the token of an id
dictionary[157]

'three'

In [9]:
# Filter off any words with document frequency less than 3, or appearing in more than 80% documents
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)

2017-09-30 12:03:45,894 : INFO : discarding 3537 tokens: [('mandating', 2), ('frightening', 2), ('toughening', 2), ('wedge', 2), ('said', 403), ('reuter', 408), ('moyana', 1), ('receipt', 1), ('inward', 1), ('delay', 1)]...
2017-09-30 12:03:45,896 : INFO : keeping 2434 tokens which were in no less than 3 and no more than 332 (=70.0%) documents
2017-09-30 12:03:45,899 : INFO : resulting dictionary: Dictionary(2434 unique tokens: ['region', 'mobil', 'must', 'grown', 'shift']...)


Dictionary(2434 unique tokens: ['region', 'mobil', 'must', 'grown', 'shift']...)


In [10]:
# Use the dictionary to prepare a DTM (using TF)
dtm_train = [dictionary.doc2bow(d) for d in toks_train ]

In [33]:
# Build an LDA model for 3 topics out of the DTM
%time lda = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 3, id2word = dictionary, passes = 20)


2017-09-30 12:08:51,519 : INFO : using symmetric alpha at 0.3333333333333333
2017-09-30 12:08:51,522 : INFO : using symmetric eta at 0.0004108463434675431
2017-09-30 12:08:51,525 : INFO : using serial LDA version on this node
2017-09-30 12:08:51,571 : INFO : running online (multi-pass) LDA training, 3 topics, 100 passes over the supplied corpus of 475 documents, updating model once every 475 documents, evaluating perplexity every 475 documents, iterating 50x with a convergence threshold of 0.001000
2017-09-30 12:08:53,441 : INFO : -8.311 per-word bound, 317.5 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:08:53,441 : INFO : PROGRESS: pass 0, at document #475/475
2017-09-30 12:08:54,202 : INFO : topic #0 (0.333): 0.016*"trade" + 0.015*"mln" + 0.014*"oil" + 0.011*"dlrs" + 0.010*"billion" + 0.010*"would" + 0.009*"market" + 0.009*"price" + 0.008*"bank" + 0.007*"bpd"
2017-09-30 12:08:54,203 : INFO : topic #1 (0.333): 0.017*"trade" + 0.014*"oil

2017-09-30 12:09:11,233 : INFO : topic #2 (0.333): 0.022*"trade" + 0.017*"japan" + 0.011*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"oil" + 0.006*"official" + 0.006*"government" + 0.006*"bank"
2017-09-30 12:09:11,234 : INFO : topic diff=0.072753, rho=0.316228
2017-09-30 12:09:12,764 : INFO : -6.904 per-word bound, 119.7 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:09:12,765 : INFO : PROGRESS: pass 9, at document #475/475
2017-09-30 12:09:13,209 : INFO : topic #0 (0.333): 0.034*"mln" + 0.025*"oil" + 0.023*"dlrs" + 0.019*"billion" + 0.017*"bank" + 0.014*"crude" + 0.014*"bpd" + 0.014*"price" + 0.012*"market" + 0.011*"pct"
2017-09-30 12:09:13,210 : INFO : topic #1 (0.333): 0.017*"oil" + 0.015*"trade" + 0.012*"opec" + 0.011*"would" + 0.010*"price" + 0.008*"country" + 0.008*"minister" + 0.007*"year" + 0.007*"official" + 0.007*"last"
2017-09-30 12:09:13,211 : INFO : topic #2 (0.333): 0.023*"trade" + 0.017*"japan" + 0.01

2017-09-30 12:09:28,750 : INFO : topic #2 (0.333): 0.027*"trade" + 0.017*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"official" + 0.006*"country" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:09:28,751 : INFO : topic diff=0.027923, rho=0.229416
2017-09-30 12:09:30,301 : INFO : -6.874 per-word bound, 117.3 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:09:30,302 : INFO : PROGRESS: pass 18, at document #475/475
2017-09-30 12:09:30,702 : INFO : topic #0 (0.333): 0.035*"mln" + 0.029*"oil" + 0.024*"dlrs" + 0.019*"billion" + 0.018*"bank" + 0.015*"price" + 0.015*"crude" + 0.014*"bpd" + 0.013*"pct" + 0.012*"market"
2017-09-30 12:09:30,703 : INFO : topic #1 (0.333): 0.019*"oil" + 0.013*"opec" + 0.011*"price" + 0.011*"trade" + 0.011*"would" + 0.009*"minister" + 0.008*"country" + 0.007*"official" + 0.007*"ecuador" + 0.007*"year"
2017-09-30 12:09:30,704 : INFO : topic #2 (0.333): 0.027*"trade" + 0.017*"jap

2017-09-30 12:09:50,289 : INFO : topic #2 (0.333): 0.028*"trade" + 0.017*"japan" + 0.012*"would" + 0.010*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"official" + 0.007*"country" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:09:50,289 : INFO : topic diff=0.015316, rho=0.188982
2017-09-30 12:09:51,796 : INFO : -6.865 per-word bound, 116.6 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:09:51,796 : INFO : PROGRESS: pass 27, at document #475/475
2017-09-30 12:09:52,180 : INFO : topic #0 (0.333): 0.035*"mln" + 0.029*"oil" + 0.024*"dlrs" + 0.019*"billion" + 0.018*"bank" + 0.015*"price" + 0.014*"pct" + 0.014*"crude" + 0.014*"bpd" + 0.012*"market"
2017-09-30 12:09:52,181 : INFO : topic #1 (0.333): 0.021*"oil" + 0.014*"opec" + 0.011*"price" + 0.010*"would" + 0.009*"minister" + 0.008*"trade" + 0.007*"ecuador" + 0.007*"country" + 0.007*"official" + 0.007*"last"
2017-09-30 12:09:52,182 : INFO : topic #2 (0.333): 0.029*"trade" + 0.017*"jap

2017-09-30 12:10:07,292 : INFO : topic #2 (0.333): 0.029*"trade" + 0.017*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:10:07,293 : INFO : topic diff=0.011180, rho=0.164399
2017-09-30 12:10:08,992 : INFO : -6.858 per-word bound, 116.0 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:10:08,993 : INFO : PROGRESS: pass 36, at document #475/475
2017-09-30 12:10:09,420 : INFO : topic #0 (0.333): 0.035*"mln" + 0.029*"oil" + 0.024*"dlrs" + 0.019*"billion" + 0.018*"bank" + 0.015*"pct" + 0.015*"price" + 0.014*"crude" + 0.013*"bpd" + 0.012*"market"
2017-09-30 12:10:09,421 : INFO : topic #1 (0.333): 0.023*"oil" + 0.015*"opec" + 0.012*"price" + 0.010*"would" + 0.009*"minister" + 0.008*"ecuador" + 0.007*"country" + 0.007*"official" + 0.007*"last" + 0.006*"china"
2017-09-30 12:10:09,421 : INFO : topic #2 (0.333): 0.030*"trade" + 0.017*"jap

2017-09-30 12:10:26,938 : INFO : topic #2 (0.333): 0.030*"trade" + 0.017*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:10:26,939 : INFO : topic diff=0.009036, rho=0.147442
2017-09-30 12:10:28,714 : INFO : -6.853 per-word bound, 115.6 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:10:28,716 : INFO : PROGRESS: pass 45, at document #475/475
2017-09-30 12:10:29,187 : INFO : topic #0 (0.333): 0.035*"mln" + 0.028*"oil" + 0.024*"dlrs" + 0.019*"billion" + 0.018*"bank" + 0.015*"pct" + 0.014*"price" + 0.014*"crude" + 0.013*"bpd" + 0.012*"market"
2017-09-30 12:10:29,189 : INFO : topic #1 (0.333): 0.026*"oil" + 0.016*"opec" + 0.013*"price" + 0.010*"would" + 0.009*"minister" + 0.008*"ecuador" + 0.007*"official" + 0.007*"country" + 0.007*"last" + 0.006*"china"
2017-09-30 12:10:29,190 : INFO : topic #2 (0.333): 0.030*"trade" + 0.017*"jap

2017-09-30 12:10:45,167 : INFO : topic #2 (0.333): 0.031*"trade" + 0.017*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:10:45,168 : INFO : topic diff=0.007737, rho=0.134840
2017-09-30 12:10:46,603 : INFO : -6.849 per-word bound, 115.3 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:10:46,604 : INFO : PROGRESS: pass 54, at document #475/475
2017-09-30 12:10:46,952 : INFO : topic #0 (0.333): 0.035*"mln" + 0.026*"oil" + 0.024*"dlrs" + 0.020*"billion" + 0.018*"bank" + 0.016*"pct" + 0.014*"crude" + 0.014*"price" + 0.012*"bpd" + 0.012*"market"
2017-09-30 12:10:46,953 : INFO : topic #1 (0.333): 0.027*"oil" + 0.017*"opec" + 0.014*"price" + 0.010*"would" + 0.009*"minister" + 0.008*"ecuador" + 0.007*"official" + 0.007*"country" + 0.007*"last" + 0.006*"china"
2017-09-30 12:10:46,954 : INFO : topic #2 (0.333): 0.031*"trade" + 0.017*"jap

2017-09-30 12:11:01,787 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:11:01,788 : INFO : topic diff=0.006750, rho=0.125000
2017-09-30 12:11:03,561 : INFO : -6.846 per-word bound, 115.1 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:11:03,561 : INFO : PROGRESS: pass 63, at document #475/475
2017-09-30 12:11:03,988 : INFO : topic #0 (0.333): 0.035*"mln" + 0.025*"oil" + 0.024*"dlrs" + 0.020*"billion" + 0.018*"bank" + 0.016*"pct" + 0.014*"crude" + 0.013*"price" + 0.012*"market" + 0.012*"bpd"
2017-09-30 12:11:03,989 : INFO : topic #1 (0.333): 0.029*"oil" + 0.017*"opec" + 0.015*"price" + 0.010*"would" + 0.008*"minister" + 0.008*"ecuador" + 0.007*"official" + 0.007*"last" + 0.007*"country" + 0.006*"china"
2017-09-30 12:11:03,990 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"jap

2017-09-30 12:11:20,595 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:11:20,596 : INFO : topic diff=0.005921, rho=0.117041
2017-09-30 12:11:22,043 : INFO : -6.844 per-word bound, 114.8 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:11:22,044 : INFO : PROGRESS: pass 72, at document #475/475
2017-09-30 12:11:22,390 : INFO : topic #0 (0.333): 0.035*"mln" + 0.024*"dlrs" + 0.024*"oil" + 0.020*"billion" + 0.018*"bank" + 0.016*"pct" + 0.013*"crude" + 0.013*"price" + 0.012*"market" + 0.011*"bpd"
2017-09-30 12:11:22,391 : INFO : topic #1 (0.333): 0.031*"oil" + 0.018*"opec" + 0.015*"price" + 0.010*"would" + 0.008*"minister" + 0.008*"ecuador" + 0.007*"official" + 0.007*"last" + 0.006*"country" + 0.006*"china"
2017-09-30 12:11:22,392 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"jap

2017-09-30 12:11:37,878 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:11:37,879 : INFO : topic diff=0.005210, rho=0.110432
2017-09-30 12:11:39,579 : INFO : -6.841 per-word bound, 114.7 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:11:39,579 : INFO : PROGRESS: pass 81, at document #475/475
2017-09-30 12:11:39,968 : INFO : topic #0 (0.333): 0.035*"mln" + 0.024*"dlrs" + 0.023*"oil" + 0.021*"billion" + 0.019*"bank" + 0.016*"pct" + 0.013*"crude" + 0.012*"market" + 0.012*"price" + 0.011*"stg"
2017-09-30 12:11:39,969 : INFO : topic #1 (0.333): 0.032*"oil" + 0.018*"opec" + 0.016*"price" + 0.010*"would" + 0.008*"minister" + 0.008*"ecuador" + 0.007*"official" + 0.007*"last" + 0.007*"bpd" + 0.007*"barrel"
2017-09-30 12:11:39,970 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan"

2017-09-30 12:11:54,422 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.007*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:11:54,423 : INFO : topic diff=0.004591, rho=0.104828
2017-09-30 12:11:55,854 : INFO : -6.839 per-word bound, 114.5 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:11:55,855 : INFO : PROGRESS: pass 90, at document #475/475
2017-09-30 12:11:56,188 : INFO : topic #0 (0.333): 0.035*"mln" + 0.025*"dlrs" + 0.022*"oil" + 0.021*"billion" + 0.019*"bank" + 0.017*"pct" + 0.013*"crude" + 0.012*"market" + 0.012*"price" + 0.011*"stg"
2017-09-30 12:11:56,189 : INFO : topic #1 (0.333): 0.033*"oil" + 0.018*"opec" + 0.016*"price" + 0.010*"would" + 0.008*"ecuador" + 0.008*"minister" + 0.007*"bpd" + 0.007*"official" + 0.007*"barrel" + 0.007*"last"
2017-09-30 12:11:56,190 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan"

2017-09-30 12:12:10,337 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.008*"country" + 0.007*"official" + 0.006*"foreign" + 0.006*"agreement"
2017-09-30 12:12:10,338 : INFO : topic diff=0.004195, rho=0.100000
2017-09-30 12:12:11,770 : INFO : -6.838 per-word bound, 114.4 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:12:11,771 : INFO : PROGRESS: pass 99, at document #475/475
2017-09-30 12:12:12,107 : INFO : topic #0 (0.333): 0.035*"mln" + 0.025*"dlrs" + 0.021*"oil" + 0.021*"billion" + 0.019*"bank" + 0.017*"pct" + 0.013*"crude" + 0.012*"market" + 0.011*"price" + 0.011*"stg"
2017-09-30 12:12:12,108 : INFO : topic #1 (0.333): 0.034*"oil" + 0.019*"opec" + 0.017*"price" + 0.010*"would" + 0.008*"ecuador" + 0.008*"bpd" + 0.008*"minister" + 0.007*"official" + 0.007*"barrel" + 0.007*"last"
2017-09-30 12:12:12,108 : INFO : topic #2 (0.333): 0.031*"trade" + 0.016*"japan"

CPU times: user 3min 17s, sys: 888 ms, total: 3min 18s
Wall time: 3min 20s


In [34]:
# To see the topics, with the most probable words in each topic. What topics to you see? 
lda.show_topics()

[(0,
  '0.035*"mln" + 0.025*"dlrs" + 0.021*"oil" + 0.021*"billion" + 0.019*"bank" + 0.017*"pct" + 0.013*"crude" + 0.012*"market" + 0.011*"price" + 0.011*"stg"'),
 (1,
  '0.034*"oil" + 0.019*"opec" + 0.017*"price" + 0.010*"would" + 0.008*"ecuador" + 0.008*"bpd" + 0.008*"minister" + 0.007*"official" + 0.007*"barrel" + 0.007*"last"'),
 (2,
  '0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.008*"country" + 0.007*"official" + 0.007*"foreign" + 0.006*"agreement"')]

In [35]:
# You can also request to see more words per topic
lda.show_topics(num_words=20)

[(0,
  '0.035*"mln" + 0.025*"dlrs" + 0.021*"oil" + 0.021*"billion" + 0.019*"bank" + 0.017*"pct" + 0.013*"crude" + 0.012*"market" + 0.011*"price" + 0.011*"stg" + 0.010*"barrel" + 0.010*"bpd" + 0.009*"year" + 0.009*"february" + 0.007*"today" + 0.007*"money" + 0.007*"january" + 0.006*"company" + 0.006*"last" + 0.006*"stock"'),
 (1,
  '0.034*"oil" + 0.019*"opec" + 0.017*"price" + 0.010*"would" + 0.008*"ecuador" + 0.008*"bpd" + 0.008*"minister" + 0.007*"official" + 0.007*"barrel" + 0.007*"last" + 0.006*"china" + 0.006*"dlrs" + 0.006*"mln" + 0.006*"country" + 0.006*"crude" + 0.005*"energy" + 0.005*"output" + 0.005*"year" + 0.005*"production" + 0.005*"export"'),
 (2,
  '0.031*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.009*"market" + 0.008*"year" + 0.008*"country" + 0.007*"official" + 0.007*"foreign" + 0.006*"agreement" + 0.006*"state" + 0.006*"import" + 0.005*"rate" + 0.005*"dollar" + 0.005*"last" + 0.005*"exchange" + 0.005*"united" + 0.005*"currency" + 0.005*"bank" + 0.00

In [14]:
# A similar function showing each topic with its most probable words and its topic coherence score
lda.top_topics(dtm_train)

[([(0.02265210370344296, 'trade'),
   (0.019098322737379322, 'japan'),
   (0.011465547957815311, 'market'),
   (0.011463130369687052, 'japanese'),
   (0.010105434955204213, 'would'),
   (0.009499749678240842, 'dollar'),
   (0.0089814549647898086, 'official'),
   (0.0075209149361294371, 'currency'),
   (0.0068566801430538825, 'bank'),
   (0.0068488216247778082, 'agreement'),
   (0.0066598665018184908, 'country'),
   (0.0062523624245382449, 'rate'),
   (0.0062205844242593338, 'state'),
   (0.0062118442743486759, 'exchange'),
   (0.0059963228262647659, 'minister'),
   (0.005033302849512305, 'united'),
   (0.0047398540956122125, 'year'),
   (0.0046345186141297859, 'world'),
   (0.0046291941342265618, 'last'),
   (0.0045628490437653914, 'nation')],
  -1.0883520694273221),
 ([(0.029287565475462136, 'oil'),
   (0.025727843127202426, 'mln'),
   (0.018353473593037451, 'dlrs'),
   (0.017517664277337593, 'price'),
   (0.01576714224619152, 'opec'),
   (0.013263002487102849, 'crude'),
   (0.0128664

In [15]:
# We can therefore derive the average topic coherence, as a way to evaluate the topic models
import numpy as np
lda_coherence = [ n for _, n in lda.top_topics(dtm_train) ]
np.mean(lda_coherence)

-1.5164294504346489

In [16]:
# Another metric for gauging goodness of models, perplexity, is accessed using bound() function
lda.bound(dtm_train)

-344555.61384163948

In [17]:
# Get the topic distribution of documents
dtopics_train = lda.get_document_topics(dtm_train)

In [18]:
# Get the topic likelihood for the first document in train set
for i in range(0, 5):
    print(dtopics_train[i])

[(2, 0.9933326808101347)]
[(0, 0.61447506018849996), (1, 0.37764094072656679)]
[(0, 0.3248011624549641), (2, 0.66892040293521571)]
[(0, 0.50506141753641587), (1, 0.044412136193555156), (2, 0.45052644627002908)]
[(1, 0.9896036830008591)]


In [19]:
# Pick the topic with the highest probability for each document, map it to the label
# NOTE: the mapping may change in a different run
from operator import itemgetter
top_train = [ max(t, key=itemgetter(1))[0] for t in dtopics_train ]
dict = {1: 'crude', 0: 'money-fx', 2: 'trade'}
topic_train = [ dict[t] for t in top_train ]

In [20]:
# Now let's see how well these topics match the actual categories

from sklearn import metrics
print(metrics.confusion_matrix(topic_train, y_train))
print(np.mean(topic_train == y_train) )
print(metrics.classification_report(topic_train, y_train))

[[118  58  18]
 [  3  62 107]
 [ 53   6  50]]
0.484210526316
             precision    recall  f1-score   support

      crude       0.68      0.61      0.64       194
   money-fx       0.49      0.36      0.42       172
      trade       0.29      0.46      0.35       109

avg / total       0.52      0.48      0.49       475



In [21]:
# The typical practice is to use the reserved test set for evaluation
toks_test = X_test.apply(pre_process)
dtm_test = [dictionary.doc2bow(d) for d in toks_test ]
dtopics_test = lda.get_document_topics(dtm_test)
top_test = [ max(t,key=itemgetter(1))[0] for t in dtopics_test ]
topic_test = [ dict[t] for t in top_test ]
print(metrics.confusion_matrix(topic_test, y_test))
print(np.mean(topic_test == y_test) )
print(metrics.classification_report(topic_test, y_test))

[[51 37 13]
 [ 3 40 44]
 [25  3 19]]
0.468085106383
             precision    recall  f1-score   support

      crude       0.65      0.50      0.57       101
   money-fx       0.50      0.46      0.48        87
      trade       0.25      0.40      0.31        47

avg / total       0.51      0.47      0.48       235



In [22]:
# In actual use case, we wouldn't have class labels for comparison.
# So mean topic coherence and model perplexity can be checked.
test_coherence = [ n for _, n in lda.top_topics(dtm_test) ]
np.mean(test_coherence)


-2.3245713610388639

In [23]:
lda.bound(dtm_test)

-162133.88252964919

In [24]:
# Different models can be compared using such metrics

%time lda4 = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 4, id2word = dictionary, passes = 20)

2017-09-30 12:04:34,145 : INFO : using symmetric alpha at 0.25
2017-09-30 12:04:34,147 : INFO : using symmetric eta at 0.0004108463434675431
2017-09-30 12:04:34,150 : INFO : using serial LDA version on this node
2017-09-30 12:04:34,199 : INFO : running online (multi-pass) LDA training, 4 topics, 20 passes over the supplied corpus of 475 documents, updating model once every 475 documents, evaluating perplexity every 475 documents, iterating 50x with a convergence threshold of 0.001000
2017-09-30 12:04:36,147 : INFO : -8.408 per-word bound, 339.8 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:04:36,147 : INFO : PROGRESS: pass 0, at document #475/475
2017-09-30 12:04:36,845 : INFO : topic #0 (0.250): 0.019*"trade" + 0.011*"oil" + 0.010*"price" + 0.009*"would" + 0.008*"opec" + 0.008*"market" + 0.007*"country" + 0.007*"mln" + 0.006*"official" + 0.006*"currency"
2017-09-30 12:04:36,846 : INFO : topic #1 (0.250): 0.022*"oil" + 0.010*"trade" + 0.

2017-09-30 12:04:49,210 : INFO : topic diff=0.113997, rho=0.353553
2017-09-30 12:04:50,734 : INFO : -6.898 per-word bound, 119.2 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:04:50,735 : INFO : PROGRESS: pass 7, at document #475/475
2017-09-30 12:04:51,173 : INFO : topic #0 (0.250): 0.015*"opec" + 0.014*"price" + 0.012*"market" + 0.010*"dollar" + 0.010*"would" + 0.009*"currency" + 0.009*"trade" + 0.008*"country" + 0.008*"bank" + 0.007*"official"
2017-09-30 12:04:51,174 : INFO : topic #1 (0.250): 0.031*"oil" + 0.011*"bank" + 0.009*"crude" + 0.009*"would" + 0.009*"price" + 0.008*"minister" + 0.008*"dlrs" + 0.008*"ecuador" + 0.008*"rate" + 0.008*"saudi"
2017-09-30 12:04:51,175 : INFO : topic #2 (0.250): 0.037*"trade" + 0.020*"japan" + 0.013*"market" + 0.012*"would" + 0.012*"japanese" + 0.010*"mln" + 0.009*"official" + 0.009*"billion" + 0.009*"bill" + 0.008*"stg"
2017-09-30 12:04:51,176 : INFO : topic #3 (0.250): 0.024*"oil" + 0.019*"dlrs" +

2017-09-30 12:05:04,702 : INFO : topic #1 (0.250): 0.033*"oil" + 0.011*"ecuador" + 0.010*"crude" + 0.010*"bank" + 0.010*"would" + 0.009*"minister" + 0.009*"dlrs" + 0.009*"price" + 0.008*"saudi" + 0.008*"barrel"
2017-09-30 12:05:04,702 : INFO : topic #2 (0.250): 0.040*"trade" + 0.020*"japan" + 0.013*"market" + 0.013*"would" + 0.012*"japanese" + 0.009*"mln" + 0.009*"official" + 0.009*"bill" + 0.008*"stg" + 0.008*"import"
2017-09-30 12:05:04,703 : INFO : topic #3 (0.250): 0.028*"oil" + 0.020*"mln" + 0.020*"dlrs" + 0.015*"billion" + 0.013*"year" + 0.012*"pct" + 0.011*"price" + 0.010*"bpd" + 0.010*"barrel" + 0.009*"crude"
2017-09-30 12:05:04,704 : INFO : topic diff=0.037041, rho=0.250000
2017-09-30 12:05:06,197 : INFO : -6.866 per-word bound, 116.7 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:05:06,198 : INFO : PROGRESS: pass 15, at document #475/475
2017-09-30 12:05:06,591 : INFO : topic #0 (0.250): 0.014*"opec" + 0.014*"market" + 0.013*"do

CPU times: user 39.9 s, sys: 98.8 ms, total: 40 s
Wall time: 40 s


In [25]:
lda4.show_topics()

[(0,
  '0.014*"dollar" + 0.014*"market" + 0.013*"bank" + 0.012*"opec" + 0.012*"price" + 0.011*"currency" + 0.010*"would" + 0.009*"rate" + 0.009*"exchange" + 0.008*"country"'),
 (1,
  '0.035*"oil" + 0.012*"ecuador" + 0.011*"crude" + 0.010*"would" + 0.010*"minister" + 0.009*"dlrs" + 0.009*"price" + 0.009*"saudi" + 0.008*"bank" + 0.008*"barrel"'),
 (2,
  '0.040*"trade" + 0.020*"japan" + 0.013*"would" + 0.013*"market" + 0.012*"japanese" + 0.009*"mln" + 0.009*"official" + 0.009*"bill" + 0.008*"stg" + 0.008*"import"'),
 (3,
  '0.029*"oil" + 0.021*"mln" + 0.020*"dlrs" + 0.016*"billion" + 0.013*"year" + 0.013*"pct" + 0.011*"price" + 0.011*"bpd" + 0.010*"barrel" + 0.010*"crude"')]

In [26]:
test_coherence4 = [ n for _, n in lda4.top_topics(dtm_test) ]
np.mean(test_coherence4)

-2.4709021811762817

In [27]:
%time lda2 = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 2, id2word = dictionary, passes = 20)
lda2.show_topics()

2017-09-30 12:05:14,180 : INFO : using symmetric alpha at 0.5
2017-09-30 12:05:14,181 : INFO : using symmetric eta at 0.0004108463434675431
2017-09-30 12:05:14,183 : INFO : using serial LDA version on this node
2017-09-30 12:05:14,209 : INFO : running online (multi-pass) LDA training, 2 topics, 20 passes over the supplied corpus of 475 documents, updating model once every 475 documents, evaluating perplexity every 475 documents, iterating 50x with a convergence threshold of 0.001000
2017-09-30 12:05:15,982 : INFO : -8.212 per-word bound, 296.4 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:05:15,983 : INFO : PROGRESS: pass 0, at document #475/475
2017-09-30 12:05:16,668 : INFO : topic #0 (0.500): 0.014*"oil" + 0.013*"trade" + 0.011*"would" + 0.009*"dlrs" + 0.009*"mln" + 0.009*"billion" + 0.008*"year" + 0.008*"bank" + 0.007*"japan" + 0.007*"pct"
2017-09-30 12:05:16,669 : INFO : topic #1 (0.500): 0.020*"trade" + 0.015*"oil" + 0.011*"mln" + 

2017-09-30 12:05:40,178 : INFO : PROGRESS: pass 11, at document #475/475
2017-09-30 12:05:40,670 : INFO : topic #0 (0.500): 0.015*"oil" + 0.013*"bank" + 0.012*"mln" + 0.012*"billion" + 0.011*"dlrs" + 0.009*"pct" + 0.009*"would" + 0.008*"market" + 0.008*"year" + 0.008*"dollar"
2017-09-30 12:05:40,672 : INFO : topic #1 (0.500): 0.027*"trade" + 0.014*"oil" + 0.013*"japan" + 0.011*"opec" + 0.011*"price" + 0.010*"would" + 0.009*"official" + 0.009*"market" + 0.009*"japanese" + 0.008*"mln"
2017-09-30 12:05:40,672 : INFO : topic diff=0.027262, rho=0.277350
2017-09-30 12:05:42,253 : INFO : -6.927 per-word bound, 121.7 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-09-30 12:05:42,254 : INFO : PROGRESS: pass 12, at document #475/475
2017-09-30 12:05:42,733 : INFO : topic #0 (0.500): 0.015*"oil" + 0.013*"bank" + 0.012*"mln" + 0.012*"billion" + 0.012*"dlrs" + 0.010*"pct" + 0.009*"would" + 0.008*"market" + 0.008*"year" + 0.008*"dollar"
2017-09-30 12:05:42,734 :

CPU times: user 42 s, sys: 118 ms, total: 42.1 s
Wall time: 42.2 s


[(0,
  '0.017*"oil" + 0.014*"bank" + 0.013*"mln" + 0.012*"dlrs" + 0.012*"billion" + 0.010*"pct" + 0.008*"market" + 0.008*"would" + 0.008*"dollar" + 0.008*"year"'),
 (1,
  '0.028*"trade" + 0.013*"japan" + 0.012*"oil" + 0.011*"opec" + 0.010*"would" + 0.010*"price" + 0.009*"official" + 0.009*"japanese" + 0.008*"market" + 0.007*"bpd"')]

In [28]:
test_coherence2 = [ n for _, n in lda2.top_topics(dtm_test) ]
np.mean(test_coherence2)

-2.1574553582018892

In [29]:
%time lsi = gensim.models.LsiModel(dtm_train, id2word=dictionary, num_topics=200)


2017-09-30 12:05:56,389 : INFO : using serial LSI version on this node
2017-09-30 12:05:56,391 : INFO : updating model with new documents
2017-09-30 12:05:56,392 : INFO : preparing a new chunk of documents
2017-09-30 12:05:56,404 : INFO : using 100 extra samples and 2 power iterations
2017-09-30 12:05:56,406 : INFO : 1st phase: constructing (2434, 300) action matrix
2017-09-30 12:05:56,421 : INFO : orthonormalizing (2434, 300) action matrix
2017-09-30 12:05:56,531 : INFO : 2nd phase: running dense svd on (300, 475) matrix
2017-09-30 12:05:56,553 : INFO : computing the final decomposition
2017-09-30 12:05:56,554 : INFO : keeping 200 factors (discarding 3.966% of energy spectrum)
2017-09-30 12:05:56,567 : INFO : processed documents up to #475
2017-09-30 12:05:56,568 : INFO : topic #0(149.574): 0.337*"trade" + 0.324*"oil" + 0.208*"price" + 0.204*"mln" + 0.189*"would" + 0.181*"opec" + 0.180*"dlrs" + 0.164*"japan" + 0.162*"market" + 0.162*"year"
2017-09-30 12:05:56,569 : INFO : topic #1(107

CPU times: user 302 ms, sys: 28.4 ms, total: 330 ms
Wall time: 184 ms


In [30]:
lsi.print_topics(5)


2017-09-30 12:05:56,579 : INFO : topic #0(149.574): 0.337*"trade" + 0.324*"oil" + 0.208*"price" + 0.204*"mln" + 0.189*"would" + 0.181*"opec" + 0.180*"dlrs" + 0.164*"japan" + 0.162*"market" + 0.162*"year"
2017-09-30 12:05:56,580 : INFO : topic #1(107.523): 0.519*"trade" + -0.388*"oil" + -0.268*"opec" + -0.237*"bpd" + -0.226*"mln" + 0.224*"japan" + -0.214*"price" + -0.138*"crude" + -0.120*"barrel" + 0.119*"japanese"
2017-09-30 12:05:56,581 : INFO : topic #2(76.511): -0.418*"dollar" + -0.405*"bank" + 0.370*"trade" + -0.238*"currency" + -0.199*"market" + -0.158*"rate" + -0.147*"yen" + -0.146*"dealer" + -0.136*"central" + -0.127*"exchange"
2017-09-30 12:05:56,583 : INFO : topic #3(66.218): 0.424*"oil" + -0.395*"mln" + -0.307*"bpd" + -0.262*"opec" + 0.219*"dlrs" + -0.189*"japan" + 0.171*"billion" + 0.147*"pct" + -0.139*"japanese" + -0.136*"output"
2017-09-30 12:05:56,584 : INFO : topic #4(64.927): -0.490*"mln" + 0.310*"price" + -0.275*"billion" + 0.244*"opec" + 0.218*"japan" + 0.189*"japanes

[(0,
  '0.337*"trade" + 0.324*"oil" + 0.208*"price" + 0.204*"mln" + 0.189*"would" + 0.181*"opec" + 0.180*"dlrs" + 0.164*"japan" + 0.162*"market" + 0.162*"year"'),
 (1,
  '0.519*"trade" + -0.388*"oil" + -0.268*"opec" + -0.237*"bpd" + -0.226*"mln" + 0.224*"japan" + -0.214*"price" + -0.138*"crude" + -0.120*"barrel" + 0.119*"japanese"'),
 (2,
  '-0.418*"dollar" + -0.405*"bank" + 0.370*"trade" + -0.238*"currency" + -0.199*"market" + -0.158*"rate" + -0.147*"yen" + -0.146*"dealer" + -0.136*"central" + -0.127*"exchange"'),
 (3,
  '0.424*"oil" + -0.395*"mln" + -0.307*"bpd" + -0.262*"opec" + 0.219*"dlrs" + -0.189*"japan" + 0.171*"billion" + 0.147*"pct" + -0.139*"japanese" + -0.136*"output"'),
 (4,
  '-0.490*"mln" + 0.310*"price" + -0.275*"billion" + 0.244*"opec" + 0.218*"japan" + 0.189*"japanese" + -0.181*"bank" + -0.174*"bpd" + 0.162*"official" + -0.122*"stg"')]

In [31]:
lsi_test = lsi[dtm_test]

In [32]:
lsi_test[0]


[(0, 2.8907380134824248),
 (1, 1.7909116503288862),
 (2, 0.53732206998022403),
 (3, 1.8450129666330857),
 (4, -4.3388467814575149),
 (5, -0.59046079259362416),
 (6, -5.5712525778407649),
 (7, -2.6101993682396474),
 (8, -3.4441187832214868),
 (9, 0.29787990358551064),
 (10, 0.40228719627415355),
 (11, -3.5223807190680203),
 (12, 0.18187134074060279),
 (13, -2.927972338628666),
 (14, -0.18466128367876561),
 (15, 0.099085363438607577),
 (16, 0.21464097942929763),
 (17, 0.49463065760229241),
 (18, 0.22348374876822699),
 (19, -1.8086568914286665),
 (20, 0.18854768490759369),
 (21, 1.6099524171346722),
 (22, -1.3746290019633916),
 (23, -0.152365894594062),
 (24, -0.99847647272122553),
 (25, 1.4038651305260095),
 (26, 2.1472986468808428),
 (27, -1.7718058031406143),
 (28, 1.3820569709156052),
 (29, 0.59678488475779023),
 (30, 0.19047721534648515),
 (31, 0.047812233361052496),
 (32, 0.16049539411763813),
 (33, -0.86957451749894488),
 (34, 0.96116223372872656),
 (35, 0.83582009232554311),
 (36,