In [2]:
# In this workshop we perform topic modeling using gensim

# The "Class" labels here are only used for sanity check of the topics discovered later.
# Remember, in actual use of topic modelling, the documents DON'T come with labeled classes.
# It's unsupervised learning.

import pandas as pd
import os
os.chdir(r"C:\Users\hp\Desktop\TM\Week4")
news=pd.read_table('r8-train-all-terms.txt',header=None,names = ["Class", "Text"])
subnews=news[(news.Class=="trade")| (news.Class=='crude')|(news.Class=='money-fx') ]
subnews.head()

Unnamed: 0,Class,Text
15,trade,brazil anti inflation plan limps to anniversar...
43,crude,diamond shamrock dia cuts crude prices diamond...
55,crude,opec may have to meet to firm prices analysts ...
76,crude,texaco canada cuts crude prices canadian cts b...
77,crude,texaco canada txc lowers crude postings texaco...


In [3]:
# Let's use the similar preprocessing we used last week.
# The output of each document is a list of tokens.

import nltk
from nltk.corpus import stopwords
mystopwords=stopwords.words("English") + ['one', 'become', 'get', 'make', 'take']
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)

In [4]:
#split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(subnews.Text, subnews.Class, test_size=0.33, random_state=12)


In [5]:
# Apply preprocessing to every document in the training set.
toks_train = X_train.apply(pre_process)
toks_train

1807    [house, panel, approves, trade, bill, house, w...
3295    [zimbabwe, dollar, devaluation, central, bank,...
5162    [swedish, prime, minister, china, visit, boost...
2151    [canada, prime, minister, say, major, trade, d...
4418    [mexican, first, qtr, crude, export, seen, dlr...
3531    [canada, outline, gatt, agricultural, reform, ...
2539    [kuwait, say, opec, member, violating, output,...
2257    [brazil, trade, surplus, doubled, february, br...
3548    [yeutter, say, japan, trade, difficulty, remai...
319     [china, call, better, trade, deal, china, call...
2581    [rotterdam, sign, cooperation, agreement, toky...
5386    [italy, say, give, chance, verify, tokyo, acco...
3915    [ecuador, negotiates, nigeria, lending, oil, e...
4632    [stock, market, overreacts, tariff, yeutter, t...
4826    [money, market, offered, early, assistance, ba...
4718    [senior, official, visit, japan, trade, row, g...
78      [marathon, petroleum, reduces, crude, posting,...
1829    [bank,

In [6]:
import logging
import gensim 
from gensim import corpora

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [7]:
# Prepare a vocabulary dictionary.
dictionary = corpora.Dictionary(toks_train)
print(dictionary)

Dictionary(5971 unique tokens: ['house', 'panel', 'approves', 'trade', 'bill']...)


In [9]:
# It's mappings between ids and tokens
# To get a token's id
dictionary.token2id['exchange']

108

In [10]:
# To get the token of an id
dictionary[157]

'help'

In [8]:
# Filter off any words with document frequency less than 3, or appearing in more than 80% documents
dictionary.filter_extremes(no_below=3, no_above=0.7)
print(dictionary)

Dictionary(2434 unique tokens: ['house', 'panel', 'approves', 'trade', 'bill']...)


In [9]:
# Use the dictionary to prepare a DTM (using TF)
dtm_train = [dictionary.doc2bow(d) for d in toks_train ]

In [13]:
# Build an LDA model for 3 topics out of the DTM
%time lda = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 3, id2word = dictionary, passes = 20)


2017-10-03 21:16:03,293 : INFO : using symmetric alpha at 0.3333333333333333
2017-10-03 21:16:03,297 : INFO : using symmetric eta at 0.0004108463434675431
2017-10-03 21:16:03,300 : INFO : using serial LDA version on this node
2017-10-03 21:16:03,354 : INFO : running online (multi-pass) LDA training, 3 topics, 20 passes over the supplied corpus of 475 documents, updating model once every 475 documents, evaluating perplexity every 475 documents, iterating 50x with a convergence threshold of 0.001000
2017-10-03 21:16:07,938 : INFO : -8.320 per-word bound, 319.5 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:16:07,939 : INFO : PROGRESS: pass 0, at document #475/475
2017-10-03 21:16:09,624 : INFO : topic #0 (0.333): 0.019*"oil" + 0.014*"trade" + 0.011*"mln" + 0.010*"would" + 0.009*"billion" + 0.009*"year" + 0.008*"market" + 0.008*"dlrs" + 0.008*"bank" + 0.008*"last"
2017-10-03 21:16:09,628 : INFO : topic #1 (0.333): 0.019*"trade" + 0.011*"japa

2017-10-03 21:16:50,684 : INFO : topic #2 (0.333): 0.017*"bank" + 0.016*"oil" + 0.016*"mln" + 0.015*"market" + 0.013*"stg" + 0.012*"dlrs" + 0.012*"dollar" + 0.011*"price" + 0.009*"crude" + 0.008*"today"
2017-10-03 21:16:50,688 : INFO : topic diff=0.063487, rho=0.316228
2017-10-03 21:16:54,438 : INFO : -6.866 per-word bound, 116.6 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:16:54,440 : INFO : PROGRESS: pass 9, at document #475/475
2017-10-03 21:16:55,252 : INFO : topic #0 (0.333): 0.030*"oil" + 0.017*"mln" + 0.013*"opec" + 0.013*"dlrs" + 0.013*"price" + 0.012*"billion" + 0.012*"bpd" + 0.010*"pct" + 0.010*"year" + 0.008*"last"
2017-10-03 21:16:55,255 : INFO : topic #1 (0.333): 0.033*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.008*"country" + 0.008*"market" + 0.008*"official" + 0.008*"year" + 0.007*"agreement" + 0.006*"state"
2017-10-03 21:16:55,260 : INFO : topic #2 (0.333): 0.018*"bank" + 0.016*"mln" + 0.016*"oil" + 0

2017-10-03 21:17:26,885 : INFO : topic diff=0.023519, rho=0.229416
2017-10-03 21:17:29,186 : INFO : -6.845 per-word bound, 115.0 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:17:29,187 : INFO : PROGRESS: pass 18, at document #475/475
2017-10-03 21:17:29,770 : INFO : topic #0 (0.333): 0.033*"oil" + 0.018*"mln" + 0.015*"price" + 0.015*"opec" + 0.014*"dlrs" + 0.012*"bpd" + 0.012*"billion" + 0.010*"pct" + 0.010*"year" + 0.009*"barrel"
2017-10-03 21:17:29,771 : INFO : topic #1 (0.333): 0.033*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.008*"country" + 0.008*"year" + 0.008*"market" + 0.008*"official" + 0.007*"agreement" + 0.006*"state"
2017-10-03 21:17:29,774 : INFO : topic #2 (0.333): 0.024*"bank" + 0.018*"market" + 0.017*"mln" + 0.016*"dollar" + 0.015*"stg" + 0.013*"oil" + 0.011*"dlrs" + 0.010*"today" + 0.009*"money" + 0.009*"price"
2017-10-03 21:17:29,776 : INFO : topic diff=0.021925, rho=0.223607
2017-10-03 21:17:32,033 :

Wall time: 1min 29s


In [14]:
# To see the topics, with the most probable words in each topic. What topics to you see? 
lda.show_topics()

[(0,
  '0.034*"oil" + 0.018*"mln" + 0.015*"price" + 0.015*"opec" + 0.014*"dlrs" + 0.012*"bpd" + 0.012*"billion" + 0.010*"pct" + 0.010*"year" + 0.009*"barrel"'),
 (1,
  '0.033*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.008*"country" + 0.008*"year" + 0.008*"market" + 0.008*"official" + 0.007*"agreement" + 0.006*"state"'),
 (2,
  '0.025*"bank" + 0.018*"market" + 0.017*"mln" + 0.017*"dollar" + 0.015*"stg" + 0.012*"oil" + 0.011*"dlrs" + 0.010*"today" + 0.010*"money" + 0.009*"currency"')]

In [15]:
# You can also request to see more words per topic
lda.show_topics(num_words=20)

[(0,
  '0.034*"oil" + 0.018*"mln" + 0.015*"price" + 0.015*"opec" + 0.014*"dlrs" + 0.012*"bpd" + 0.012*"billion" + 0.010*"pct" + 0.010*"year" + 0.009*"barrel" + 0.009*"crude" + 0.008*"last" + 0.008*"would" + 0.006*"export" + 0.006*"ecuador" + 0.006*"production" + 0.006*"saudi" + 0.006*"february" + 0.006*"output" + 0.005*"day"'),
 (1,
  '0.033*"trade" + 0.016*"japan" + 0.012*"would" + 0.009*"japanese" + 0.008*"country" + 0.008*"year" + 0.008*"market" + 0.008*"official" + 0.007*"agreement" + 0.006*"state" + 0.006*"foreign" + 0.006*"import" + 0.005*"last" + 0.005*"minister" + 0.005*"world" + 0.005*"united" + 0.005*"export" + 0.005*"gatt" + 0.005*"economic" + 0.005*"tariff"'),
 (2,
  '0.025*"bank" + 0.018*"market" + 0.017*"mln" + 0.017*"dollar" + 0.015*"stg" + 0.012*"oil" + 0.011*"dlrs" + 0.010*"today" + 0.010*"money" + 0.009*"currency" + 0.009*"rate" + 0.009*"price" + 0.008*"crude" + 0.008*"exchange" + 0.007*"canada" + 0.007*"pct" + 0.007*"yen" + 0.007*"around" + 0.006*"central" + 0.006*"p

In [16]:
# A similar function showing each topic with its most probable words and its topic coherence score
lda.top_topics(dtm_train)

[([(0.033248659356522464, 'trade'),
   (0.015861212710266071, 'japan'),
   (0.012301104840762366, 'would'),
   (0.0088471521864590986, 'japanese'),
   (0.0083297970173540556, 'country'),
   (0.0079208636502411268, 'year'),
   (0.0077763195740064026, 'market'),
   (0.007684073694710295, 'official'),
   (0.0066064385340074642, 'agreement'),
   (0.0062598500869903108, 'state'),
   (0.0057483198812265544, 'foreign'),
   (0.0055023687278431058, 'import'),
   (0.0053915523730109225, 'last'),
   (0.0051648919906105073, 'minister'),
   (0.0051149630467510407, 'world'),
   (0.0050929184390505686, 'united'),
   (0.0049989542470142753, 'export'),
   (0.0048016461879245992, 'gatt'),
   (0.0045341884751624977, 'economic'),
   (0.0045180893731752887, 'tariff')],
  -1.1128509952861618),
 ([(0.033510021512893916, 'oil'),
   (0.018118762065835653, 'mln'),
   (0.01510540212215775, 'price'),
   (0.014811866280453024, 'opec'),
   (0.014310576371810305, 'dlrs'),
   (0.012195776831997797, 'bpd'),
   (0.0119

In [17]:
# We can therefore derive the average topic coherence, as a way to evaluate the topic models
import numpy as np
lda_coherence = [ n for _, n in lda.top_topics(dtm_train) ]
np.mean(lda_coherence)

-1.7142716864650136

In [18]:
# Another metric for gauging goodness of models, perplexity, is accessed using bound() function
lda.bound(dtm_train)

-344107.22313101846

In [19]:
# Get the topic distribution of documents
dtopics_train = lda.get_document_topics(dtm_train)

In [20]:
# Get the topic likelihood for the first document in train set
for i in range(0, 5):
    print(dtopics_train[i])

[(1, 0.99375180020339848)]
[(0, 0.45938784658817433), (2, 0.53086222942136096)]
[(0, 0.43491041341446957), (1, 0.55912285878508083)]
[(0, 0.044535510354251087), (1, 0.88899277576490243), (2, 0.066471713880846467)]
[(0, 0.98983114486066948)]


In [22]:
print(y_train)

1807       trade
3295    money-fx
5162       trade
2151       trade
4418       crude
3531       trade
2539       crude
2257       trade
3548       trade
319        trade
2581       trade
5386    money-fx
3915       crude
4632       trade
4826    money-fx
4718       trade
78         crude
1829       trade
2429       crude
710        crude
114        crude
757        crude
2356       trade
1797       crude
4234    money-fx
2650    money-fx
1560       crude
4339    money-fx
4238    money-fx
4730       trade
          ...   
3053       trade
3621    money-fx
1196       crude
5279       crude
3512       crude
3514       trade
975        trade
2330    money-fx
847        trade
715        crude
2100    money-fx
1768       crude
1028       crude
3288       trade
3318       crude
696        crude
4756       crude
3742    money-fx
3513    money-fx
3819       trade
5148       trade
476        crude
2188       trade
3545       crude
2039       trade
4890    money-fx
1949       trade
2021    money-

In [24]:
# Pick the topic with the highest probability for each document, map it to the label
# NOTE: the mapping may change in a different run
from operator import itemgetter
top_train = [ max(t, key=itemgetter(1))[0] for t in dtopics_train ]
dict = {1: 'crude', 0: 'money-fx', 2: 'trade'}
topic_train = [ dict[t] for t in top_train ]

In [25]:
# Now let's see how well these topics match the actual categories

from sklearn import metrics
print(metrics.confusion_matrix(topic_train, y_train))
print(np.mean(topic_train == y_train) )
print(metrics.classification_report(topic_train, y_train))

[[  4  26 151]
 [120  28  22]
 [ 50  72   2]]
0.0715789473684
             precision    recall  f1-score   support

      crude       0.02      0.02      0.02       181
   money-fx       0.22      0.16      0.19       170
      trade       0.01      0.02      0.01       124

avg / total       0.09      0.07      0.08       475



In [26]:
# The typical practice is to use the reserved test set for evaluation
toks_test = X_test.apply(pre_process)
dtm_test = [dictionary.doc2bow(d) for d in toks_test ]
dtopics_test = lda.get_document_topics(dtm_test)
top_test = [ max(t,key=itemgetter(1))[0] for t in dtopics_test ]
topic_test = [ dict[t] for t in top_test ]
print(metrics.confusion_matrix(topic_test, y_test))
print(np.mean(topic_test == y_test) )
print(metrics.classification_report(topic_test, y_test))

[[ 6 28 62]
 [57 16 14]
 [16 36  0]]
0.0936170212766
             precision    recall  f1-score   support

      crude       0.08      0.06      0.07        96
   money-fx       0.20      0.18      0.19        87
      trade       0.00      0.00      0.00        52

avg / total       0.11      0.09      0.10       235



In [27]:
# In actual use case, we wouldn't have class labels for comparison.
# So mean topic coherence and model perplexity can be checked.
test_coherence = [ n for _, n in lda.top_topics(dtm_test) ]
np.mean(test_coherence)



-2.1187980706387388

In [28]:
lda.bound(dtm_test)

-161594.44843775372

In [29]:
# Different models can be compared using such metrics

%time lda4 = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 4, id2word = dictionary, passes = 20)

2017-10-03 21:39:05,241 : INFO : using symmetric alpha at 0.25
2017-10-03 21:39:05,245 : INFO : using symmetric eta at 0.0004108463434675431
2017-10-03 21:39:05,248 : INFO : using serial LDA version on this node
2017-10-03 21:39:05,313 : INFO : running online (multi-pass) LDA training, 4 topics, 20 passes over the supplied corpus of 475 documents, updating model once every 475 documents, evaluating perplexity every 475 documents, iterating 50x with a convergence threshold of 0.001000
2017-10-03 21:39:10,459 : INFO : -8.412 per-word bound, 340.6 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:39:10,462 : INFO : PROGRESS: pass 0, at document #475/475
2017-10-03 21:39:12,100 : INFO : topic #0 (0.250): 0.017*"mln" + 0.012*"would" + 0.011*"oil" + 0.010*"bank" + 0.010*"market" + 0.009*"opec" + 0.008*"year" + 0.008*"price" + 0.008*"pct" + 0.007*"dlrs"
2017-10-03 21:39:12,103 : INFO : topic #1 (0.250): 0.025*"trade" + 0.019*"oil" + 0.010*"dlrs" + 

2017-10-03 21:39:43,610 : INFO : topic diff=0.126428, rho=0.353553
2017-10-03 21:39:47,632 : INFO : -6.868 per-word bound, 116.8 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:39:47,634 : INFO : PROGRESS: pass 7, at document #475/475
2017-10-03 21:39:48,599 : INFO : topic #0 (0.250): 0.024*"mln" + 0.019*"bank" + 0.015*"market" + 0.013*"opec" + 0.011*"dollar" + 0.010*"currency" + 0.010*"saudi" + 0.010*"stg" + 0.009*"price" + 0.009*"oil"
2017-10-03 21:39:48,602 : INFO : topic #1 (0.250): 0.024*"trade" + 0.010*"year" + 0.010*"country" + 0.010*"oil" + 0.007*"world" + 0.007*"price" + 0.007*"japan" + 0.007*"dlrs" + 0.007*"last" + 0.006*"would"
2017-10-03 21:39:48,607 : INFO : topic #2 (0.250): 0.046*"oil" + 0.018*"crude" + 0.017*"dlrs" + 0.017*"price" + 0.016*"barrel" + 0.012*"mln" + 0.011*"ecuador" + 0.011*"company" + 0.011*"bpd" + 0.009*"would"
2017-10-03 21:39:48,611 : INFO : topic #3 (0.250): 0.035*"trade" + 0.019*"japan" + 0.014*"would" + 

2017-10-03 21:40:22,734 : INFO : topic #1 (0.250): 0.022*"trade" + 0.010*"year" + 0.010*"country" + 0.009*"world" + 0.007*"gatt" + 0.007*"would" + 0.007*"export" + 0.006*"analyst" + 0.006*"agreement" + 0.006*"last"
2017-10-03 21:40:22,738 : INFO : topic #2 (0.250): 0.052*"oil" + 0.020*"crude" + 0.018*"dlrs" + 0.018*"price" + 0.017*"barrel" + 0.013*"mln" + 0.012*"bpd" + 0.011*"company" + 0.010*"ecuador" + 0.009*"pct"
2017-10-03 21:40:22,741 : INFO : topic #3 (0.250): 0.038*"trade" + 0.021*"japan" + 0.014*"would" + 0.012*"japanese" + 0.009*"official" + 0.009*"market" + 0.009*"import" + 0.008*"year" + 0.008*"foreign" + 0.008*"billion"
2017-10-03 21:40:22,744 : INFO : topic diff=0.041728, rho=0.250000
2017-10-03 21:40:26,541 : INFO : -6.830 per-word bound, 113.7 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:40:26,543 : INFO : PROGRESS: pass 15, at document #475/475
2017-10-03 21:40:27,458 : INFO : topic #0 (0.250): 0.023*"mln" + 0.020*"bank"

Wall time: 1min 39s


In [30]:
lda4.show_topics()

[(0,
  '0.023*"mln" + 0.021*"bank" + 0.017*"market" + 0.014*"opec" + 0.014*"billion" + 0.014*"dollar" + 0.011*"currency" + 0.010*"saudi" + 0.010*"stg" + 0.009*"february"'),
 (1,
  '0.022*"trade" + 0.011*"year" + 0.011*"country" + 0.009*"world" + 0.008*"would" + 0.007*"gatt" + 0.007*"export" + 0.007*"analyst" + 0.006*"china" + 0.006*"agreement"'),
 (2,
  '0.054*"oil" + 0.020*"crude" + 0.019*"dlrs" + 0.018*"price" + 0.018*"barrel" + 0.014*"mln" + 0.013*"bpd" + 0.011*"company" + 0.010*"ecuador" + 0.009*"pct"'),
 (3,
  '0.039*"trade" + 0.022*"japan" + 0.014*"would" + 0.013*"japanese" + 0.009*"official" + 0.009*"market" + 0.009*"import" + 0.008*"year" + 0.008*"foreign" + 0.007*"state"')]

In [31]:
test_coherence4 = [ n for _, n in lda4.top_topics(dtm_test) ]
np.mean(test_coherence4)

-2.4483883157716071

In [32]:
%time lda2 = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 2, id2word = dictionary, passes = 20)
lda2.show_topics()

2017-10-03 21:42:24,321 : INFO : using symmetric alpha at 0.5
2017-10-03 21:42:24,324 : INFO : using symmetric eta at 0.0004108463434675431
2017-10-03 21:42:24,326 : INFO : using serial LDA version on this node
2017-10-03 21:42:24,361 : INFO : running online (multi-pass) LDA training, 2 topics, 20 passes over the supplied corpus of 475 documents, updating model once every 475 documents, evaluating perplexity every 475 documents, iterating 50x with a convergence threshold of 0.001000
2017-10-03 21:42:29,230 : INFO : -8.218 per-word bound, 297.7 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:42:29,231 : INFO : PROGRESS: pass 0, at document #475/475
2017-10-03 21:42:30,931 : INFO : topic #0 (0.500): 0.018*"trade" + 0.015*"oil" + 0.013*"mln" + 0.010*"would" + 0.009*"market" + 0.009*"price" + 0.008*"japan" + 0.008*"dlrs" + 0.007*"bank" + 0.007*"billion"
2017-10-03 21:42:30,934 : INFO : topic #1 (0.500): 0.014*"oil" + 0.014*"trade" + 0.010*"dlr

2017-10-03 21:43:28,880 : INFO : PROGRESS: pass 11, at document #475/475
2017-10-03 21:43:30,221 : INFO : topic #0 (0.500): 0.019*"oil" + 0.017*"mln" + 0.013*"bank" + 0.011*"opec" + 0.010*"price" + 0.010*"billion" + 0.010*"market" + 0.010*"dlrs" + 0.009*"would" + 0.009*"bpd"
2017-10-03 21:43:30,224 : INFO : topic #1 (0.500): 0.025*"trade" + 0.014*"japan" + 0.010*"oil" + 0.010*"would" + 0.009*"japanese" + 0.007*"official" + 0.007*"dlrs" + 0.007*"market" + 0.007*"year" + 0.006*"minister"
2017-10-03 21:43:30,228 : INFO : topic diff=0.033943, rho=0.277350
2017-10-03 21:43:33,899 : INFO : -6.932 per-word bound, 122.1 perplexity estimate based on a held-out corpus of 475 documents with 50286 words
2017-10-03 21:43:33,902 : INFO : PROGRESS: pass 12, at document #475/475
2017-10-03 21:43:35,056 : INFO : topic #0 (0.500): 0.019*"oil" + 0.017*"mln" + 0.013*"bank" + 0.011*"price" + 0.011*"opec" + 0.010*"billion" + 0.010*"dlrs" + 0.010*"market" + 0.009*"would" + 0.009*"bpd"
2017-10-03 21:43:35,059

Wall time: 1min 45s


[(0,
  '0.022*"oil" + 0.018*"mln" + 0.014*"bank" + 0.011*"price" + 0.011*"dlrs" + 0.011*"opec" + 0.011*"billion" + 0.010*"market" + 0.009*"bpd" + 0.008*"pct"'),
 (1,
  '0.029*"trade" + 0.015*"japan" + 0.010*"would" + 0.009*"japanese" + 0.007*"year" + 0.007*"official" + 0.007*"oil" + 0.007*"market" + 0.006*"country" + 0.006*"dlrs"')]

In [None]:
test_coherence2 = [ n for _, n in lda2.top_topics(dtm_test) ]
np.mean(test_coherence2)

In [None]:
%time lsi = gensim.models.LsiModel(dtm_train, id2word=dictionary, num_topics=200)


In [None]:
lsi.print_topics(5)


In [None]:
lsi_test = lsi[dtm_test]

In [None]:
lsi_test[0]
