# gensim library for topic modelling

In [30]:
from gensim import corpora, models

In [31]:
# importing data as UCI Bag of Words
data = corpora.UciCorpus('docword.xkcd.txt', 'vocab.xkcd.txt')
dictionary = data.create_dictionary()

In [40]:
print(data)

MmCorpus(1265 documents, 14352 features, 87409 non-zero entries)


In [41]:
print(dictionary)

Dictionary(14352 unique tokens: [b'boy', b'sits', b'barrel', b'floating', b'ocean']...)


In [32]:
# fit the model
%time ldamodel = models.ldamodel.LdaModel(data, id2word = dictionary, num_topics=5, passes=40, alpha=1.25, eta=1.25)

Wall time: 49.7 s


In [33]:
#  save the model to save time on fitting
ldamodel.save('ldamodel_xkcd')

In [34]:
# load the model
ldamodel = models.ldamodel.LdaModel.load('ldamodel_xkcd')

In [35]:
# show the top words:
for t, top_words in ldamodel.print_topics(num_topics=10, num_words=10):
    print(f'topic: {t} : {top_words}')

topic: 0 : 0.001*"b'paul'" + 0.001*"b'within'" + 0.001*"b'goggles'" + 0.001*"b'ron'" + 0.001*"b'relation'" + 0.001*"b'jelly'" + 0.001*"b'accurate'" + 0.001*"b'bean'" + 0.001*"b'link'" + 0.001*"b'han'"
topic: 1 : 0.002*"b'scientist'" + 0.001*"b'degree'" + 0.001*"b'mark'" + 0.001*"b'leopard'" + 0.001*"b'nathan'" + 0.001*"b'summer'" + 0.001*"b'centigrade'" + 0.001*"b'shark'" + 0.001*"b'hurricane'" + 0.001*"b'marie'"
topic: 2 : 0.001*"b'man'" + 0.001*"b'reference'" + 0.001*"b'turtle'" + 0.001*"b'boomerang'" + 0.001*"b'radio'" + 0.001*"b'destroy'" + 0.001*"b'yada'" + 0.001*"b'flu'" + 0.001*"b'boom'" + 0.001*"b'lake'"
topic: 3 : 0.015*"b'man'" + 0.012*"b'text'" + 0.012*"b'person'" + 0.010*"b'title'" + 0.008*"b'guy'" + 0.007*"b'one'" + 0.005*"b'girl'" + 0.005*"b'just'" + 0.005*"b'two'" + 0.005*"b'hat'"
topic: 4 : 0.025*"b'man'" + 0.017*"b'woman'" + 0.004*"b'boy'" + 0.003*"b'text'" + 0.003*"b'title'" + 0.002*"b'day'" + 0.001*"b'get'" + 0.001*"b'voice'" + 0.001*"b'female'" + 0.001*"b'girl'"


In [36]:
# calc the perplexity logarithm
perplexity = ldamodel.log_perplexity(list(data))
# and for whatever reason bring it to another from
print(perplexity)
print(2**-perplexity)

-8.473038992306169
355.3357397418994


In [37]:
perp = ldamodel.bound(data)
print(perp)
print(2**(-perp/float(87409)))

-740619.8654311746
355.33574017213266


In [38]:
# adding new documents out from the new corpus 
# since the techer forgot to outload the 2nd corpus i'll use the same one
data2 = corpora.UciCorpus('docword.xkcd.txt', 'vocab.xkcd.txt')
ldamodel.update(data2, passes=10)

In [39]:
# getting the distrubution for the specific document
doc = list(data)[0]
print(doc)
print(ldamodel.get_document_topics(doc))
# i.e. for the doc[0] the most inportant is the 4th topic

[(0, 2.0), (1, 1.0), (2, 2.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0)]
[(0, 0.05430645), (1, 0.0633613), (2, 0.057095762), (3, 0.5326352), (4, 0.29260126)]
