In [1]:
#!pip install gensim

In [2]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [3]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [4]:
common_dictionary = Dictionary(common_texts)
print(common_dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [5]:
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

In [6]:
print(common_corpus) #prints word_id and the frequency, within each list the words are sorted aplhabetically & then encoded

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [7]:
from gensim.models import LdaModel
lda = LdaModel(common_corpus, num_topics=10)

In [8]:
other_texts = [
    ['computer', 'time', 'graph'],
    ['survey', 'response', 'eps'],
    ['human', 'system', 'computer']
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
print(other_corpus[0], ), print(other_corpus[2])

[(0, 1), (6, 1), (10, 1)]
[(0, 1), (1, 1), (5, 1)]


(None, None)

In [9]:
unseen_doc = other_corpus[0]
vector = lda[unseen_doc]  # get topic probability distribution for a document

In [10]:
vector #Here the numbers correspnd to the topics

[(0, 0.025000058),
 (1, 0.5249818),
 (2, 0.025000062),
 (3, 0.025000062),
 (4, 0.025009863),
 (5, 0.025008721),
 (6, 0.025008475),
 (7, 0.025000062),
 (8, 0.025000062),
 (9, 0.2749908)]

In [11]:
lda.update(other_corpus)

In [12]:
vector = lda[unseen_doc]

In [13]:
lda.get_term_topics(word_id=2)

[(0, 0.04549327), (5, 0.027164787)]

In [14]:
lda.get_topics()

array([[0.01923259, 0.01923246, 0.11537796, 0.01924255, 0.01924222,
        0.11539391, 0.0192322 , 0.11538   , 0.49996135, 0.01923716,
        0.01923504, 0.01923256],
       [0.07317638, 0.01219622, 0.0121959 , 0.31705397, 0.31705308,
        0.07317476, 0.07317404, 0.07316791, 0.01221507, 0.01219904,
        0.01219769, 0.01219586],
       [0.08333218, 0.08332966, 0.08333268, 0.08332815, 0.08333118,
        0.0833353 , 0.08333049, 0.08333336, 0.08332878, 0.08335013,
        0.08333875, 0.08332936],
       [0.08333126, 0.08333287, 0.08333357, 0.08332969, 0.08333286,
        0.08333209, 0.08333017, 0.08333121, 0.08333219, 0.08334821,
        0.08333516, 0.0833307 ],
       [0.01174141, 0.1079501 , 0.01173986, 0.01173933, 0.01173927,
        0.36385012, 0.01173915, 0.01174003, 0.07043037, 0.1877849 ,
        0.12911813, 0.07042734],
       [0.5488594 , 0.27209938, 0.07159013, 0.01193375, 0.01193414,
        0.01195708, 0.01194423, 0.01193358, 0.01193358, 0.01193608,
        0.01194525,

In [21]:
lda.dtype

dtype('float32')

In [23]:
lda.show_topics()

[(0,
  '0.500*"8" + 0.115*"5" + 0.115*"7" + 0.115*"2" + 0.019*"3" + 0.019*"4" + 0.019*"9" + 0.019*"10" + 0.019*"0" + 0.019*"11"'),
 (1,
  '0.317*"3" + 0.317*"4" + 0.073*"0" + 0.073*"5" + 0.073*"6" + 0.073*"7" + 0.012*"8" + 0.012*"9" + 0.012*"10" + 0.012*"1"'),
 (2,
  '0.083*"9" + 0.083*"10" + 0.083*"5" + 0.083*"7" + 0.083*"2" + 0.083*"0" + 0.083*"4" + 0.083*"6" + 0.083*"1" + 0.083*"11"'),
 (3,
  '0.083*"9" + 0.083*"10" + 0.083*"2" + 0.083*"1" + 0.083*"4" + 0.083*"8" + 0.083*"5" + 0.083*"0" + 0.083*"7" + 0.083*"11"'),
 (4,
  '0.364*"5" + 0.188*"9" + 0.129*"10" + 0.108*"1" + 0.070*"8" + 0.070*"11" + 0.012*"0" + 0.012*"7" + 0.012*"2" + 0.012*"3"'),
 (5,
  '0.549*"0" + 0.272*"1" + 0.072*"2" + 0.012*"5" + 0.012*"10" + 0.012*"6" + 0.012*"9" + 0.012*"4" + 0.012*"3" + 0.012*"8"'),
 (6,
  '0.553*"6" + 0.128*"3" + 0.128*"7" + 0.021*"10" + 0.021*"0" + 0.021*"9" + 0.021*"5" + 0.021*"8" + 0.021*"4" + 0.021*"11"'),
 (7,
  '0.083*"9" + 0.083*"10" + 0.083*"5" + 0.083*"7" + 0.083*"4" + 0.083*"2" + 0.08

In [None]:
lda.