# Пример использования библиотеки gensim для тематического моделирования

In [67]:
from gensim import corpora, models
import numpy as np



In [None]:
# Косинусное расстояние - чем больше, тем лучше
def cos(x,y):
    return x.dot(y)/(np.sqrt(x.dot(x))*np.sqrt(y.dot(y)))

In [None]:
def argmax(x):
    m = max(x)
    for i in range(len(x)):
        if(x[i] == m):
            return i

In [None]:
def to_vec(x):
    kl = np.zeros(5)
    #kl = [.0,.0,.0,.0,.0,.0,.0,.0,.0,.0]
    for i in range(len(x)):
        j = x[i][0]
        kl[j] = x[i][1]
    return np.array(kl)

In [201]:
# Импортируем данные в формте UCI Bag of Words
#data = corpora.UciCorpus("docword.xkcd.txt", "vocab.xkcd.txt")
data = corpora.UciCorpus("docword.bow.txt","vocab.bow.txt")
dictionary = data.create_dictionary()

In [None]:
Number_of_topics = 10

In [373]:
# обучение модель
%time ldamodel = models.ldamodel.LdaModel(
    data, id2word=dictionary, num_topics=Number_of_topics, passes=20, alpha=1.25, eta=1.25)

Wall time: 2.21 s


In [70]:
# Сохранение модели
ldamodel.save("ldamodel_xkcd")

In [71]:
# Загрузка модели
ldamodel = models.ldamodel.LdaModel.load("ldamodel_xkcd")

In [72]:
#ldamodel.print_topics(num_topics=10, num_words=10)

In [374]:
# выводим топы слов
for t, top_words in ldamodel.print_topics(num_topics=5, num_words=20):
    print("Topic", t, ":", top_words,'\n')

Topic 0 : 0.011*"b'eto'" + 0.007*"b'rambler'" + 0.006*"b'kompaniya'" + 0.006*"b'internet'" + 0.006*"b'god'" + 0.005*"b'obuchenie'" + 0.004*"b'kotoryj'" + 0.004*"b'odin'" + 0.003*"b'nash'" + 0.003*"b'nuzhno'" + 0.003*"b'chelovek'" + 0.003*"b'denga'" + 0.003*"b'svoj'" + 0.003*"b'zadacha'" + 0.003*"b'eshh'" + 0.003*"b'samyj'" + 0.003*"b'million'" + 0.003*"b'vs'" + 0.003*"b'viktor'" + 0.003*"b'dollar'" 

Topic 1 : 0.004*"b'kotoryj'" + 0.003*"b'dat'" + 0.003*"b'eto'" + 0.003*"b'process'" + 0.002*"b'fajlovyj'" + 0.002*"b'bufer'" + 0.002*"b'rabotat'" + 0.002*"b'fajl'" + 0.002*"b'sistema'" + 0.002*"b'funkciya'" + 0.002*"b'pamyat'" + 0.002*"b'problema'" + 0.001*"b'operaciya'" + 0.001*"b'vremya'" + 0.001*"b'ispolzovat'" + 0.001*"b'vs'" + 0.001*"b'kakoj'" + 0.001*"b'proekt'" + 0.001*"b'odin'" + 0.001*"b'izmenenie'" 

Topic 2 : 0.007*"b'set'" + 0.005*"b'eto'" + 0.004*"b'kotoryj'" + 0.004*"b'nejronnyj'" + 0.003*"b'odin'" + 0.003*"b'sputnik'" + 0.002*"b'takoj'" + 0.002*"b'nash'" + 0.002*"b'zadacha'"

In [74]:
# Вычисляем логарифм перплексии и немного преобразуем, чтобы привести к общепринятому виду
perplexity = ldamodel.log_perplexity(list(data))
print(2**(-perplexity))

366.6874481344705


In [75]:
perp = ldamodel.bound(data)
2**(-perp/float(87409))

366.6874412065536

In [76]:
# Добавление в модель новых документов, содержащихся в новом корупсе data2
#ldamodel.update(data2, passes=10)

In [383]:
data2 = corpora.UciCorpus("docword.kek_rambler.txt","vocab.kek3.txt")

In [None]:
# Получение распределения тем для конкретного документа
doc = list(data2)[0]
kl_ = ldamodel.get_document_topics(doc)
print(kl_)
kl_ = to_vec(kl_)
kl_

In [378]:
for i in range(20):
    doc = list(data)[i]
    kl
    k = ldamodel.get_document_topics(doc)
    print(i+ 2,k)

2 [(1, 0.98620445)]
3 [(0, 0.14087127), (1, 0.031190626), (2, 0.034708347), (3, 0.020744458), (4, 0.7724853)]
4 [(0, 0.086568385), (1, 0.041553225), (2, 0.7901978), (3, 0.035377093), (4, 0.046303492)]
5 [(0, 0.56662786), (1, 0.03861747), (2, 0.3341466), (3, 0.027783664), (4, 0.032824434)]
6 [(0, 0.043654118), (1, 0.91410834), (2, 0.020216929), (4, 0.015667593)]
7 [(2, 0.9840497)]
8 [(0, 0.99054784)]
9 [(0, 0.9826675)]
10 [(0, 0.16159981), (1, 0.012995518), (2, 0.4982982), (4, 0.3173621)]
11 [(0, 0.035356943), (1, 0.017809497), (2, 0.020076368), (3, 0.016860072), (4, 0.90989715)]
12 [(0, 0.41668534), (2, 0.56382704)]
13 [(2, 0.9952539)]
14 [(0, 0.0127186775), (2, 0.97458464)]
15 [(0, 0.9988435)]
16 [(0, 0.9824118)]
17 [(2, 0.99464136)]
18 [(0, 0.9441777), (1, 0.014646235), (2, 0.019585935), (4, 0.012483825)]
19 [(4, 0.992704)]
20 [(4, 0.98760265)]
21 [(0, 0.010017615), (2, 0.9652571)]


In [385]:

x = []
for i in range(20):
    doc = list(data)[i]
    kl
    k = to_vec(ldamodel.get_document_topics(doc))
    x.append(cos(kl_,k))
    print(i + 2,cos(kl_,k))

print('-------\n',argmax(x) +2 ,max(x))

2 0.44022673278607355
3 0.5574887942026674
4 0.5348550275650954
5 0.6980877365162738
6 0.47949908128961277
7 0.4178722405535795
8 0.48993210394077324
9 0.48993210394077324
10 0.7011141526042133
11 0.47444327098765904
12 0.6272450550457097
13 0.4178722405535795
14 0.4242279030126884
15 0.4899321039407733
16 0.4899321039407733
17 0.4178722405535795
18 0.5108941728580404
19 0.4297862328041485
20 0.4297862328041485
21 0.4229357765321494
-------
 10 0.7011141526042133
