In [1]:
from gensim import corpora, models, similarities, matutils
import numpy as np
import os 

In [2]:
# documents = ["Human machine interface for lab abc computer applications",
#              "A survey of user opinion of computer system response time",
#              "The EPS user interface management system",
#              "System and human system engineering testing of EPS",
#              "Relation of user perceived response time to error measurement",
#              "The generation of random binary unordered trees",
#              "The intersection graph of paths in trees",
#              "Graph minors IV Widths of trees and well quasi ordering",
#               "Graph minors A survey"]

In [3]:
if (os.path.exists("tmp/deerwester.dict")):
    dictionary = corpora.Dictionary.load('tmp/deerwester.dict')
    corpus = corpora.MmCorpus('tmp/deerwester.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")



Used files generated from first tutorial


In [4]:
for d in corpus:
    print(d)

[(0, 1.0), (1, 1.0), (2, 1.0)]
[(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]
[(2, 1.0), (5, 1.0), (7, 1.0), (8, 1.0)]
[(1, 1.0), (5, 2.0), (8, 1.0)]
[(3, 1.0), (6, 1.0), (7, 1.0)]
[(9, 1.0)]
[(9, 1.0), (10, 1.0)]
[(9, 1.0), (10, 1.0), (11, 1.0)]
[(4, 1.0), (10, 1.0), (11, 1.0)]


In [5]:
scipy_csc_matrix = matutils.corpus2csc(corpus)
print(scipy_csc_matrix.todense())

[[1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 1. 2. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1.]]


In [6]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


# Budujemy model LDA
Budujemy model LDA i transformujemy dane

* **num_topics=2** oznacza ilość modelowanych tematów

In [7]:
model = models.LdaModel(corpus, id2word=dictionary, num_topics=2)
corpus_lda = model[corpus] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
for d in corpus_lda:
    print(d)

[(0, 0.83729), (1, 0.16271001)]
[(0, 0.90050095), (1, 0.09949899)]
[(0, 0.19252324), (1, 0.80747676)]
[(0, 0.12373279), (1, 0.8762672)]
[(0, 0.8622306), (1, 0.13776943)]
[(0, 0.3121528), (1, 0.6878472)]
[(0, 0.19222331), (1, 0.8077767)]
[(0, 0.14043865), (1, 0.8595613)]
[(0, 0.15467012), (1, 0.8453299)]


### Dla każdego dokumentu dostajemy prawdopodobieństwo przynależności dokumentu do danego tematu.

Możemy też zobaczyć z czego składają się tematy:

In [8]:
model.show_topics()

[(0,
  '0.124*"user" + 0.122*"computer" + 0.120*"response" + 0.115*"time" + 0.082*"system" + 0.082*"interface" + 0.078*"survey" + 0.077*"human" + 0.076*"trees" + 0.050*"graph"'),
 (1,
  '0.157*"system" + 0.139*"graph" + 0.116*"trees" + 0.108*"eps" + 0.103*"minors" + 0.074*"user" + 0.070*"human" + 0.069*"survey" + 0.066*"interface" + 0.037*"time"')]

# Zad

Chcemy postortowac słowa każdego tematu i wybrać 5 najważniejszych - co można powiedzieć o tematach?

Proszę zobaczyć na funkcje typu get_topics(), get_term_topics(...): https://radimrehurek.com/gensim/models/ldamodel.html

In [9]:
import numpy as np
topics = np.argsort(model.get_topics()[0,:])[::-1] #::-1 sortowanie w odwrotnej kolejności
for x in topics[:5]:
    print(dictionary[x])

user
computer
response
time
system


In [10]:
model.get_topic_terms(topicid=0)

[(7, 0.124267705),
 (0, 0.12216605),
 (3, 0.12027098),
 (6, 0.11450522),
 (5, 0.08195062),
 (2, 0.081943735),
 (4, 0.078475155),
 (1, 0.077084005),
 (9, 0.07631046),
 (10, 0.050397925)]

In [11]:
topic2_terms = model.get_topic_terms(topicid=0)
topic2_words = [
    (dictionary.get(i), j)
    for i,j in topic2_terms
]
topic2_words

[('user', 0.124267705),
 ('computer', 0.12216605),
 ('response', 0.12027098),
 ('time', 0.11450522),
 ('system', 0.08195062),
 ('interface', 0.081943735),
 ('survey', 0.078475155),
 ('human', 0.077084005),
 ('trees', 0.07631046),
 ('graph', 0.050397925)]

In [12]:
[
    (dictionary.get(i), j)
    for i,j in model.get_topic_terms(topicid=0)
]

[('user', 0.124267705),
 ('computer', 0.12216605),
 ('response', 0.12027098),
 ('time', 0.11450522),
 ('system', 0.08195062),
 ('interface', 0.081943735),
 ('survey', 0.078475155),
 ('human', 0.077084005),
 ('trees', 0.07631046),
 ('graph', 0.050397925)]

# Zad 

Proszę posortować zdania najbardziej pasujące do danego tematu. 

In [13]:
import gensim

# for d in corpus_lda:
#     print(d)
    
# for d in corpus_lda:
#     print(d[0])

#print(corpus_lda[0][0])

numpy_corpus = gensim.matutils.corpus2dense(corpus_lda, num_terms=2)
docs = np.argsort(numpy_corpus[0,:])[::-1]
for x in docs[:5]:
    print(corpus[x])
    
#trzeba by wypisać raczej zdania niż ich reprezentacje bag-of-words, ale tu nie mam dostepu do tekstu

[(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]
[(3, 1.0), (6, 1.0), (7, 1.0)]
[(0, 1.0), (1, 1.0), (2, 1.0)]
[(9, 1.0)]
[(2, 1.0), (5, 1.0), (7, 1.0), (8, 1.0)]


# Zad. 
Sprawdzić do jakiego tematu pasuje nowy dokument i jakie są mu najbliższe

In [14]:
doc = "Human computer interaction"

In [15]:
doc_rep = dictionary.doc2bow(doc.split(' '))
# print(doc_rep)
doc_assignments = model[doc_rep]
print(doc_assignments)

[(0, 0.7407068), (1, 0.25929314)]


In [16]:
index = similarities.MatrixSimilarity(corpus_lda)

sims = index[doc_assignments]
print(list(enumerate(sims)))

sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims)

[(0, 0.9895207), (1, 0.9744165), (2, 0.54026634), (3, 0.4591714), (4, 0.98414946), (5, 0.69093746), (6, 0.53994584), (7, 0.47826633), (8, 0.49494594)]
[(0, 0.9895207), (4, 0.98414946), (1, 0.9744165), (5, 0.69093746), (2, 0.54026634), (6, 0.53994584), (8, 0.49494594), (7, 0.47826633), (3, 0.4591714)]


# Wizualizacja modelu LDA:

pyLDAvis

http://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb

http://www.kennyshirley.com/LDAvis/

In [17]:
import pyLDAvis.gensim

In [18]:
pyLDAvis.enable_notebook()

In [19]:
# pyLDAvis.gensim.prepare??

In [20]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
