In [32]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# sklearn text data tutorial
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [33]:
# http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.datasets import fetch_20newsgroups

In [34]:
categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
data = fetch_20newsgroups(categories=categories)

In [35]:
print(len(data.data))
print(data.target_names)

2257
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [36]:
print(data.data[0])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.data)
print(X_train_counts.shape)

(2257, 35788)


In [39]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [40]:
from sklearn.feature_extraction.text import TfidfTransformer

In [41]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf.shape)

(2257, 35788)


In [42]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(2257, 35788)


In [43]:
from sklearn.naive_bayes import MultinomialNB

In [44]:
clf = MultinomialNB().fit(X_train_tfidf, data.target)

In [45]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [46]:
predicted = clf.predict(X_new_tfidf)

In [47]:
for doc, category in zip(docs_new, predicted):
    print("'{0} => {1}".format(doc, data.target_names[category]))

'God is love => soc.religion.christian
'OpenGL on the GPU is fast => comp.graphics


In [48]:
from sklearn.pipeline import Pipeline

In [49]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [50]:
text_clf = text_clf.fit(data.data, data.target)

In [51]:
import numpy as np

In [52]:
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [53]:
docs_test = data_test.data

In [54]:
predicted = text_clf.predict(docs_test)
predicted[:10]

array([2, 2, 3, 0, 3, 0, 1, 3, 2, 3])

In [55]:
np.mean(predicted == data_test.target)

0.83488681757656458

In [56]:
from sklearn.linear_model import SGDClassifier

In [57]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge',
                                           penalty='l2',
                                           alpha=1e-3,
                                           n_iter=5,
                                           random_state=42)),
])

In [58]:
_ = text_clf.fit(data.data, data.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == data_test.target)

0.9127829560585885

In [59]:
from sklearn import metrics

In [60]:
print(metrics.classification_report(data_test.target, predicted,
      target_names=data_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [61]:
metrics.confusion_matrix(data_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [62]:
from sklearn.model_selection import GridSearchCV

In [63]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [64]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [65]:
gs_clf.fit(data.data[:400], data.target[:400])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__alpha': (0.01, 0.001), 'tfidf__use_idf': (True, False), 'vect__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [66]:
data.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [67]:
print(gs_clf.best_score_)

0.9


In [68]:
for param_name in sorted(parameters.keys()):
    print("{0}: '{1}'".format(param_name, gs_clf.best_params_[param_name]))

clf__alpha: '0.001'
tfidf__use_idf: 'True'
vect__ngram_range: '(1, 1)'


---

# python lda package
https://pypi.python.org/pypi/lda

In [69]:
import numpy as np
import lda
import lda.datasets

In [70]:
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()

In [71]:
len(titles[0])

75

In [72]:
len(vocab)

4258

In [73]:
X.shape

(395, 4258)

In [74]:
titles[0:10]

('0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20',
 '1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21',
 "2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23",
 '3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25',
 '4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25',
 "5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25",
 '6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26',
 "7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25",
 '8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26',
 '9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26')

In [75]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)

In [76]:
model.fit(X)

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1051748
INFO:lda:<10> log likelihood: -719800
INFO:lda:<20> log likelihood: -699115
INFO:lda:<30> log likelihood: -689370
INFO:lda:<40> log likelihood: -684918
INFO:lda:<50> log likelihood: -681322
INFO:lda:<60> log likelihood: -678979
INFO:lda:<70> log likelihood: -676598
INFO:lda:<80> log likelihood: -675383
INFO:lda:<90> log likelihood: -673316
INFO:lda:<100> log likelihood: -672761
INFO:lda:<110> log likelihood: -671320
INFO:lda:<120> log likelihood: -669744
INFO:lda:<130> log likelihood: -669292
INFO:lda:<140> log likelihood: -667940
INFO:lda:<150> log likelihood: -668038
INFO:lda:<160> log likelihood: -667429
INFO:lda:<170> log likelihood: -666475
INFO:lda:<180> log likelihood: -665562
INFO:lda:<190> log likelihood: -664920
INFO:lda:<200> log likelihood: -664979
INFO:lda:<210> log likelihood: -664722
INFO:lda:<220> log likelihood: -

<lda.lda.LDA at 0x7fbc5c2a8dd8>

In [77]:
topic_word = model.topic_word_

In [78]:
n_top_words = 8

In [79]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {0}: {1}'.format(i, ' '.join(topic_words)))

Topic 0: british churchill sale million major letters west britain
Topic 1: church government political country state people party against
Topic 2: elvis king fans presley life concert young death
Topic 3: yeltsin russian russia president kremlin moscow michael operation
Topic 4: pope vatican paul john surgery hospital pontiff rome
Topic 5: family funeral police miami versace cunanan city service
Topic 6: simpson former years court president wife south church
Topic 7: order mother successor election nuns church nirmala head
Topic 8: charles prince diana royal king queen parker bowles
Topic 9: film french france against bardot paris poster animal
Topic 10: germany german war nazi letter christian book jews
Topic 11: east peace prize award timor quebec belo leader
Topic 12: n't life show told very love television father
Topic 13: years year time last church world people say
Topic 14: mother teresa heart calcutta charity nun hospital missionaries
Topic 15: city salonika capital buddhist c

In [80]:
doc_topic = model.doc_topic_
for i in range(10):
    print("{0} (top topic: {1})".format(titles[i], doc_topic[i].argmax()))

0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20 (top topic: 8)
1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21 (top topic: 13)
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23 (top topic: 14)
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25 (top topic: 8)
4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25 (top topic: 14)
5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25 (top topic: 14)
6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26 (top topic: 14)
7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25 (top topic: 14)
8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26 (top topic: 14)
9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26 (top topic: 8)


---

# gensim

## simple example first
https://radimrehurek.com/gensim/tutorial.html

In [85]:
import gensim
from gensim import corpora, similarities, models

In [92]:
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
          [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
          [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
          [(0, 1.0), (4, 2.0), (7, 1.0)],
          [(3, 1.0), (5, 1.0), (6, 1.0)],
          [(9, 1.0)],
          [(9, 1.0), (10, 1.0)],
          [(9, 1.0), (10, 1.0), (11, 1.0)],
          [(8, 1.0), (10, 1.0), (11, 1.0)]]

In [93]:
tfidf = models.TfidfModel(corpus)

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [94]:
vec = [(0, 1), (4, 1)]
print(tfidf[vec])

[(0, 0.8075244024440723), (4, 0.5898341626740045)]


In [95]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)

INFO:gensim.similarities.docsim:creating sparse index
INFO:gensim.matutils:creating sparse matrix from corpus
INFO:gensim.matutils:PROGRESS: at document #0
INFO:gensim.similarities.docsim:created <9x12 sparse matrix of type '<class 'numpy.float32'>'
	with 28 stored elements in Compressed Sparse Row format>


In [97]:
sims = index[tfidf[vec]]
print(list(enumerate(sims)))

[(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


## now the tutorials: tutorial 1
https://radimrehurek.com/gensim/tut1.html

In [98]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [99]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

from pprint import pprint  # pretty-printer
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [100]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(12 unique tokens: ['computer', 'interface', 'eps', 'response', 'trees']...) from 9 documents (total 29 corpus positions)
INFO:gensim.utils:saving Dictionary object under /tmp/deerwester.dict, separately None


Dictionary(12 unique tokens: ['computer', 'interface', 'eps', 'response', 'trees']...)


In [101]:
print(dictionary.token2id)

{'computer': 0, 'interface': 1, 'eps': 8, 'response': 3, 'trees': 9, 'user': 5, 'graph': 10, 'minors': 11, 'survey': 4, 'human': 2, 'system': 6, 'time': 7}


In [102]:
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'interface', 'eps', 'response', 'trees']...)


In [103]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)   # the word 'interaction' does not appear in the dictionary and is ignored

[(0, 1), (2, 1)]


In [105]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
print(corpus)

INFO:gensim.corpora.mmcorpus:storing corpus in Matrix Market format to /tmp/deerwester.mm
INFO:gensim.matutils:saving sparse matrix to /tmp/deerwester.mm
INFO:gensim.matutils:PROGRESS: saving document #0
INFO:gensim.matutils:saved 9x12 matrix, density=25.926% (28/108)
INFO:gensim.corpora.indexedcorpus:saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(1, 1), (5, 1), (6, 1), (8, 1)], [(2, 1), (6, 2), (8, 1)], [(3, 1), (5, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


In [107]:
# memory-friendly corpus streaming, one document at a time
class MyCorpus(object):
    def __iter__(self):
        for line in open('/tmp/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [108]:
corpus_memory_friendly = MyCorpus()
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x7fbc4419bac8>


In [109]:
for vector in corpus_memory_friendly:
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(1, 1), (5, 1), (6, 1), (8, 1)]
[(2, 1), (6, 2), (8, 1)]
[(3, 1), (5, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


## tutorial 2
https://radimrehurek.com/gensim/tut2.html

In [111]:
import os
from gensim import corpora, models, similarities

if (os.path.exists("/tmp/deerwester.dict")):
   dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
   corpus = corpora.MmCorpus('/tmp/deerwester.mm')
   print("Used files generated from first tutorial")
else:
   print("Please run first tutorial to generate data set")

INFO:gensim.utils:loading Dictionary object from /tmp/deerwester.dict
INFO:gensim.corpora.indexedcorpus:loaded corpus index from /tmp/deerwester.mm.index
INFO:gensim.matutils:initializing corpus reader from /tmp/deerwester.mm
INFO:gensim.matutils:accepted corpus with 9 documents, 12 features, 28 non-zero entries


Used files generated from first tutorial


In [113]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [114]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [115]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.3244870206138555), (7, 0.44424552527467476)]
[(1, 0.5710059809418182), (5, 0.4170757362022777), (6, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (5, 0.45889394536615247), (7, 0.6282580468670046)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [116]:
# Transformations can also be serialized, one on top of another, in a sort of chain:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

INFO:gensim.models.lsimodel:using serial LSI version on this node
INFO:gensim.models.lsimodel:updating model with new documents
INFO:gensim.models.lsimodel:preparing a new chunk of documents
INFO:gensim.models.lsimodel:using 100 extra samples and 2 power iterations
INFO:gensim.models.lsimodel:1st phase: constructing (12, 102) action matrix
INFO:gensim.models.lsimodel:orthonormalizing (12, 102) action matrix
INFO:gensim.models.lsimodel:2nd phase: running dense svd on (12, 9) matrix
INFO:gensim.models.lsimodel:computing the final decomposition
INFO:gensim.models.lsimodel:keeping 2 factors (discarding 47.565% of energy spectrum)
INFO:gensim.models.lsimodel:processed documents up to #9
INFO:gensim.models.lsimodel:topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
INFO:gensim.models.lsimodel:topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -

In [117]:
lsi.print_topics(2)

INFO:gensim.models.lsimodel:topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
INFO:gensim.models.lsimodel:topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"


[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [118]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

[(0, 0.066007833960904982), (1, -0.52007033063618591)]
[(0, 0.19667592859142663), (1, -0.76095631677000486)]
[(0, 0.08992639972446613), (1, -0.7241860626752511)]
[(0, 0.075858476521783361), (1, -0.632055158600343)]
[(0, 0.10150299184980255), (1, -0.57373084830029542)]
[(0, 0.70321089393783076), (1, 0.16115180214025968)]
[(0, 0.87747876731198282), (1, 0.16758906864659642)]
[(0, 0.90986246868185738), (1, 0.1408655362871927)]
[(0, 0.61658253505692806), (1, -0.053929075663891851)]


In [119]:
lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')

INFO:gensim.utils:saving Projection object under /tmp/model.lsi.projection, separately None
INFO:gensim.utils:saving LsiModel object under /tmp/model.lsi, separately None
INFO:gensim.utils:not storing attribute dispatcher
INFO:gensim.utils:not storing attribute projection
INFO:gensim.utils:loading LsiModel object from /tmp/model.lsi
INFO:gensim.utils:loading id2word recursively from /tmp/model.lsi.id2word.* with mmap=None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:setting ignored attribute projection to None
INFO:gensim.utils:loading LsiModel object from /tmp/model.lsi.projection


In [128]:
# TF-IDF
model = models.TfidfModel(corpus, normalize=True)

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [130]:
# Latent Semantic Indexing (LSI, or sometimes LSA)
model = models.LsiModel(corpus, id2word=dictionary, num_topics=300)  # should be tfidf_corpus

INFO:gensim.models.lsimodel:using serial LSI version on this node
INFO:gensim.models.lsimodel:updating model with new documents
INFO:gensim.models.lsimodel:preparing a new chunk of documents
INFO:gensim.models.lsimodel:using 100 extra samples and 2 power iterations
INFO:gensim.models.lsimodel:1st phase: constructing (12, 400) action matrix
INFO:gensim.models.lsimodel:orthonormalizing (12, 400) action matrix
INFO:gensim.models.lsimodel:2nd phase: running dense svd on (12, 9) matrix
INFO:gensim.models.lsimodel:computing the final decomposition
INFO:gensim.models.lsimodel:keeping 9 factors (discarding 0.000% of energy spectrum)
INFO:gensim.models.lsimodel:processed documents up to #9
INFO:gensim.models.lsimodel:topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
INFO:gensim.models.lsimodel:topic #1(2.542): 0.623*"graph" + 0.490*"trees" + 0.451*"minors" + 0.274

In [131]:
# Latent Dirirchlet Allocation
model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.01
INFO:gensim.models.ldamodel:using symmetric eta at 0.01
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online LDA training, 100 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity every 9 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:-115.905 per-word bound, 77766935683978659199970645983100928.0 perplexity estimate based on a held-out corpus of 9 documents with 29 words
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #9/9
INFO:gensim.models.ldamodel:topic #32 (0.010): 0.083*time + 0.083*user + 0.083*graph + 0.083*trees + 0.083*eps + 0.083*computer + 0.083*system + 0.083*human + 0.083*response + 0.083*interface
INFO:gensim.models.ldamodel:topic #13 (0.010): 0.083*time + 0.083*user + 0.083*graph + 0.083*trees + 0.083*eps + 0.083*computer + 0.083*system +

In [132]:
# Hierarchical Dirichlet Process
model = models.HdpModel(corpus, id2word=dictionary)

INFO:gensim.models.hdpmodel:topic 0: 0.302*time + 0.149*graph + 0.139*eps + 0.139*minors + 0.075*user + 0.073*response + 0.044*survey + 0.035*interface + 0.016*computer + 0.012*human + 0.010*trees + 0.006*system
INFO:gensim.models.hdpmodel:topic 1: 0.311*minors + 0.247*human + 0.171*survey + 0.091*graph + 0.070*user + 0.040*interface + 0.032*eps + 0.014*trees + 0.012*system + 0.009*computer + 0.002*time + 0.001*response
INFO:gensim.models.hdpmodel:topic 2: 0.264*time + 0.142*response + 0.136*survey + 0.100*trees + 0.068*minors + 0.064*interface + 0.061*system + 0.051*user + 0.037*graph + 0.036*eps + 0.026*human + 0.015*computer
INFO:gensim.models.hdpmodel:topic 3: 0.419*graph + 0.136*trees + 0.112*human + 0.076*survey + 0.071*user + 0.064*response + 0.047*computer + 0.040*minors + 0.018*eps + 0.007*time + 0.006*interface + 0.004*system
INFO:gensim.models.hdpmodel:topic 4: 0.271*user + 0.200*trees + 0.121*graph + 0.081*computer + 0.065*survey + 0.059*eps + 0.057*minors + 0.044*interface

## Seminal Papers

- Deerwester, LSI: http://lsa3.colorado.edu/papers/JASIS.lsi.90.pdf
- Blei, LDA: https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf
- Blei, Online LDA: http://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf

## Other Tutorials

- http://chrisstrelioff.ws/sandbox/2014/11/13/getting_started_with_latent_dirichlet_allocation_in_python.html
- https://ariddell.org/lda.html
- https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html
- https://de.dariah.eu/tatom/topic_model_mallet.html#topic-model-mallet
- https://de.dariah.eu/tatom/topic_model_python.html

## Resources

- http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html