In [1]:
import numpy as np
import lda
import lda.datasets
import pandas as pd

# LDA: Reuters News Data Example

In [2]:
# X is a term-document matrix
# Sparse matrices are accepted
X = lda.datasets.load_reuters()
print type(X)

<type 'numpy.ndarray'>


In [3]:
print X.shape
print X.sum()
print X[0]

(395, 4258)
84010
[1 0 1 ..., 0 0 0]


In [4]:
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()

In [11]:
print type(vocab)
print len(vocab)
print vocab[:5]

<type 'tuple'>
4258
('church', 'pope', 'years', 'people', 'mother')


In [47]:
print type(titles)
print len(titles)
print titles[:5]

SyntaxError: invalid syntax (<ipython-input-47-06d5f45990da>, line 1)

In [None]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available

In [None]:
topic_word = model.topic_word_  # model.components_ also works

In [16]:
print type(topic_word)
print topic_word.shape
print topic_word[0][:20]
print topic_word[0].sum()

<type 'numpy.ndarray'>
(20, 4258)
[  3.62505347e-06   3.62505347e-06   3.62505347e-06   3.62505347e-06
   3.62505347e-06   2.17865714e-03   3.62505347e-06   3.62505347e-06
   3.62505347e-06   3.62505347e-06   3.62505347e-06   3.62505347e-06
   3.62505347e-06   3.62505347e-06   3.62505347e-06   3.26617318e-03
   3.62505347e-06   3.62505347e-06   3.62505347e-06   3.62505347e-06]
1.0


In [5]:
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: british churchill sale million major letters west britain
Topic 1: church government political country state people party against
Topic 2: elvis king fans presley life concert young death
Topic 3: yeltsin russian russia president kremlin moscow michael operation
Topic 4: pope vatican paul john surgery hospital pontiff rome
Topic 5: family funeral police miami versace cunanan city service
Topic 6: simpson former years court president wife south church
Topic 7: order mother successor election nuns church nirmala head
Topic 8: charles prince diana royal king queen parker bowles
Topic 9: film french france against bardot paris poster animal
Topic 10: germany german war nazi letter christian book jews
Topic 11: east peace prize award timor quebec belo leader
Topic 12: n't life show told very love television father
Topic 13: years year time last church world people say
Topic 14: mother teresa heart calcutta charity nun hospital missionaries
Topic 15: city salonika capital buddhist c

In [27]:
# document topic distributions
doc_topic = model.doc_topic_ 

In [31]:
print type(doc_topic)
print doc_topic.shape
print doc_topic[0].sum()
print doc_topic[0]

<type 'numpy.ndarray'>
(395, 20)
1.0
[  4.34782609e-04   3.52173913e-02   4.34782609e-04   9.13043478e-03
   4.78260870e-03   4.34782609e-04   9.13043478e-03   3.08695652e-02
   5.04782609e-01   4.78260870e-03   4.34782609e-04   4.34782609e-04
   3.08695652e-02   2.17826087e-01   4.34782609e-04   4.34782609e-04
   4.34782609e-04   3.95652174e-02   4.34782609e-04   1.09130435e-01]


In [33]:
for i in range(15):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20 (top topic: 8)
1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21 (top topic: 13)
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23 (top topic: 14)
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25 (top topic: 8)
4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25 (top topic: 14)
5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25 (top topic: 14)
6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26 (top topic: 14)
7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25 (top topic: 14)
8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26 (top topic: 14)
9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26 (top topic: 8)
10 UK: Britain tells Charles to forget Camilla. LONDO

In [34]:
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: british churchill sale million major letters west britain
Topic 1: church government political country state people party against
Topic 2: elvis king fans presley life concert young death
Topic 3: yeltsin russian russia president kremlin moscow michael operation
Topic 4: pope vatican paul john surgery hospital pontiff rome
Topic 5: family funeral police miami versace cunanan city service
Topic 6: simpson former years court president wife south church
Topic 7: order mother successor election nuns church nirmala head
Topic 8: charles prince diana royal king queen parker bowles
Topic 9: film french france against bardot paris poster animal
Topic 10: germany german war nazi letter christian book jews
Topic 11: east peace prize award timor quebec belo leader
Topic 12: n't life show told very love television father
Topic 13: years year time last church world people say
Topic 14: mother teresa heart calcutta charity nun hospital missionaries
Topic 15: city salonika capital buddhist c

# SKLEARN: News Group Example

In [35]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [36]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [37]:
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 2.356s.


In [56]:
print(type(dataset))
print(type(dataset.data))
print(type(data_samples))
print(len(data_samples))
print(data_samples[0])

<class 'sklearn.datasets.base.Bunch'>
<type 'list'>
<type 'list'>
11314
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.



In [38]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 3.019s.


In [None]:
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."% (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()

In [67]:
print(type(tf_feature_names))
print(len(tf_feature_names))
print(tf_feature_names[:10])
print(tf_feature_names[-10:])

<type 'list'>
1000
[u'00', u'000', u'01', u'02', u'03', u'04', u'0d', u'0t', u'10', u'100']
[u'written', u'wrong', u'wrote', u'x11', u'xt', u'year', u'years', u'yes', u'york', u'young']


In [73]:
print(type(model.components_))
print(model.components_.shape)
print(model.components_[0])
print(model.components_[0].sum())

<type 'numpy.ndarray'>
(20, 4258)
[  3.62505347e-06   3.62505347e-06   3.62505347e-06 ...,   3.62505347e-06
   3.62505347e-06   3.62505347e-06]
1.0


In [76]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {0}:".format(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [77]:
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0:
government people mr law gun state president states public use right rights national new control american security encryption health united
Topic 1:
drive card disk bit scsi use mac memory thanks pc does video hard speed apple problem used data monitor software
Topic 2:
said people armenian armenians turkish did saw went came women killed children turkey told dead didn left started greek war
Topic 3:
year good just time game car team years like think don got new play games ago did season better ll
Topic 4:
10 00 15 25 12 11 20 14 17 16 db 13 18 24 30 19 27 50 21 40
Topic 5:
windows window program version file dos use files available display server using application set edu motif package code ms software
Topic 6:
edu file space com information mail data send available program ftp email entry info list output nasa address anonymous internet
Topic 7:
ax max b8f g9v a86 pl 145 1d9 0t 34u 1t 3t giz bhj wm 2di 75u 2tm bxn 7ey
Topic 8:
god people jesus believe does say think israel c