In [9]:
import numpy as np
import pyspark
import lda
import lda.datasets

In [4]:
X = lda.datasets.load_reuters() #document-term matrix
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()

In [10]:
model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(X)

topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8

for i, topic_dist in enumerate(topic_word):
     topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    
doc_topic = model.doc_topic_
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

Topic 0: government british minister west group letters party
Topic 1: church first during people political country ceremony
Topic 2: elvis king wright fans presley concert life
Topic 3: yeltsin russian russia president kremlin michael romania
Topic 4: pope vatican paul surgery pontiff john hospital
Topic 5: family police miami versace cunanan funeral home
Topic 6: south simpson born york white north african
Topic 7: order church mother successor since election religious
Topic 8: charles prince diana royal queen king parker
Topic 9: film france french against actor paris bardot
Topic 10: germany german war nazi christian letter book
Topic 11: east prize peace timor quebec belo indonesia
Topic 12: n't told life people church show very
Topic 13: years world time year last say three
Topic 14: mother teresa heart charity calcutta missionaries sister
Topic 15: city salonika exhibition buddhist byzantine vietnam swiss
Topic 16: music first people tour including off opera
Topic 17: church cat

In [19]:
doc_id = 0
word_id = 3117

print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
print("-- doc  : {}".format(titles[doc_id]))

doc id: 0 word id: 3117
-- count: 2
-- word : heir-to-the-throne
-- doc  : 0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20


In [21]:
# Topic-word Probabilities

topic_word = model.topic_word_
for n in range(5):
    sum_pr = sum(topic_word[n,:])
    print("topic: {} sum: {}".format(n, sum_pr))
    
#Top 5 words for each topic

n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))
    


topic: 0 sum: 1.0000000000000875
topic: 1 sum: 1.0000000000001148
topic: 2 sum: 0.9999999999998656
topic: 3 sum: 1.0000000000000042
topic: 4 sum: 1.0000000000000928
*Topic 0
- government british minister west group
*Topic 1
- church first during people political
*Topic 2
- elvis king wright fans presley
*Topic 3
- yeltsin russian russia president kremlin
*Topic 4
- pope vatican paul surgery pontiff
*Topic 5
- family police miami versace cunanan
*Topic 6
- south simpson born york white
*Topic 7
- order church mother successor since
*Topic 8
- charles prince diana royal queen
*Topic 9
- film france french against actor
*Topic 10
- germany german war nazi christian
*Topic 11
- east prize peace timor quebec
*Topic 12
- n't told life people church
*Topic 13
- years world time year last
*Topic 14
- mother teresa heart charity calcutta
*Topic 15
- city salonika exhibition buddhist byzantine
*Topic 16
- music first people tour including
*Topic 17
- church catholic bernardin cardinal bishop
*To

In [23]:
# Document-topic Probab

doc_topic = model.doc_topic_

for n in range(5):
    sum_pr = sum(doc_topic[n,:])
    print("document: {} sum: {}".format(n, sum_pr))

for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n,
                                            topic_most_pr,
                                            titles[n][:50]))


document: 0 sum: 1.0000000000000002
document: 1 sum: 0.9999999999999998
document: 2 sum: 1.0
document: 3 sum: 1.0000000000000002
document: 4 sum: 0.9999999999999997
doc: 0 topic: 8
0 UK: Prince Charles spearheads British royal revo...
doc: 1 topic: 1
1 GERMANY: Historic Dresden church rising from WW2...
doc: 2 topic: 14
2 INDIA: Mother Teresa's condition said still unst...
doc: 3 topic: 8
3 UK: Palace warns British weekly over Charles pic...
doc: 4 topic: 14
4 INDIA: Mother Teresa, slightly stronger, blesses...
doc: 5 topic: 14
5 INDIA: Mother Teresa's condition unchanged, thou...
doc: 6 topic: 14
6 INDIA: Mother Teresa shows signs of strength, bl...
doc: 7 topic: 14
7 INDIA: Mother Teresa's condition improves, many ...
doc: 8 topic: 14
8 INDIA: Mother Teresa improves, nuns pray for "mi...
doc: 9 topic: 8
9 UK: Charles under fire over prospect of Queen Ca...


In [None]:
## Visualizing 

import matplotlib.pyplot as plt

# use matplotlib style sheet
try:
    plt.style.use('ggplot')
except:
    # version of matplotlib might not be recent
    pass

# f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
# for i, k in enumerate([0, 5, 9, 14, 19]):
#     ax[i].stem(topic_word[k,:], linefmt='b-',
#                markerfmt='bo', basefmt='w-')
#     ax[i].set_xlim(-50,4350)
#     ax[i].set_ylim(0, 0.08)
#     ax[i].set_ylabel("Prob")
#     ax[i].set_title("topic {}".format(k))

# ax[4].set_xlabel("word")

# plt.tight_layout()
# plt.show()

f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]):
    ax[i].stem(doc_topic[k,:], linefmt='r-',
               markerfmt='ro', basefmt='w-')
    ax[i].set_xlim(-1, 21)
    ax[i].set_ylim(0, 1)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("Document {}".format(k))

ax[4].set_xlabel("Topic")

plt.tight_layout()
plt.show()

