In [25]:
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
no_features = 1000

In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [4]:
# NNMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print tfidf_feature_names

[u'00', u'000', u'01', u'02', u'03', u'04', u'0d', u'0t', u'10', u'100', u'11', u'12', u'128', u'13', u'14', u'145', u'15', u'16', u'17', u'18', u'19', u'1990', u'1991', u'1992', u'1993', u'1d9', u'1st', u'1t', u'20', u'200', u'21', u'22', u'23', u'24', u'25', u'250', u'26', u'27', u'28', u'29', u'2di', u'2tm', u'30', u'300', u'31', u'32', u'33', u'34', u'34u', u'35', u'36', u'37', u'38', u'39', u'3d', u'3t', u'40', u'42', u'43', u'44', u'45', u'50', u'500', u'55', u'60', u'64', u'6ei', u'70', u'75', u'75u', u'7ey', u'7u', u'80', u'800', u'86', u'90', u'91', u'92', u'93', u'9v', u'a86', u'able', u'ac', u'accept', u'access', u'according', u'act', u'action', u'actually', u'add', u'addition', u'address', u'administration', u'advance', u'age', u'ago', u'agree', u'ah', u'air', u'al', u'algorithm', u'allow', u'allowed', u'alt', u'america', u'american', u'analysis', u'anonymous', u'answer', u'answers', u'anti', u'anybody', u'apparently', u'appears', u'apple', u'application', u'applications', 

In [7]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
print tf_feature_names

[u'00', u'000', u'01', u'02', u'03', u'04', u'0d', u'0t', u'10', u'100', u'11', u'12', u'128', u'13', u'14', u'145', u'15', u'16', u'17', u'18', u'19', u'1990', u'1991', u'1992', u'1993', u'1d9', u'1st', u'1t', u'20', u'200', u'21', u'22', u'23', u'24', u'25', u'250', u'26', u'27', u'28', u'29', u'2di', u'2tm', u'30', u'300', u'31', u'32', u'33', u'34', u'34u', u'35', u'36', u'37', u'38', u'39', u'3d', u'3t', u'40', u'42', u'43', u'44', u'45', u'50', u'500', u'55', u'60', u'64', u'6ei', u'70', u'75', u'75u', u'7ey', u'7u', u'80', u'800', u'86', u'90', u'91', u'92', u'93', u'9v', u'a86', u'able', u'ac', u'accept', u'access', u'according', u'act', u'action', u'actually', u'add', u'addition', u'address', u'administration', u'advance', u'age', u'ago', u'agree', u'ah', u'air', u'al', u'algorithm', u'allow', u'allowed', u'alt', u'america', u'american', u'analysis', u'anonymous', u'answer', u'answers', u'anti', u'anybody', u'apparently', u'appears', u'apple', u'application', u'applications', 

In [30]:
no_topics = 20
no_top_words = 10
no_top_documents = 1

In [27]:
#run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

In [23]:
# Run LDA
lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

In [32]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print documents[doc_index][:1000]

In [29]:
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

Topic 0:
don think time good did really say make way want
Accounts of Anti-Armenian Human Right Violations in Azerbaijan #012
                 Prelude to Current Events in Nagorno-Karabakh

        +---------------------------------------------------------+
        |                                                         |
        |  I saw a naked girl with her hair down. They were       |
        |  dragging her. She kept falling because they were       |
        |  pushing her and kicking her. She fell down, it was     |
        |  muddy there, and later other witnesses who saw it from |
        |  their balconies told us, they seized her by the hair   |
        |  and dragged her a couple of blocks, as far as the      |
        |  mortgage bank, that's a good block and a half or two   |
        |  from here. I know this for sure because I saw it       |
        |  myself.                                                |
        |                                                     

In [33]:
display_topics(lda_H, lda_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

Topic 0:
does think people just don believe point time case say
Archive-name: atheism/introduction
Alt-atheism-archive-name: introduction
Last-modified: 5 April 1993
Version: 1.2

-----BEGIN PGP SIGNED MESSAGE-----

                          An Introduction to Atheism
                       by mathew <mathew@mantis.co.uk>

This article attempts to provide a general introduction to atheism.  Whilst I
have tried to be as neutral as possible regarding contentious issues, you
should always remember that this document represents only one viewpoint.  I
would encourage you to read widely and draw your own conclusions; some
relevant books are listed in a companion article.

To provide a sense of cohesion and progression, I have presented this article
as an imaginary conversation between an atheist and a theist.  All the
questions asked by the imaginary theist are questions which have been cropped
up repeatedly on alt.atheism since the newsgroup was created.  Some other
frequently asked questio