In [None]:
%matplotlib inline


# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


This is an example of applying Non-negative Matrix Factorization
and Latent Dirichlet Allocation on a corpus of documents and
extract additive models of the topic structure of the corpus.
The output is a list of topics, each represented as a list of terms
(weights are not shown).

The default parameters (n_samples / n_features / n_topics) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).


In [1]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 200
n_features = 100
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


Loading dataset...
done in 85.016s.
Extracting tf-idf features for NMF...
done in 0.051s.
Extracting tf features for LDA...
done in 0.041s.
Fitting the NMF model with tf-idf features, n_samples=200 and n_features=100...
done in 0.089s.

Topics in NMF model:
Topic #0:
just use like want long got way really bit make right years software work doesn new 10 used going cache
Topic #1:
think going sure computer year way got try did lot win com message look package called life software gm bit
Topic #2:
edu work good using mail problem new systems try little program drive site make use computer does look years format
Topic #3:
know people time say like does real help israel things want new mail life make information lot called program support
Topic #4:
don problem sure doesn hard years want way make like check did lot does based little time ll year say
Topic #5:
ve good year better win ll does years lot got support things life card 10 way using long com really
Topic #6:
key chip government publ

In [65]:
unseen_document = ["How often to evaluate perplexity. Only used in fit method. set it to 0 or negative number to not evalute perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold."]

In [68]:
tv2 = TfidfVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)

In [70]:
unseen_document_tfidf = tv2.fit_transform([unseen_document])

In [72]:
lda.fit_transform(unseen_document_tfidf)

array([[ 0.01576719,  0.01576723,  0.01576717,  0.01576696,  0.01576716,
         0.01576937,  0.01576692,  0.01576715,  0.01576731,  0.85809354]])

In [73]:
lda.components_.shape

(10, 100)

In [74]:
dataset2 = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

In [75]:
data_samples_2 = dataset2.data[:10]

In [96]:
unseen_document = data_samples[0]
tv2 = TfidfVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)
unseen_document_tfidf = tv2.fit_transform([unseen_document])
print(lda.transform(unseen_document_tfidf))
print(nmf.transform(unseen_document_tfidf))

[[ 0.02612796  0.02612841  0.02612937  0.02612825  0.76484196  0.02612865
   0.02612902  0.02612822  0.02612913  0.02612903]]
[[ 0.          0.18420905  0.          0.          0.0087455   0.
   0.09262394  0.22387758  0.          0.        ]]


In [78]:
unseen_document

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [80]:
type(tf_feature_names)

list

In [83]:
topic = lda.components_[6,:]

In [85]:
print(" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

make win know real drive contact think info did problem sure phone really new windows 24 like format things stuff
