# NMF and LDA Topic Extraction

In [1]:
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [44]:
n_samples = 200
n_features = 1000
n_components = 10
n_top_words = 10

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [37]:
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

done in 1.874s.


In [38]:
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [39]:
data_samples[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [33]:
data_samples = [
        "cat and dogs are animals",
        "i love animals but not cats",
        "steve jobs was the founder of apple",
        "jobs made back to the apple after getting fired",
        "car has brakes and gear",
        "brakes of the car failed"
]

In [40]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

In [41]:
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 0.066s.


In [42]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 0.065s.


In [50]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=200 and n_features=1000...
done in 0.082s.

Topics in NMF model (Frobenius norm):
Topic #0: don just like know people good say time does make
Topic #1: vram simms simm board need favorite buy 30 cost correct
Topic #2: key chip keys clipper session receiving door knows needs government
Topic #3: info drivers mail help anybody monitor does windows hi tell
Topic #4: memory goes story person 49 free data real probably digital
Topic #5: bike riding know rights rear motorcycle people luck following mean
Topic #6: game heard flyers sure regular season remember red comes mean
Topic #7: win means favorite event baseball bet text runs read does
Topic #8: captain traded mike currently pittsburgh season course real toronto time
Topic #9: think pontiac pretty european sure assembly switch signal easy just



In [46]:
print(tfidf_feature_names)

['000', '040', '10', '100', '1000', '11', '12', '128', '13', '14', '15', '16', '17', '18', '19', '1993', '20', '200', '21', '22', '23', '24', '25', '250', '26', '27', '28', '30', '32', '32k', '33', '386bsd', '3d', '40', '42', '49', '50', '500', '604', '66', '72', '75', '80', '90', 'abc', 'able', 'ac', 'academic', 'accept', 'access', 'act', 'actually', 'add', 'addition', 'additional', 'address', 'administration', 'ago', 'agree', 'algorithm', 'allowed', 'americans', 'analysis', 'animation', 'announced', 'annual', 'answer', 'answers', 'anybody', 'appeared', 'appears', 'apple', 'application', 'applications', 'appropriate', 'april', 'area', 'article', 'aside', 'ask', 'asked', 'asking', 'assault', 'assembly', 'assume', 'attack', 'attacks', 'attempt', 'au', 'author', 'automatic', 'available', 'average', 'away', 'bad', 'baltimore', 'ban', 'bank', 'base', 'baseball', 'based', 'basically', 'begin', 'believe', 'best', 'bet', 'better', 'bible', 'big', 'bigger', 'bike', 'bikes', 'bit', 'bitnet', 'b

In [49]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=200 and n_features=1000...
done in 0.288s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: good use like just years things don ve make thing
Topic #1: work using edu program systems lines use running speed school
Topic #2: government people doesn rights right long non weapons public going
Topic #3: know does thanks don says say called mean change people
Topic #4: really year memory vram need card runs season don ll
Topic #5: pretty look think just want don know problem friend work
Topic #6: said new believe time info windows heard write drive leafs
Topic #7: like just usually used doing thanks problem running trying width
Topic #8: time new trip traded season mike good use way just
Topic #9: think subject group 11 try start understanding read power fact



In [51]:

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=200 and n_features=1000...
done in 0.452s.


In [52]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: insurance know good drive car does don riding bike cable
Topic #1: try current problem sense range keyboard hand thought university suspension
Topic #2: win like way really better year don contact going think
Topic #3: think don like just good people power use time coming
Topic #4: program drive errors hard read cache went keyboard power gs
Topic #5: edu graphics mail send 128 3d com file format objects
Topic #6: just captain traded use new hp blood believe rules going
Topic #7: don like just key chip know plane years problem used
Topic #8: israel israeli people time lebanese peace soldiers men accept members
Topic #9: gm game cache john time st vs baltimore card ram



In [55]:
lda.fit(tfidf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: bike riding know mistake luck chips rights rear mean reading
Topic #1: power time speed think opinions members just say thought colorado
Topic #2: drive radio problems money vram pc end cache phigs gs
Topic #3: don did law piece edu right 200 self evidence called
Topic #4: mail program ago errors does cities baseball monitor help discussion
Topic #5: book think problem certain faith make really just actually white
Topic #6: just traded going captain mike peter long got posting season
Topic #7: chip effective bob community like processor heat channel problem normal
Topic #8: fpu israeli israel usually makes jim solution ban turn seen
Topic #9: game looking memory mind insurance packages boston time help week



In [25]:
print(lda)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
