In [10]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=[x for x in chosenlist]
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

# Number of topics.
ntopics= 5

In [12]:
from sklearn.datasets import fetch_20newsgroups
categs =['rec.autos',
         'sci.space',
         'soc.religion.christian',
         'talk.politics.mideast']
news_raw = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'), categories = categs)

In [13]:
news = pd.DataFrame({'text':news_raw.data})

# removing everything except alphabets`
news['text_clean'] = news['text'].str.replace("[^a-zA-Z#]", " ")

In [16]:
news.head()

Unnamed: 0,text,text_clean
0,Hiya \n\nI'm a VERY amuture astronomer in Adel...,Hiya I m a VERY amuture astronomer in Adelai...
1,"\n\n\tYou mean he talks about those Jews, who,...",You mean he talks about those Jews who be...
2,From: Center for Policy Research <cpr>\nSubjec...,From Center for Policy Research cpr Subject...
3,"\n\nFirst of all, ""ceremonial law"" is an extra...",First of all ceremonial law is an extraSc...
4,"\nCarrying a pistol, loaded or unloaded, in th...",Carrying a pistol loaded or unloaded in the...


In [17]:
# Creating the tf-idf matrix.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
news_tfidf=vectorizer.fit_transform(news['text_clean'])

# Getting the word list.
vocabulary = vectorizer.get_feature_names()

In [18]:
#LSA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
news_lsa = lsa.fit_transform(news_tfidf)

news_components_lsa = word_topic(news_tfidf, news_lsa, vocabulary)


topwords=pd.DataFrame()
topwords['LSA']=top_words(news_components_lsa, n_top_words)

In [20]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
news_nmf = nmf.fit_transform(news_tfidf) 

news_components_nmf = word_topic(news_tfidf, news_nmf, vocabulary)

topwords['NNMF']=top_words(news_components_nmf, n_top_words)

In [21]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
            LSA          LDA         NNMF
0     god 30.62    just 1.45     car 2.96
0  people 29.67  people 1.23    like 1.22
0    just 28.65    like 1.22    just 1.19
0     like 26.9     car 1.18     don 0.95
0   think 24.64     god 1.13    cars 0.94
0     don 24.56    think 1.1    know 0.93
0    know 24.39     know 1.1    good 0.88
0    does 20.15   space 1.08   think 0.81
0    time 18.94   loser 1.06     new 0.73
0     say 17.78     don 1.04  engine 0.71
Topic 1:
               LSA          LDA             NNMF
1        god 24.27    like 1.41         god 5.51
1      jesus 11.67    just 1.41       jesus 2.64
1       faith 7.23     don 1.18       people 2.0
1       church 6.9     car 1.18     believe 1.69
1       bible 6.68    know 1.13       faith 1.59
1      christ 6.53     god 1.04       bible 1.53
1      believe 6.1  people 1.03      christ 1.48
1  christians 5.98    good 1.01       think 1.47
1         hell 5.1     does 1.0  christians 1.42
1   christian 4.89    time 0.9

**LSA Model** was able to reproduce 3 topics out of the 4 topics represented by newsgroups and included in this model. (The topics are 'sci.space' (Topic 4),'soc.religion.christian' (Topics 0 & 1),'talk.politics.mideast' (Topics 2 & 3) <br>
**LDA Model** was not able to reproduce any of the topics reprsented by newsgroups. All the topics has words from more than one topic.<br>
**NNMF Model** was able to reproduce 4 topics out of the 4 topics represented by newsgroups and included in this model. (The topics are 'sci.space' (Topic 4),'soc.religion.christian' (Topic 1),'talk.politics.mideast' (Topics 2 & 3), 'rec.autos' (Topic 0)  <br>

Therefore, The best model is NNMF