In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [3]:
news_df = pd.DataFrame({'document':documents})

# removing everything except alphabets`
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# make all text lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# tokenization
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

# Getting the word list.
terms = vectorizer.get_feature_names()

# Number of topics.
ntopics=20

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=[x for x in chosenlist]
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10


In [7]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
news_lsa = lsa.fit_transform(X)

components_lsa = word_topic(X, news_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)                


In [9]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

news_lda = lda.fit_transform(X) 

components_lda = word_topic(X, news_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)


In [10]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
news_nmf = nmf.fit_transform(X) 

components_nmf = word_topic(X, news_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [11]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
             LSA              LDA         NNMF
0    like 164.83       sale 18.32  really 3.82
0     know 151.4   condition 15.6    like 3.77
0  people 144.61      offer 12.99    make 3.58
0    think 134.3      price 10.94    know 3.33
0    good 114.38  excellent 10.12    want 3.32
0    time 111.31       email 9.68  people 3.19
0      make 95.0        good 9.47   think 3.13
0     want 90.11      asking 9.28    sure 3.06
0    right 86.04        best 8.56     good 2.8
0   really 83.42        sell 6.91    going 2.8
Topic 1:
              LSA          LDA              NNMF
1    thanks 75.23   like 11.55      thanks 13.63
1   windows 58.76    bike 8.68          mail 5.3
1      card 36.01    good 7.77      advance 4.67
1      mail 35.31    know 7.51         know 3.51
1     drive 32.96    time 7.17      looking 3.09
1      file 30.79    dave 7.08         help 2.72
1  software 30.29  thanks 7.03         info 2.69
1   advance 29.39    mike 5.82  information 2.38
1   program 27.62  sound

LSA 2 is rec.sport.hockey, LSA 13 is sci.space. The other topics are either indistiguishable from others in a similar category or indistinguishable overall. LDA 0 is misc.forsale, LDA 6 is talk.religion.misc, LDA 7 is sci.med, LDA 9 is sci.space, LDA 15 is soc.religion.christian, LDA 17 is rec.sport.hockey. Overall, the topics were more distiguishable in LDA than in LSA. NNMF 2 is rec.sport.hockey, NNMF 13 is sci.space, NNMF 14 is misc.forsale, NNMF 15 is talk.religion.misc, and NNMF 18 is soc.religion.christian. Some of the groups in NNMF were particularly unclear. Overall, I felt that LDA did the best job of creating topics.