In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import networkx as nx
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups

#select the categories with which I'm familiar (so that I could actually evaluate the performance of the algorithms)

categs =['alt.atheism',
         'rec.autos',
         'sci.electronics',
         'sci.med',
         'sci.space',
         'soc.religion.christian',
         'talk.politics.guns',
         'talk.politics.mideast']

In [13]:
#remove headers, footers, quotes to make it fair

news_train = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories = categs)

In [18]:
#creating tf idf matrix
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = 'english')
news_train_tfidf = vectorizer.fit_transform(news_train.data)

In [23]:
news_train.data[:1]

['\nI attended a colloquium at Goddard last fall where the head of the \noperations section of NASA was talking about what future missions\nwere going to be funded.  I don\'t remember his name or title off hand\nand I have discarded the colloquia announcement. In any case, he was \nasked about that very matter: "Why can\'t we spend a few million more\nto keep instruments that we already have in place going?"\n\nHis responce was that there are only so many $ available to him and\nthe lead time on an instrument like a COBE, Magellan, Hubble, etc\nis 5-10 years minumum.  If he spent all that could be spent on using\ncurrent instruments in the current budget enviroment he would have\nvery little to nothing for future projects.  If he did that, sure\nin the short run the science would be wonderful and he would be popular,\nhowever starting a few years after he had retired he would become\none of the greatest villans ever seen in the space community for not\nfunding the early stages of the n

In [5]:
news_train_tfidf.shape

(4561, 46431)

In [20]:
#get the vocabulary
vocabulary = vectorizer.get_feature_names()
vocabulary

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '000062david42',
 '00014',
 '000152',
 '00041032',
 '0004136',
 '0004246',
 '0004422',
 '00044513',
 '0004847546',
 '0005',
 '00090711',
 '000mi',
 '000miles',
 '000th',
 '001',
 '0010',
 '0012',
 '001319',
 '0014',
 '0018',
 '002',
 '0020',
 '0022',
 '0028',
 '0029',
 '003',
 '0033',
 '0034',
 '004',
 '004021809',
 '004418',
 '005',
 '00500',
 '006',
 '0065',
 '007',
 '0078',
 '008',
 '008561',
 '009',
 '0096b294',
 '0098',
 '00am',
 '00pm',
 '00xkv',
 '01',
 '010',
 '011',
 '0119',
 '012',
 '013',
 '014',
 '015',
 '016',
 '01720',
 '01760',
 '018',
 '01826',
 '01830',
 '018b',
 '0195',
 '02',
 '020359',
 '02115',
 '02118',
 '02138',
 '02139',
 '02160',
 '02173',
 '02178',
 '022',
 '02215',
 '0235',
 '023b',
 '024246',
 '0245',
 '025',
 '025258',
 '0278',
 '02790',
 '02908',
 '03',
 '030',
 '0300',
 '0306',
 '031349',
 '0318',
 '0320',
 '033',
 '033213',
 '0335',
 '034',
 '034101',
 '035',
 '0358',
 '037',
 '0372',
 '03756',
 '04',
 '040',

In [7]:
# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10
ntopics=8

In [8]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
news_train_lsa = lsa.fit_transform(news_train_tfidf)

components_lsa = word_topic(news_train_tfidf, news_train_lsa, vocabulary)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)

In [9]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=1, 
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

news_train_lda = lda.fit_transform(news_train_tfidf) 

components_lda = word_topic(news_train_tfidf, news_train_lda, vocabulary)

topwords['LDA']=top_words(components_lda, n_top_words)

In [10]:
from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
news_train_nmf = nmf.fit_transform(news_train_tfidf) 

components_nmf = word_topic(news_train_tfidf, news_train_nmf, vocabulary)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [11]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
   LSA  LDA NNMF
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
0  NaN  NaN  NaN
Topic 1:
   LSA  LDA NNMF
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
1  NaN  NaN  NaN
Topic 2:
   LSA  LDA NNMF
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
2  NaN  NaN  NaN
Topic 3:
   LSA  LDA NNMF
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
3  NaN  NaN  NaN
Topic 4:
   LSA  LDA NNMF
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
4  NaN  NaN  NaN
Topic 5:
   LSA  LDA