# Data imports

We import Pandas, numpy and scipy for data structuresWe use gensim for LDA, and sklearn for NMF

In [1]:
import pandas as pd;
import numpy as np;
import scipy as sp;
import sklearn;
import sys;
from nltk.corpus import stopwords;
import nltk;
from gensim.models import ldamodel
import gensim.corpora;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
import pickle;

# Loading the data

We are using the ABC News headlines dataset. Some lines are badly formatted (very few), so we are skipping those.

In [2]:
data = pd.read_csv('../input/abcnews-date-text.csv', error_bad_lines=False);

In [3]:
#We only need the Headlines_text column from the data
data_text = data[['headline_text']];

In [None]:
#it takes too long to run it on the entire dataset on Kaggle, so reducing the size
np.random.seed(1024);
data_text = data_text.iloc[np.random.choice(len(data_text), 10000)];

We need to remove stopwords first. Casting all values to float will make it easier to iterate over.

In [21]:
data_text = data_text.astype('str');

In [22]:
for idx in range(len(data_text)):
    
    #go through each word in each data_text row, remove stopwords, and set them on the index.
    data_text.iloc[idx]['headline_text'] = [word for word in data_text.iloc[idx]['headline_text'].split(' ') if word not in stopwords.words()];
    
    #print logs to monitor output
    if idx % 1000 == 0:
        sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text)));

In [105]:
#save data because it takes very long to remove stop words
pickle.dump(data_text, open('data_text.dat', 'wb'))

In [24]:
#get the words as an array for lda input
train_headlines = [value[0] for value in data_text.iloc[0:].values];

In [26]:
#number of topics we will cluster for: 10
num_topics = 10;

# LDA

We will use the gensim library for LDA. First, we obtain a id-2-word dictionary. For each headline, we will use the dictionary to obtain a mapping of the word id to their word counts. The LDA model uses both of these mappings.

In [27]:
id2word = gensim.corpora.Dictionary(train_headlines)

In [28]:
corpus = [id2word.doc2bow(text) for text in train_headlines]

In [30]:
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

# generating LDA topics

We will iterate over the number of topics, get the top words in each cluster and add them to a dataframe.

In [31]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [33]:
get_lda_topics(lda, num_topics)

# NMF

For NMF, we need to obtain a design matrix. To improve results, I am going to apply TfIdf transformation to the counts.

In [34]:
#the count vectorizer needs string inputs, not array, so I join them with a space.
train_headlines_sentences = [' '.join(text) for text in train_headlines]

In [35]:
#obtain a Counts design matrix. Because the size of the matrix will be large, we can set the max_features to 5000.
vectorizer = CountVectorizer(analyzer='word', max_features=5000);
x_counts = vectorizer.fit_transform(train_headlines_sentences);

In [36]:
#set a TfIdf transformer, and transfer the counts with the model.
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [37]:
#normalize the TfIdf values so each row has unit length.
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [38]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');

In [39]:
#fit the model
model.fit(xtfidf_norm)

In [40]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [41]:
get_nmf_topics(model, 20)