# Data imports

We import Pandas, numpy and scipy for data structuresWe use gensim for LDA, and sklearn for NMF

In [1]:
import pandas as pd;
import numpy as np;
import scipy as sp;
import sklearn;
import sys;
from nltk.corpus import stopwords;
import nltk;
from gensim.models import ldamodel
import gensim.corpora;
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize;
import pickle;

Using TensorFlow backend.


# Loading the data

We are using the ABC News headlines dataset. Some lines are badly formatted (very few), so we are skipping those.

In [2]:
data = pd.read_csv('../input/abcnews-date-text.csv', error_bad_lines=False);

b'Skipping line 637987: expected 2 fields, saw 3\nSkipping line 672395: expected 2 fields, saw 3\nSkipping line 697406: expected 2 fields, saw 3\nSkipping line 724169: expected 2 fields, saw 3\nSkipping line 738471: expected 2 fields, saw 3\nSkipping line 753796: expected 2 fields, saw 3\nSkipping line 759008: expected 2 fields, saw 3\nSkipping line 761317: expected 2 fields, saw 3\nSkipping line 761491: expected 2 fields, saw 3\nSkipping line 761778: expected 2 fields, saw 3\nSkipping line 763261: expected 2 fields, saw 3\nSkipping line 766836: expected 2 fields, saw 3\nSkipping line 767743: expected 2 fields, saw 3\nSkipping line 768084: expected 2 fields, saw 3\nSkipping line 770979: expected 2 fields, saw 3\nSkipping line 778212: expected 2 fields, saw 3\nSkipping line 781216: expected 2 fields, saw 3\nSkipping line 782529: expected 2 fields, saw 3\nSkipping line 784936: expected 2 fields, saw 3\nSkipping line 785692: expected 2 fields, saw 3\n'
b'Skipping line 787820: expected 2 f

In [3]:
#We only need the Headlines_text column from the data
data_text = data[['headline_text']];

In [4]:
#it takes too long to run it on the entire dataset on Kaggle, so reducing the size
np.random.seed(1024);
data_text = data_text.iloc[np.random.choice(len(data_text), 10000)];

We need to remove stopwords first. Casting all values to float will make it easier to iterate over.

In [5]:
data_text = data_text.astype('str');

In [6]:
for idx in range(len(data_text)):
    
    #go through each word in each data_text row, remove stopwords, and set them on the index.
    data_text.iloc[idx]['headline_text'] = [word for word in data_text.iloc[idx]['headline_text'].split(' ') if word not in stopwords.words()];
    
    #print logs to monitor output
    if idx % 1000 == 0:
        sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text)));

c = 9000 / 10000

In [7]:
#save data because it takes very long to remove stop words
pickle.dump(data_text, open('data_text.dat', 'wb'))

In [8]:
#get the words as an array for lda input
train_headlines = [value[0] for value in data_text.iloc[0:].values];

In [9]:
#number of topics we will cluster for: 10
num_topics = 10;

# LDA

We will use the gensim library for LDA. First, we obtain a id-2-word dictionary. For each headline, we will use the dictionary to obtain a mapping of the word id to their word counts. The LDA model uses both of these mappings.

In [10]:
id2word = gensim.corpora.Dictionary(train_headlines)

In [11]:
corpus = [id2word.doc2bow(text) for text in train_headlines]

In [12]:
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

# generating LDA topics

We will iterate over the number of topics, get the top words in each cluster and add them to a dataframe.

In [13]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [14]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,new,dead,interview,health,police,new,us,,plans,election
1,backs,murder,community,fire,win,business,high,court,probe,mayor
2,driver,accused,death,police,gets,woman,claims,new,could,case
3,qld,abc,post,day,new,hospital,funding,west,council,budget
4,jobs,bank,ban,water,missing,health,nt,children,report,new
5,coal,charges,calls,rural,found,car,nsw,killed,title,bail
6,charged,cup,hobart,sex,body,sydney,australian,seek,lead,china
7,accident,takes,years,help,crash,minister,govt,seeks,boost,court
8,open,jailed,act,three,get,news,hits,storm,state,wa
9,hopes,blaze,research,second,boost,inquest,drought,flood,wa,ahead


# NMF

For NMF, we need to obtain a design matrix. To improve results, I am going to apply TfIdf transformation to the counts.

In [15]:
#the count vectorizer needs string inputs, not array, so I join them with a space.
train_headlines_sentences = [' '.join(text) for text in train_headlines]

In [16]:
#obtain a Counts design matrix. Because the size of the matrix will be large, we can set the max_features to 5000.
vectorizer = CountVectorizer(analyzer='word', max_features=5000);
x_counts = vectorizer.fit_transform(train_headlines_sentences);

In [17]:
#set a TfIdf transformer, and transfer the counts with the model.
transformer = TfidfTransformer(smooth_idf=False);
x_tfidf = transformer.fit_transform(x_counts);

In [18]:
#normalize the TfIdf values so each row has unit length.
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [19]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');

In [20]:
#fit the model
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=10, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [21]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [22]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,interview,seconds,police,new,rural,abc,death,water,closer,charged
1,john,90,investigate,hospital,qld,sport,investigate,says,am1,court
2,matthew,business,missing,gets,podcast,entertainment,woman,council,news,murder
3,michael,news,crash,drug,national,speaks,fire,plan,am2,assault
4,andrew,weather,search,zealand,news,news,toll,us,pm,accused
5,tim,sport,hunt,ceo,nsw,market,coroner,australia,pm1,drug
6,jack,confidence,probe,deal,beef,weather,suspicious,govt,step,trial
7,scott,market,arrest,pacific,drought,analysis,shooting,health,basslink,charge
8,matt,ninety,dead,england,doctors,business,investigation,fire,inquiry,charges
9,ivan,analysis,car,capital,reporter,peter,hospital,call,national,alleged
