In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['abcnews-date-text.csv']


In [2]:
data = pd.read_csv("../input/abcnews-date-text.csv", error_bad_lines=False)

In [3]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [7]:
text = data[['headline_text']]

In [8]:
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [9]:
text['index'] = text.index

In [10]:
text.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [11]:
documents = text

In [12]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [13]:
print("Total length of the documents: {}".format(len(documents)))

Total length of the documents: 1103663


## Data Pre-Processing

In [15]:
# importing the gensim and nltk libraries

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import nltk
np.random.seed(42)

In [22]:
def preprocessing(sentence):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(sentence, pos='v'))

def preprocess(sentence):
    result = []
    
    for token in gensim.utils.simple_preprocess(sentence):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(preprocessing(token))
            
    return result

In [18]:
sample = documents[documents['index'] == 4310].values[0][0]

print("Sample document is selected for pre-processing: {}".format(sample))

Sample document is selected for pre-processing: rain helps dampen bushfires


In [20]:
words = []

for word in sample.split(' '):
    words.append(word)
    
print("Words found after splitting the sample document: {}".format(words))

Words found after splitting the sample document: ['rain', 'helps', 'dampen', 'bushfires']


In [23]:
print("Tokenized and lemmatized document: {}".format(preprocess(sample)))

Tokenized and lemmatized document: ['rain', 'help', 'dampen', 'bushfir']


In [24]:
# pre-processing all the documents

preprocessed_documents = documents['headline_text'].map(preprocess)

In [25]:
preprocessed_documents[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [26]:
# creating a dictionary from the above processed documents

dictionary = gensim.corpora.Dictionary(preprocessed_documents)

In [28]:
count = 0

for key, value in dictionary.iteritems():
    print("Key: {} and Value: {}".format(key, value))
    count += 1
    
    if count > 10:
        break

Key: 0 and Value: broadcast
Key: 1 and Value: communiti
Key: 2 and Value: decid
Key: 3 and Value: licenc
Key: 4 and Value: awar
Key: 5 and Value: defam
Key: 6 and Value: wit
Key: 7 and Value: call
Key: 8 and Value: infrastructur
Key: 9 and Value: protect
Key: 10 and Value: summit


In [29]:
# filter out extreme tokens in the document

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [30]:
bag_of_words = [dictionary.doc2bow(document=document) for document in preprocessed_documents]

In [31]:
bag_of_words[4310]

[(76, 1), (112, 1), (483, 1), (4014, 1)]

In [35]:
## preview of bag of words of our sample preprocessed document

sample_bag_of_words = bag_of_words[4310]

for i in range(len(sample_bag_of_words)):
    print("Word: {} (\"{}\") appears: {} times.".format(sample_bag_of_words[i][0], dictionary[sample_bag_of_words[i][0]], sample_bag_of_words[i][1]))

Word: 76 ("bushfir") appears: 1 times.
Word: 112 ("help") appears: 1 times.
Word: 483 ("rain") appears: 1 times.
Word: 4014 ("dampen") appears: 1 times.


# TF-IDF

In [38]:
from gensim import corpora, models

tfidf = models.TfidfModel(bag_of_words)
corpus_tfidf = tfidf[bag_of_words]

from pprint import pprint

for document in corpus_tfidf:
    pprint(document)
    break

[(0, 0.5892908867507543),
 (1, 0.38929654337861147),
 (2, 0.4964985175717023),
 (3, 0.5046520327464028)]


# Running LDA using Bag of words 

In [39]:
# training our model using gensim LdaMulticore

model = gensim.models.LdaMulticore(bag_of_words, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [40]:
for index, topic in model.print_topics(-1):
    print("Topic: {} \n Words: {}".format(index, topic))

Topic: 0 
 Words: 0.033*"charg" + 0.031*"court" + 0.020*"face" + 0.020*"murder" + 0.018*"alleg" + 0.017*"brisban" + 0.017*"final" + 0.016*"accus" + 0.015*"child" + 0.013*"trial"
Topic: 1 
 Words: 0.034*"govern" + 0.018*"year" + 0.016*"hour" + 0.016*"elect" + 0.015*"adelaid" + 0.014*"china" + 0.013*"break" + 0.012*"student" + 0.012*"busi" + 0.011*"news"
Topic: 2 
 Words: 0.055*"polic" + 0.026*"death" + 0.020*"perth" + 0.017*"hous" + 0.016*"miss" + 0.015*"donald" + 0.014*"arrest" + 0.013*"investig" + 0.013*"woman" + 0.011*"children"
Topic: 3 
 Words: 0.023*"nation" + 0.021*"home" + 0.019*"countri" + 0.019*"women" + 0.018*"australia" + 0.018*"rural" + 0.016*"world" + 0.016*"rise" + 0.015*"test" + 0.013*"leagu"
Topic: 4 
 Words: 0.020*"hospit" + 0.016*"leav" + 0.015*"deal" + 0.012*"melbourn" + 0.012*"campaign" + 0.011*"john" + 0.011*"adelaid" + 0.011*"season" + 0.010*"lose" + 0.010*"unit"
Topic: 5 
 Words: 0.043*"australia" + 0.021*"market" + 0.017*"drug" + 0.015*"australian" + 0.015*"reco

In [41]:
tfidf_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for index, topic in tfidf_model.print_topics(-1):
    print("Topic: {}, Words: {}".format(index, topic))

Topic: 0, Words: 0.021*"countri" + 0.020*"hour" + 0.010*"price" + 0.009*"market" + 0.007*"rise" + 0.007*"share" + 0.007*"farm" + 0.007*"cattl" + 0.006*"rat" + 0.006*"monday"
Topic: 1, Words: 0.006*"live" + 0.006*"christma" + 0.006*"abbott" + 0.006*"australia" + 0.006*"fish" + 0.005*"central" + 0.005*"queensland" + 0.005*"histori" + 0.005*"australian" + 0.005*"hunter"
Topic: 2, Words: 0.010*"grandstand" + 0.009*"dead" + 0.008*"korea" + 0.008*"climat" + 0.008*"kill" + 0.008*"novemb" + 0.007*"dairi" + 0.007*"june" + 0.006*"islam" + 0.006*"syria"
Topic: 3, Words: 0.018*"polic" + 0.017*"charg" + 0.014*"murder" + 0.013*"crash" + 0.012*"woman" + 0.012*"court" + 0.010*"alleg" + 0.010*"death" + 0.009*"jail" + 0.008*"assault"
Topic: 4, Words: 0.026*"rural" + 0.011*"turnbul" + 0.011*"donald" + 0.008*"juli" + 0.008*"health" + 0.007*"wednesday" + 0.007*"mental" + 0.007*"thursday" + 0.007*"care" + 0.007*"august"
Topic: 5, Words: 0.011*"weather" + 0.008*"bushfir" + 0.008*"michael" + 0.007*"flood" + 0

## Evaluation by classifing simple document using LDA Bag of words model

In [42]:
preprocessed_documents[4310]

['rain', 'help', 'dampen', 'bushfir']

In [43]:
for index, score in sorted(tfidf_model[bag_of_words[4310]], key=lambda tup: -1 * tup[1]):
    print("\nScore: {} \t \nTopic: {}".format(score, tfidf_model.print_topics(index, 10)))


Score: 0.5588955879211426 	 
Topic: [(6, '0.027*"trump" + 0.014*"drum" + 0.009*"celebr" + 0.009*"friday" + 0.008*"tuesday" + 0.007*"post" + 0.007*"april" + 0.006*"scott" + 0.006*"presid" + 0.006*"wall"'), (5, '0.011*"weather" + 0.008*"bushfir" + 0.008*"michael" + 0.007*"flood" + 0.006*"fire" + 0.006*"burn" + 0.006*"cyclon" + 0.006*"jam" + 0.006*"home" + 0.005*"quiz"'), (9, '0.010*"govern" + 0.010*"elect" + 0.007*"fund" + 0.007*"labor" + 0.007*"sport" + 0.006*"council" + 0.006*"say" + 0.006*"budget" + 0.006*"liber" + 0.006*"feder"'), (7, '0.018*"news" + 0.009*"busi" + 0.008*"octob" + 0.007*"peter" + 0.006*"zealand" + 0.006*"right" + 0.006*"anim" + 0.005*"know" + 0.005*"human" + 0.005*"breakfast"'), (3, '0.018*"polic" + 0.017*"charg" + 0.014*"murder" + 0.013*"crash" + 0.012*"woman" + 0.012*"court" + 0.010*"alleg" + 0.010*"death" + 0.009*"jail" + 0.008*"assault"')]

Score: 0.2810365855693817 	 
Topic: [(4, '0.026*"rural" + 0.011*"turnbul" + 0.011*"donald" + 0.008*"juli" + 0.008*"health" 

### Testing on unseen document

In [44]:
test_document = "How a Pentgon deal became an identity crisis for Google"

bag_of_words_vector = dictionary.doc2bow(preprocess(test_document))

In [47]:
for index, score in sorted(tfidf_model[bag_of_words_vector], key=lambda tup: -1 * tup[1]):
    print("Score: {} \t Topic: {}\n".format(score, tfidf_model.print_topics(index, 5)))

Score: 0.5894848108291626 	 Topic: [(4, '0.026*"rural" + 0.011*"turnbul" + 0.011*"donald" + 0.008*"juli" + 0.008*"health"'), (2, '0.010*"grandstand" + 0.009*"dead" + 0.008*"korea" + 0.008*"climat" + 0.008*"kill"'), (9, '0.010*"govern" + 0.010*"elect" + 0.007*"fund" + 0.007*"labor" + 0.007*"sport"'), (7, '0.018*"news" + 0.009*"busi" + 0.008*"octob" + 0.007*"peter" + 0.006*"zealand"'), (3, '0.018*"polic" + 0.017*"charg" + 0.014*"murder" + 0.013*"crash" + 0.012*"woman"'), (6, '0.027*"trump" + 0.014*"drum" + 0.009*"celebr" + 0.009*"friday" + 0.008*"tuesday"'), (8, '0.018*"interview" + 0.010*"final" + 0.009*"podcast" + 0.009*"leagu" + 0.008*"world"'), (5, '0.011*"weather" + 0.008*"bushfir" + 0.008*"michael" + 0.007*"flood" + 0.006*"fire"'), (0, '0.021*"countri" + 0.020*"hour" + 0.010*"price" + 0.009*"market" + 0.007*"rise"'), (1, '0.006*"live" + 0.006*"christma" + 0.006*"abbott" + 0.006*"australia" + 0.006*"fish"')]

Score: 0.2504499852657318 	 Topic: [(1, '0.006*"live" + 0.006*"christma" +