In [10]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
from gensim import corpora, models
from pprint import pprint
from tqdm import tqdm 

In [11]:
data = pd.read_csv('C:/Users/Sidhanth Krishna/Downloads/Datasets/kaggle/abcnews-date-text.csv',error_bad_lines=False)
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [12]:
# Tokenization is donw using simple_preprocess
# Remove words that have characters below 3 or if they are stopwords
# The resultant words are then lemmatized to change from third person to first person
# Stemming is done to reduce to root word

stemmer = SnowballStemmer('english')

def lem_stem(text):
    lemma = WordNetLemmatizer().lemmatize(text,pos='v')
    lemmatize_stemming = stemmer.stem(lemma)
    return lemmatize_stemming

def preprocess(text):
    result_text = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token)>3:
            result_text.append(lem_stem(token))
    return result_text

In [13]:
doc_sample = documents[documents['index']==4310].values[0][0]
print(doc_sample)
print(preprocess(doc_sample))

rain helps dampen bushfires
['rain', 'help', 'dampen', 'bushfir']


In [14]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs.head()

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

In [15]:
# Vectorizing using Bag of words
# Using words based on it's frequency
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(bow_corpus[4310])

[(19, 1)]


In [16]:
# Vectorizing using tfidf 
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]


count = 0
for doc in corpus_tfidf:
    pprint(doc)
    count+=1
    print()
    if count>9:
        break


[]

[]

[(0, 1.0)]

[(1, 1.0)]

[(2, 1.0)]

[(3, 1.0)]

[(4, 1.0)]

[]

[(5, 1.0)]

[(6, 1.0)]



In [36]:
# LDA using BOW
LDA_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=10, 
                                       id2word=dictionary, 
                                       passes=2, workers=2)

for idx,topic in LDA_model.print_topics(-1):
    print('Topic: '+str(idx)+'\nWords: '+str(topic)+'\n')

Topic: 0
Words: 0.022*"home" + 0.019*"live" + 0.015*"peopl" + 0.013*"student" + 0.012*"help" + 0.012*"industri" + 0.011*"farmer" + 0.010*"build" + 0.010*"worker" + 0.010*"research"

Topic: 1
Words: 0.026*"north" + 0.023*"market" + 0.016*"share" + 0.015*"lose" + 0.013*"bank" + 0.013*"talk" + 0.013*"week" + 0.012*"west" + 0.012*"close" + 0.012*"hobart"

Topic: 2
Words: 0.043*"polic" + 0.025*"charg" + 0.024*"court" + 0.018*"murder" + 0.016*"woman" + 0.014*"die" + 0.014*"death" + 0.014*"alleg" + 0.013*"interview" + 0.013*"jail"

Topic: 3
Words: 0.030*"melbourn" + 0.026*"adelaid" + 0.019*"sydney" + 0.018*"hospit" + 0.017*"donald" + 0.014*"concern" + 0.013*"flood" + 0.012*"guilti" + 0.011*"senat" + 0.010*"question"

Topic: 4
Words: 0.049*"australia" + 0.031*"trump" + 0.023*"world" + 0.018*"open" + 0.018*"canberra" + 0.016*"women" + 0.015*"final" + 0.013*"win" + 0.013*"australian" + 0.011*"leagu"

Topic: 5
Words: 0.024*"govern" + 0.021*"queensland" + 0.018*"plan" + 0.018*"elect" + 0.016*"chan

In [50]:
# Evaluation of LDA using BOW
print(str(processed_docs[4310])+'\n')

for index,score in sorted(LDA_model[bow_corpus[4310]], key=lambda tup:-1*tup[1]):
    print('Score: '+str(score)+'\nTopic'+str(LDA_model.print_topic(index,5))+'\n')

['rain', 'help', 'dampen', 'bushfir']

Score: 0.42031497
Topic0.030*"melbourn" + 0.026*"adelaid" + 0.019*"sydney" + 0.018*"hospit" + 0.017*"donald"

Score: 0.22015883
Topic0.028*"south" + 0.022*"test" + 0.018*"tasmania" + 0.014*"fall" + 0.014*"busi"

Score: 0.2193984
Topic0.026*"north" + 0.023*"market" + 0.016*"share" + 0.015*"lose" + 0.013*"bank"

Score: 0.020026032
Topic0.022*"home" + 0.019*"live" + 0.015*"peopl" + 0.013*"student" + 0.012*"help"

Score: 0.02001862
Topic0.024*"govern" + 0.021*"queensland" + 0.018*"plan" + 0.018*"elect" + 0.016*"chang"

Score: 0.020016627
Topic0.043*"polic" + 0.025*"charg" + 0.024*"court" + 0.018*"murder" + 0.016*"woman"

Score: 0.020016627
Topic0.049*"australia" + 0.031*"trump" + 0.023*"world" + 0.018*"open" + 0.018*"canberra"

Score: 0.020016627
Topic0.026*"coast" + 0.024*"countri" + 0.019*"hour" + 0.018*"time" + 0.017*"price"

Score: 0.020016627
Topic0.027*"report" + 0.023*"warn" + 0.018*"tasmanian" + 0.017*"power" + 0.014*"victoria"

Score: 0.02001

In [38]:
# LDA using tfidf
LDA_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf,
                                             num_topics=10,
                                             id2word=dictionary,
                                             passes=2, workers=2)
for idx,topic in LDA_model_tfidf.print_topics(-1):
    print('Topic: '+str(idx)+'\nWords: '+str(topic)+'\n')

Topic: 0
Words: 0.018*"polic" + 0.016*"charg" + 0.015*"murder" + 0.012*"alleg" + 0.012*"woman" + 0.011*"court" + 0.011*"death" + 0.010*"shoot" + 0.009*"arrest" + 0.008*"jail"

Topic: 1
Words: 0.019*"live" + 0.017*"turnbul" + 0.012*"victorian" + 0.012*"hobart" + 0.010*"malcolm" + 0.009*"weather" + 0.008*"northern" + 0.007*"islam" + 0.007*"explain" + 0.006*"blog"

Topic: 2
Words: 0.013*"rural" + 0.011*"govern" + 0.007*"budget" + 0.006*"fund" + 0.006*"share" + 0.006*"market" + 0.006*"council" + 0.006*"region" + 0.006*"dollar" + 0.005*"say"

Topic: 3
Words: 0.012*"drum" + 0.012*"news" + 0.010*"royal" + 0.008*"final" + 0.008*"australia" + 0.008*"commiss" + 0.008*"world" + 0.007*"australian" + 0.007*"open" + 0.006*"win"

Topic: 4
Words: 0.038*"trump" + 0.018*"donald" + 0.011*"christma" + 0.011*"marriag" + 0.009*"energi" + 0.008*"sexual" + 0.008*"stori" + 0.008*"speak" + 0.008*"south" + 0.007*"plead"

Topic: 5
Words: 0.015*"tasmania" + 0.012*"victoria" + 0.010*"street" + 0.008*"wall" + 0.007*

In [49]:
# Evaluation of LDA using tf-idf
print(str(processed_docs[4310])+'\n')

for index,score in sorted(LDA_model_tfidf[bow_corpus[4310]], key=lambda tup:-1*tup[1]):
    print('Score: '+str(score)+'\nTopic'+str(LDA_model_tfidf.print_topic(index,5))+'\n')

['rain', 'help', 'dampen', 'bushfir']

Score: 0.58291334
Topic0.030*"queensland" + 0.010*"elect" + 0.008*"mental" + 0.008*"juli" + 0.008*"presid"

Score: 0.25702325
Topic0.014*"interview" + 0.010*"korea" + 0.009*"north" + 0.008*"celebr" + 0.008*"ash"

Score: 0.020012975
Topic0.015*"tasmania" + 0.012*"victoria" + 0.010*"street" + 0.008*"wall" + 0.007*"price"

Score: 0.020009546
Topic0.013*"rural" + 0.011*"govern" + 0.007*"budget" + 0.006*"fund" + 0.006*"share"

Score: 0.020008529
Topic0.019*"live" + 0.017*"turnbul" + 0.012*"victorian" + 0.012*"hobart" + 0.010*"malcolm"

Score: 0.020007392
Topic0.018*"polic" + 0.016*"charg" + 0.015*"murder" + 0.012*"alleg" + 0.012*"woman"

Score: 0.020006895
Topic0.021*"countri" + 0.019*"hour" + 0.013*"crash" + 0.009*"die" + 0.009*"leagu"

Score: 0.02000666
Topic0.010*"john" + 0.009*"farm" + 0.009*"sport" + 0.008*"michael" + 0.008*"climat"

Score: 0.02000629
Topic0.038*"trump" + 0.018*"donald" + 0.011*"christma" + 0.011*"marriag" + 0.009*"energi"

Score:

In [48]:
# Testing on unseen document
unseen_doc = 'How a Pentagon deal became an identitiy crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_doc))

for index,score in sorted(LDA_model[bow_vector],
                          key=lambda tup:-1*tup[1]):
    print('Score: '+str(score)+'\nTopic'+str(LDA_model.print_topic(index,5))+'\n')

Score: 0.6202708
Topic0.026*"north" + 0.023*"market" + 0.016*"share" + 0.015*"lose" + 0.013*"bank"

Score: 0.21963663
Topic0.027*"kill" + 0.018*"dead" + 0.018*"attack" + 0.017*"turnbul" + 0.016*"leav"

Score: 0.020013858
Topic0.024*"govern" + 0.021*"queensland" + 0.018*"plan" + 0.018*"elect" + 0.016*"chang"

Score: 0.02001239
Topic0.028*"south" + 0.022*"test" + 0.018*"tasmania" + 0.014*"fall" + 0.014*"busi"

Score: 0.020011669
Topic0.027*"report" + 0.023*"warn" + 0.018*"tasmanian" + 0.017*"power" + 0.014*"victoria"

Score: 0.020011362
Topic0.030*"melbourn" + 0.026*"adelaid" + 0.019*"sydney" + 0.018*"hospit" + 0.017*"donald"

Score: 0.02001136
Topic0.022*"home" + 0.019*"live" + 0.015*"peopl" + 0.013*"student" + 0.012*"help"

Score: 0.02001064
Topic0.026*"coast" + 0.024*"countri" + 0.019*"hour" + 0.018*"time" + 0.017*"price"

Score: 0.020010639
Topic0.043*"polic" + 0.025*"charg" + 0.024*"court" + 0.018*"murder" + 0.016*"woman"

Score: 0.020010639
Topic0.049*"australia" + 0.031*"trump" + 