In [1]:
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
print(len(documents))

1186018


In [3]:
print(documents)

                                             headline_text    index
0        aba decides against community broadcasting lic...        0
1           act fire witnesses must be aware of defamation        1
2           a g calls for infrastructure protection summit        2
3                 air nz staff in aust strike for pay rise        3
4            air nz strike to affect australian travellers        4
...                                                    ...      ...
1186013  vision of flames approaching corryong in victoria  1186013
1186014  wa police and government backflip on drug amne...  1186014
1186015  we have fears for their safety: victorian premier  1186015
1186016                              when do the 20s start  1186016
1186017  yarraville shooting woman dead man critically ...  1186017

[1186018 rows x 2 columns]


In [4]:
#Data_PreProcessing_Starts_Here
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)


import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#Lemmitization Example
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [6]:
#Stemming Example
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [7]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: #filtering with length less than 3
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print(doc_sample)
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))


ratepayers group wants compulsory local govt voting
original document: 
['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


 tokenized and lemmatized document: 
['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [10]:
processed_docs = documents['headline_text'].map(preprocess)

In [11]:
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [12]:
#Bag_of_Words
dictionary = gensim.corpora.Dictionary(processed_docs)

In [13]:
print(dictionary)

Dictionary(67118 unique tokens: ['broadcast', 'communiti', 'decid', 'licenc', 'awar']...)


In [14]:
#Filtering_those_available_in_less_than_15_documents_and_in_less_than_50%_of_the_document.Last_keeing_100000_most_frequent
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
print(dictionary)

Dictionary(14939 unique tokens: ['broadcast', 'communiti', 'decid', 'licenc', 'awar']...)


In [15]:
#count = 0
#for k, v in dictionary.iteritems():
#    print(k, v)
#    count += 1
#    if count > 10:
#       break

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(bow_corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
print(bow_corpus[4310])

[(162, 1), (240, 1), (292, 1), (589, 1), (838, 1), (3567, 1), (3568, 1)]


In [18]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 162 ("govt") appears 1 time.
Word 240 ("group") appears 1 time.
Word 292 ("vote") appears 1 time.
Word 589 ("local") appears 1 time.
Word 838 ("want") appears 1 time.
Word 3567 ("compulsori") appears 1 time.
Word 3568 ("ratepay") appears 1 time.


In [19]:
#TF-IDF starts here

In [20]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [21]:
corpus_tfidf = tfidf[bow_corpus]

In [22]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]


In [23]:
#Running LDA using Bag of Words

In [29]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [33]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic {}: {}  \nWords: {}'.format(idx, dictionary[idx], topic))

Topic 0: broadcast  
Words: 0.048*"help" + 0.046*"live" + 0.038*"farmer" + 0.030*"break" + 0.028*"water" + 0.028*"drum" + 0.025*"need" + 0.017*"appeal" + 0.017*"plead" + 0.017*"monday"
Topic 1: communiti  
Words: 0.051*"nation" + 0.040*"tasmania" + 0.032*"speak" + 0.031*"interview" + 0.030*"case" + 0.027*"releas" + 0.025*"prison" + 0.025*"want" + 0.024*"park" + 0.024*"polit"
Topic 2: decid  
Words: 0.048*"news" + 0.046*"school" + 0.031*"work" + 0.031*"take" + 0.028*"student" + 0.028*"farm" + 0.024*"chines" + 0.020*"program" + 0.019*"lead" + 0.017*"action"
Topic 3: licenc  
Words: 0.063*"queensland" + 0.040*"health" + 0.033*"bushfir" + 0.031*"minist" + 0.026*"servic" + 0.026*"darwin" + 0.024*"beat" + 0.023*"scott" + 0.022*"find" + 0.020*"like"
Topic 4: awar  
Words: 0.047*"adelaid" + 0.044*"open" + 0.029*"street" + 0.021*"media" + 0.020*"australian" + 0.019*"head" + 0.018*"beach" + 0.017*"compani" + 0.017*"right" + 0.017*"mark"
Topic 5: defam  
Words: 0.060*"charg" + 0.054*"court" + 0.0

In [34]:
#Running LDA using TF-IDF

In [35]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [42]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic {}: {} \nWord: {}'.format(idx, dictionary[idx], topic))

Topic 0: broadcast 
Word: 0.019*"donald" + 0.013*"guilti" + 0.009*"plead" + 0.008*"monday" + 0.008*"peopl" + 0.007*"insid" + 0.007*"know" + 0.006*"human" + 0.006*"spring" + 0.006*"decemb"
Topic 1: communiti 
Word: 0.009*"michael" + 0.009*"scott" + 0.008*"juli" + 0.008*"hobart" + 0.006*"award" + 0.006*"june" + 0.006*"pacif" + 0.006*"america" + 0.006*"leagu" + 0.005*"brief"
Topic 2: decid 
Word: 0.020*"countri" + 0.015*"hour" + 0.015*"drum" + 0.010*"climat" + 0.009*"tuesday" + 0.008*"search" + 0.007*"miss" + 0.006*"beach" + 0.006*"bodi" + 0.006*"ash"
Topic 3: licenc 
Word: 0.011*"health" + 0.008*"fund" + 0.007*"friday" + 0.007*"servic" + 0.006*"budget" + 0.006*"sport" + 0.006*"care" + 0.005*"mental" + 0.005*"school" + 0.005*"say"
Topic 4: awar 
Word: 0.020*"news" + 0.015*"rural" + 0.013*"elect" + 0.010*"nation" + 0.006*"vote" + 0.006*"fiji" + 0.006*"august" + 0.006*"septemb" + 0.006*"presid" + 0.006*"liber"
Topic 5: defam 
Word: 0.020*"trump" + 0.013*"market" + 0.009*"price" + 0.008*"roy

In [38]:
#Classification of the topics
#Performance evaluation by classifying sample document using LDA Bag of Words model

In [43]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [47]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 11)))


Score: 0.6313902139663696	 
Topic: 0.043*"plan" + 0.033*"chang" + 0.032*"rural" + 0.030*"fund" + 0.030*"council" + 0.027*"state" + 0.024*"industri" + 0.023*"concern" + 0.022*"region" + 0.021*"support" + 0.021*"vote"

Score: 0.1312827616930008	 
Topic: 0.051*"nation" + 0.040*"tasmania" + 0.032*"speak" + 0.031*"interview" + 0.030*"case" + 0.027*"releas" + 0.025*"prison" + 0.025*"want" + 0.024*"park" + 0.024*"polit" + 0.021*"bodi"

Score: 0.13104446232318878	 
Topic: 0.059*"world" + 0.040*"die" + 0.031*"crash" + 0.031*"record" + 0.031*"island" + 0.022*"climat" + 0.021*"make" + 0.020*"turnbul" + 0.019*"leagu" + 0.019*"race" + 0.017*"law"


In [48]:
#Performance evaluation by classifying sample document using LDA TF-IDF model

In [52]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 11)))


Score: 0.37577593326568604	 
Topic: 0.011*"health" + 0.008*"fund" + 0.007*"friday" + 0.007*"servic" + 0.006*"budget" + 0.006*"sport" + 0.006*"care" + 0.005*"mental" + 0.005*"school" + 0.005*"say" + 0.005*"plan"

Score: 0.22491025924682617	 
Topic: 0.020*"trump" + 0.013*"market" + 0.009*"price" + 0.008*"royal" + 0.007*"farmer" + 0.007*"commiss" + 0.006*"weather" + 0.006*"govern" + 0.006*"bushfir" + 0.006*"queensland" + 0.006*"busi"

Score: 0.1806265413761139	 
Topic: 0.019*"donald" + 0.013*"guilti" + 0.009*"plead" + 0.008*"monday" + 0.008*"peopl" + 0.007*"insid" + 0.007*"know" + 0.006*"human" + 0.006*"spring" + 0.006*"decemb" + 0.005*"right"

Score: 0.14364171028137207	 
Topic: 0.020*"news" + 0.015*"rural" + 0.013*"elect" + 0.010*"nation" + 0.006*"vote" + 0.006*"fiji" + 0.006*"august" + 0.006*"septemb" + 0.006*"presid" + 0.006*"liber" + 0.006*"jam"

Score: 0.012508916668593884	 
Topic: 0.009*"michael" + 0.009*"scott" + 0.008*"juli" + 0.008*"hobart" + 0.006*"award" + 0.006*"june" + 0.00

In [53]:
#Testing model on unseen document

In [54]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.21005107462406158	 Topic: 0.051*"famili" + 0.046*"coast" + 0.032*"children" + 0.031*"gold" + 0.031*"deal"
Score: 0.21004332602024078	 Topic: 0.059*"world" + 0.040*"die" + 0.031*"crash" + 0.031*"record" + 0.031*"island"
Score: 0.20999836921691895	 Topic: 0.051*"nation" + 0.040*"tasmania" + 0.032*"speak" + 0.031*"interview" + 0.030*"case"
Score: 0.20986396074295044	 Topic: 0.060*"kill" + 0.054*"report" + 0.052*"south" + 0.045*"north" + 0.045*"attack"
Score: 0.010002706199884415	 Topic: 0.048*"help" + 0.046*"live" + 0.038*"farmer" + 0.030*"break" + 0.028*"water"
Score: 0.010002706199884415	 Topic: 0.048*"news" + 0.046*"school" + 0.031*"work" + 0.031*"take" + 0.028*"student"
Score: 0.010002706199884415	 Topic: 0.063*"queensland" + 0.040*"health" + 0.033*"bushfir" + 0.031*"minist" + 0.026*"servic"
Score: 0.010002706199884415	 Topic: 0.047*"adelaid" + 0.044*"open" + 0.029*"street" + 0.021*"media" + 0.020*"australian"
Score: 0.010002706199884415	 Topic: 0.060*"charg" + 0.054*"court" 