In [1]:
#news headlines published over a period of 15 years form kaggle
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [2]:
len(data)

1226258

In [3]:
# taking 1000 documents
data_text=data[['headline_text']][:1000]
data_text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [4]:
data_text['index']=data_text.index
data_text.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [5]:
len(data_text)

1000

In [6]:
data_text['headline_text']

0      aba decides against community broadcasting lic...
1         act fire witnesses must be aware of defamation
2         a g calls for infrastructure protection summit
3               air nz staff in aust strike for pay rise
4          air nz strike to affect australian travellers
                             ...                        
995                  conference to focus on tuna fishery
996                        council hosts farewell for mp
997                  council resists eba roster pressure
998                     customs house restoration opened
999                dam water levels still critically low
Name: headline_text, Length: 1000, dtype: object

In [7]:
# Assigning data_text as documents
documents=data_text

In [8]:
print(len(documents))

1000


In [9]:
print(documents[:5])

                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [10]:
import gensim
from gensim.utils import simple_preprocess # convert a document into a list of tokens
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from nltk import PorterStemmer
stemmer=PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\srini\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
"""import re

import nltk
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

list=[]
for i in range(len(data_text)):
    review=re.sub('[^a-zA-Z]', ' ', data_text['headline_text'][i])
    review=review.lower()
    review=review.split()
    review= [lemmatizer.lemmatize(word) for word in review if word not in (stopwords.words('english'))]
    review=' '.join(review)
    list.append(review) """


"import re\n\nimport nltk\nfrom nltk.corpus import stopwords\n\nfrom nltk.stem import WordNetLemmatizer\nlemmatizer=WordNetLemmatizer()\n\nlist=[]\nfor i in range(len(data_text)):\n    review=re.sub('[^a-zA-Z]', ' ', data_text['headline_text'][i])\n    review=review.lower()\n    review=review.split()\n    review= [lemmatizer.lemmatize(word) for word in review if word not in (stopwords.words('english'))]\n    review=' '.join(review)\n    list.append(review) "

In [12]:

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [13]:
documents[documents['index'] == 99]

Unnamed: 0,headline_text,index
99,more water restrictions predicted for northern...,99


In [14]:
doc_sample = documents[documents['index'] == 99].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['more', 'water', 'restrictions', 'predicted', 'for', 'northern', 'tas']


 tokenized and lemmatized document: 
['water', 'restrict', 'predict', 'northern']


In [15]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0               [decid, commun, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

In [16]:
#    Dictionary encapsulates the mapping between normalized words and their integer ids.

#The main function is doc2bow, which converts a collection of words to its bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.

dictionary = gensim.corpora.Dictionary(processed_docs)


In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[800]

[(275, 1), (609, 1), (1212, 1), (1437, 1), (1720, 1)]

In [18]:
#TF-IDF
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus) #This module implements functionality related to the Term Frequency - Inverse Document Frequency
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5081330240297587),
 (1, 0.4057017483173504),
 (2, 0.5648077116975213),
 (3, 0.5081330240297587)]


In [19]:
# Running LDA using Bag of Words
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [20]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"plan" + 0.006*"anti" + 0.006*"protest" + 0.006*"water" + 0.006*"firefight" + 0.005*"investig" + 0.005*"face" + 0.005*"court" + 0.005*"rain" + 0.005*"turkey"
Topic: 1 
Words: 0.010*"govt" + 0.007*"iraq" + 0.007*"fund" + 0.007*"race" + 0.006*"critic" + 0.005*"crean" + 0.005*"high" + 0.005*"patterson" + 0.005*"assur" + 0.005*"lead"
Topic: 2 
Words: 0.014*"plan" + 0.009*"polic" + 0.009*"crash" + 0.007*"rule" + 0.007*"charg" + 0.007*"australian" + 0.005*"injur" + 0.005*"say" + 0.005*"hous" + 0.005*"affect"
Topic: 3 
Words: 0.009*"urg" + 0.008*"water" + 0.008*"help" + 0.008*"public" + 0.008*"delay" + 0.007*"troop" + 0.007*"council" + 0.007*"probe" + 0.007*"protect" + 0.006*"hospit"
Topic: 4 
Words: 0.011*"face" + 0.008*"court" + 0.007*"call" + 0.007*"suppli" + 0.007*"rain" + 0.006*"trial" + 0.006*"hear" + 0.006*"action" + 0.006*"ahead" + 0.006*"melbourn"
Topic: 5 
Words: 0.008*"polic" + 0.008*"plan" + 0.007*"lead" + 0.007*"miss" + 0.007*"price" + 0.006*"consid" + 0.00

In [21]:
# Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4) # parallelized LDA
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"govt" + 0.005*"price" + 0.004*"investig" + 0.004*"fuel" + 0.004*"hold" + 0.004*"predict" + 0.003*"struggl" + 0.003*"park" + 0.003*"contribut" + 0.003*"turkey"
Topic: 1 Word: 0.005*"fund" + 0.004*"iraqi" + 0.004*"report" + 0.004*"decis" + 0.004*"protest" + 0.004*"hold" + 0.004*"charg" + 0.003*"pilot" + 0.003*"iraq" + 0.003*"warn"
Topic: 2 Word: 0.005*"patterson" + 0.004*"elect" + 0.004*"kill" + 0.004*"polic" + 0.004*"turn" + 0.003*"player" + 0.003*"state" + 0.003*"rule" + 0.003*"protest" + 0.003*"telstra"
Topic: 3 Word: 0.005*"kill" + 0.004*"work" + 0.004*"anti" + 0.004*"critic" + 0.004*"court" + 0.004*"nation" + 0.003*"protest" + 0.003*"worker" + 0.003*"claim" + 0.003*"turkey"
Topic: 4 Word: 0.006*"nightclub" + 0.005*"rain" + 0.005*"polic" + 0.004*"land" + 0.004*"search" + 0.004*"council" + 0.004*"defenc" + 0.004*"warn" + 0.004*"talk" + 0.004*"claim"
Topic: 5 Word: 0.006*"drought" + 0.006*"murder" + 0.005*"charg" + 0.005*"break" + 0.005*"stuttl" + 0.004*"polic" + 

In [22]:
#Performance evaluation by classifying sample document using LDA Bag of Words model
processed_docs[10] #test document
for index, score in sorted(lda_model[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199659585952759	 
Topic: 0.009*"iraqi" + 0.009*"iraq" + 0.007*"polic" + 0.005*"seek" + 0.005*"call" + 0.005*"england" + 0.005*"station" + 0.005*"readi" + 0.005*"raid" + 0.005*"tent"

Score: 0.020007481798529625	 
Topic: 0.007*"plan" + 0.006*"anti" + 0.006*"protest" + 0.006*"water" + 0.006*"firefight" + 0.005*"investig" + 0.005*"face" + 0.005*"court" + 0.005*"rain" + 0.005*"turkey"

Score: 0.020006537437438965	 
Topic: 0.010*"death" + 0.009*"iraq" + 0.008*"rise" + 0.008*"club" + 0.007*"charg" + 0.007*"report" + 0.007*"match" + 0.007*"continu" + 0.007*"govt" + 0.007*"murder"

Score: 0.020006027072668076	 
Topic: 0.017*"rain" + 0.012*"council" + 0.011*"drought" + 0.011*"break" + 0.008*"qanta" + 0.006*"continu" + 0.006*"iraq" + 0.006*"secur" + 0.006*"talk" + 0.006*"korean"

Score: 0.020003627985715866	 
Topic: 0.009*"urg" + 0.008*"water" + 0.008*"help" + 0.008*"public" + 0.008*"delay" + 0.007*"troop" + 0.007*"council" + 0.007*"probe" + 0.007*"protect" + 0.006*"hospit"

Score: 0.

In [23]:
#Performance evaluation by classifying sample document using LDA TF-IDF model.
for index, score in sorted(lda_model_tfidf[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8198614716529846	 
Topic: 0.006*"govt" + 0.005*"price" + 0.004*"investig" + 0.004*"fuel" + 0.004*"hold" + 0.004*"predict" + 0.003*"struggl" + 0.003*"park" + 0.003*"contribut" + 0.003*"turkey"

Score: 0.02003416419029236	 
Topic: 0.005*"record" + 0.005*"cairn" + 0.004*"brawl" + 0.004*"rain" + 0.004*"report" + 0.004*"iraq" + 0.004*"council" + 0.004*"river" + 0.004*"boost" + 0.003*"injur"

Score: 0.0200207456946373	 
Topic: 0.004*"reject" + 0.004*"iraq" + 0.004*"probe" + 0.004*"open" + 0.004*"famili" + 0.003*"council" + 0.003*"lead" + 0.003*"win" + 0.003*"levi" + 0.003*"northern"

Score: 0.020017843693494797	 
Topic: 0.005*"fund" + 0.004*"iraqi" + 0.004*"report" + 0.004*"decis" + 0.004*"protest" + 0.004*"hold" + 0.004*"charg" + 0.003*"pilot" + 0.003*"iraq" + 0.003*"warn"

Score: 0.020017601549625397	 
Topic: 0.006*"drought" + 0.006*"murder" + 0.005*"charg" + 0.005*"break" + 0.005*"stuttl" + 0.004*"polic" + 0.004*"rain" + 0.004*"court" + 0.004*"hous" + 0.004*"seat"

Score: 0.0200

In [24]:
#Testing model on unseen document
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.525023341178894	 Topic: 0.008*"polic" + 0.008*"plan" + 0.007*"lead" + 0.007*"miss" + 0.007*"price"
Score: 0.2748906910419464	 Topic: 0.012*"claim" + 0.008*"charg" + 0.007*"concern" + 0.007*"probe" + 0.007*"warn"
Score: 0.02502259984612465	 Topic: 0.010*"death" + 0.009*"iraq" + 0.008*"rise" + 0.008*"club" + 0.007*"charg"
Score: 0.02501627616584301	 Topic: 0.007*"plan" + 0.006*"anti" + 0.006*"protest" + 0.006*"water" + 0.006*"firefight"
Score: 0.025012152269482613	 Topic: 0.010*"govt" + 0.007*"iraq" + 0.007*"fund" + 0.007*"race" + 0.006*"critic"
Score: 0.025006990879774094	 Topic: 0.014*"plan" + 0.009*"polic" + 0.009*"crash" + 0.007*"rule" + 0.007*"charg"
Score: 0.025006990879774094	 Topic: 0.009*"urg" + 0.008*"water" + 0.008*"help" + 0.008*"public" + 0.008*"delay"
Score: 0.025006990879774094	 Topic: 0.011*"face" + 0.008*"court" + 0.007*"call" + 0.007*"suppli" + 0.007*"rain"
Score: 0.025006990879774094	 Topic: 0.009*"iraqi" + 0.009*"iraq" + 0.007*"polic" + 0.005*"seek" + 0.005*"