In [0]:
import pandas as pd

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving abcnews-date-text.csv to abcnews-date-text.csv
User uploaded file "abcnews-date-text.csv" with length 55392904 bytes


In [0]:
# download data from kaggle :          https://www.kaggle.com/therohk/million-headlines/data

df = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = df[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [6]:
# data length & data view


print(len(documents))
print(documents[:6])

1103663
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4
5                  ambitious olsson wins triple jump      5


# Data Preprocessing

Tokenization: Split the text into sentences and the sentences into words.

Lowercase the words and remove punctuation.

Words that have fewer than 3 characters are removed.

All stopwords are removed.

Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.

Words are stemmed — words are reduced to their root form.

In [0]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
stemmer = SnowballStemmer('english')


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

# Select a document to preview after preprocessing.

In [13]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


# Preprocess the headline text, saving the results as ‘processed_docs’

In [14]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:5]

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

# Bag of words on the dataset

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


# Filter out tokens that appear in

less than 15 documents (absolute number) or     
more than 0.5 documents (fraction of total corpus size, not absolute number).  
after the above two steps, keep only the first 100000 most frequent tokens.

In [0]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


# Gensim doc2bow
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (484, 1), (4015, 1)]

# Preview Bag Of Words for our sample preprocessed document.

In [18]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 76 ("bushfir") appears 1 time.
Word 112 ("help") appears 1 time.
Word 484 ("rain") appears 1 time.
Word 4015 ("dampen") appears 1 time.


# TF-IDF

In [0]:

from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

corpus_tfidf = tfidf[bow_corpus]

In [20]:

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5893154583024485),
 (1, 0.3892866165028569),
 (2, 0.49651921997736453),
 (3, 0.5046106271280878)]


# Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [0]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.023*"death" + 0.020*"hospit" + 0.017*"trial" + 0.016*"deal" + 0.014*"driver" + 0.012*"street" + 0.010*"flood" + 0.010*"babi" + 0.009*"east" + 0.009*"wall"
Topic: 1 
Words: 0.025*"world" + 0.018*"test" + 0.016*"interview" + 0.015*"australia" + 0.012*"leagu" + 0.010*"close" + 0.010*"game" + 0.010*"darwin" + 0.009*"take" + 0.008*"star"
Topic: 2 
Words: 0.019*"council" + 0.018*"countri" + 0.016*"nation" + 0.016*"plan" + 0.015*"tasmanian" + 0.015*"hour" + 0.012*"say" + 0.011*"citi" + 0.011*"news" + 0.010*"green"
Topic: 3 
Words: 0.023*"perth" + 0.020*"women" + 0.018*"miss" + 0.017*"sydney" + 0.013*"protest" + 0.012*"guilti" + 0.011*"search" + 0.011*"water" + 0.011*"john" + 0.010*"continu"
Topic: 4 
Words: 0.028*"elect" + 0.019*"say" + 0.017*"live" + 0.017*"famili" + 0.016*"chang" + 0.011*"park" + 0.011*"polit" + 0.010*"vote" + 0.010*"leader" + 0.009*"violenc"
Topic: 5 
Words: 0.039*"australia" + 0.035*"trump" + 0.028*"queensland" + 0.015*"tasmania" + 0.014*"turnbul" + 0.0

# Running LDA using TF-IDF

In [23]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.009*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"plan" + 0.006*"farm" + 0.006*"council" + 0.005*"industri" + 0.005*"sport" + 0.005*"budget" + 0.005*"region"
Topic: 1 Word: 0.018*"polic" + 0.016*"charg" + 0.013*"murder" + 0.011*"crash" + 0.011*"court" + 0.011*"woman" + 0.009*"alleg" + 0.009*"arrest" + 0.008*"jail" + 0.008*"kill"
Topic: 2 Word: 0.011*"podcast" + 0.010*"final" + 0.006*"climat" + 0.006*"world" + 0.006*"monday" + 0.006*"tuesday" + 0.005*"beat" + 0.005*"leagu" + 0.005*"season" + 0.005*"wrap"
Topic: 3 Word: 0.022*"countri" + 0.021*"hour" + 0.010*"turnbul" + 0.008*"donald" + 0.006*"water" + 0.006*"drought" + 0.005*"murray" + 0.005*"quiz" + 0.004*"social" + 0.004*"farmer"
Topic: 4 Word: 0.012*"drum" + 0.009*"septemb" + 0.008*"plead" + 0.008*"guilti" + 0.007*"asylum" + 0.007*"august" + 0.007*"pacif" + 0.006*"sexual" + 0.006*"seeker" + 0.006*"teenag"
Topic: 5 Word: 0.016*"market" + 0.013*"news" + 0.012*"rural" + 0.009*"share" + 0.008*"nation" + 0.007*"elect"

# Performance evaluation by classifying sample document using LDA Bag of Words model

In [24]:
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [25]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.30915534496307373	 
Topic: 0.031*"govern" + 0.018*"market" + 0.013*"share" + 0.012*"australian" + 0.012*"warn" + 0.011*"busi" + 0.011*"bank" + 0.010*"high" + 0.010*"break" + 0.009*"fund"

Score: 0.2901693284511566	 
Topic: 0.016*"health" + 0.015*"rural" + 0.014*"power" + 0.012*"price" + 0.012*"hous" + 0.012*"communiti" + 0.012*"concern" + 0.010*"servic" + 0.010*"farm" + 0.010*"report"

Score: 0.26066818833351135	 
Topic: 0.028*"elect" + 0.019*"say" + 0.017*"live" + 0.017*"famili" + 0.016*"chang" + 0.011*"park" + 0.011*"polit" + 0.010*"vote" + 0.010*"leader" + 0.009*"violenc"

Score: 0.02000299282371998	 
Topic: 0.023*"perth" + 0.020*"women" + 0.018*"miss" + 0.017*"sydney" + 0.013*"protest" + 0.012*"guilti" + 0.011*"search" + 0.011*"water" + 0.011*"john" + 0.010*"continu"

Score: 0.020001640543341637	 
Topic: 0.023*"death" + 0.020*"hospit" + 0.017*"trial" + 0.016*"deal" + 0.014*"driver" + 0.012*"street" + 0.010*"flood" + 0.010*"babi" + 0.009*"east" + 0.009*"wall"

Score: 0.020

# Performance evaluation by classifying sample document using LDA TF-IDF model.

In [26]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5362162590026855	 
Topic: 0.021*"trump" + 0.008*"royal" + 0.007*"tasmania" + 0.006*"hunter" + 0.006*"energi" + 0.006*"burn" + 0.005*"commiss" + 0.005*"bushfir" + 0.005*"coast" + 0.004*"blaze"

Score: 0.30373987555503845	 
Topic: 0.018*"interview" + 0.009*"hobart" + 0.008*"michael" + 0.008*"friday" + 0.007*"marriag" + 0.007*"novemb" + 0.006*"capit" + 0.006*"parliament" + 0.005*"syrian" + 0.005*"indonesia"

Score: 0.020016519352793694	 
Topic: 0.010*"weather" + 0.007*"juli" + 0.007*"festiv" + 0.007*"rugbi" + 0.007*"wednesday" + 0.006*"peter" + 0.006*"thursday" + 0.006*"stori" + 0.006*"andrew" + 0.006*"june"

Score: 0.020010028034448624	 
Topic: 0.022*"countri" + 0.021*"hour" + 0.010*"turnbul" + 0.008*"donald" + 0.006*"water" + 0.006*"drought" + 0.005*"murray" + 0.005*"quiz" + 0.004*"social" + 0.004*"farmer"

Score: 0.0200051162391901	 
Topic: 0.009*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"plan" + 0.006*"farm" + 0.006*"council" + 0.005*"industri" + 0.005*"sport" + 0.005

# Testing model on unseen document

In [27]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3498546779155731	 Topic: 0.028*"elect" + 0.019*"say" + 0.017*"live" + 0.017*"famili" + 0.016*"chang"
Score: 0.1834772229194641	 Topic: 0.023*"death" + 0.020*"hospit" + 0.017*"trial" + 0.016*"deal" + 0.014*"driver"
Score: 0.18333333730697632	 Topic: 0.031*"govern" + 0.018*"market" + 0.013*"share" + 0.012*"australian" + 0.012*"warn"
Score: 0.183333158493042	 Topic: 0.039*"australia" + 0.035*"trump" + 0.028*"queensland" + 0.015*"tasmania" + 0.014*"turnbul"
Score: 0.01666819490492344	 Topic: 0.016*"health" + 0.015*"rural" + 0.014*"power" + 0.012*"price" + 0.012*"hous"
Score: 0.016666723415255547	 Topic: 0.023*"perth" + 0.020*"women" + 0.018*"miss" + 0.017*"sydney" + 0.013*"protest"
Score: 0.016666676849126816	 Topic: 0.019*"council" + 0.018*"countri" + 0.016*"nation" + 0.016*"plan" + 0.015*"tasmanian"
Score: 0.01666666753590107	 Topic: 0.025*"world" + 0.018*"test" + 0.016*"interview" + 0.015*"australia" + 0.012*"leagu"
Score: 0.01666666753590107	 Topic: 0.031*"charg" + 0.030*"cour