### Dataset

In [None]:
import pandas as pd

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

### Pre-Proccessing

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shruti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5918674193999763),
 (1, 0.3937180767686992),
 (2, 0.5009876624450964),
 (3, 0.49365007440105513)]


In [None]:

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.057*"australia" + 0.041*"trump" + 0.024*"australian" + 0.022*"china" + 0.019*"world" + 0.019*"sydney" + 0.017*"open" + 0.017*"coronavirus" + 0.015*"border" + 0.012*"win"
Topic: 1 
Words: 0.023*"market" + 0.019*"year" + 0.016*"record" + 0.012*"care" + 0.012*"price" + 0.012*"years" + 0.012*"australian" + 0.011*"business" + 0.011*"country" + 0.010*"age"
Topic: 2 
Words: 0.065*"coronavirus" + 0.032*"covid" + 0.029*"government" + 0.015*"rise" + 0.015*"restrictions" + 0.014*"water" + 0.012*"royal" + 0.012*"scott" + 0.011*"tasmanian" + 0.010*"commission"
Topic: 3 
Words: 0.027*"kill" + 0.022*"die" + 0.019*"coast" + 0.018*"shoot" + 0.017*"miss" + 0.016*"crash" + 0.015*"attack" + 0.015*"gold" + 0.015*"dead" + 0.014*"island"
Topic: 4 
Words: 0.040*"police" + 0.026*"charge" + 0.026*"case" + 0.025*"court" + 0.020*"death" + 0.020*"murder" + 0.017*"face" + 0.013*"jail" + 0.013*"people" + 0.012*"arrest"
Topic: 5 
Words: 0.027*"test" + 0.020*"tasmania" + 0.015*"morrison" + 0.014*"dr

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"coronavirus" + 0.007*"border" + 0.007*"kill" + 0.006*"september" + 0.005*"footage" + 0.005*"china" + 0.005*"brexit" + 0.005*"protesters" + 0.005*"biden" + 0.005*"attack"
Topic: 1 Word: 0.013*"crash" + 0.011*"queensland" + 0.010*"bushfire" + 0.010*"coast" + 0.007*"weather" + 0.007*"die" + 0.007*"michael" + 0.007*"police" + 0.006*"beach" + 0.006*"death"
Topic: 2 Word: 0.006*"plan" + 0.005*"coal" + 0.005*"water" + 0.005*"action" + 0.005*"government" + 0.005*"legal" + 0.005*"council" + 0.004*"spring" + 0.004*"closure" + 0.004*"coronavirus"
Topic: 3 Word: 0.021*"coronavirus" + 0.016*"covid" + 0.010*"health" + 0.009*"government" + 0.007*"royal" + 0.007*"commission" + 0.007*"federal" + 0.006*"care" + 0.006*"update" + 0.006*"sport"
Topic: 4 Word: 0.012*"scott" + 0.011*"morrison" + 0.010*"restrictions" + 0.010*"coronavirus" + 0.009*"christmas" + 0.008*"turnbull" + 0.007*"island" + 0.007*"stories" + 0.006*"morning" + 0.005*"malcolm"
Topic: 5 Word: 0.025*"news" + 0.016*"rura

### Testing both the models

In [None]:
from gensim.models.coherencemodel import CoherenceModel
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.118709657723862

Coherence Score:  0.24535051024041796


In [None]:
from gensim.models.coherencemodel import CoherenceModel
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.


coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.997436855159867

Coherence Score:  0.30983791499722557
