### Topic Modeling and Latent Dirichlet Allocation (LDA) in Python

Source: Susan Li https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [54]:
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from pprint import pprint
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import numpy as np
np.random.seed(2018)

import nltk

In [39]:
# Import the data

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines = False)    # Bad lines will be dropped from the dataframe
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [40]:
# Create the dataframe 'documents'

data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [41]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [42]:
documents.shape

(1226258, 2)

In [43]:
# Function to perform preprocessing steps: 
# Removing tokens less than 3 characters, remove stopwords, lemmatization and stemming

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos = 'v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize(token))
    return result


In [44]:
# Select a sample and preview after preprocessing 

doc_sample = documents[documents['index'] == 4310].values[0][0]

In [45]:
doc_sample

'ratepayers group wants compulsory local govt voting'

In [46]:
words = []
for word in doc_sample.split(' '):      # Tokenize sentence
    words.append(word)
print('original document: ', words)

original document:  ['ratepayers', 'group', 'wants', 'compulsory', 'local', 'govt', 'voting']


In [47]:
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))



 tokenized and lemmatized document: 
['ratepayers', 'group', 'want', 'compulsory', 'local', 'govt', 'vote']


In [48]:
# Preprocess all headline texts

processed_docs = documents['headline_text'].map(preprocess)
processed_docs.head(10)

0              [decide, community, broadcast, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4             [strike, affect, australian, travellers]
5               [ambitious, olsson, win, triple, jump]
6               [antic, delight, record, break, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                         [australia, lock, timetable]
Name: headline_text, dtype: object

In [49]:
# Bag of words on the Data set
# Create a dictionary from 'processed_docs' containing the number of times a word appears in the training set

dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 community
2 decide
3 licence
4 aware
5 defamation
6 witness
7 call
8 infrastructure
9 protection
10 summit


In [50]:
# Gensim filter extremes

# Filter out tokens that appear in:
# Less than 15 documents, more than 0.5 documents, and keep only 100000 most frquent tokens

dictionary.filter_extremes(no_below = 15, no_above = 0.5, keep_n = 100000)

### BoW

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:

A vocabulary of known words.
A measure of the presence of known words.
It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document.

https://machinelearningmastery.com/gentle-introduction-bag-words-model/

In [51]:
# Gensim doc2bow
# Create a dictionary reporting how many words and how many times those words appear

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(164, 1), (241, 1), (295, 1), (600, 1), (864, 1), (3915, 1), (3916, 1)]

In [52]:
# Preview Bag Of Words for our sample preprocessed document

bow_doc_4310 = bow_corpus[4310]

for i in range (len(bow_doc_4310)):
    print('Word {} {}\') appears {} time.'.format(bow_doc_4310[i][0],
                        dictionary[bow_doc_4310[i][0]], 
                                bow_doc_4310[i][1]))

Word 164 govt') appears 1 time.
Word 241 group') appears 1 time.
Word 295 vote') appears 1 time.
Word 600 local') appears 1 time.
Word 864 want') appears 1 time.
Word 3915 compulsory') appears 1 time.
Word 3916 ratepayers') appears 1 time.


In [55]:
# TF-IDF

tfidf = models.TfidfModel(bow_corpus)      # Create a TF-IDF model
corpus_tfidf = tfidf[bow_corpus]           # Apply transformation to the entire corpus and call it ‘corpus_tfidf’

for doc in corpus_tfidf:       # Preview for the first document
    pprint(doc)
    break

[(0, 0.5918674193999763),
 (1, 0.3937180767686992),
 (2, 0.5009876624450964),
 (3, 0.49365007440105513)]


In [56]:
# Running LDA using bag of words

# Define the model

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, 
                                      passes = 2, workers = 2)

In [57]:
# Explore the words occurring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.059*"australia" + 0.034*"coronavirus" + 0.024*"australian" + 0.023*"china" + 0.020*"sydney" + 0.019*"world" + 0.017*"open" + 0.016*"border" + 0.012*"win" + 0.011*"take"
Topic: 1 
Words: 0.022*"market" + 0.021*"record" + 0.019*"year" + 0.013*"care" + 0.012*"years" + 0.012*"price" + 0.011*"australian" + 0.011*"business" + 0.011*"country" + 0.010*"age"
Topic: 2 
Words: 0.043*"coronavirus" + 0.030*"government" + 0.025*"covid" + 0.015*"restrictions" + 0.015*"rise" + 0.014*"water" + 0.012*"royal" + 0.012*"scott" + 0.012*"tasmanian" + 0.011*"say"
Topic: 3 
Words: 0.026*"kill" + 0.022*"die" + 0.018*"coast" + 0.018*"shoot" + 0.017*"miss" + 0.016*"crash" + 0.015*"attack" + 0.014*"gold" + 0.014*"dead" + 0.013*"island"
Topic: 4 
Words: 0.040*"police" + 0.027*"case" + 0.026*"charge" + 0.026*"court" + 0.020*"death" + 0.020*"murder" + 0.017*"face" + 0.013*"jail" + 0.013*"people" + 0.012*"arrest"
Topic: 5 
Words: 0.056*"trump" + 0.028*"test" + 0.020*"tasmania" + 0.015*"morrison" + 0

In [58]:
# Running the LDA using TD-IDF

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 10, id2word = dictionary, 
                                             passes = 2, workers = 4)

In [59]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.026*"trump" + 0.012*"drum" + 0.010*"australia" + 0.009*"world" + 0.008*"scott" + 0.008*"final" + 0.008*"tuesday" + 0.007*"league" + 0.006*"cricket" + 0.006*"test"
Topic: 1 Word: 0.013*"hour" + 0.009*"david" + 0.008*"pandemic" + 0.008*"march" + 0.008*"plead" + 0.007*"june" + 0.006*"dollar" + 0.006*"reopen" + 0.005*"sunday" + 0.005*"suicide"
Topic: 2 Word: 0.023*"coronavirus" + 0.015*"covid" + 0.012*"donald" + 0.009*"queensland" + 0.009*"restrictions" + 0.008*"victoria" + 0.006*"rain" + 0.006*"australia" + 0.006*"farmers" + 0.006*"drought"
Topic: 3 Word: 0.016*"police" + 0.015*"charge" + 0.013*"crash" + 0.012*"murder" + 0.012*"kill" + 0.011*"woman" + 0.010*"court" + 0.009*"shoot" + 0.008*"arrest" + 0.008*"die"
Topic: 4 Word: 0.009*"morrison" + 0.009*"health" + 0.009*"care" + 0.007*"age" + 0.007*"coronavirus" + 0.006*"mental" + 0.005*"government" + 0.005*"legal" + 0.004*"say" + 0.004*"remote"
Topic: 5 Word: 0.010*"coronavirus" + 0.009*"monday" + 0.008*"michael" + 0.008*"m

In [60]:
# Performance evaluation by classifying sample document using LDA Bag of Words model

# Checking where our test document would be classified

processed_docs[4310]

['ratepayers', 'group', 'want', 'compulsory', 'local', 'govt', 'vote']

In [61]:
for index, score in sorted(lda_model[bow_corpus[4310]], key = lambda tup: -1 * tup[1]):
    print('\nScore: {}\t \nTopic: {}'.format(score, lda_model.print_topic(index, 10)))


Score: 0.5026649236679077	 
Topic: 0.043*"coronavirus" + 0.030*"government" + 0.025*"covid" + 0.015*"restrictions" + 0.015*"rise" + 0.014*"water" + 0.012*"royal" + 0.012*"scott" + 0.012*"tasmanian" + 0.011*"say"

Score: 0.3973022997379303	 
Topic: 0.036*"queensland" + 0.027*"election" + 0.022*"live" + 0.013*"federal" + 0.012*"work" + 0.012*"school" + 0.010*"council" + 0.010*"farm" + 0.009*"fund" + 0.009*"vote"

Score: 0.012505652382969856	 
Topic: 0.037*"say" + 0.025*"victoria" + 0.022*"health" + 0.019*"change" + 0.018*"adelaide" + 0.014*"indigenous" + 0.010*"rural" + 0.010*"service" + 0.010*"covid" + 0.010*"national"

Score: 0.01250387355685234	 
Topic: 0.059*"australia" + 0.034*"coronavirus" + 0.024*"australian" + 0.023*"china" + 0.020*"sydney" + 0.019*"world" + 0.017*"open" + 0.016*"border" + 0.012*"win" + 0.011*"take"

Score: 0.01250387355685234	 
Topic: 0.022*"market" + 0.021*"record" + 0.019*"year" + 0.013*"care" + 0.012*"years" + 0.012*"price" + 0.011*"australian" + 0.011*"busi

In [62]:
# Performance evaluation by classifying sample document using the LDA TF-IDF model

for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key = lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5778061747550964	 
Topic: 0.017*"news" + 0.010*"rural" + 0.010*"election" + 0.008*"government" + 0.007*"federal" + 0.007*"wednesday" + 0.007*"thursday" + 0.007*"business" + 0.007*"national" + 0.007*"john"

Score: 0.177941232919693	 
Topic: 0.013*"hour" + 0.009*"david" + 0.008*"pandemic" + 0.008*"march" + 0.008*"plead" + 0.007*"june" + 0.006*"dollar" + 0.006*"reopen" + 0.005*"sunday" + 0.005*"suicide"

Score: 0.15668842196464539	 
Topic: 0.026*"trump" + 0.012*"drum" + 0.010*"australia" + 0.009*"world" + 0.008*"scott" + 0.008*"final" + 0.008*"tuesday" + 0.007*"league" + 0.006*"cricket" + 0.006*"test"

Score: 0.012510434724390507	 
Topic: 0.006*"kohler" + 0.006*"government" + 0.006*"jam" + 0.005*"people" + 0.005*"school" + 0.005*"coronavirus" + 0.005*"online" + 0.005*"territory" + 0.005*"sell" + 0.005*"smith"

Score: 0.012510214000940323	 
Topic: 0.009*"morrison" + 0.009*"health" + 0.009*"care" + 0.007*"age" + 0.007*"coronavirus" + 0.006*"mental" + 0.005*"government" + 0.005*"le

In [63]:
# Testing model on unseen document

unseen_document = 'How a pentagon deal became an indentity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key = lambda tup: -1 * tup[1]):
    print('Score: {}\t Topic: {}'.format(score, lda_model.print_topic(index,5)))

Score: 0.3823586702346802	 Topic: 0.043*"coronavirus" + 0.030*"government" + 0.025*"covid" + 0.015*"restrictions" + 0.015*"rise"
Score: 0.2578660845756531	 Topic: 0.022*"market" + 0.021*"record" + 0.019*"year" + 0.013*"care" + 0.012*"years"
Score: 0.21968549489974976	 Topic: 0.026*"kill" + 0.022*"die" + 0.018*"coast" + 0.018*"shoot" + 0.017*"miss"
Score: 0.02001577988266945	 Topic: 0.056*"trump" + 0.028*"test" + 0.020*"tasmania" + 0.015*"morrison" + 0.014*"drum"
Score: 0.02001461572945118	 Topic: 0.037*"say" + 0.025*"victoria" + 0.022*"health" + 0.019*"change" + 0.018*"adelaide"
Score: 0.02001415751874447	 Topic: 0.059*"australia" + 0.034*"coronavirus" + 0.024*"australian" + 0.023*"china" + 0.020*"sydney"
Score: 0.02001303620636463	 Topic: 0.036*"queensland" + 0.027*"election" + 0.022*"live" + 0.013*"federal" + 0.012*"work"
Score: 0.020010707899928093	 Topic: 0.040*"police" + 0.027*"case" + 0.026*"charge" + 0.026*"court" + 0.020*"death"
Score: 0.020010707899928093	 Topic: 0.019*"bushfi