# Topic Modeling Using LDA

Description: Topic Modeling is a type of statistical modeling for discovering the abstract topics that occur in a collection of documents.

Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### NLP Libraries

In [2]:
import gensim
from gensim import corpora, models
from pprint import pprint

## Load Dataset

In [3]:
# Load dataset
df = pd.read_csv('fullstatCleaned_withLabels.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0,by,category,comment_likes_count,comments_base,comments_count_fb,comments_replies,comments_retrieved,engagement_fb,likes_count_fb,post_message,post_published,rea_ANGRY,rea_HAHA,rea_LOVE,rea_SAD,rea_THANKFUL,rea_WOW,reactions_count_fb,shares_count_fb,type
0,post_page_155027942462,App Update,0,0,11,0,0,22,11,"['paying', 'bill', 'autopay', 'ensures', 'bill...",2019-06-27T14:30:13+0000,0,0,0,0,0,0,11,0,photo
1,post_page_155027942462,Engagement,0,0,30,0,0,110,70,"['raise', 'hand', 'excited', 'first', 'day', '...",2019-06-21T14:31:06+0000,1,0,4,0,0,1,76,4,photo
2,post_page_155027942462,Engagement,0,0,11,0,0,17,5,"['couple', 'save', 'together', 'stay', 'foreve...",2019-06-20T15:00:28+0000,1,0,0,0,0,0,6,0,photo
3,post_page_155027942462,Engagement,0,0,10,0,0,19,8,"['case', 'forgotten', 'make', 'saving', 'daily...",2019-06-17T14:30:11+0000,1,0,0,0,0,0,9,0,photo
4,post_page_155027942462,Engagement,0,0,8,0,0,17,6,"['father', 'day', 'approaching', 'got', 'smart...",2019-06-14T14:30:02+0000,3,0,0,0,0,0,9,0,video


In [4]:
# Selct necessary columns
processed_words = df['post_message']

## Bag of Words on the Dataset

In [5]:
# Create a tokenized string for BoW input
processed_words = [words.split() for words in processed_words]

In [6]:
# Create a dictionary from ‘processed_words’ containing the number of times a word appears in the training set
dictionary = gensim.corpora.Dictionary(processed_words)

In [7]:
# Filter out tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [8]:
# Create a dictionary reporting how many words and how many times those words appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_words]

In [9]:
# Preview Bag of Words for a sample preprocessed document
bow_doc_500 = bow_corpus[500]

for i in range(len(bow_doc_500)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_500[i][0], 
                                                     dictionary[bow_doc_500[i][0]], 
                                                     bow_doc_500[i][1]))

Word 48 ("'payday',") appears 1 time.
Word 60 ("'see',") appears 1 time.
Word 92 ("'week',") appears 1 time.
Word 102 ("'bonus',") appears 1 time.
Word 148 ("'winner']") appears 1 time.


## TF-IDF

In [10]:
# Create tf-idf model object
tfidf = models.TfidfModel(bow_corpus)

In [11]:
# Apply transformation to the entire corpus
corpus_tfidf = tfidf[bow_corpus]

In [12]:
# Preview TF-IDF scores for the first document
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.8509381336929653), (1, 0.40105490505841873), (2, 0.33920385575007195)]


## Running LDA using Bag of Words

In [13]:
# Train the lda model using gensim.models.LdaMulticore and bow_corpus
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [14]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.083*"'get'," + 0.063*"'pay'," + 0.050*"'day'," + 0.040*"'period'," + 0.040*"'deposit'," + 0.024*"'like'," + 0.023*"'direct'," + 0.023*"'employer'," + 0.022*"'may'," + 0.022*"'notice',"

Topic: 1 
Words: 0.065*"'u'," + 0.046*"'tell'," + 0.043*"'dot'," + 0.043*"'green'," + 0.038*"'comment']" + 0.030*"'get'," + 0.025*"'back'," + 0.025*"'see'," + 0.022*"'weekend'," + 0.019*"'tax',"

Topic: 2 
Words: 0.085*"'see'," + 0.077*"'week'," + 0.069*"'payday'," + 0.063*"'bonus'," + 0.053*"'someone'," + 0.035*"'check'," + 0.034*"'person']" + 0.032*"'ontheblog']" + 0.028*"'winner']" + 0.027*"['learn',"

Topic: 3 
Words: 0.045*"'direct'," + 0.042*"'get'," + 0.042*"'asap'," + 0.040*"'deposit'," + 0.036*"'paid'," + 0.032*"'payment'," + 0.031*"'finance'," + 0.031*"'win'," + 0.030*"'day'," + 0.030*"'play',"

Topic: 4 
Words: 0.054*"'u'," + 0.047*"'tip'," + 0.047*"'like'," + 0.038*"'comment'," + 0.034*"'see'," + 0.030*"'sweepstakes'," + 0.030*"'financial'," + 0.027*"'page'," + 0.025*"'tel

## Running LDA using TF-IDF

In [15]:
# Train the lda model using gensim.models.LdaMulticore and corpus_tfidf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [16]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}\n'.format(idx, topic))

Topic: 0 
Word: 0.097*"'financial'," + 0.037*"'goal'," + 0.030*"'new'," + 0.029*"'comment']" + 0.028*"'best'," + 0.027*"'someone'," + 0.027*"'credit'," + 0.025*"['learn'," + 0.024*"'one'," + 0.024*"'learn',"

Topic: 1 
Word: 0.056*"['green'," + 0.034*"'tax'," + 0.029*"'comment'," + 0.027*"'dot'," + 0.027*"'money'," + 0.022*"'tip']" + 0.021*"'prize'," + 0.021*"'finance'," + 0.021*"'smart'," + 0.021*"'giving',"

Topic: 2 
Word: 0.047*"'u'," + 0.046*"'tell'," + 0.038*"'day'," + 0.035*"'credit'," + 0.027*"'moneypak'," + 0.026*"'card'," + 0.023*"'check'," + 0.023*"'get'," + 0.020*"'comment'," + 0.018*"'using',"

Topic: 3 
Word: 0.060*"'money'," + 0.051*"'read'," + 0.049*"'save'," + 0.043*"'ontheblog']" + 0.039*"'finance'," + 0.031*"'financial'," + 0.025*"['learn'," + 0.025*"'way'," + 0.025*"'read']" + 0.021*"'u',"

Topic: 4 
Word: 0.035*"'tip'," + 0.035*"'budget'," + 0.026*"'back'," + 0.025*"'like'," + 0.023*"'u'," + 0.023*"'saving'," + 0.023*"'page'," + 0.022*"'cash'," + 0.022*"'sweepstake

## Classification of the Topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [17]:
processed_words[500]

["['someone',",
 "'cloud',",
 "'nine',",
 "'see',",
 "'week',",
 "'payday',",
 "'bonus',",
 "'winner']"]

In [18]:
for index, score in sorted(lda_model[bow_corpus[500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.849994421005249	 
Topic: 0.085*"'see'," + 0.077*"'week'," + 0.069*"'payday'," + 0.063*"'bonus'," + 0.053*"'someone'," + 0.035*"'check'," + 0.034*"'person']" + 0.032*"'ontheblog']" + 0.028*"'winner']" + 0.027*"['learn',"

Score: 0.01666816882789135	 
Topic: 0.080*"'card'," + 0.078*"'dot'," + 0.059*"'green'," + 0.032*"'mobile'," + 0.027*"'prepaid'," + 0.027*"'tax'," + 0.026*"'app'," + 0.026*"'account'," + 0.026*"'credit'," + 0.025*"'balance',"

Score: 0.016667695716023445	 
Topic: 0.058*"'dot'," + 0.056*"'green'," + 0.054*"'sweepstakes'," + 0.042*"'money'," + 0.031*"'advice'," + 0.025*"'see'," + 0.023*"'u'," + 0.023*"'love'," + 0.022*"'entrant']" + 0.021*"'comment',"

Score: 0.016667524352669716	 
Topic: 0.065*"'u'," + 0.046*"'tell'," + 0.043*"'dot'," + 0.043*"'green'," + 0.038*"'comment']" + 0.030*"'get'," + 0.025*"'back'," + 0.025*"'see'," + 0.022*"'weekend'," + 0.019*"'tax',"

Score: 0.016667494550347328	 
Topic: 0.046*"'card'," + 0.043*"'win'," + 0.030*"'fee'," + 0.028*"'ca

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [19]:
for index, score in sorted(lda_model_tfidf[bow_corpus[500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8499804139137268	 
Topic: 0.049*"'back'," + 0.042*"'cash'," + 0.034*"'love'," + 0.028*"'visa'," + 0.026*"'green'," + 0.026*"'dot'," + 0.026*"'card'," + 0.025*"'debit'," + 0.025*"'favorite'," + 0.023*"'bonus',"

Score: 0.016671495512127876	 
Topic: 0.063*"'payment'," + 0.060*"'ssi'," + 0.034*"'stay'," + 0.032*"'see'," + 0.031*"'tip'," + 0.029*"'benefit'," + 0.029*"'time'," + 0.028*"'person']" + 0.024*"'week'," + 0.023*"'someone',"

Score: 0.01667030341923237	 
Topic: 0.060*"'money'," + 0.051*"'read'," + 0.049*"'save'," + 0.043*"'ontheblog']" + 0.039*"'finance'," + 0.031*"'financial'," + 0.025*"['learn'," + 0.025*"'way'," + 0.025*"'read']" + 0.021*"'u',"

Score: 0.01667006127536297	 
Topic: 0.035*"'way'," + 0.033*"'advice'," + 0.033*"'green'," + 0.030*"'dot'," + 0.029*"'app'," + 0.025*"'mobile'," + 0.023*"'win'," + 0.022*"'learn']" + 0.021*"'sweepstakes'," + 0.020*"'winner']"

Score: 0.016669511795043945	 
Topic: 0.097*"'financial'," + 0.037*"'goal'," + 0.030*"'new'," + 0.029*"