# Topic Modeling Using LDA

Description: Topic Modeling is a type of statistical modeling for discovering the abstract topics that occur in a collection of documents.

Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### NLP Libraries

In [2]:
import gensim
from gensim import corpora, models
from pprint import pprint

## Load Dataset

In [3]:
# Load dataset
df = pd.read_csv('fullstatCleaned_withLabels.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0,by,category,comment_likes_count,comments_base,comments_count_fb,comments_replies,comments_retrieved,engagement_fb,likes_count_fb,post_message,post_published,rea_ANGRY,rea_HAHA,rea_LOVE,rea_SAD,rea_THANKFUL,rea_WOW,reactions_count_fb,shares_count_fb,type
0,post_page_155027942462,App Update,0,0,11,0,0,22,11,"['paying', 'bill', 'autopay', 'ensures', 'bill...",2019-06-27T14:30:13+0000,0,0,0,0,0,0,11,0,photo
1,post_page_155027942462,Engagement,0,0,30,0,0,110,70,"['raise', 'hand', 'excited', 'first', 'day', '...",2019-06-21T14:31:06+0000,1,0,4,0,0,1,76,4,photo
2,post_page_155027942462,Engagement,0,0,11,0,0,17,5,"['couple', 'save', 'together', 'stay', 'foreve...",2019-06-20T15:00:28+0000,1,0,0,0,0,0,6,0,photo
3,post_page_155027942462,Engagement,0,0,10,0,0,19,8,"['case', 'forgotten', 'make', 'saving', 'daily...",2019-06-17T14:30:11+0000,1,0,0,0,0,0,9,0,photo
4,post_page_155027942462,Engagement,0,0,8,0,0,17,6,"['father', 'day', 'approaching', 'got', 'smart...",2019-06-14T14:30:02+0000,3,0,0,0,0,0,9,0,video


In [4]:
# Selct necessary columns
processed_words = df['post_message']

## Bag of Words on the Dataset

In [5]:
# Create a tokenized string for BoW input
processed_words = [words.split() for words in processed_words]

In [6]:
# Create a dictionary from ‘processed_words’ containing the number of times a word appears in the training set
dictionary = gensim.corpora.Dictionary(processed_words)

In [7]:
# Filter out tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [8]:
# Create a dictionary reporting how many words and how many times those words appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_words]

In [9]:
# Preview Bag of Words for a sample preprocessed document
bow_doc_500 = bow_corpus[500]

for i in range(len(bow_doc_500)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_500[i][0], 
                                                     dictionary[bow_doc_500[i][0]], 
                                                     bow_doc_500[i][1]))

Word 48 ("'payday',") appears 1 time.
Word 60 ("'see',") appears 1 time.
Word 92 ("'week',") appears 1 time.
Word 102 ("'bonus',") appears 1 time.
Word 148 ("'winner']") appears 1 time.


## TF-IDF

In [10]:
# Create tf-idf model object
tfidf = models.TfidfModel(bow_corpus)

In [11]:
# Apply transformation to the entire corpus
corpus_tfidf = tfidf[bow_corpus]

In [12]:
# Preview TF-IDF scores for the first document
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.8509381336929653), (1, 0.40105490505841873), (2, 0.33920385575007195)]


## Running LDA using Bag of Words

In [13]:
# Train the lda model using gensim.models.LdaMulticore and bow_corpus
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [14]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.065*"'payday'," + 0.060*"'bonus'," + 0.044*"'extra'," + 0.040*"'payment'," + 0.039*"'entrant'," + 0.037*"'customer'," + 0.034*"'statement'," + 0.034*"'reflects'," + 0.034*"'ssi'," + 0.032*"'experience']"
Topic: 1 
Words: 0.085*"'cash'," + 0.074*"'back'," + 0.055*"'card'," + 0.048*"'dot'," + 0.046*"'green'," + 0.043*"'money'," + 0.039*"'visa'," + 0.037*"'debit'," + 0.036*"'earn'," + 0.028*"'annually',"
Topic: 2 
Words: 0.076*"'green'," + 0.070*"'dot'," + 0.051*"'money'," + 0.030*"'app'," + 0.030*"'read'," + 0.028*"'mobile'," + 0.026*"'u'," + 0.025*"'card'," + 0.024*"'balance'," + 0.023*"'learn']"
Topic: 3 
Words: 0.036*"'family'," + 0.034*"'play'," + 0.032*"'win'," + 0.030*"'back'," + 0.029*"'spin'," + 0.028*"'day'," + 0.027*"'cash'," + 0.026*"'get'," + 0.025*"'game'," + 0.022*"'tip',"
Topic: 4 
Words: 0.081*"'u'," + 0.060*"'financial'," + 0.049*"'tell'," + 0.041*"'make'," + 0.036*"'tip'," + 0.031*"'finance'," + 0.029*"'comment']" + 0.028*"'stay'," + 0.028*"'ontheblog

## Running LDA using TF-IDF

In [15]:
# Train the lda model using gensim.models.LdaMulticore and corpus_tfidf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [16]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.059*"'financial'," + 0.040*"'week'," + 0.038*"'bonus'," + 0.037*"'payday'," + 0.031*"'see'," + 0.029*"'someone'," + 0.029*"'balance'," + 0.028*"'check'," + 0.028*"'day'," + 0.027*"'winner',"
Topic: 1 Word: 0.041*"'tell'," + 0.038*"'comment']" + 0.035*"'u'," + 0.028*"'like'," + 0.028*"'summer'," + 0.026*"'sweepstakes'," + 0.025*"'tip'," + 0.025*"'comment'," + 0.023*"'page'," + 0.023*"'weekend',"
Topic: 2 Word: 0.082*"['learn'," + 0.055*"'tip'," + 0.048*"'new'," + 0.036*"'save'," + 0.032*"'time'," + 0.027*"'using'," + 0.024*"'today'," + 0.020*"'every'," + 0.020*"'ontheblog']" + 0.019*"'credit',"
Topic: 3 Word: 0.057*"'back'," + 0.047*"'cash'," + 0.044*"'get'," + 0.035*"['happy'," + 0.032*"'shopping'," + 0.031*"'visa'," + 0.026*"'card'," + 0.026*"'check'," + 0.026*"'annually'," + 0.024*"'green',"
Topic: 4 Word: 0.041*"'credit'," + 0.034*"'like'," + 0.031*"'learn'," + 0.029*"'financialliteracymonth']" + 0.028*"'page'," + 0.027*"'back'," + 0.025*"'read']" + 0.022*"'budgetfr

## Classification of the Topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [17]:
processed_words[500]

["['someone',",
 "'cloud',",
 "'nine',",
 "'see',",
 "'week',",
 "'payday',",
 "'bonus',",
 "'winner']"]

In [18]:
for index, score in sorted(lda_model[bow_corpus[500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8499889969825745	 
Topic: 0.069*"'see'," + 0.046*"'comment'," + 0.043*"'week'," + 0.039*"'sweepstakes'," + 0.039*"'bonus'," + 0.038*"'payday'," + 0.032*"'winner'," + 0.025*"'dot'," + 0.025*"'green'," + 0.025*"'share',"

Score: 0.016671868041157722	 
Topic: 0.065*"'payday'," + 0.060*"'bonus'," + 0.044*"'extra'," + 0.040*"'payment'," + 0.039*"'entrant'," + 0.037*"'customer'," + 0.034*"'statement'," + 0.034*"'reflects'," + 0.034*"'ssi'," + 0.032*"'experience']"

Score: 0.016668537631630898	 
Topic: 0.065*"'tax'," + 0.035*"'dot'," + 0.033*"'card'," + 0.031*"'refund'," + 0.031*"'green'," + 0.029*"'win'," + 0.026*"'see'," + 0.025*"'prize'," + 0.025*"'deposit'," + 0.023*"'direct',"

Score: 0.016668010503053665	 
Topic: 0.036*"'family'," + 0.034*"'play'," + 0.032*"'win'," + 0.030*"'back'," + 0.029*"'spin'," + 0.028*"'day'," + 0.027*"'cash'," + 0.026*"'get'," + 0.025*"'game'," + 0.022*"'tip',"

Score: 0.01666739396750927	 
Topic: 0.076*"'green'," + 0.070*"'dot'," + 0.051*"'money'," + 

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [19]:
for index, score in sorted(lda_model_tfidf[bow_corpus[500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8499898314476013	 
Topic: 0.059*"'financial'," + 0.040*"'week'," + 0.038*"'bonus'," + 0.037*"'payday'," + 0.031*"'see'," + 0.029*"'someone'," + 0.029*"'balance'," + 0.028*"'check'," + 0.028*"'day'," + 0.027*"'winner',"

Score: 0.016670944169163704	 
Topic: 0.035*"'see'," + 0.035*"'favorite'," + 0.029*"'read'," + 0.027*"'winner']" + 0.025*"'u'," + 0.023*"'make'," + 0.023*"'love'," + 0.022*"'comment'," + 0.022*"'dot'," + 0.022*"'week',"

Score: 0.01666988432407379	 
Topic: 0.050*"'payment'," + 0.045*"'ssi'," + 0.038*"'advice'," + 0.032*"'bonus'," + 0.031*"'payday'," + 0.029*"'love'," + 0.027*"'entrant'," + 0.026*"'benefit'," + 0.026*"'sweepstakes'," + 0.025*"'customer',"

Score: 0.016667841002345085	 
Topic: 0.041*"'tell'," + 0.038*"'comment']" + 0.035*"'u'," + 0.028*"'like'," + 0.028*"'summer'," + 0.026*"'sweepstakes'," + 0.025*"'tip'," + 0.025*"'comment'," + 0.023*"'page'," + 0.023*"'weekend',"

Score: 0.016667108982801437	 
Topic: 0.082*"['learn'," + 0.055*"'tip'," + 0.048*"