# Topic Modeling Using LDA

Description: Topic Modeling is a type of statistical modeling for discovering the abstract topics that occur in a collection of documents.

Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd

### NLP Libraries

In [2]:
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
from pprint import pprint
from nltk.tokenize import word_tokenize

## Load Dataset

In [3]:
# Load dataset
df = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/cleaned/fullstatCleaned_withLabels.tsv', index_col='Unnamed: 0', sep='\t')
# Inspect df
df.head()

Unnamed: 0,by,category,comment_likes_count,comments_base,comments_count_fb,comments_replies,comments_retrieved,engagement_fb,likes_count_fb,post_id,...,post_published,rea_ANGRY,rea_HAHA,rea_LOVE,rea_SAD,rea_THANKFUL,rea_WOW,reactions_count_fb,shares_count_fb,type
0,post_page_155027942462,App Update,0,0,11,0,0,22,11,155027942462_10157348020627463,...,2019-06-27T14:30:13+0000,0,0,0,0,0,0,11,0,photo
1,post_page_155027942462,Engagement,0,0,30,0,0,110,70,155027942462_10157333387457463,...,2019-06-21T14:31:06+0000,1,0,4,0,0,1,76,4,photo
2,post_page_155027942462,Engagement,0,0,11,0,0,17,5,155027942462_10157330985232463,...,2019-06-20T15:00:28+0000,1,0,0,0,0,0,6,0,photo
3,post_page_155027942462,Engagement,0,0,10,0,0,19,8,155027942462_10157323881577463,...,2019-06-17T14:30:11+0000,1,0,0,0,0,0,9,0,photo
4,post_page_155027942462,Engagement,0,0,8,0,0,17,6,155027942462_10157315990422463,...,2019-06-14T14:30:02+0000,3,0,0,0,0,0,9,0,video


In [4]:
# Tokenize post_message column
df['post_message'] = df['post_message'].apply(lambda list_words: word_tokenize(list_words))

## Data Pre-processing

In [5]:
# Select df['post_message']
processed_words = df['post_message']

In [6]:
# Create a list of processed_words
processed_words = [words for words in processed_words]

### Bag of Word (BoW)

In [7]:
# Create a dictionary from ‘processed_words’ containing the number of times a word appears in the training set
dictionary = corpora.Dictionary(processed_words)

In [8]:
# Filter out tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
# Create a dictionary reporting how many words and how many times those words appear
corpus_bow = [dictionary.doc2bow(doc) for doc in processed_words]

In [10]:
# Preview Bag of Words for a sample preprocessed document
bow_doc_500 = corpus_bow[500]

for i in range(len(bow_doc_500)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_500[i][0], dictionary[bow_doc_500[i][0]], bow_doc_500[i][1]))

Word 57 ("paydai") appears 1 time.
Word 68 ("see") appears 1 time.
Word 108 ("week") appears 1 time.
Word 118 ("someon") appears 1 time.
Word 119 ("bonu") appears 1 time.
Word 163 ("winner") appears 1 time.


### TFIDF

In [11]:
# Fit model
tfidf = models.TfidfModel(corpus_bow)

In [12]:
# Apply transformation to the entire corpus
corpus_tfidf = tfidf[corpus_bow]

In [13]:
# Preview TF-IDF scores for the first document
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.8057762139865111),
 (1, 0.3489304433755792),
 (2, 0.39392744378681527),
 (3, 0.2716494205605292)]


## Topic Modeling - LDA

### Using BoW

In [14]:
# Train the lda model using gensim.models.LdaMulticore and corpus_bow
lda_model = gensim.models.LdaMulticore(corpus_bow, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [15]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.039*"credit" + 0.035*"wai" + 0.033*"save" + 0.028*"u" + 0.025*"make" + 0.025*"monei" + 0.023*"dai" + 0.023*"learn" + 0.022*"read" + 0.022*"comment"

Topic: 1 
Words: 0.042*"green" + 0.042*"get" + 0.041*"dot" + 0.033*"card" + 0.031*"sweepstak" + 0.028*"advic" + 0.023*"back" + 0.020*"cash" + 0.018*"help" + 0.018*"monei"

Topic: 2 
Words: 0.040*"app" + 0.039*"u" + 0.039*"mobil" + 0.034*"balanc" + 0.029*"account" + 0.029*"dot" + 0.029*"green" + 0.028*"tell" + 0.028*"learn" + 0.021*"bank"

Topic: 3 
Words: 0.063*"green" + 0.061*"dot" + 0.048*"tip" + 0.041*"card" + 0.036*"tax" + 0.030*"like" + 0.026*"love" + 0.020*"page" + 0.018*"season" + 0.016*"enter"

Topic: 4 
Words: 0.073*"card" + 0.063*"cash" + 0.049*"back" + 0.045*"dot" + 0.044*"green" + 0.042*"appli" + 0.040*"fee" + 0.033*"debit" + 0.029*"visa" + 0.027*"get"

Topic: 5 
Words: 0.035*"see" + 0.035*"green" + 0.033*"win" + 0.033*"dot" + 0.030*"winner" + 0.029*"dai" + 0.026*"week" + 0.025*"tip" + 0.023*"plai" + 0.022*"g

### Using TFIDF

In [16]:
# Train the lda model using gensim.models.LdaMulticore and corpus_tfidf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [17]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}\n'.format(idx, topic))

Topic: 0 
Word: 0.033*"payment" + 0.033*"back" + 0.031*"schedul" + 0.031*"learn" + 0.028*"ssi" + 0.026*"get" + 0.025*"cash" + 0.022*"card" + 0.022*"talk" + 0.019*"bill"

Topic: 1 
Word: 0.038*"summer" + 0.029*"dai" + 0.025*"ontheblog" + 0.023*"make" + 0.020*"start" + 0.019*"go" + 0.018*"famili" + 0.017*"card" + 0.017*"dot" + 0.016*"green"

Topic: 2 
Word: 0.036*"happi" + 0.024*"read" + 0.024*"stai" + 0.021*"dot" + 0.021*"green" + 0.021*"new" + 0.020*"want" + 0.019*"like" + 0.019*"kind" + 0.017*"sweepstak"

Topic: 3 
Word: 0.043*"tip" + 0.028*"monei" + 0.027*"good" + 0.025*"save" + 0.024*"extra" + 0.023*"smart" + 0.023*"entrant" + 0.023*"experi" + 0.022*"custom" + 0.022*"statement"

Topic: 4 
Word: 0.041*"comment" + 0.038*"share" + 0.031*"u" + 0.029*"budget" + 0.024*"tell" + 0.021*"save" + 0.019*"wai" + 0.018*"dai" + 0.018*"prepaid" + 0.017*"goal"

Topic: 5 
Word: 0.056*"financi" + 0.044*"green" + 0.043*"dot" + 0.031*"love" + 0.029*"advic" + 0.026*"credit" + 0.026*"shop" + 0.025*"ontheb

## Classification of the Topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [18]:
processed_words[500]

['someon', 'cloud', 'nine', 'see', 'week', 'paydai', 'bonu', 'winner']

In [19]:
for index, score in sorted(lda_model[corpus_bow[500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8714189529418945	 
Topic: 0.054*"see" + 0.045*"u" + 0.043*"comment" + 0.040*"bonu" + 0.038*"paydai" + 0.033*"week" + 0.032*"tell" + 0.027*"financi" + 0.027*"like" + 0.024*"winner"

Score: 0.014288805425167084	 
Topic: 0.035*"see" + 0.035*"green" + 0.033*"win" + 0.033*"dot" + 0.030*"winner" + 0.029*"dai" + 0.026*"week" + 0.025*"tip" + 0.023*"plai" + 0.022*"get"

Score: 0.014287449419498444	 
Topic: 0.042*"back" + 0.038*"card" + 0.038*"dot" + 0.037*"green" + 0.037*"financi" + 0.034*"cash" + 0.028*"paydai" + 0.028*"bonu" + 0.023*"visa" + 0.023*"entrant"

Score: 0.014286912977695465	 
Topic: 0.110*"get" + 0.055*"pai" + 0.042*"dai" + 0.041*"period" + 0.038*"deposit" + 0.025*"time" + 0.024*"mai" + 0.024*"depend" + 0.023*"employ" + 0.022*"like"

Score: 0.014286698773503304	 
Topic: 0.048*"share" + 0.041*"monei" + 0.036*"save" + 0.036*"comment" + 0.035*"learn" + 0.033*"financ" + 0.026*"u" + 0.026*"see" + 0.025*"win" + 0.021*"thank"

Score: 0.01428657490760088	 
Topic: 0.042*"green" +

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [20]:
for index, score in sorted(lda_model_tfidf[corpus_bow[500]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8714226484298706	 
Topic: 0.053*"week" + 0.049*"paydai" + 0.047*"bonu" + 0.037*"see" + 0.034*"someon" + 0.032*"winner" + 0.024*"treat" + 0.024*"pai" + 0.024*"get" + 0.023*"person"

Score: 0.01428710762411356	 
Topic: 0.045*"monei" + 0.035*"season" + 0.026*"see" + 0.025*"free" + 0.024*"save" + 0.021*"thank" + 0.021*"plan" + 0.018*"extra" + 0.018*"u" + 0.018*"credit"

Score: 0.014286509715020657	 
Topic: 0.043*"tip" + 0.028*"monei" + 0.027*"good" + 0.025*"save" + 0.024*"extra" + 0.023*"smart" + 0.023*"entrant" + 0.023*"experi" + 0.022*"custom" + 0.022*"statement"

Score: 0.014286419376730919	 
Topic: 0.041*"comment" + 0.038*"share" + 0.031*"u" + 0.029*"budget" + 0.024*"tell" + 0.021*"save" + 0.019*"wai" + 0.018*"dai" + 0.018*"prepaid" + 0.017*"goal"

Score: 0.01428639329969883	 
Topic: 0.056*"financi" + 0.044*"green" + 0.043*"dot" + 0.031*"love" + 0.029*"advic" + 0.026*"credit" + 0.026*"shop" + 0.025*"ontheblog" + 0.025*"read" + 0.025*"monei"

Score: 0.014286370016634464	 
Topi

## Evaluating the LDA Model

"As a rule of thumb for a good LDA model, the perplexity score should be low while coherence should be high"

### LDA using BoW

In [21]:
# Print Perplexity:
print('\nPerplexity:', lda_model.log_perplexity(corpus_bow))

# Compute for coherence score
coherence_score_lda = CoherenceModel(model=lda_model, texts=processed_words, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()
# Print Coherence score:
print('\nCoherence Score:', coherence_score)


Perplexity: -4.999184949024548

Coherence Score: 0.4279726410034108


### LDA using TFIDF

In [22]:
# Print Perplexity:
print('\nPerplexity:', lda_model_tfidf.log_perplexity(corpus_tfidf))

# Compute for coherence score
coherence_score_lda_tf = CoherenceModel(model=lda_model_tfidf, texts=processed_words, dictionary=dictionary, coherence='c_v')
coherence_score_tf = coherence_score_lda_tf.get_coherence()
# Print Coherence Score:
print('\nCoherence Score:', coherence_score_tf)


Perplexity: -6.525632269902833

Coherence Score: 0.3643418335681687


-----------------

## Transform topics into Features

In [23]:
def get_listScore(topic):
    '''
    Get scores and put it in a list
    e.g. [1,2,3,4]
    
    Note: There are corpus that does not have a topic,
    so it gets an error of IndexError, to combat the IndexError,
    append 0 instead.
    '''
    scores = []
    for i in range(len(corpus_bow)):
        try:
            score = lda_model_tfidf[corpus_bow[i]][topic][1]
            scores.append(score)
        except IndexError:
            scores.append(0)
    return scores

In [24]:
# Create empty dictionary_scores
dictionary_scores = {}

# Set dictionary
numTopics = 10

for topic in range(numTopics):
    key = 'Topic ' + str(topic)
    dictionary_scores[key] = get_listScore(topic)

In [25]:
# Convert dictionary to dataFrame
topic_features = pd.DataFrame(dictionary_scores)
topic_features.head()

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
0,0.849985,0.016668,0.016669,0.016668,0.016668,0.016668,0.016668,0.016672,0.016667,0.016667
1,0.033335,0.699976,0.033337,0.033337,0.033338,0.033337,0.033335,0.033336,0.033334,0.033334
2,0.025002,0.025002,0.025012,0.77496,0.025004,0.025007,0.025003,0.025003,0.025002,0.025006
3,0.025003,0.025015,0.025002,0.025014,0.025006,0.774933,0.025009,0.025002,0.025003,0.337901
4,0.012502,0.012504,0.012504,0.545986,0.354294,0.012505,0.012502,0.012501,0.012502,0.012501


### Save as topic_features.tsv

In [26]:
topic_features.to_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/results/topic_features.tsv', sep='\t')