### Import Libraries

In [10]:
import pandas as pd
import numpy as np

import string
import spacy

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

#Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pyLDAvis
import pyLDAvis.sklearn


# Machine Learning
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

### Load dataset

In [11]:
# Load dataset

dataset = pd.read_csv('sample_review_c1.csv')
dataset.head()

Unnamed: 0,id,time,participant,message,truth
0,1,2004-11-11 11:11:11,P1,nice story. i like the way snape is developed....,False
1,2,2004-11-11 11:11:11,P2,w,False
2,3,2004-11-11 11:11:11,P3,Please please please update! You can't leave i...,False
3,4,2004-11-11 11:11:11,P4,noooo it can't be the end!,False
4,5,2004-11-11 11:11:11,P5,Noooo you can't top here! It's such an awesom...,True


In [12]:
dataset.shape

(4499, 5)

In [13]:
# Extract the reviews column 
reviews = dataset['message'].tolist()
reviews[0:10]

['nice story. i like the way snape is developed. though i wonder how sirius and snape will go',
 'w',
 "Please please please update! You can't leave it at that! Please  just one more chapter! I want to see the misunderstanding between Harry and Snape to be solved and.. Please!  Why don't I ever learn! I find an truly incredible fic and get all excited about it and in the last updated chapter I realize that there's a note saying that the author has abandoned the story or s/he han't updated it in years!  But please  I'm begging you  please update this! I simply love this  and I really think you got Sev and Harry spot on! Actually  all the characters have been acting in-character  as far as I can see it. Your writing style is just inspiring and I just want to read more and more and more! And I can't bear to see Harry thinking that Snape doesn't like nor want him anymore  while Snape is basically thinking the same! Please? It would mean the world to me.",
 "noooo it can't be the end!",
 "N

### Data cleaning

In [14]:
stop = set(stopwords.words('english'))
punc = set(string.punctuation)

In [15]:
lemma = WordNetLemmatizer()

In [16]:
def clean_doc(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join([ch for ch in stop_free if ch not in punc])
    normalized = " ".join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized

In [17]:
doc_clean = [clean_doc(doc) for doc in reviews]

### Create Document Term Matrix

In [18]:
tf_vectorizer = CountVectorizer(max_features=1000, max_df=0.95, min_df=2, stop_words='english')

In [19]:
tf = tf_vectorizer.fit_transform(doc_clean)

In [20]:
tf_feature_names = tf_vectorizer.get_feature_names()

### Creat LDA Model

In [21]:
# Lets start with 20 topics
no_topics = 20

In [22]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', random_state=123).fit(tf)

In [23]:
lda.components_.shape

(20, 1000)

### Output Topics

In [25]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % topic_idx)
        print(" ".join([feature_names[i] for i in model.components_[topic_idx].argsort()[:-no_top_words-1:-1]]))

In [26]:
display_topics(lda, tf_feature_names, 10)

Topic 0:
continue twilight hope day story fun say fantastic great sure
Topic 1:
story im like thing interesting really know good thank wow
Topic 2:
make think pretty said yeah book good hate thats like
Topic 3:
way relationship chapter thing plot luna omg die mean ive
Topic 4:
wait spike brilliant best yay got read believe twilight word
Topic 5:
chapter love story great new know thought write fic like
Topic 6:
celestia coming dusk saw learn nightmare apple ray se equestria
Topic 7:
look que bad add forward lovely black magic kill lord
Topic 8:
rose like feel scene song dont personality glad certainly especially
Topic 9:
loved looking sequel story gonna thanks forward reading big im
Topic 10:
really story like good love end want come read character
Topic 11:
nice bravo wonder cutie girl totally dude vinyl amy play
Topic 12:
doctor better ending time need know written jack shit got
Topic 13:
story good awesome work great im read think make far
Topic 14:
xd funny like liked time long bit 

In [27]:
print(lda)
print("Log likelihood: ", lda.score(tf))
print("Perplexity:", lda.perplexity(tf) )

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=20, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Log likelihood:  -317567.2353866406
Perplexity: 710.7538934431799


### Grid Search to improve model performance

In [28]:
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', random_state=123)

In [29]:
search_params = {'n_components':[10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]}

In [30]:
model = GridSearchCV(lda, param_grid=search_params)

In [31]:
model.fit(tf)

GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_components': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
print(model.best_estimator_)
print(model.best_params_)
print(model.best_score_)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.9,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
{'learning_decay': 0.9, 'n_components': 10}
-120768.71205628286


In [42]:
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', random_state=123)
search_params = {'n_components':[4, 6, 8, 10], 'learning_decay': [0.9, 0.95, 1.0]}
model = GridSearchCV(lda, param_grid=search_params)
model.fit(tf)
print(model.best_estimator_)
print(model.best_params_)
print(model.best_score_)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=1.0,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=4, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
{'learning_decay': 1.0, 'n_components': 4}
-112294.04041280781


In [43]:
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_decay =1.0, random_state=123)
search_params = {'n_components':[1, 2, 3, 4]}
model = GridSearchCV(lda, param_grid=search_params)
model.fit(tf)
print(model.best_estimator_)
print(model.best_params_)
print(model.best_score_)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=1.0,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=1, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
{'n_components': 1}
-105584.94062393658


We need to extract atleast 10 topics to compare it to the qualitative analysis of reviews. Hence lets keep the number of topics as 10.

In [48]:
lda = LatentDirichletAllocation(n_components=10, max_iter=5, learning_method='online', learning_decay =1.0, random_state=123)
search_params = {'learning_offset':[10, 20, 30, 40, 50], 'max_iter':[5, 10, 15, 20]}
model = GridSearchCV(lda, param_grid=search_params)
model.fit(tf)
print(model.best_estimator_)
print(model.best_params_)
print(model.best_score_)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=1.0,
             learning_method='online', learning_offset=50,
             max_doc_update_iter=100, max_iter=15, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
{'learning_offset': 50, 'max_iter': 15}
-117460.98436591157


In [49]:
lda = LatentDirichletAllocation(n_components=10, max_iter=15, learning_method='online', learning_decay =1.0, random_state=123)
search_params = {'learning_offset':[50, 60, 70, 80, 90]}
model = GridSearchCV(lda, param_grid=search_params)
model.fit(tf)
print(model.best_estimator_)
print(model.best_params_)
print(model.best_score_)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=1.0,
             learning_method='online', learning_offset=90,
             max_doc_update_iter=100, max_iter=15, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
{'learning_offset': 90}
-117140.1633264214


### LDA Model with best chosen parameters

In [41]:
lda = LatentDirichletAllocation(n_components=10, max_iter=15, learning_method='online', learning_decay =1.0, learning_offset= 50. , random_state=123)
lda.fit(tf)
print(lda)
print("Log likelihood: ", lda.score(tf))
print("Perplexity:", lda.perplexity(tf))

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=1.0,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=15, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)
Log likelihood:  -309783.1263153893
Perplexity: 605.0883539849148


In [34]:
lda_output = lda.transform(tf)
lda_output

array([[0.01000134, 0.77719539, 0.14279435, ..., 0.01000183, 0.01000006,
        0.01000004],
       [0.1       , 0.1       , 0.1       , ..., 0.1       , 0.1       ,
        0.1       ],
       [0.00200061, 0.8994866 , 0.0020002 , ..., 0.0845114 , 0.00200027,
        0.0020001 ],
       ...,
       [0.00769248, 0.00769413, 0.65085501, ..., 0.00769248, 0.00769231,
        0.00769232],
       [0.03333334, 0.69998244, 0.03333919, ..., 0.0333334 , 0.03333336,
        0.0333334 ],
       [0.05000001, 0.05000026, 0.54999157, ..., 0.05000001, 0.05000001,
        0.05000001]])

In [35]:
topicnames = ["Topic " + str(i) for i in range(lda.n_components)]
docnames = ["Doc " + str(i) for i in range(len(reviews))]
print(len(topicnames))
print(len(docnames))

10
4499


In [36]:
df_document_topic = pd.DataFrame(np.round(lda_output,2), columns=topicnames, index=docnames)

In [37]:
df_document_topic['dominant_topic'] = np.argmax(df_document_topic.values, axis=1)

### Dominant Topics in top 15 documents

In [38]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,dominant_topic
Doc 0,0.01,0.78,0.14,0.01,0.01,0.01,0.01,0.01,0.01,0.01,1
Doc 1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc 2,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,1
Doc 3,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,2
Doc 4,0.03,0.03,0.03,0.03,0.03,0.7,0.03,0.03,0.03,0.03,5
Doc 5,0.02,0.02,0.02,0.02,0.02,0.82,0.02,0.02,0.02,0.02,5
Doc 6,0.03,0.77,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,1
Doc 7,0.01,0.44,0.01,0.01,0.01,0.52,0.01,0.01,0.01,0.01,5
Doc 8,0.03,0.03,0.03,0.03,0.03,0.7,0.03,0.03,0.03,0.03,5
Doc 9,0.01,0.01,0.01,0.01,0.01,0.87,0.01,0.01,0.01,0.01,5


### Visualization for the topic modelling

In [39]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer, mds='tsne')
panel

In [64]:
pyLDAvis.save_html(panel, 'Topic_Viz_Reviews.html')

In [40]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,1,2424
1,5,1158
2,2,294
3,0,173
4,7,165
5,3,113
6,6,73
7,4,63
8,9,19
9,8,17


### Topic Keyword matrix

In [61]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda.components_)

# Assign Column and Index
df_topic_keywords.columns = tf_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords

Unnamed: 0,10,abandoned,able,absolute,absolutely,account,act,acting,action,actual,...,wrote,wtf,xd,xx,ya,yay,yeah,year,yes,young
Topic 0,0.167609,0.219702,0.171191,0.190754,0.184304,0.172416,0.170378,4.213048,0.172295,0.172456,...,0.16409,0.177581,0.190119,0.181601,0.197556,0.252572,0.172685,0.162563,0.177315,1.963681
Topic 1,15.073117,14.289338,36.731738,10.627052,51.234938,11.010247,8.179865,7.873979,21.321601,20.347844,...,21.78569,0.203538,16.072571,13.407809,4.515031,22.600202,28.580463,123.118688,21.090646,3.986388
Topic 2,1.322842,0.236382,0.280247,0.255397,0.18613,0.186556,2.789865,0.634275,0.262915,0.186604,...,0.228694,0.182958,15.836726,0.533594,0.455302,0.602918,3.412523,0.356524,48.864631,0.534855
Topic 3,0.177701,0.175751,0.184613,0.414845,0.327281,0.178984,0.196025,0.191027,0.184215,1.137761,...,0.234035,0.179932,26.629287,0.464477,11.034679,0.169098,3.031586,0.156462,0.401592,0.175262
Topic 4,0.173916,0.173647,0.173958,0.28641,0.169228,0.27901,0.189313,1.008155,0.184681,0.233507,...,1.412481,11.664406,0.986032,0.238237,0.326461,0.173073,0.22723,0.19102,0.235375,0.49932
Topic 5,0.31059,0.174173,0.256152,0.638304,29.367372,1.742764,0.249351,0.286447,3.029551,0.349827,...,3.881998,0.30214,74.4515,5.791795,12.098589,42.399545,17.10342,0.323919,7.019961,0.420834
Topic 6,0.177273,0.176239,0.193159,0.43594,9.6741,0.173538,0.192373,0.209495,0.187013,0.263882,...,0.171314,0.178909,0.178679,0.17083,0.175129,0.197359,0.239045,0.175282,0.305796,0.218415
Topic 7,2.742963,4.721133,4.509284,0.208967,0.496881,0.170985,2.807852,0.439089,0.193264,0.174258,...,0.321478,0.174207,0.210551,0.165542,0.223747,0.194969,0.468104,6.1333,2.827492,6.90068
Topic 8,0.191071,0.188292,0.177659,0.159206,0.175084,0.169165,0.175606,0.174488,0.183111,0.175912,...,0.171224,0.178612,0.202767,0.174692,0.206307,0.199518,0.183599,0.174287,0.174687,0.170391
Topic 9,0.17603,0.187967,0.170367,0.192285,0.172795,0.183681,0.189897,0.169969,0.178964,0.163829,...,0.177457,0.176113,0.191495,0.317775,0.22597,0.18105,0.175981,0.287929,0.190626,0.261406


In [62]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=lda, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,celestia,twilight,oo,woman,suck,van,lot,british,sex,date,near,talk,guy,princess,acting
Topic 1,story,really,im,good,like,chapter,read,update,know,love,time,think,thing,great,reading
Topic 2,nice,rainbow,like,fluttershy,cute,twilight,yes,luna,pony,rarity,relationship,dash,good,scootaloo,chapter
Topic 3,que,la,por,shit,el,en,xd,se,su,lo,fuck,bueno,pero,para,los
Topic 4,spike,sweetie,angel,mad,little,moar,run,wtf,shining,dragon,razorfang,ship,twilight,bitch,scootaloo
Topic 5,love,story,chapter,doctor,great,oh,rose,wait,like,loved,think,god,sweet,amazing,awesome
Topic 6,dusk,aw,cadence,ray,coming,doctor,exciting,absolutely,wedding,cliffhanger,truly,clara,come,amazing,gone
Topic 7,pony,magic,harry,later,spell,awesome,bad,right,die,got,mark,dark,want,death,cutie
Topic 8,discord,element,surprise,cool,place,harmony,fall,ah,queen,whats,control,old,magical,married,order
Topic 9,hogwarts,scary,fly,sir,au,hermione,party,battle,filly,kid,ron,stuck,fics,sharing,trying
