In [None]:
### In this notebook, we will perform Topic Modelling on the collection of Restaurants' reviews

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
#pip install pyLDAvis
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [2]:
#data_dir = "C:/Users/april.chow/Documents/tripreviews.txt"
data_dir = "C:/Users/Sam/Desktop/tripadvisor_reviews_collection.csv"

data_df = pd.read_csv(data_dir)
data_df = data_df.rename({"rest_id": "ID", "rank": "Rank", "review_id": "reviewid", "comment": "Text"}, axis=1)
text = data_df["Text"]

In [3]:
# 1st iteration to run with defaut stopword list
mystopwords=stopwords.words("english")
WNlemma = nltk.WordNetLemmatizer()

# 2nd iteration to remove topics found in 1st iteration
#mystopwords=stopwords.words("english") + ['dish','service', 'food']

# 3rd iteration to remove topics found in 2nd iteration
#mystopwords=stopwords.words("english") + ['dish','service', 'food', 
#                                         'great', 'good', 'staff', 'friendly', 'experience', 'excellent', 'attentive', 'ambience', 'amazing', 
#                                          'wonderful', 'served', 'dining', 'recommend',
#                                         'restaurant', 'table', 'order', 'like', 'even', 'n\'t', 'would', 'get', 'could', 'still', 'take', 
#                                          'better', 'make', 'asked', 'course', 'first', 'made', 'wait', 'since',
#                                         'taste', 'ordered', 'really', 'try', 'also', 'well', 'nice', 'delicious', 'meal', 'course', 'like', 'main']

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)

toks = text.apply(pre_process)

# Use dictionary (built from corpus) to prepare a DTM (using frequency)
import logging
#pip install gensim '3.8.1'
import gensim 
from gensim import corpora

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# Filter off any words with document frequency less than 2, or appearing in more than 90% documents
dictionary = corpora.Dictionary(toks)
print(dictionary)
dictionary.filter_extremes(no_below=2, no_above=0.8)
"""
        no_below : int, optional
            Keep tokens which are contained in at least `no_below` documents.
        no_above : float, optional
            Keep tokens which are contained in no more than `no_above` documents
            (fraction of total corpus size, not an absolute number).
"""
#print(dictionary)


#dtm here is a list of lists, which is exactly a matrix
# here we are only looking for word frequency. Hence we use this approach to create DTM. 
dtm = [dictionary.doc2bow(d) for d in toks]
###########################################################
lda = gensim.models.ldamodel.LdaModel(dtm, num_topics = 3, id2word = dictionary, passes=10,chunksize=128,random_state=10)
# guessing that there are 5 types of theres. Hence running 5 topics 

# 0.009*set + 0.008*agent........... There will be 5 sets of such equation
# n\'t can be removed
# based on the equation can come up with some labels
# equation 2 can be further split up...seems to have a seem topics n one. 

# this prints the word tokens
lda.show_topics(10)


# In[15]:


##Evaluate the coherence score of LDA models
'''
u_mass:prefer the model close to 0 
c_v: [0,1], prefer bigger value   ( DO not fully rely on coherence score. Might not be align with classification problem)
Do not fully rely on the coherence score
'''
from gensim.models.coherencemodel import CoherenceModel
cm_umass = CoherenceModel(lda,  dictionary=dictionary, corpus=dtm, coherence='u_mass')
cm_cv = CoherenceModel(lda,  dictionary=dictionary, texts=toks, coherence='c_v')
lda_umass = cm_umass.get_coherence()
lda_cv = cm_cv.get_coherence()

print(lda_umass)
print(lda_cv)


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


Dictionary(23737 unique tokens: ['beautiful', 'breast', 'cancer', 'delicious', 'donate']...)
-2.0346451546841346
0.48622215274106734


In [4]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, dtm, dictionary)
#pyLDAvis.show(LDAvis_prepared)
pyLDAvis.display(LDAvis_prepared)

In [5]:
# Get the topic distribution of documents
doc_topics = lda.get_document_topics(dtm)

from operator import itemgetter
#show the topic distributions for the first 5 docs, 
# for 1st doc, the dominating topic is topic 4. for the second doc, the dominating topic is topic 0. 
for i in range(0, 5):
    print(doc_topics[i])
    print(max(doc_topics[i], key=itemgetter(1))[0]) 
    

    

#Select the best topic (with highest score) for each document
top_topic = [ max(t, key=itemgetter(1))[0] for t in doc_topics ]
print (top_topic)
# for each document, we are selecting the best candidate. Each document, will have a topic. 

[(0, 0.16925003), (1, 0.8122758), (2, 0.018474188)]
1
[(0, 0.0134603055), (1, 0.9727361), (2, 0.013803621)]
1
[(0, 0.0119165275), (1, 0.72977823), (2, 0.25830525)]
1
[(0, 0.09204522), (1, 0.68146235), (2, 0.22649242)]
1
[(0, 0.022470966), (1, 0.66877997), (2, 0.30874908)]
1
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1