In [None]:
import pandas as pd
import gensim
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt
from pprint import pprint

import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os
import numpy as np
import tqdm

In [None]:
data = pd.read_csv('scrape/restaurant-data/cleaned_restaurant_reviews.csv', index_col=0)

In [None]:
data = data.reset_index(drop=True)

In [None]:
data

In [None]:
restaurant_review_df = data[['url', 'cleaned_text']]
restaurant_review_df = restaurant_review_df.groupby(['url'], as_index = False).agg({'cleaned_text': ' '.join})

In [None]:
restaurant_review_df

## LDA

In [None]:
# create dictionary
docs = data['cleaned_text'] ###
processed_docs = [d.split() for d in docs]
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
# term document frequency
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       id2word=dictionary,
                                       num_topics=10,
                                       alpha=0.1, # document topic density. higher alpha, documents composed of more topics
                                       eta=0.01, # topic word density. higher beta, topics composed of large number of words in the corpus
                                       chunksize=100, # number of documents to consider at once
                                       passes=10, # number of times to go through the entire corpus
                                       random_state =100)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
lda_score = coherence_model_lda.get_coherence()
lda_score

In [None]:
def compute_coherence_values(k):
    
    lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           alpha=0.1, # document topic density. higher alpha, documents composed of more topics
                                           eta=0.01, # topic word density. higher beta, topics composed of large number of words in the corpus
                                           chunksize=100, # number of documents to consider at once
                                           passes=10, # number of times to go through the entire corpus
                                           random_state =100)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# grid = {}
# grid['Validation_Set'] = {}

# # Topics range
# min_topics = 2
# max_topics = 11
# step_size = 1
# topics_range = range(min_topics, max_topics, step_size)

# # Alpha parameter
# alpha = list(np.arange(0.01, 1, 0.3))
# alpha.append('symmetric')
# alpha.append('asymmetric')

# # Beta parameter
# beta = list(np.arange(0.01, 1, 0.3))
# beta.append('symmetric')


# model_results = {
#                  'Topics': [],
#                  'Alpha': [],
#                  'Beta': [],
#                  'Coherence': []
#                 }

# # Can take a long time to run
# if 1 == 1:
#     pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)))
    
#     # iterate through number of topics
#     for k in topics_range:
#         # iterate through alpha values
#         for a in alpha:
#             # iterare through beta values
#             for b in beta:
#                 # get the coherence score for the given parameters
#                 cv = compute_coherence_values(k=k, a=a, b=b)
#                 # Save the model results
#                 model_results['Topics'].append(k)
#                 model_results['Alpha'].append(a)
#                 model_results['Beta'].append(b)
#                 model_results['Coherence'].append(cv)
                
#                 pbar.update(1)
#     pd.DataFrame(model_results).to_csv('restaurant_lda_tuning_results.csv', index=False)
#     pbar.close()

In [None]:
# iterate through number of topics
coherence_values = []
topics_range = range(2,11,1)

for k in topics_range:
    value = compute_coherence_values(k)
    print(k)
    print(value)
    coherence_values.append(value)

In [None]:
# Show graph
plt.plot(topics_range, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# choose num topics == 6 instead since 6 & 7 is stable.

In [None]:
# optimal parameters
num_topics = 6 ### CHANGE THIS

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                        id2word=dictionary,
                                        num_topics=num_topics,
                                        alpha=0.1, # document topic density. higher alpha, documents composed of more topics
                                        eta=0.01, # topic word density. higher beta, topics composed of large number of words in the corpus
                                        chunksize=100, # number of documents to consider at once
                                        passes=10, # number of times to go through the entire corpus
                                        random_state =100)
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

In [None]:
# Visualize the topics 
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(num_topics)+'_')
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

### https://we1s.ucsb.edu/research/we1s-tools-and-software/topic-model-observatory/tmo-guide/tmo-guide-pyldavis/
# A “relevance metric” slider scale at the top of the right panel controls how the words for a topic are sorted.
# lambda 1: sorts words by their frequency in the topic (red bars)
# lambda 0: sorts words by their "lift". Words whose red bars are nearly as long as their blue bars will be at the top Lift means how much a word's frequency sticks out in a topic above the baseline of its overall frequency in the model