# Topic Modeling of Reviews for Cellphone and Accessories category on Amazon 

In [None]:
import numpy as np
from gensim.models import CoherenceModel
np.set_printoptions(precision=2, linewidth=80)
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis.gensim  # don't skip this
import numpy as np
import utils
import pandas as pd
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from collections import Counter

In [None]:
warnings.filterwarnings("ignore")
nlp = spacy.load('en_vecs', parse=False, tag=False, entity=False)

np.set_printoptions(precision=2, linewidth=80)

runOptimizeCounts=False

# Load normalized data from processed file


In [None]:
reviews = utils.readFromDisk('reviews')
reviews_tokens = utils.readFromDisk('reviews_tokens')
positive_reviews = utils.readFromDisk('reviews_positive')
negative_reviews = utils.readFromDisk('reviews_negative')
positive_reviews_tokens = utils.readFromDisk('reviews_positive_tokens')
negative_reviews_tokens = utils.readFromDisk('reviews_negative_tokens')

# Global set up

In [None]:
# Create Dictionary
id2word_all = corpora.Dictionary(reviews_tokens)
# Term Document Frequency
tdf_all = [id2word_all.doc2bow(text) for text in reviews_tokens]

# Create Dictionary
id2word_positive = corpora.Dictionary(positive_reviews_tokens)
# Term Document Frequency
tdf_positive = [id2word_positive.doc2bow(text) for text in positive_reviews_tokens]

# Create Dictionary
id2word_negative = corpora.Dictionary(negative_reviews_tokens)
# Term Document Frequency
tdf_negative = [id2word_negative.doc2bow(text) for text in negative_reviews_tokens]


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        topicmodel=gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        #gensim.models.ldamulticore.LdaMulticore(corpus=tdf,id2word=id2word,num_topics=num_topics,workers=16)
        model_list.append(topicmodel)
        coherencemodel = CoherenceModel(model=topicmodel, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


In [None]:
def runmodel(dictionary, corpus,texts):
    
    # View
    #print(corpus[:1])
    
    # Build LDA model
    topicmodel = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,id2word=dictionary,num_topics=16,workers=16,
                                random_state=100)
    #chunksize=1000, passes=100,


    # Compute Perplexity
    print('\nPerplexity: ', topicmodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=topicmodel, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence)

    # Print the Keyword in the 10 topics
    pprint(topicmodel.print_topics())

    return topicmodel

    


In [None]:
def wordcloud(topicmodel):
    # Wordcloud of Top N words in each topic
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
    
    cloud = WordCloud(stopwords=utils.final_stop_words,
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)
    
    topics = topicmodel.show_topics(formatted=False)
    
    fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)
    
    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')
    
    
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()


# Finding optimized number of topics

In [None]:
# find optimal number of topics
if(runOptimizeCounts):
    model_list, coherence_values = compute_coherence_values(dictionary=id2word_all, corpus=tdf_all, texts=reviews_tokens, start=1, limit=50, step=5)
    #visualize
    print(model_list)
    # Show graph
    import matplotlib.pyplot as plt
    limit=50; start=1; step=5;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    #print coherence scores
    # Print the coherence scores
    for m, cv in zip(x, coherence_values):
        print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    

# Topic Modeling on all reviews

In [None]:

    topicmodel =runmodel(id2word_all,tdf_all,reviews_tokens)


In [None]:
    # Visualize the topics as clouds
    wordcloud(topicmodel)

In [None]:
    #show keywords by topic
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(topicmodel, tdf_all, id2word_all,mds='mmds')
    
    vis


# Topic Modeling on positive reviews

In [None]:

    topicmodel =runmodel(id2word_positive,tdf_positive,positive_reviews_tokens)


In [None]:
    # Visualize the topics as clouds
    wordcloud(topicmodel)

In [None]:
    #show keywords by topic
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(topicmodel, tdf_positive, id2word_positive,mds='mmds')
    
    vis


# Topic Modeling on negative reviews

In [None]:

    topicmodel =runmodel(id2word_negative,tdf_positive,negative_reviews_tokens)


In [None]:
    # Visualize the topics as clouds
    wordcloud(topicmodel)

In [None]:
    #show keywords by topic
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(topicmodel, tdf_negative, id2word_negative,mds='mmds')
    
    vis
