# Topic Modeling of Reviews for Cellphone and Accessories category on Amazon 

In [None]:
import numpy as np
from sklearn.utils import shuffle
np.set_printoptions(precision=2, linewidth=80)
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis.gensim  # don't skip this
import numpy as np
import pandas as pd
import pickle

In [None]:
warnings.filterwarnings("ignore")
nlp = spacy.load('en_vecs', parse=False, tag=False, entity=False)

np.set_printoptions(precision=2, linewidth=80)

PROCESSED_FILENAME= './data/amazon_reviews_processed.pickle' 


# Load normalized data from processed file


In [None]:
f=open(PROCESSED_FILENAME, "rb")
dfdb = pickle.load(f)

#filter rows out that have less than 20 word tokens
#dfdb = dfdb[dfdb['Clean_Review_Tokens'].apply(lambda x: len(x) >= 20)]


# Prune data for development if needed


In [None]:
trial=0

#subset for local runs, will remove on final runs or on server
five=(dfdb['overall'] == 5.0)
four=(dfdb['overall'] >= 4.0) & (dfdb['overall'] < 5.0)
three=(dfdb['overall'] == 3.0) & (dfdb['overall'] < 4.0)
two=(dfdb['overall'] == 2.0) & (dfdb['overall'] < 3.0)
one=(dfdb['overall'] == 1.0) & (dfdb['overall'] < 2.0)
zero=(dfdb['overall'] == 0.0) & (dfdb['overall'] < 1.0)

df=pd.DataFrame(columns = dfdb.columns)
if(trial>0):
    df=dfdb[five].iloc[0:trial]
    df=df.append(dfdb[four].iloc[0:trial])
    df=df.append(dfdb[two].iloc[0:trial])
    df=df.append(dfdb[one].iloc[0:trial])
    df=df.append(dfdb[zero].iloc[0:trial])
else:
    df=dfdb[five]
    df=df.append(dfdb[four])
    df=df.append(dfdb[two])
    df=df.append(dfdb[one])
    df=df.append(dfdb[zero])

#randomize dataset
df = shuffle(df)

# Sample processed data loaded, notice Cleaned Review


In [None]:
print('Total Rows on processed dataset: ' + str(len(df)))
print('Sample of processed dataset. Notice the column named Clean_Review');
df.head(20)



# Split train and test data


In [None]:
# take a peek at the data
reviews = np.array(df['Clean_Review'])
sentiments = np.array(df['sentiment'])
reviews_tokens = np.array(df['Clean_Review_Tokens'])
positive_reviews_tokens = np.array((df[df['sentiment']==1])['Clean_Review_Tokens'])
negative_reviews_tokens = np.array((df[df['sentiment']==0])['Clean_Review_Tokens'])


# Extract features from positive and negative reviews


In [None]:
# get tf-idf features for only positive reviews
zipped=zip(reviews, sentiments)

positive_reviews = [review for review, sentiment in zip(reviews,sentiments)  if sentiment == 1]
ptvf = TfidfVectorizer(ngram_range=(1,1))
ptvf_features = ptvf.fit_transform(positive_reviews)
# get tf-idf features for only negative reviews
negative_reviews = [review for review, sentiment in zip(reviews, sentiments) if sentiment == 0]
ntvf = TfidfVectorizer(ngram_range=(1,1))
ntvf_features = ntvf.fit_transform(negative_reviews)
# view feature set dimensions
#print(ptvf_features.shape, ntvf_features.shape)


In [None]:
def test(tokens):
    # Create Dictionary
    id2word = corpora.Dictionary(tokens)
    
    # Term Document Frequency
    tdf = [id2word.doc2bow(text) for text in tokens]
    # View
    #print(corpus[:1])
    
    # Build LDA model
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=tdf,id2word=id2word)

    return lda_model,tdf,id2word


# Topic Modeling on all reviews


In [None]:
lda_model,tdf,id2word =test(reviews_tokens)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[tdf]
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tdf, id2word,mds='mmds')
vis
    

# Topic modeling of positive reviews

In [None]:
lda_model,tdf,id2word =test(positive_reviews_tokens)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[tdf]
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tdf, id2word,mds='mmds')
vis
        

# Topic modeling of negative reviews

In [None]:
lda_model,tdf,id2word =test(negative_reviews_tokens)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[tdf]
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tdf, id2word,mds='mmds')
vis

In [None]:
# Compute Perplexity
#print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
#coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)