In [9]:
!pip install pyLDAvis

In [8]:
# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
pd.set_option("display.max_columns", None)
pd.options.display.max_colwidth = 50
import warnings; 
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning) 

READING FILES

In [None]:
review_df = pd.read_csv("")

print(review_df.info())

LOOKING AT THE DATA & SPLITTINT IT

In [None]:
review_df.head()

In [None]:
#negative review data
neg_data = review_df.loc[review_df['rating'] < 3]
neg_data.reset_index(drop=True, inplace=True)
print(neg_data.info())

In [None]:
#positive review data
pos_data = review_df.loc[review_df['rating'] > 3]
pos_data.reset_index(drop=True, inplace=True)
print(pos_data.info())

PREPROCESSING

In [12]:
#Tokenizing sentences into a list of words and removing uncessary characters
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  

neg_data['data_words'] = list(sent_to_words(neg_data['review-en']))

#Stop word removal and tokenizing 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["throw","application","question","ask","answer", "YKS", "TYT", "LGS", "AYT"])  #extending stop words

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts_out = []
    nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

neg_data['data_ready'] = process_words(neg_data.data_words)  

LDA

In [None]:
#Dict
id2word = corpora.Dictionary(neg_data.data_ready)
#Corpus
corpus = [id2word.doc2bow(text) for text in neg_data.data_ready]
#model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics= 8,  #8 topics                           
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,  
                                           passes=10,   
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

#print topics and keywords
print(lda_model.print_topics())

In [15]:
#Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

#Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=neg_data.data_ready, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.308499112298337

Coherence Score:  0.4633786311709297


TOPIC KEYWORDS

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words= 15):
    print('Topic: {} --> Words: {}'.format(idx, '/'.join([w[0] for w in topic])))

In [17]:
#Calculating the topic distribution

from matplotlib.ticker import FuncFormatter

#Finding the dominant topics
def topics_per_review(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_review(model=lda_model, corpus=corpus, end=-1) 

In [None]:
#distribution of topics in reviews
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_rev = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_rev = dominant_topic_in_each_rev.to_frame(name='count').reset_index()
display(df_dominant_topic_in_each_rev)

In [None]:
# total distrubution 
topic_weightage_by_rev = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_rev = topic_weightage_by_rev.sum().to_frame(name='count').reset_index()

display(df_topic_weightage_by_rev)

In [None]:
# Plots 
from matplotlib.ticker import FuncFormatter

# Top  Keywords for topics
topic_top_n_words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 5]  # for 5 key words

df_top_n_words_stacked = pd.DataFrame(topic_top_n_words, columns=['topic_id', 'words'])
df_top_n_words = df_top_n_words_stacked.groupby('topic_id').agg(', \n'.join)
df_top_n_words.reset_index(level=0,inplace=True)

# Plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 4), dpi=120, sharey=True)

# Topic Distribution by Dominant Topics
ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_rev, width=.5, color='#9ECBEA')
ax1.set_xticks(range(df_dominant_topic_in_each_rev.Dominant_Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top_n_words.loc[df_top_n_words.topic_id==x, 'words'].values[0])
ax1.tick_params(labelsize=4)
ax1.set_title('Number of Negative Reviews by Dominant Topic - APPNAME GooglePlay', fontdict=dict(size=8))
ax1.set_ylabel('Number of Reviews', fontsize = 6)
ax1.set_ylim(0, 100)


# Topic Distribution by Topic Weights
ax2.bar(x='index', height='count', data=df_topic_weightage_by_rev, width=.5, color='#EADA9E')
ax2.set_xticks(range(df_topic_weightage_by_rev.index.unique().__len__()))
ax2.xaxis.set_major_formatter(tick_formatter)
ax2.tick_params(labelsize=4)
ax2.set_title('Number of Negative Reviews  by Topic Weightage - APPNAME GooglePlay', fontdict=dict(size=8))
ax2.set_ylabel('Number of Review', fontsize = 6)

plt.show()