In [14]:
#import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Seema\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [1]:
import pandas as pd
import numpy as np
import re
from pprint import pprint
import pickle

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from gensim.models import LdaMulticore
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
from nltk.corpus import stopwords as nltk_SW

# spacy for lemmatization
import spacy
from spacy.tokenizer import Tokenizer

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
% run config.ipynb

In [3]:
% run data_preprocessing.ipynb

In [15]:
% run nlp_preprocessing.ipynb

In [31]:
% run topic_model.ipynb

In [5]:
# Load processed .pkl file
df = load_processed_data()

After removing outliers
Total Rows: 295390 , Total Columns: 6 , Total Memory Usage (Bytes): 16541840


In [6]:
df.head()

Unnamed: 0,business_id,review_id,stars,date,text,text_len
4,PfOCPjBrlQAnz__NXj9h_w,sa6lbcuzaMkjVegOT21t8Q,5,2017-05-29,Been going here for years and it's still my fa...,132
10,PfOCPjBrlQAnz__NXj9h_w,BRHnn4cqgGfMfZXOpcqZHQ,5,2016-01-13,everyone was so friendly and the drinks were g...,157
11,PfOCPjBrlQAnz__NXj9h_w,QyForzRygHhPsHY2b174mg,5,2016-02-24,Good beer and even better food. Staff is atten...,180
14,PfOCPjBrlQAnz__NXj9h_w,fQR2uXHOOs-hPvdRvwJ-OA,4,2017-08-05,Never been in the area and decided to stop wit...,177
18,PfOCPjBrlQAnz__NXj9h_w,Nb-hzSY3AbSvHVhEHP58Qw,3,2016-11-06,"Good service, nice atmosphere, the filet is ho...",149


## NLP Processing

In [35]:
# Load pre trained model

#spacy.cli.download("en_core_web_lg")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [7]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [8]:
# Clean Text
df['clean_text'] = df['text'].apply(clean_text)

In [9]:
data_lemmatized = generate_lemma_text(df['clean_text'])

In [10]:
df['data_lemmatized'] = data_lemmatized

# Save lemma output
save_preprocessed_data(df, file_name_lemmatized)

## Checkpoint  for clean lemma tokens

In [11]:
df = load_pkl_file(file_name_lemmatized)

In [12]:
df.head()

Unnamed: 0,business_id,review_id,stars,date,text,text_len,clean_text,data_lemmatized
4,PfOCPjBrlQAnz__NXj9h_w,sa6lbcuzaMkjVegOT21t8Q,5,2017-05-29,Been going here for years and it's still my fa...,132,been going here for years and it's still my fa...,"[go, year, still, favorite, server, friendly, ..."
10,PfOCPjBrlQAnz__NXj9h_w,BRHnn4cqgGfMfZXOpcqZHQ,5,2016-01-13,everyone was so friendly and the drinks were g...,157,everyone was so friendly and the drinks were g...,"[friendly, drink, good, food, amazing, comfy, ..."
11,PfOCPjBrlQAnz__NXj9h_w,QyForzRygHhPsHY2b174mg,5,2016-02-24,Good beer and even better food. Staff is atten...,180,good beer and even better food staff is attent...,"[good, beer, even, well, food, staff, attentiv..."
14,PfOCPjBrlQAnz__NXj9h_w,fQR2uXHOOs-hPvdRvwJ-OA,4,2017-08-05,Never been in the area and decided to stop wit...,177,never been in the area and decided to stop wit...,"[never, area, decide, stop, friend, cool, atmo..."
18,PfOCPjBrlQAnz__NXj9h_w,Nb-hzSY3AbSvHVhEHP58Qw,3,2016-11-06,"Good service, nice atmosphere, the filet is ho...",149,good service nice atmosphere the filet is horr...,"[good, service, nice, atmosphere, horrendous, ..."


## Vectorization

In [25]:
df = df[df['date']>='2017-07-01']
print(df.shape)

id2word, corpus = vectorization(data_lemmatized)

(73232, 8)


## LDA Model

In [32]:
lda_model, perplexity_lda, coherence_lda = generate_topic_model_lda(id2word, corpus, data_lemmatized)

[(0,
  '0.076*"love" + 0.051*"sushi" + 0.038*"favorite" + 0.034*"new" + '
  '0.033*"good" + 0.031*"roll" + 0.026*"spot" + 0.025*"work" + 0.014*"people" '
  '+ 0.013*"fresh"'),
 (1,
  '0.140*"always" + 0.051*"food" + 0.049*"great" + 0.045*"visit" + '
  '0.043*"time" + 0.040*"love" + 0.034*"service" + 0.028*"good" + 0.026*"eat" '
  '+ 0.020*"year"'),
 (2,
  '0.143*"time" + 0.059*"wait" + 0.052*"first" + 0.047*"back" + 0.042*"food" + '
  '0.037*"long" + 0.024*"service" + 0.021*"come" + 0.019*"good" + 0.017*"go"'),
 (3,
  '0.050*"never" + 0.036*"disappoint" + 0.028*"soup" + 0.022*"food" + '
  '0.021*"bad" + 0.021*"review" + 0.020*"delivery" + 0.020*"chinese" + '
  '0.020*"good" + 0.016*"order"'),
 (4,
  '0.191*"pizza" + 0.046*"close" + 0.041*"open" + 0.020*"twice" + 0.019*"good" '
  '+ 0.018*"late" + 0.016*"call" + 0.015*"pm" + 0.014*"say" + 0.014*"crust"'),
 (5,
  '0.089*"good" + 0.060*"food" + 0.048*"beer" + 0.045*"great" + '
  '0.039*"selection" + 0.034*"nice" + 0.026*"price" + 0.024*"s

In [33]:
model_name = 'lda_model_baseline.pkl'
with open(file_path + model_name, 'wb') as f:
        pickle.dump(lda_model, f)

In [34]:
%%time
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

Wall time: 7min 55s


In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=30, step=6)
# Show graph

In [None]:
plot_coherence_values(coherence_values)

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [35]:
# Select the model and print the topics
optimal_model = lda_model # model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.076*"love" + 0.051*"sushi" + 0.038*"favorite" + 0.034*"new" + '
  '0.033*"good" + 0.031*"roll" + 0.026*"spot" + 0.025*"work" + 0.014*"people" '
  '+ 0.013*"fresh"'),
 (1,
  '0.140*"always" + 0.051*"food" + 0.049*"great" + 0.045*"visit" + '
  '0.043*"time" + 0.040*"love" + 0.034*"service" + 0.028*"good" + 0.026*"eat" '
  '+ 0.020*"year"'),
 (2,
  '0.143*"time" + 0.059*"wait" + 0.052*"first" + 0.047*"back" + 0.042*"food" + '
  '0.037*"long" + 0.024*"service" + 0.021*"come" + 0.019*"good" + 0.017*"go"'),
 (3,
  '0.050*"never" + 0.036*"disappoint" + 0.028*"soup" + 0.022*"food" + '
  '0.021*"bad" + 0.021*"review" + 0.020*"delivery" + 0.020*"chinese" + '
  '0.020*"good" + 0.016*"order"'),
 (4,
  '0.191*"pizza" + 0.046*"close" + 0.041*"open" + 0.020*"twice" + 0.019*"good" '
  '+ 0.018*"late" + 0.016*"call" + 0.015*"pm" + 0.014*"say" + 0.014*"crust"'),
 (5,
  '0.089*"good" + 0.060*"food" + 0.048*"beer" + 0.045*"great" + '
  '0.039*"selection" + 0.034*"nice" + 0.026*"price" + 0.024*"s