# Review Data EDA

In [1]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set()

In [2]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)
DO_WRITE_CHARTS = False

# Load Review Data for Arizona Restaurants

In [4]:
time_marker(text='Loading Review Data...')

reviews = pd.DataFrame()
file_path_slug = '../clean_data/az_restaurant_reviews.csv'
file_list = glob(file_path_slug)

# Chunk Settings
chunks = list()
chunksize = 10000
for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['date'])):
        
        # append chunk to chunks list
        chunks.append(chunk)
        
time_marker(text='merging to dataframe...')
reviews = pd.concat(chunks)

time_marker('reseting index...')
reviews.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')


[17:21:25.556042] Loading Review Data...
[17:21:25.558329] Reading 1 of 1 ../clean_data/az_restaurant_reviews.csv...
[17:21:34.320888] merging to dataframe...
[17:21:35.500623] reseting index...
[17:21:35.510243] Complete!


In [5]:
time_marker('Dropping records with NaN values...')
reviews.dropna(how='any', inplace=True)
reviews.reset_index(inplace=True, drop=True)

time_marker('Cleaning data types...')
reviews['cool'] = reviews['cool'].astype('int')
reviews['funny'] = reviews['funny'].astype('int')
reviews['stars'] = reviews['stars'].astype('int')
reviews['useful'] = reviews['useful'].astype('int')
reviews['review_len'] = reviews['review_len'].astype('int')
reviews['is_fast_food'] = reviews['is_fast_food'].apply(lambda x: True if x == 1 else False)
reviews['date'] = pd.to_datetime(reviews['date'])

time_marker('assiging \'Positive\' or \'Negative\' classification to reviews...')
reviews['is_positive'] = reviews.stars.apply(lambda x: True if x > 3 else False)
           

[17:21:37.006823] Dropping records with NaN values...
[17:21:37.805531] Cleaning data types...
[17:21:38.362676] assiging 'Positive' or 'Negative' classification to reviews...


In [6]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495893 entries, 0 to 495892
Data columns (total 12 columns):
business_id     495893 non-null object
cool            495893 non-null int64
date            495893 non-null datetime64[ns]
funny           495893 non-null int64
review_id       495893 non-null object
stars           495893 non-null int64
text            495893 non-null object
useful          495893 non-null int64
user_id         495893 non-null object
is_fast_food    495893 non-null bool
review_len      495893 non-null int64
is_positive     495893 non-null bool
dtypes: bool(2), datetime64[ns](1), int64(5), object(4)
memory usage: 38.8+ MB


In [7]:
reviews.head(3)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,is_fast_food,review_len,is_positive
0,JlNeaOymdVbE6_bubqjohg,0,2014-08-09,0,BF0ANB54sc_f-3_howQBCg,1,we always go to the chevo's in chandler which ...,3,ssuXFjkH4neiBgwv-oN4IA,False,422,False
1,0Rni7ocMC_Lg2UH0lDeKMQ,0,2014-08-09,0,DbLUpPT61ykLTakknCF9CQ,1,this place is always so dirty and grimy been t...,6,ssuXFjkH4neiBgwv-oN4IA,False,111,False
2,S-oLPRdhlyL5HAknBKTUcQ,0,2017-11-30,0,z_mVLygzPn8uHp63SSCErw,4,holy portion sizes! you get a lot of bang for ...,0,MzEnYCyZlRYQRISNMXTWIg,False,130,True


# Normalizing Review Text

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [9]:
# import stop words
stop_words = set(stopwords.words('english'))

# terms and characters to ignore, we dont care about punctuation
exclude = set(string.punctuation)

contractions = ["'s", "n't", "'ll", "'t", "'s", "'re"]

# lemma
lemma = WordNetLemmatizer()

exclusion_terms = list(set(set(stop_words) | set(exclude) | set(contractions)))

In [11]:
def clean_review(doc):
    ''' remove stop words, remove punctuation, and lemmatize a text document'''

    # lemmatize, tokenize and remove stop words, puncuation and contractions
    # remove non alpha tokens
    tokens = [lemma.lemmatize(word) for word in word_tokenize(doc) if word not in exclusion_terms and word.isalpha()]
    
    return tokens

# Prepare Review Corpus for LDA

In [12]:
from gensim.corpora.dictionary import Dictionary
from collections import defaultdict
import itertools

In [13]:
def lda_prep(corpus=None, n_terms=5):
    '''
    
        @ params:
            corpus   : a list of 
            n_terms  : the number of top terms to preview to the console
    
        returns:
            a list of 3 items
                dictionary        :  a gensim dictionary object built from the corpus
                corpus            :  a bag of words sparce array of corpus terms
                total_word_count  :  a defaultdict with key word identifier in dictionary, and value the count of times that word appears in the corpus
    
    '''
    if corpus == None:
        return False  
    else:
        time_marker('building gensim dict...')
        # build gensim dict, key=token, value=count
        dictionary = Dictionary(corpus)
        # print('dictionary Tokens to ID {}'.format(dictionary.token2id))

        # create a gensim corpus
        time_marker('building gensim corpus...')
        corpus = [dictionary.doc2bow(doc) for doc in clean_docs]
        # print('gensim Corpus {}'.format(corpus[0]))

        # create a defaultdict
        total_word_count = defaultdict(int)

        # loop over corpus and count the number of times each word appears
        for word_id, word_count in itertools.chain.from_iterable(corpus):
            total_word_count[word_id] += word_count

        # create a sorted list from the defaultdict
        sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

        # print top n_terms words across all documents
        print('Top {:d} words across all documents'.format(n_terms))
        for word_id, word_count in sorted_word_count[:n_terms]:
            print('{:20}{}'.format(dictionary.get(word_id), word_count))
        
        return [dictionary, corpus, total_word_count]

# Split up by `Fast Food` and `Non Fast Food` Restaurant Reviews

In [14]:
ff_reviews = reviews[reviews.is_fast_food == True].copy()
ff_reviews.reset_index(inplace=True, drop=True)
nff_reviews = reviews[reviews.is_fast_food == False].copy()
nff_reviews.reset_index(inplace=True, drop=True)

In [26]:
print('============================== Fast Food Reviews ==============================')
print('{:45}{:d}\t{:2.4f}'.format('Number of Fast Food Reviews', ff_reviews.shape[0], 100.*ff_reviews.shape[0] / reviews.shape[0]))
print('{:45}{:d}\t{:2.4f}'.format('Number of Positive Fast Food Reviews', ff_reviews[ff_reviews.is_positive == True].shape[0], (100.*ff_reviews[ff_reviews.is_positive == True].shape[0]/ff_reviews.shape[0])))
print('{:45}{:d}\t{:2.4f}'.format('Number of Negative Fast Food Reviews', ff_reviews[ff_reviews.is_positive == False].shape[0], (100.*ff_reviews[ff_reviews.is_positive == False].shape[0]/ff_reviews.shape[0])))

print()
print('============================ Non Fast Food Reviews ============================')
print('{:45}{:d}\t{:2.4f}'.format('Number of Non Fast Food Reviews', nff_reviews.shape[0], 100.*nff_reviews.shape[0] / reviews.shape[0]))
print('{:45}{:d}\t{:2.4f}'.format('Number of Positive Non Fast Food Reviews', nff_reviews[nff_reviews.is_positive == True].shape[0], (100.*nff_reviews[nff_reviews.is_positive == True].shape[0]/nff_reviews.shape[0])))
print('{:45}{:d}\t{:2.4f}'.format('Number of Negative Non Fast Food Reviews', nff_reviews[nff_reviews.is_positive == False].shape[0], (100.*nff_reviews[nff_reviews.is_positive == False].shape[0]/nff_reviews.shape[0])))

Number of Fast Food Reviews                  39907	8.0475
Number of Positive Fast Food Reviews         19488	48.8335
Number of Negative Fast Food Reviews         20419	51.1665

Number of Non Fast Food Reviews              455986	91.9525
Number of Positive Non Fast Food Reviews     303285	66.5119
Number of Negative Non Fast Food Reviews     152701	33.4881


# Fast Food Reviews

<p><b>Step 1: </b>Subset to only evaluate `Fast Food` Reviews</p>

In [30]:
ff_reviews.head(3)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,is_fast_food,review_len,is_positive
0,iIjVO7cLD1UEmIO7G05Ujw,0,2016-06-11,0,xatycgntu_F_Ioyny3iflw,4,flavor was actually pretty good. not used to e...,0,vaXJ7-xLrnD6FAEhUqYKwQ,True,309,True
1,8F-CalsRSKiPjjsx8ql8Lg,9,2009-12-22,8,xWvUUQ-tO-x9pAsG8JEnOQ,4,i really want to give this place four stars. g...,6,dyhTHLIf6eWBvU78Y3T06A,True,2349,True
6,W2CzAePJakvARgoQuohbOA,0,2011-08-30,0,B9Eq_4FSMD4zCfCDiEXazQ,5,times have changed. unfortunately i am no long...,0,Jt4u7qnfrk35buainfOuGA,True,468,True


In [31]:
ff_reviews.shape[0]

19488

<p><b>Step 2: </b>Tokenize and Normalize review text</p>

In [32]:
time_marker('tokenizing and normalizing text...')
ff_reviews['tokens'] = ff_reviews.text.apply(lambda r: clean_review(r))
ff_reviews['norm_text'] = ff_reviews.tokens.apply(lambda t: ' '.join(t))
time_marker('done!')

[17:30:47.209000] tokenizing and normalizing text...
[17:31:22.268828] done!


In [33]:
ff_reviews.head(3).transpose()

Unnamed: 0,0,1,6
business_id,iIjVO7cLD1UEmIO7G05Ujw,8F-CalsRSKiPjjsx8ql8Lg,W2CzAePJakvARgoQuohbOA
cool,0,9,0
date,2016-06-11 00:00:00,2009-12-22 00:00:00,2011-08-30 00:00:00
funny,0,8,0
review_id,xatycgntu_F_Ioyny3iflw,xWvUUQ-tO-x9pAsG8JEnOQ,B9Eq_4FSMD4zCfCDiEXazQ
stars,4,4,5
text,flavor was actually pretty good. not used to e...,i really want to give this place four stars. g...,times have changed. unfortunately i am no long...
useful,0,6,0
user_id,vaXJ7-xLrnD6FAEhUqYKwQ,dyhTHLIf6eWBvU78Y3T06A,Jt4u7qnfrk35buainfOuGA
is_fast_food,True,True,True


<p><b>Step 3: </b>Build our term dictionary, document term matrix, and preview the most common terms</p>

In [145]:
# collect all cleaned review strings into a list of strings
clean_docs = list(ff_reviews.tokens[:1000].values)

# create dictionary, corpus, and word counts with custom function
dictionary, doc_term_matrix, total_word_count = lda_prep(corpus=clean_docs, n_terms=25)

[18:16:51.263525] building gensim dict...
[18:16:51.350837] building gensim corpus...
Top 25 words across all documents
food                636
place               553
good                530
great               461
burger              357
always              342
get                 328
service             313
like                312
time                295
love                272
one                 271
fry                 257
friendly            250
fast                241
go                  236
order               232
location            231
back                227
pizza               205
sandwich            197
really              187
staff               187
fresh               179
delicious           171


<p><b>Step 4: </b>Using a Multicore LDA model, attempt to identify topics from the dictionary</p>

In [180]:
from gensim.models.ldamulticore import LdaMulticore
ldam = LdaMulticore

num_topics = 50
num_words  = 10
num_passes = 50

In [147]:
time_marker('started generating lda multicore model')
ldam_model = ldam(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=num_passes)
time_marker('done!')

results = ldam_model.print_topics(num_topics=num_topics, num_words=num_words)

[18:16:53.233327] started generating lda multicore model
[18:24:26.656576] done!


<p><b>Step 6: </b>View Results</p>

# Print Model Terms

In [148]:
from collections import Counter

In [182]:
def print_topic_terms(model, num_topics=num_topics, num_words=10, unique=False):
    results = model.print_topics(num_topics=num_topics, num_words=num_words)
    if not unique:
        print('=============================== Terms Per Topic ===============================')
        for r in results:
            topic = r[0]
            term_list = r[1]

            term_list = term_list.split('"')[1::2]
            topic_terms = [term for term in term_list]
            print('{}\t{}'.format(topic, topic_terms))
    else:
        terms = [x[1] for x in results]
        term_lists = [x.split('"')[1::2] for x in terms]

        flatList = itertools.chain.from_iterable(term_lists)
        term_counts = Counter(flatList)

        # non_unique_terms = term_counts
        test = dict(term_counts)

        # extract terms that appear more than once
        non_unique_terms = [key for key, value in test.items() if value > 1]
        
        
        print('============================ Unique Terms Per Topic ===========================')
        for r in results:
            topic = r[0]
            term_list = r[1]

            term_list = term_list.split('"')[1::2]
            topic_terms = [term for term in term_list if term not in non_unique_terms]
            print('{}\t{}'.format(topic, topic_terms))

In [183]:
print_topic_terms(ldam_model, num_topics=num_topics, num_words=10, unique=False)

0	['food', 'good', 'place', 'sandwich', 'like', 'fry', 'burger', 'would', 'one', 'fast']
1	['food', 'order', 'little', 'delicious', 'also', 'fry', 'much', 'place', 'one', 'get']
2	['dog', 'hot', 'chicago', 'beef', 'good', 'sandwich', 'place', 'chili', 'fry', 'try']
3	['fish', 'place', 'chip', 'n', 'time', 'food', 'go', 'year', 'always', 'good']
4	['chicken', 'good', 'get', 'eat', 'place', 'like', 'usually', 'love', 'enjoy', 'wing']
5	['wait', 'food', 'incredible', 'service', 'long', 'great', 'well', 'ca', 'restaurant', 'night']
6	['good', 'place', 'food', 'one', 'really', 'go', 'chicken', 'like', 'better', 'restaurant']
7	['coney', 'detroit', 'great', 'dog', 'good', 'place', 'go', 'owner', 'fry', 'say']
8	['burger', 'u', 'food', 'back', 'got', 'time', 'one', 'mcdonald', 'nice', 'fast']
9	['glad', 'fast', 'food', 'breakfast', 'english', 'fish', 'u', 'chang', 'plate', 'place']
10	['taco', 'delicious', 'place', 'found', 'food', 'really', 'good', 'try', 'go', 'pizza']
11	['place', 'burger'

In [184]:
print_topic_terms(ldam_model, num_topics=num_topics, num_words=10, unique=True)

0	[]
1	[]
2	['hot', 'chicago', 'beef', 'chili']
3	['chip', 'n', 'year']
4	['eat', 'usually', 'enjoy', 'wing']
5	['wait', 'incredible', 'long', 'ca', 'night']
6	['better']
7	['coney', 'detroit', 'owner', 'say']
8	['mcdonald']
9	['glad', 'english', 'chang', 'plate']
10	['found']
11	[]
12	['loved', 'wanted']
13	['lobby']
14	['three', 'extra']
15	['first']
16	['favorite', 'want']
17	['two']
18	['inside', 'though']
19	[]
20	['sub', 'mike', 'subway', 'firehouse', 'jersey']
21	['choice']
22	[]
23	['topping']
24	['inch', 'wrap', 'know', 'goodcents']
25	[]
26	[]
27	['five', 'guy']
28	['drive', 'still', 'quickly']
29	['simple', 'open', 'real']
30	['waffle', 'enough', 'pretty', 'poutine', 'yes']
31	[]
32	['pollo', 'salad', 'el', 'secret']
33	['wendy', 'tasty']
34	['menu']
35	['chinese']
36	['carne', 'asada', 'salsa']
37	['home']
38	[]
39	['green', 'sample']
40	['amazing']
41	['take', 'husband']
42	['family']
43	[]
44	['excellent']
45	['employee', 'customer']
46	['highly', 'reasonable']
47	[]
48	[

# Save Model

In [185]:
# save model to disk (no need to use pickle module)
term = 'test'
file_suffix = '{}_{:d}_topics_{:d}_terms_{}_passes'.format(term, num_topics, num_words, num_passes)
ldam_model.save('../models/ldam_{}.model'.format(file_suffix))