# Review Data EDA

In [60]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set()

In [61]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [62]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)
DO_WRITE_CHARTS = False

# Load Review Data for Arizona Restaurants

In [63]:
time_marker(text='Loading Review Data...')

reviews = pd.DataFrame()
file_path_slug = '../clean_data/az_restaurant_reviews.csv'
file_list = glob(file_path_slug)

# Chunk Settings
chunks = list()
chunksize = 10000
for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)
    format_width = len(str(num_chunks))

    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['date'])):

        
        # append chunk to chunks list
        chunks.append(chunk)
        
time_marker(text='merging to dataframe...')
reviews = pd.concat(chunks)

time_marker('reseting index...')
reviews.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')


[02:37:36.271084] Loading Review Data...
[02:37:36.387878] Reading 1 of 1 ../clean_data/az_restaurant_reviews.csv...
[02:37:47.383567] merging to dataframe...
[02:37:50.782648] reseting index...
[02:37:50.801102] Complete!


In [64]:
reviews.dropna(how='any', inplace=True)
reviews.reset_index(inplace=True, drop=True)

In [65]:
reviews['cool'] = reviews['cool'].astype('int')
reviews['funny'] = reviews['funny'].astype('int')
reviews['stars'] = reviews['stars'].astype('int')
reviews['useful'] = reviews['useful'].astype('int')
reviews['review_len'] = reviews['review_len'].astype('int')
reviews['is_fast_food'] = reviews['is_fast_food'].astype('int')

reviews['date'] = pd.to_datetime(reviews['date'])

In [66]:
reviews.head(3)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,is_fast_food,review_len
0,JlNeaOymdVbE6_bubqjohg,0,2014-08-09,0,BF0ANB54sc_f-3_howQBCg,1,we always go to the chevo's in chandler which ...,3,ssuXFjkH4neiBgwv-oN4IA,0,422
1,0Rni7ocMC_Lg2UH0lDeKMQ,0,2014-08-09,0,DbLUpPT61ykLTakknCF9CQ,1,this place is always so dirty and grimy been t...,6,ssuXFjkH4neiBgwv-oN4IA,0,111
2,S-oLPRdhlyL5HAknBKTUcQ,0,2017-11-30,0,z_mVLygzPn8uHp63SSCErw,4,holy portion sizes! you get a lot of bang for ...,0,MzEnYCyZlRYQRISNMXTWIg,0,130


In [67]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495893 entries, 0 to 495892
Data columns (total 11 columns):
business_id     495893 non-null object
cool            495893 non-null int64
date            495893 non-null datetime64[ns]
funny           495893 non-null int64
review_id       495893 non-null object
stars           495893 non-null int64
text            495893 non-null object
useful          495893 non-null int64
user_id         495893 non-null object
is_fast_food    495893 non-null int64
review_len      495893 non-null int64
dtypes: datetime64[ns](1), int64(6), object(4)
memory usage: 41.6+ MB


# Split up by Review Ranking

In [73]:
one_star_reviews   = reviews[reviews.stars == 1].copy()
two_star_reviews   = reviews[reviews.stars == 2].copy()
three_star_reviews = reviews[reviews.stars == 3].copy()
four_star_reviews  = reviews[reviews.stars == 4].copy()
five_star_reviews  = reviews[reviews.stars == 5].copy()

print('one_star_reviews    {:d}'.format(one_star_reviews.shape[0]))
print('two_star_reviews    {:d}'.format(two_star_reviews.shape[0]))
print('three_star_reviews  {:d}'.format(three_star_reviews.shape[0]))
print('four_star_reviews  {:d}'.format(four_star_reviews.shape[0]))
print('five_star_reviews  {:d}'.format(five_star_reviews.shape[0]))

one_star_reviews    65786
two_star_reviews    47729
three_star_reviews  59605
four_star_reviews  124348
five_star_reviews  198425


# Identify Key Terms in Reviews

In [80]:
sample_one_star = one_star_reviews.iloc[10219,:].text

sample_one_star

"this is my first post. i'm also in the service industry. this restaurant experience was by far the worst i've ever experienced in fine dining. i'm sure it will be closed soon. service and food was way below par. we had a table of six two birthdays which we called ahead for. it took them 15-20 min to bring our drinks which the server got wrong. took them another 20-30 min to f-up one side salad order. also our dinner orders were wrong. i asked for a 6oz fliet and got salmon. i got my entree when everyone was finished and they never apologized. we had to send a smoke signal to our server. ther restaurant was only a 1/4 full of tables that were busy on a friday . food was not worth the money and never acknowledged our two birthdays. we brought all complaints to management and he looked at us like he could care less. we paid our bill like troopers no apologies from management no tip which is not in my very being. soo disappointed. i'm trying to get our money back. it's just principle. i c

In [81]:
import spacy

nlp = spacy.load('en')

In [82]:
parsed_one_star = nlp(sample_one_star)

## Parse Sentences

In [84]:
for num, sentence in enumerate(parsed_one_star.sents):
    print('Sentence {}: {}'.format(num + 1, sentence))

Sentence 1: this is my first post.
Sentence 2: i'm also in the service industry.
Sentence 3: this restaurant experience was by far the worst i've ever experienced in fine dining.
Sentence 4: i'm sure it will be closed soon.
Sentence 5: service and food was way below par.
Sentence 6: we had a table of six two birthdays which we called ahead for.
Sentence 7: it took them 15-20 min to bring our drinks which the server got wrong.
Sentence 8: took them another 20-30 min to f-up one side salad order.
Sentence 9: also our dinner orders were wrong.
Sentence 10: i asked for a 6oz fliet and got salmon.
Sentence 11: i got my entree when everyone was finished and they never apologized.
Sentence 12: we had to send a smoke signal to our server.
Sentence 13: ther restaurant was only a 1/4 full of tables that were busy on a friday .
Sentence 14: food was not worth the money and never acknowledged our two birthdays.
Sentence 15: we brought all complaints to management and he looked at us like he could 

## Parts of Speech Tagging

In [87]:
token_text = [token.orth_ for token in parsed_one_star]
token_pos = [token.pos_ for token in parsed_one_star]

pos = pd.DataFrame({'token_text': token_text, 'part_of_speech': token_pos})

nouns_1star = pos[pos.part_of_speech =='NOUN'].copy()
adjs_1star = pos[pos.part_of_speech =='ADJ'].copy()

In [89]:
adjs_1star

Unnamed: 0,part_of_speech,token_text
2,ADJ,my
3,ADJ,first
21,ADJ,worst
27,ADJ,fine
32,ADJ,sure
55,ADJ,which
70,ADJ,our
72,ADJ,which
76,ADJ,wrong
95,ADJ,our


## Topic Modeling with Latent Dirichlet Allocation (LDA)

In [95]:
# tokenize text
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize

In [98]:
type(parsed_one_star)

spacy.tokens.doc.Doc

In [96]:
tokens = word_tokenize(parsed_one_star)
tokens

TypeError: expected string or bytes-like object