# Review Data EDA

In [1]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)
FIG_SIZE_SHORT = (15,3)
GRID_DIMS = 15

DO_WRITE_CHARTS = False

In [4]:
fname = '../clean_data/_analysis/us_restaurant_bids.txt'

with open(fname) as f:
    us_rest_bids = f.readlines()
us_rest_bids = [x.strip() for x in us_rest_bids] 

# Load Review Data for US Restaurants

In [15]:
time_marker(text='Loading Review Data...')

restaurants = pd.DataFrame()
file_path_slug = '../clean_data/business/*.csv'
file_list = glob(file_path_slug)

# Chunk Settings
chunks = list()
chunksize = 10000
for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)
    format_width = len(str(num_chunks))

    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0)):

        # only consider reviews for businesses that are US Restaurants
        chunk = chunk[chunk.business_id.isin(us_rest_bids)].copy()
        
#         # drop reviews with missing review text
#         chunk = chunk[(~chunk.text.isnull()) & (~chunk.user_id.isnull())].copy()
        
        # append chunk to chunks list
        chunks.append(chunk)
        
time_marker(text='merging to dataframe...')
restaurants = pd.concat(chunks)

time_marker('reseting index...')
restaurants.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')



[16:36:04.169175] Loading Review Data...
[16:36:04.171935] Reading 1 of 64 ../clean_data/business/00_-_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[16:36:04.860399] Reading 2 of 64 ../clean_data/business/01_0_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[16:36:05.548520] Reading 3 of 64 ../clean_data/business/02_1_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[16:36:06.179619] Reading 4 of 64 ../clean_data/business/03_2_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[16:36:06.796784] Reading 5 of 64 ../clean_data/business/04_3_business_clean.csv...
[16:36:07.494158] Reading 6 of 64 ../clean_data/business/05_4_business_clean.csv...
[16:36:08.202546] Reading 7 of 64 ../clean_data/business/06_5_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[16:36:08.927587] Reading 8 of 64 ../clean_data/business/07_6_business_clean.csv...
[16:36:09.933690] Reading 9 of 64 ../clean_data/business/08_7_business_clean.csv...
[16:36:10.859345] Reading 10 of 64 ../clean_data/business/09_8_business_clean.csv...
[16:36:11.653541] Reading 11 of 64 ../clean_data/business/10_9_business_clean.csv...
[16:36:12.485600] Reading 12 of 64 ../clean_data/business/11_A_business_clean.csv...
[16:36:13.159966] Reading 13 of 64 ../clean_data/business/12_B_business_clean.csv...
[16:36:13.790704] Reading 14 of 64 ../clean_data/business/13_C_business_clean.csv...
[16:36:14.484786] Reading 15 of 64 ../clean_data/business/14_D_business_clean.csv...
[16:36:15.117438] Reading 16 of 64 ../clean_data/business/15_E_business_clean.csv...
[16:36:15.743999] Reading 17 of 64 ../clean_data/business/16_F_business_clean.csv...
[16:36:16.396174] Reading 18 of 64 ../clean_data/business/17_G_business_clean.csv...
[16:36:17.040494] Reading 19 of 64 ../clean_data/business/18_H_busi

  interactivity=interactivity, compiler=compiler, result=result)


[16:36:20.983213] Reading 25 of 64 ../clean_data/business/24_N_business_clean.csv...
[16:36:21.656259] Reading 26 of 64 ../clean_data/business/25_O_business_clean.csv...
[16:36:22.506655] Reading 27 of 64 ../clean_data/business/26_P_business_clean.csv...
[16:36:23.147817] Reading 28 of 64 ../clean_data/business/27_Q_business_clean.csv...
[16:36:23.835085] Reading 29 of 64 ../clean_data/business/28_R_business_clean.csv...
[16:36:24.572237] Reading 30 of 64 ../clean_data/business/29_S_business_clean.csv...
[16:36:25.350154] Reading 31 of 64 ../clean_data/business/30_T_business_clean.csv...
[16:36:26.189277] Reading 32 of 64 ../clean_data/business/31_U_business_clean.csv...
[16:36:26.922428] Reading 33 of 64 ../clean_data/business/32_V_business_clean.csv...
[16:36:27.919528] Reading 34 of 64 ../clean_data/business/33_W_business_clean.csv...
[16:36:28.712520] Reading 35 of 64 ../clean_data/business/34_X_business_clean.csv...
[16:36:29.561092] Reading 36 of 64 ../clean_data/business/35_Y_bu

  interactivity=interactivity, compiler=compiler, result=result)


[16:36:42.144854] Reading 53 of 64 ../clean_data/business/52_o_business_clean.csv...
[16:36:42.928896] Reading 54 of 64 ../clean_data/business/53_p_business_clean.csv...
[16:36:43.647812] Reading 55 of 64 ../clean_data/business/54_q_business_clean.csv...
[16:36:44.294204] Reading 56 of 64 ../clean_data/business/55_r_business_clean.csv...
[16:36:44.959122] Reading 57 of 64 ../clean_data/business/56_s_business_clean.csv...
[16:36:45.605718] Reading 58 of 64 ../clean_data/business/57_t_business_clean.csv...
[16:36:46.251162] Reading 59 of 64 ../clean_data/business/58_u_business_clean.csv...
[16:36:46.996305] Reading 60 of 64 ../clean_data/business/59_v_business_clean.csv...
[16:36:47.707101] Reading 61 of 64 ../clean_data/business/60_w_business_clean.csv...
[16:36:48.943690] Reading 62 of 64 ../clean_data/business/61_x_business_clean.csv...
[16:36:49.767518] Reading 63 of 64 ../clean_data/business/62_y_business_clean.csv...
[16:36:50.530788] Reading 64 of 64 ../clean_data/business/63_z_bu

In [21]:
# drop columns of all zeros
restaurants = restaurants.loc[:, (restaurants != 0).any(axis=0)]

In [22]:
restaurants.head(3)

Unnamed: 0,business_id,address,city,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,...,thai,turkish,ukrainian,uzbek,vegan,vegetarian,venezuelan,vietnamese,waffles,wraps
0,-2q4dnUw0gGJniGW2aPamQ,1805 S Neil St,Champaign,0,40.094068,-88.245785,Fiesta Ranchera,,61820,4,...,0,0,0,0,0,0,0,0,0,0
1,-49WY_TEa9ZEcRk_GnuLog,5205 Detroit Rd,Sheffield Village,1,41.425861,-82.080963,Cracker Barrel Old Country Store,,44054,27,...,0,0,0,0,0,0,0,0,0,0
2,--9e1ONYQuAa-CB_Rrw7Tw,3355 Las Vegas Blvd S,Las Vegas,1,36.123183,-115.16919,Delmonico Steakhouse,The Strip,89109,1389,...,0,0,0,0,0,0,0,0,0,0


In [45]:
category_columns = restaurants.columns[-128:]
cols_of_interest = ['business_id', 'name', 'review_count', 'stars', 'state']

for c in category_columns :
    cols_of_interest.append(c)


In [52]:
for c in category_columns:
#     print(c)
#     print(restaurants[category_columns][c].sum())
    
    print('{} {:d}'.format(str(c).rjust(25), restaurants[category_columns][c].sum()))
    

                   afghan 16
                  african 23
           american_(new) 1551
   american_(traditional) 2283
                  arabian 8
                argentine 7
                 armenian 5
             asian_fusion 627
               australian 1
                 austrian 1
              bangladeshi 1
                 barbeque 601
                  belgian 5
                  bistros 3
                brazilian 26
       breakfast_&_brunch 1204
                  british 5
                  buffets 493
                bulgarian 1
                  burgers 2091
                  burmese 5
             cajun/creole 105
                cambodian 11
                cantonese 47
                caribbean 93
             cheesesteaks 129
            chicken_wings 1013
                  chinese 1807
                colombian 11
             comfort_food 91
                creperies 18
                    cuban 30
                    delis 642
                  dim_sum 47
       

In [61]:
# limit to only businesses in these categories

categories_of_interest = ['chicken_wings','breakfast_&_brunch',
                          'american_(new)','italian','chinese',
                          'burgers','american_(traditional)',
                          'mexican','sandwiches','pizza','fast_food']

restaurants_of_interest = pd.DataFrame()
chunks = []
for cat in categories_of_interest:
    chunk = restaurants[restaurants[cat] == 1].copy()
    
    chunks.append(chunk)

restaurants_of_interest = pd.concat(chunks)


# drop columns we dont care about    
for col in restaurants_of_interest.columns:
    if col not in cols_of_interest:
        restaurants_of_interest.drop(col, axis=1, inplace=True)
    
# drop columns of all zeros
restaurants_of_interest = restaurants_of_interest.loc[:, (restaurants_of_interest != 0).any(axis=0)]   




In [62]:
restaurants_of_interest.head()

Unnamed: 0,business_id,name,review_count,stars,state,afghan,african,american_(new),american_(traditional),arabian,...,thai,turkish,ukrainian,uzbek,vegan,vegetarian,venezuelan,vietnamese,waffles,wraps
8,-sCaUNqEfFOYwtu8WCw5Wg,Anthony's Coal Fired Pizza,50,3.5,PA,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,--GM_ORV2cYS-h38DSaCLw,Mm Mm Pizza,7,4.0,PA,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52,-fbpSXmv2RhFc-h6JyC0aw,Pangea Tapas Bar & Grill,4,4.0,NC,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
58,-CZwX0Fy14DGCY_2aFkhKg,K Jun Chicken,5,3.5,NV,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59,-qqMAXJh3CIlfUr7DO0yHQ,KFC,4,1.0,OH,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
restaurants_of_interest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23777 entries, 8 to 19969
Columns: 125 entries, business_id to wraps
dtypes: float64(1), int64(121), object(3)
memory usage: 23.5+ MB


# Exploratory Data Analysis

In [None]:
ax = plt.figure(figsize=FIG_SIZE)
ax = sns.countplot(x="stars", data=reviews, palette="Reds")

ax.set_xlabel('Review Stars', size=LABEL_FONT_SIZE)
ax.set_ylabel('Number of Reviews', size=LABEL_FONT_SIZE)
ax.set_title('Review Star Distribution', size=TITLE_FONT_SIZE)

plt.show()
plt.close()


In [None]:
ax = plt.figure(figsize=FIG_SIZE)
ax = sns.boxplot(x="stars", y="review_length", data=reviews, palette="Reds")

ax.set_xlabel('Review Stars', size=LABEL_FONT_SIZE)
ax.set_ylabel('Reviews Length', size=LABEL_FONT_SIZE)
ax.set_title('Review Length vs Stars', size=TITLE_FONT_SIZE)

ax.set_ylim([0,2000])

plt.show()
plt.close()

In [None]:
# reviews.shape[0]
print('Total number of reviews         {:d}'.format(reviews.shape[0]))
print('Number of reviews tagged as \'cool\'   {:d}'.format(reviews[reviews.cool > 0].shape[0]))
print('Number of reviews tagged as \'funny\'  {:d}'.format(reviews[reviews.funny > 0].shape[0]))
print('Number of reviews tagged as \'useful\' {:d}'.format(reviews[reviews.useful > 0].shape[0]))

In [None]:
19976 unique business_id     values
       118 unique cool            values
      4305 unique date            values
       102 unique funny           values
   1323672 unique review_id       values
         5 unique stars           values
   1323232 unique text            values
       127 unique useful          values
    496209 unique user_id         values
      4819 unique review_length   values