# Business Data EDA

In [1]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

day_labels = ['MON','TUE','WED','THU','FRI','SAT','SUN']
day_labels_full = ['MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY']
month_labels = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
month_labels_full = ['JANUARY','FEBRUARY','MARCH','APRIL','MAY','JUNE','JULY','AUGUST','SEPTEMBER','OCTOBER','NOVEMBER','DECEMBER']

FIG_SIZE = (15,6)
FIG_SIZE_SHORT = (15,3)
GRID_DIMS = 15

DO_WRITE_CHARTS = False

# Load Review Data

In [4]:
time_marker(text='Loading Review Data...')

business = pd.DataFrame()
file_path_slug = '../clean_data/business/*.csv'
file_list = glob(file_path_slug)


# Chunk Settings
chunks = list()
chunksize = 10000
newest_review_date = pd.Timestamp('2017-07-26')

for ii, file in enumerate(file_list):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)
    format_width = len(str(num_chunks))

    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0)):

        # append chunk to chunks list
        chunks.append(chunk)        
        
        if jj % 10 == 0 or jj == num_chunks:
            time_marker(text='\tfinished chunk {} of {}'.format(str(jj+1).rjust(format_width), str(num_chunks).rjust(format_width)))

time_marker(text='merging to dataframe...')
business = pd.concat(chunks)

time_marker('reseting index...')
business.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')


[22:28:17.617711] Loading Review Data...
[22:28:17.621964] Reading 1 of 64 ../clean_data/business/07_6_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[22:28:18.402027] 	finished chunk 1 of 1
[22:28:18.402438] Reading 2 of 64 ../clean_data/business/26_P_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[22:28:19.121854] 	finished chunk 1 of 1
[22:28:19.122150] Reading 3 of 64 ../clean_data/business/33_W_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[22:28:19.804177] 	finished chunk 1 of 1
[22:28:19.804507] Reading 4 of 64 ../clean_data/business/24_N_business_clean.csv...
[22:28:20.481309] 	finished chunk 1 of 1
[22:28:20.481581] Reading 5 of 64 ../clean_data/business/36_Z_business_clean.csv...
[22:28:21.176895] 	finished chunk 1 of 1
[22:28:21.177141] Reading 6 of 64 ../clean_data/business/49_l_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[22:28:21.852362] 	finished chunk 1 of 1
[22:28:21.852615] Reading 7 of 64 ../clean_data/business/13_C_business_clean.csv...
[22:28:22.565864] 	finished chunk 1 of 1
[22:28:22.566112] Reading 8 of 64 ../clean_data/business/57_t_business_clean.csv...
[22:28:23.347932] 	finished chunk 1 of 1
[22:28:23.348158] Reading 9 of 64 ../clean_data/business/29_S_business_clean.csv...
[22:28:24.191732] 	finished chunk 1 of 1
[22:28:24.191964] Reading 10 of 64 ../clean_data/business/18_H_business_clean.csv...
[22:28:24.851707] 	finished chunk 1 of 1
[22:28:24.851972] Reading 11 of 64 ../clean_data/business/22_L_business_clean.csv...
[22:28:25.579349] 	finished chunk 1 of 1
[22:28:25.579646] Reading 12 of 64 ../clean_data/business/14_D_business_clean.csv...
[22:28:26.289870] 	finished chunk 1 of 1
[22:28:26.290166] Reading 13 of 64 ../clean_data/business/47_j_business_clean.csv...
[22:28:27.190260] 	finished chunk 1 of 1
[22:28:27.190494] Reading 14 of 64 ../clean_data/business/40_c_business_clean.cs

  interactivity=interactivity, compiler=compiler, result=result)


[22:28:31.996791] 	finished chunk 1 of 1
[22:28:31.997029] Reading 21 of 64 ../clean_data/business/11_A_business_clean.csv...
[22:28:32.710243] 	finished chunk 1 of 1
[22:28:32.710464] Reading 22 of 64 ../clean_data/business/06_5_business_clean.csv...
[22:28:33.424012] 	finished chunk 1 of 1
[22:28:33.424260] Reading 23 of 64 ../clean_data/business/09_8_business_clean.csv...
[22:28:34.090695] 	finished chunk 1 of 1
[22:28:34.090941] Reading 24 of 64 ../clean_data/business/34_X_business_clean.csv...
[22:28:34.756300] 	finished chunk 1 of 1
[22:28:34.756547] Reading 25 of 64 ../clean_data/business/31_U_business_clean.csv...
[22:28:35.393763] 	finished chunk 1 of 1
[22:28:35.394008] Reading 26 of 64 ../clean_data/business/05_4_business_clean.csv...
[22:28:36.059316] 	finished chunk 1 of 1
[22:28:36.059637] Reading 27 of 64 ../clean_data/business/50_m_business_clean.csv...
[22:28:36.705764] 	finished chunk 1 of 1
[22:28:36.706012] Reading 28 of 64 ../clean_data/business/59_v_business_clean

  interactivity=interactivity, compiler=compiler, result=result)


[22:28:40.646639] 	finished chunk 1 of 1
[22:28:40.646880] Reading 34 of 64 ../clean_data/business/19_I_business_clean.csv...
[22:28:41.301493] 	finished chunk 1 of 1
[22:28:41.301788] Reading 35 of 64 ../clean_data/business/43_f_business_clean.csv...
[22:28:41.950129] 	finished chunk 1 of 1
[22:28:41.950380] Reading 36 of 64 ../clean_data/business/28_R_business_clean.csv...
[22:28:42.582427] 	finished chunk 1 of 1
[22:28:42.583483] Reading 37 of 64 ../clean_data/business/38_a_business_clean.csv...
[22:28:43.272227] 	finished chunk 1 of 1
[22:28:43.272473] Reading 38 of 64 ../clean_data/business/54_q_business_clean.csv...
[22:28:43.896352] 	finished chunk 1 of 1
[22:28:43.896597] Reading 39 of 64 ../clean_data/business/12_B_business_clean.csv...
[22:28:44.543347] 	finished chunk 1 of 1
[22:28:44.543601] Reading 40 of 64 ../clean_data/business/62_y_business_clean.csv...
[22:28:45.214904] 	finished chunk 1 of 1
[22:28:45.215146] Reading 41 of 64 ../clean_data/business/04_3_business_clean

  interactivity=interactivity, compiler=compiler, result=result)


[22:28:58.293903] 	finished chunk 1 of 1
[22:28:58.294146] Reading 59 of 64 ../clean_data/business/58_u_business_clean.csv...
[22:28:59.035078] 	finished chunk 1 of 1
[22:28:59.035386] Reading 60 of 64 ../clean_data/business/20_J_business_clean.csv...
[22:28:59.728465] 	finished chunk 1 of 1
[22:28:59.728714] Reading 61 of 64 ../clean_data/business/42_e_business_clean.csv...
[22:29:00.405198] 	finished chunk 1 of 1
[22:29:00.405454] Reading 62 of 64 ../clean_data/business/08_7_business_clean.csv...
[22:29:01.109628] 	finished chunk 1 of 1
[22:29:01.109892] Reading 63 of 64 ../clean_data/business/03_2_business_clean.csv...
[22:29:01.925841] 	finished chunk 1 of 1
[22:29:01.927391] Reading 64 of 64 ../clean_data/business/52_o_business_clean.csv...
[22:29:02.614377] 	finished chunk 1 of 1
[22:29:02.614693] merging to dataframe...
[22:29:10.339327] reseting index...
[22:29:10.342177] Complete!


In [5]:
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Columns: 1347 entries, business_id to zoos
dtypes: float64(28), int64(1242), object(77)
memory usage: 1.6+ GB


## Manually selected list of all Restaurant, Bar, and Eatery columns

In [6]:
food_cols = ['acai_bowls', 'afghan', 'african', 'alsatian', 'american_(new)', 'american_(traditional)', 'arabian', 'argentine', 'armenian', 
             'asian_fusion', 'australian', 'austrian', 'ayurveda', 'baden', 'bagels', 'bakeries', 'bangladeshi', 
             'barbeque', 'bars', 'bavarian', 'beach_bars', 'bed_&_breakfast', 'beer', 'beer_bar', 'beer_garden', 
             'beer_gardens', 'beer_hall', 'belgian', 'bistros', 'brazilian', 'breakfast_&_brunch', 'breweries', 
             'british', 'bubble_tea', 'buffets', 'bulgarian', 'burgers', 'burmese', 'cajun/creole', 'canadian_(new)', 'cambodian', 
             'cantonese', 'caribbean', 'champagne_bars', 'cheese_shops', 'cheesesteaks', 'chicken_wings', 'chinese', 
             'chocolatiers_&_shops', 'cocktail_bars', 'coffee_&_tea', 'coffee_roasteries', 'coffeeshops', 'colombian', 
             'comfort_food', 'creperies', 'cuban', 'cupcakes', 'curry_sausage', 'czech', 'czech/slovakian', 
             'delicatessen', 'delis', 'desserts', 'dim_sum', 'diners', 'dive_bars', 'dominican', 'donuts', 
             'eastern_european', 'eastern_german', 'egyptian', 'empanadas', 'ethiopian', 'ethnic_food', 'falafel', 
             'fast_food', 'filipino', 'fischbroetchen', 'fish_&_chips', 'flatbread', 'fondue', 'food_court', 
             'food_stands', 'food_tours', 'food_trucks', 'french', 'friterie', 'gastropubs', 'gelato', 'georgian', 
             'german', 'gluten-free', 'greek', 'guamanian', 'hainan', 'haitian', 'hakka', 'halal', 'hawaiian', 
             'himalayan/nepalese', 'honduran', 'hong_kong_style_cafe', 'hot_dogs', 'hot_pot', 'hungarian', 'iberian', 
             'indian', 'indonesian', 'irish', 'irish_pub', 'island_pub', 'italian', 'izakaya', 'japanese', 'japanese_curry', 
             'juice_bars_&_smoothies', 'kebab', 'kombucha', 'korean', 'kosher', 'laotian', 'latin_american', 
             'lebanese', 'local_flavor', 'macarons', 'malaysian', 'mauritius', 'mediterranean', 'mexican', 
             'middle_eastern', 'milkshake_bars', 'minho', 'modern_european', 'mongolian', 'moroccan', 'muay_thai', 
             'new_mexican_cuisine', 'nicaraguan', 'noodles', 'olive_oil', 'oriental', 'paint_&_sip', 'pakistani', 
             'palatine', 'pan_asian', 'pasta_shops', 'patisserie/cake_shop', 'persian/iranian', 'peruvian', 'piano_bars', 
             'pita', 'pizza', 'poke', 'polish', 'pop-up_restaurants', 'popcorn_shops', 'portuguese', 'poutineries', 
             'pretzels', 'pub_food', 'pubs', 'puerto_rican', 'ramen', 'restaurants', 'rotisserie_chicken', 'russian', 
             'salad', 'salvadoran', 'sandwiches', 'scandinavian', 'scottish', 'seafood', 'senegalese', 'serbo_croatian', 
             'shanghainese', 'shaved_ice', 'singaporean', 'slovakian', 'soba', 'soul_food', 'soup', 'south_african', 
             'southern', 'spanish', 'speakeasies', 'specialty_food', 'sports_bars', 'sri_lankan', 'steakhouses', 
             'sugar_shacks', 'sushi_bars', 'swabian', 'swiss_food', 'syrian', 'szechuan', 'tacos', 'taiwanese', 
             'tapas_bars', 'tapas/small_plates', 'tea_rooms', 'tempura', 'teppanyaki', 'tex-mex', 'thai', 'themed_cafes', 
             'tiki_bars', 'tonkatsu', 'turkish', 'tuscan', 'udon', 'ukrainian', 'uzbek', 'vegan', 'vegetarian', 
             'venezuelan', 'vietnamese', 'waffles', 'whiskey_bars', 'wine_&_spirits', 'wine_bars', 'wok', 'wraps']

# Narrow down dataframe to only food and restaurants

In [50]:
time_marker('subset businesses with food and restaurant category columns set to 1...')
food_idx = list()
for ii, col in enumerate(food_cols):
    for idx in list(business[business[col] == 1].index):
        if idx not in food_idx:
            food_idx.append(idx)

restaurants = business.iloc[food_idx,:].copy()
time_marker('done')

[23:15:17.460588] collecting list of businesses with food and restaurant category columns set to 1...
[23:17:19.160134] done


In [51]:
time_marker('collecting list of businesses with non food and restaurant category columns...')
non_food_cols = list()
for col in list(restaurants.columns):
    if col not in food_cols:
        non_food_cols.append(col)
non_food_cols = non_food_cols[107:]

[23:18:53.272225] collecting list of businesses with non food and restaurant category columns...


In [52]:
time_marker('prune businesses with non food and restaurant category columns set to 1...')
non_food_idx = list()
for ii, col in enumerate(non_food_cols):
    for idx in list(restaurants[restaurants[col] == 1].index):
        if idx not in non_food_idx:
            non_food_idx.append(idx)

restaurants = restaurants.loc[~restaurants.index.isin(non_food_idx)]
time_marker('done')

[23:19:01.083985] collecting list of businesses with non food and restaurant category columns set to 1...
[23:19:19.502268] done


In [53]:
# restaurants = business.iloc[food_selection_index,:].copy()

time_marker('dropping columns of all zeros')
restaurants = restaurants.loc[:, (restaurants != 0).any(axis=0)].copy()

time_marker('dropping columns of all nan')
restaurants.replace('NaN', np.nan, inplace=True)
restaurants.replace(' NaN', np.nan, inplace=True)
restaurants.dropna(axis=1, how='all', inplace=True)

# reset index
time_marker('resetting index...')
restaurants.reset_index(inplace=True, drop=True)
time_marker('done')

[23:19:54.089772] dropping columns of all zeros
[23:19:55.950218] dropping columns of all nan
[23:20:00.503745] resetting index...
[23:20:00.504376] done


In [54]:
print('Restaurants                {:d}'.format(restaurants.shape[0]))
print('All businesses             {:d}'.format(business.shape[0]))
print('Ratio of Restaurants       {:2.4f}'.format((restaurants.shape[0]/business.shape[0] * 100.)))

Restaurants                33497
All businesses             156639
Ratio of Restaurants       21.3848


# Write to File

In [55]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33497 entries, 0 to 33496
Columns: 252 entries, business_id to wraps
dtypes: float64(26), int64(158), object(68)
memory usage: 64.4+ MB


In [56]:
restaurants.head()

Unnamed: 0,business_id,address,city,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,...,turkish,ukrainian,uzbek,vegan,vegetarian,venezuelan,vietnamese,waffles,wok,wraps
0,6Cl15LapBVhw4ZfBIeVJJA,"1674 N Higley Rd, Ste 104",Gilbert,0,33.381279,-111.724168,Rania's Kabob Grill,,85234,320,...,0,0,0,0,0,0,0,0,0,0
1,Pi7dsdOXqrMaC53V5TOqBQ,4929 W Ray Rd,Chandler,0,33.319526,-111.925763,Zaytoon Kabob Bistro,,85226,3,...,0,0,0,0,0,0,0,0,0,0
2,NCayC7-QUer-1m-UW_71Yg,1450 Kingston Road,Pickering,0,43.841453,-79.083996,Mahipar Kabab,,L1V 1C1,6,...,0,0,0,0,0,0,0,0,0,0
3,NSfAp377ZRl5Mee430KTcA,17310 Yonge Street,Newmarket,1,44.050083,-79.479545,Pita & Grill,,L3Y 7R8,13,...,0,0,0,0,0,0,0,0,0,0
4,ZCdXbef51G3yyjVEXB7AYw,3595 Sheppard Avenue E,Toronto,1,43.78084,-79.298867,Alanoor Afghan Kabob,Scarborough,M1T 3K8,15,...,0,0,0,0,0,0,0,0,0,0


In [62]:
time_marker('writing to file...')
restaurants.to_csv('../clean_data/_analysis/restaurants.csv', encoding=('utf-8'))
time_marker('done')

[23:21:44.524178] writing to file...
[23:21:50.240295] done


# Exploratorty Data Analysis

<p><b>1.</b> Number of restaurants in each category</p>

In [64]:
for ii, col in enumerate(restaurants.columns[96:]):
    df = restaurants[restaurants[col] == 1].copy()
    print('{} {}'.format(str(col).ljust(30), df.shape[0]))

afghan                         75
african                        88
american_(new)                 1709
american_(traditional)         2770
arabian                        38
argentine                      21
armenian                       6
asian_fusion                   1138
australian                     3
austrian                       7
baden                          1
bangladeshi                    12
barbeque                       932
bavarian                       10
beer_garden                    35
beer_hall                      2
belgian                        17
bistros                        47
brazilian                      47
breakfast_&_brunch             1936
british                        101
buffets                        639
bulgarian                      1
burgers                        2992
burmese                        8
cajun/creole                   167
cambodian                      27
canadian_(new)                 676
cantonese                      58
caribb

<p><b>2.</b> Number of open and closed restaurants</p>

In [65]:
print('Number of open restaurants   {:d}'.format(restaurants[restaurants.is_open == 1].shape[0]))
print('Number of closed restaurants {:d}'.format(restaurants[restaurants.is_open == 0].shape[0]))

Number of open restaurants   24454
Number of closed restaurants 9043


<p><b>3.</b> Unique values in non categorical columns</p>

In [68]:
# restaurants.columns[:96]
for ii, col in enumerate(restaurants.columns[:96]):
    print('{} {}\n{}'.format(str(col).ljust(30), len(restaurants[col].unique()),restaurants[col].unique()))
    print('-'*80)

business_id                    33497
['6Cl15LapBVhw4ZfBIeVJJA' 'Pi7dsdOXqrMaC53V5TOqBQ' 'NCayC7-QUer-1m-UW_71Yg'
 ..., 'oehWJ_dAlzU9C65b7zzKbQ' 'okop_ufDnpNze_l1SXOf2g'
 'oz-U184llqjVIt398hLntQ']
--------------------------------------------------------------------------------
address                        28897
['1674 N Higley Rd, Ste 104' '4929 W Ray Rd' '1450 Kingston Road' ...,
 '4th Floor, 33 Castle Street' '11729 Detroit Ave' '2155 E University Dr']
--------------------------------------------------------------------------------
city                           636
['Gilbert' 'Chandler' 'Pickering' 'Newmarket' 'Toronto' 'Mississauga'
 'Dollard-des-Ormeaux' 'Madison' 'North Olmsted' 'Las Vegas' 'Ajax'
 'Phoenix' 'Brampton' 'Vaughan' 'Pineville' 'Richmond Hill' 'Laval'
 'Montréal' 'Edinburgh' 'Mesa' 'Markham' 'East York' 'Whitby' 'Stuttgart'
 'Pittsburgh' 'Scarborough' 'Glendale' 'Charlotte' 'Montreal' 'Ludwigsburg'
 'Etobicoke' 'Champaign' 'Mercier' 'Oakville' 'Cleveland'
 'Clevelan

saturday_close                 62
['22:00:00' nan '02:00:00' '23:00:00' '00:00:00' '20:00:00' '21:00:00'
 '21:30:00' '22:30:00' '23:30:00' '01:00:00' '20:30:00' '15:00:00'
 '02:30:00' '18:00:00' '19:00:00' '03:00:00' '04:00:00' '20:45:00'
 '14:00:00' '01:30:00' '16:00:00' '12:00:00' '17:00:00' '19:30:00'
 '14:30:00' '05:00:00' '06:00:00' '08:00:00' '00:30:00' '22:15:00'
 '16:30:00' '11:00:00' '23:45:00' '05:30:00' '13:00:00' '15:30:00'
 '17:30:00' '12:30:00' '04:30:00' '10:30:00' '13:30:00' '11:15:00'
 '23:59:00' '18:30:00' '22:45:00' '03:30:00' '06:30:00' '09:00:00'
 '07:00:00' '01:45:00' '08:30:00' '03:59:00' '21:45:00' '23:15:00'
 '01:10:00' '10:00:00' '11:30:00' '02:45:00' '15:45:00' '03:45:00'
 '00:45:00']
--------------------------------------------------------------------------------
sunday_open                    50
['11:00:00' nan '16:00:00' '11:30:00' '12:00:00' '13:00:00' '17:00:00'
 '08:00:00' '14:00:00' '15:00:00' '10:00:00' '18:00:00' '17:30:00'
 '06:30:00' '12:30:00' '16

[nan False]
--------------------------------------------------------------------------------
music_live                     2
[nan False]
--------------------------------------------------------------------------------
music_no_music                 2
[nan False]
--------------------------------------------------------------------------------
music_video                    2
[nan False]
--------------------------------------------------------------------------------
noiselevel                     5
['average' nan 'quiet' 'loud' 'very_loud']
--------------------------------------------------------------------------------
open24hours                    3
[ nan   0.   1.]
--------------------------------------------------------------------------------
outdoorseating                 3
[  0.   1.  nan]
--------------------------------------------------------------------------------
restaurantsattire              4
['casual' nan 'dressy' 'formal']
--------------------------------------------

<p><b>4.</b> Number of locations grouped by name</p>

In [80]:
tmp = restaurants.groupby('name').count()['business_id'].to_frame()
tmp = tmp[tmp.business_id > 1]
tmp.sort_values(['business_id'], ascending=False)[:20]

Unnamed: 0_level_0,business_id
name,Unnamed: 1_level_1
McDonald's,618
Subway,578
Taco Bell,276
Pizza Hut,269
Burger King,247
Wendy's,228
Chipotle Mexican Grill,164
Domino's Pizza,160
KFC,140
Panda Express,123
