# Yelp Business Wrangling

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime

import json

import seaborn as sns
sns.set()
# sns.set_style('whitegrid')
# sns.set_context("poster")

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Load Business Data

In [3]:
print('[%s] Loading Business Data...' % datetime.datetime.now().time())

data = pd.DataFrame()
source_data_file = '../source_data/business.json'

biz_list = []
for line in open(source_data_file, 'r'):
    biz_list.append(json.loads(line))

print('[%s] creating dataframe...' % datetime.datetime.now().time())    
biz_df = pd.DataFrame(biz_list)

print('[%s] data type cleanup...' % datetime.datetime.now().time())    
# tips_df.date        = pd.to_datetime(tips_df.date)
# tips_df.likes       = tips_df.likes.astype('int')
    
print('[%s] Complete!' % datetime.datetime.now().time())

[16:07:01.452429] Loading Business Data...
[16:07:21.019387] creating dataframe...
[16:07:24.057651] data type cleanup...
[16:07:24.057939] Complete!


In [4]:
biz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 15 columns):
address         156639 non-null object
attributes      156639 non-null object
business_id     156639 non-null object
categories      156639 non-null object
city            156639 non-null object
hours           156639 non-null object
is_open         156639 non-null int64
latitude        156638 non-null float64
longitude       156638 non-null float64
name            156639 non-null object
neighborhood    156639 non-null object
postal_code     156639 non-null object
review_count    156639 non-null int64
stars           156639 non-null float64
state           156639 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 17.9+ MB


In [5]:
biz_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...",YDf95gJZaq05wvo7hTQbbQ,"[Shopping, Shopping Centers]",Richmond Heights,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",1,41.541716,-81.493116,Richmond Town Square,,44143,17,2.0,OH
1,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...",mLwM-h2YhXl2NCgdS84_Bw,"[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,"{'Monday': '10:00-22:00', 'Tuesday': '10:00-22...",0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,4,4.5,NC
2,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...",v2WhjAB3PIBA8J8VxG3wEg,"[Food, Coffee & Tea]",Toronto,"{'Monday': '10:00-19:00', 'Tuesday': '10:00-19...",0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,7,4.5,ON
3,"7702 E Doubletree Ranch Rd, Ste 300",{},CVtCbSB1zUcUWg-9TNGTuQ,"[Professional Services, Matchmakers]",Scottsdale,"{'Friday': '9:00-17:00', 'Tuesday': '9:00-17:0...",1,33.565082,-111.9164,TRUmatch,,85258,3,3.0,AZ
4,4719 N 20Th St,"{'RestaurantsTableService': False, 'GoodForMea...",duHFBe87uNSXImQmvBh87Q,"[Sandwiches, Restaurants]",Phoenix,{},0,33.505928,-112.038847,Blimpie,,85016,10,4.5,AZ


In [6]:
def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

unpack_biz_df = unpack(biz_df, 'attributes')
unpack_biz_df = unpack(unpack_biz_df, 'hours')

In [7]:
unpack_biz_df.head()

Unnamed: 0,address,business_id,categories,city,is_open,latitude,longitude,name,neighborhood,postal_code,...,Smoking,WheelchairAccessible,WiFi,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,691 Richmond Rd,YDf95gJZaq05wvo7hTQbbQ,"[Shopping, Shopping Centers]",Richmond Heights,1,41.541716,-81.493116,Richmond Town Square,,44143,...,,True,,10:00-21:00,10:00-21:00,10:00-21:00,11:00-18:00,10:00-21:00,10:00-21:00,10:00-21:00
1,2824 Milton Rd,mLwM-h2YhXl2NCgdS84_Bw,"[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,...,,,,10:00-22:00,10:00-22:00,10:00-22:00,10:00-22:00,10:00-22:00,10:00-22:00,10:00-22:00
2,337 Danforth Avenue,v2WhjAB3PIBA8J8VxG3wEg,"[Food, Coffee & Tea]",Toronto,0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,...,,True,no,10:00-19:00,10:00-19:00,10:00-18:00,12:00-17:00,10:00-19:00,10:00-19:00,10:00-19:00
3,"7702 E Doubletree Ranch Rd, Ste 300",CVtCbSB1zUcUWg-9TNGTuQ,"[Professional Services, Matchmakers]",Scottsdale,1,33.565082,-111.9164,TRUmatch,,85258,...,,,,9:00-17:00,9:00-17:00,,,9:00-17:00,9:00-17:00,9:00-17:00
4,4719 N 20Th St,duHFBe87uNSXImQmvBh87Q,"[Sandwiches, Restaurants]",Phoenix,0,33.505928,-112.038847,Blimpie,,85016,...,,,no,,,,,,,


In [8]:
unpack_biz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 59 columns):
address                       156639 non-null object
business_id                   156639 non-null object
categories                    156639 non-null object
city                          156639 non-null object
is_open                       156639 non-null int64
latitude                      156638 non-null float64
longitude                     156638 non-null float64
name                          156639 non-null object
neighborhood                  156639 non-null object
postal_code                   156639 non-null object
review_count                  156639 non-null int64
stars                         156639 non-null float64
state                         156639 non-null object
AcceptsInsurance              8576 non-null object
AgesAllowed                   395 non-null object
Alcohol                       44240 non-null object
Ambience                      42900 non-null o

In [9]:
for stars in sorted(unpack_biz_df.stars.unique()):
    star_biz_df = unpack_biz_df[unpack_biz_df.stars == stars]
    
    star_biz_df.reset_index(inplace=True, drop=True)
    file = '../clean_data/business/%s_star_businesses.csv' % str(stars).replace('.', '')
    star_biz_df.to_csv(file)
    print(file)
    
    

../clean_data/business/10_star_businesses.csv
../clean_data/business/15_star_businesses.csv
../clean_data/business/20_star_businesses.csv
../clean_data/business/25_star_businesses.csv
../clean_data/business/30_star_businesses.csv
../clean_data/business/35_star_businesses.csv
../clean_data/business/40_star_businesses.csv
../clean_data/business/45_star_businesses.csv
../clean_data/business/50_star_businesses.csv
