# Yelp Business Wrangling

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime

import json

import seaborn as sns
sns.set()
# sns.set_style('whitegrid')
# sns.set_context("poster")

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Load Business Data

In [3]:
print('[%s] Loading Business Data...' % datetime.datetime.now().time())

data = pd.DataFrame()
source_data_file = '../source_data/business.json'

biz_list = []
for line in open(source_data_file, 'r'):
    biz_list.append(json.loads(line))

print('[%s] creating dataframe...' % datetime.datetime.now().time())    
biz_df = pd.DataFrame(biz_list)

# print('[%s] data type cleanup...' % datetime.datetime.now().time())    
# # tips_df.date        = pd.to_datetime(tips_df.date)
# # tips_df.likes       = tips_df.likes.astype('int')
    
print('[%s] Complete!' % datetime.datetime.now().time())

[03:47:55.135078] Loading Business Data...
[03:48:05.236833] creating dataframe...
[03:48:06.661212] Complete!


In [4]:
biz_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...",YDf95gJZaq05wvo7hTQbbQ,"[Shopping, Shopping Centers]",Richmond Heights,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",1,41.541716,-81.493116,Richmond Town Square,,44143,17,2.0,OH
1,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...",mLwM-h2YhXl2NCgdS84_Bw,"[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,"{'Monday': '10:00-22:00', 'Tuesday': '10:00-22...",0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,4,4.5,NC
2,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...",v2WhjAB3PIBA8J8VxG3wEg,"[Food, Coffee & Tea]",Toronto,"{'Monday': '10:00-19:00', 'Tuesday': '10:00-19...",0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,7,4.5,ON
3,"7702 E Doubletree Ranch Rd, Ste 300",{},CVtCbSB1zUcUWg-9TNGTuQ,"[Professional Services, Matchmakers]",Scottsdale,"{'Friday': '9:00-17:00', 'Tuesday': '9:00-17:0...",1,33.565082,-111.9164,TRUmatch,,85258,3,3.0,AZ
4,4719 N 20Th St,"{'RestaurantsTableService': False, 'GoodForMea...",duHFBe87uNSXImQmvBh87Q,"[Sandwiches, Restaurants]",Phoenix,{},0,33.505928,-112.038847,Blimpie,,85016,10,4.5,AZ


# Separate `hours` column into daily open and close columns

In [5]:
biz_hours = biz_df[['business_id', 'hours']].copy()

biz_hours = pd.concat([biz_hours.drop(['hours'], axis=1), biz_hours['hours'].apply(pd.Series)], axis=1)

# split daily hours columnsinto '{DAY}_open' and '{DAY}_close'
for col in biz_hours.columns[1:]:

    # split hours column of [11:00-19:00] into '{original_name}_open' and '{original_name}_close' columns
    biz_hours['{}_open'.format(col.lower())], biz_hours['{}_close'.format(col.lower())] = biz_hours[col].str.split('-', 1).str

    # split each open column into '{}_open_hour' and '{}_open_minute' columns
    biz_hours['{}_open_hour'.format(col.lower())], biz_hours['{}_open_minute'.format(col.lower())] = biz_hours['{}_open'.format(col.lower())].str.split(':',1).str

    # split each close column into '{}_open_hour' and '{}_open_minute' columns
    biz_hours['{}_close_hour'.format(col.lower())], biz_hours['{}_close_minute'.format(col.lower())] = biz_hours['{}_close'.format(col.lower())].str.split(':',1).str
    
    # convert open_hour and open_minute to int, min/60 for fraction of hour
    biz_hours['{}_open_hour'.format(col.lower())] = biz_hours['{}_open_hour'.format(col.lower())].astype('float')
    biz_hours['{}_open_minute'.format(col.lower())] = biz_hours['{}_open_minute'.format(col.lower())].astype('float')/60.
    
    # convert close_hour and close_minute to int, min/60 for fraction of hour
    biz_hours['{}_close_hour'.format(col.lower())] = biz_hours['{}_close_hour'.format(col.lower())].astype('float')
    biz_hours['{}_close_minute'.format(col.lower())] = biz_hours['{}_close_minute'.format(col.lower())].astype('float')/60.

    # add back into hour of day as a fraction of hours in 24 hour clock i.e. 5:30pm -> 17.5
    biz_hours['{}_open'.format(col.lower())] = biz_hours['{}_open_hour'.format(col.lower())] + biz_hours['{}_open_minute'.format(col.lower())]
    biz_hours['{}_close'.format(col.lower())] = biz_hours['{}_close_hour'.format(col.lower())] + biz_hours['{}_close_minute'.format(col.lower())]

    # drop our bits and pieces
    drop_cols = ['{}_open_hour'.format(col.lower()), 
         '{}_open_minute'.format(col.lower()), 
         '{}_close_hour'.format(col.lower()), 
         '{}_close_minute'.format(col.lower())]
    biz_hours.drop(drop_cols, axis=1, inplace=True)
    
    # drop oroginal column
    biz_hours.drop([col], inplace=True, axis=1)
biz_hours.fillna(0, inplace=True)

# merge back to original data frame
biz_df = biz_df.merge(biz_hours, left_on='business_id', right_on='business_id')

# drop original 'hours' column of list  
biz_df.drop(['hours'], axis=1, inplace=True)

# Unpack `attributes` column into separate columns

In [6]:
def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

unpacked_biz_df = unpack(biz_df, 'attributes')

In [7]:
unpacked_biz_df.head()

Unnamed: 0,address,business_id,categories,city,is_open,latitude,longitude,name,neighborhood,postal_code,...,RestaurantsCounterService,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,Smoking,WheelchairAccessible,WiFi
0,691 Richmond Rd,YDf95gJZaq05wvo7hTQbbQ,"[Shopping, Shopping Centers]",Richmond Heights,1,41.541716,-81.493116,Richmond Town Square,,44143,...,,,,2.0,,,,,True,
1,2824 Milton Rd,mLwM-h2YhXl2NCgdS84_Bw,"[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,...,,True,True,2.0,False,,True,,,
2,337 Danforth Avenue,v2WhjAB3PIBA8J8VxG3wEg,"[Food, Coffee & Tea]",Toronto,0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,...,,,,2.0,,,,,True,no
3,"7702 E Doubletree Ranch Rd, Ste 300",CVtCbSB1zUcUWg-9TNGTuQ,"[Professional Services, Matchmakers]",Scottsdale,1,33.565082,-111.9164,TRUmatch,,85258,...,,,,,,,,,,
4,4719 N 20Th St,duHFBe87uNSXImQmvBh87Q,"[Sandwiches, Restaurants]",Phoenix,0,33.505928,-112.038847,Blimpie,,85016,...,,False,True,1.0,False,False,True,,,no


In [9]:
# make all columns lower case
unpacked_biz_df.columns = [str(x).lower() for x in unpacked_biz_df.columns]

for stars in sorted(unpacked_biz_df.stars.unique()):
    star_biz_df = unpacked_biz_df[unpacked_biz_df.stars == stars]
    
    star_biz_df.reset_index(inplace=True, drop=True)
    file = '../clean_data/business/%s_star_businesses.csv' % str(stars).replace('.', '')
    star_biz_df.to_csv(file)
    print(file)
    
    

../clean_data/business/10_star_businesses.csv
../clean_data/business/15_star_businesses.csv
../clean_data/business/20_star_businesses.csv
../clean_data/business/25_star_businesses.csv
../clean_data/business/30_star_businesses.csv
../clean_data/business/35_star_businesses.csv
../clean_data/business/40_star_businesses.csv
../clean_data/business/45_star_businesses.csv
../clean_data/business/50_star_businesses.csv
