# Yelp Business Wrangling

In [7]:
import numpy as np
import pandas as pd
import datetime
import json

DRY_RUN = False

In [8]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [9]:
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text.title()))

def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

# Load Business Data

In [5]:
time_marker(text='Loading Business Info Data...')

data = pd.DataFrame()
source_data_file = '../source_data/business.json'

biz_list = []
for line in open(source_data_file, 'r'):
    biz_list.append(json.loads(line))

time_marker(text='creting dataframe...')
biz_df = pd.DataFrame(biz_list)

# print('[%s] data type cleanup...' % datetime.datetime.now().time())    
# # tips_df.date        = pd.to_datetime(tips_df.date)
# # tips_df.likes       = tips_df.likes.astype('int')
    
biz_df.head(3)

[20:52:48.057231] Loading Business Info Data...
[20:52:56.436567] Creting Dataframe...


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...",YDf95gJZaq05wvo7hTQbbQ,"[Shopping, Shopping Centers]",Richmond Heights,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",1,41.541716,-81.493116,Richmond Town Square,,44143,17,2.0,OH
1,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...",mLwM-h2YhXl2NCgdS84_Bw,"[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,"{'Monday': '10:00-22:00', 'Tuesday': '10:00-22...",0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,4,4.5,NC
2,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...",v2WhjAB3PIBA8J8VxG3wEg,"[Food, Coffee & Tea]",Toronto,"{'Monday': '10:00-19:00', 'Tuesday': '10:00-19...",0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,7,4.5,ON


In [6]:
biz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 15 columns):
address         156639 non-null object
attributes      156639 non-null object
business_id     156639 non-null object
categories      156639 non-null object
city            156639 non-null object
hours           156639 non-null object
is_open         156639 non-null int64
latitude        156638 non-null float64
longitude       156638 non-null float64
name            156639 non-null object
neighborhood    156639 non-null object
postal_code     156639 non-null object
review_count    156639 non-null int64
stars           156639 non-null float64
state           156639 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 17.9+ MB


# Separate `hours` column into daily open and close columns

In [10]:
time_marker(text='collecting business Hours...')
biz_hours = biz_df[['business_id', 'hours']].copy()
biz_hours = pd.concat([biz_hours.drop(['hours'], axis=1), biz_hours['hours'].apply(pd.Series)], axis=1)


time_marker(text='Split hours into open and close...')
# split daily hours columnsinto '{DAY}_open' and '{DAY}_close'
for col in biz_hours.columns[1:]:

    # split hours column of [11:00-19:00] into '{original_name}_open' and '{original_name}_close' columns
    biz_hours['{}_open'.format(col.lower())], biz_hours['{}_close'.format(col.lower())] = biz_hours[col].str.split('-', 1).str

    # split each open column into '{}_open_hour' and '{}_open_minute' columns
    biz_hours['{}_open_hour'.format(col.lower())], biz_hours['{}_open_minute'.format(col.lower())] = biz_hours['{}_open'.format(col.lower())].str.split(':',1).str

    # split each close column into '{}_open_hour' and '{}_open_minute' columns
    biz_hours['{}_close_hour'.format(col.lower())], biz_hours['{}_close_minute'.format(col.lower())] = biz_hours['{}_close'.format(col.lower())].str.split(':',1).str
    
    # convert open_hour and open_minute to int, min/60 for fraction of hour
    biz_hours['{}_open_hour'.format(col.lower())] = biz_hours['{}_open_hour'.format(col.lower())].astype('float')
    biz_hours['{}_open_minute'.format(col.lower())] = biz_hours['{}_open_minute'.format(col.lower())].astype('float')/60.
    
    # convert close_hour and close_minute to int, min/60 for fraction of hour
    biz_hours['{}_close_hour'.format(col.lower())] = biz_hours['{}_close_hour'.format(col.lower())].astype('float')
    biz_hours['{}_close_minute'.format(col.lower())] = biz_hours['{}_close_minute'.format(col.lower())].astype('float')/60.

    # add back into hour of day as a fraction of hours in 24 hour clock i.e. 5:30pm -> 17.5
    biz_hours['{}_open'.format(col.lower())] = biz_hours['{}_open_hour'.format(col.lower())] + biz_hours['{}_open_minute'.format(col.lower())]
    biz_hours['{}_close'.format(col.lower())] = biz_hours['{}_close_hour'.format(col.lower())] + biz_hours['{}_close_minute'.format(col.lower())]

    # drop our bits and pieces
    drop_cols = ['{}_open_hour'.format(col.lower()), 
         '{}_open_minute'.format(col.lower()), 
         '{}_close_hour'.format(col.lower()), 
         '{}_close_minute'.format(col.lower())]
    biz_hours.drop(drop_cols, axis=1, inplace=True)
    
    # drop oroginal column
    biz_hours.drop([col], inplace=True, axis=1)
biz_hours.fillna(0, inplace=True)

# merge back to original data frame
time_marker(text='merge open and close hours to business data...')
biz_df = biz_df.merge(biz_hours, left_on='business_id', right_on='business_id')

# drop original 'hours' column of list  
biz_df.drop(['hours'], axis=1, inplace=True)

[20:56:47.378855] Collecting Business Hours...
[20:58:03.893685] Split Hours Into Open And Close...
[20:58:21.107357] Merge Open And Close Hours To Business Data...


# Unpack `attributes` column into separate columns

In [11]:
unpacked_biz_df = unpack(biz_df, 'attributes')

In [12]:
time_marker(text='cleaning up and reset index...')
unpacked_biz_df.reset_index(inplace=True, drop=True)

[20:58:24.335261] Cleaning Up And Reset Index...


In [13]:
# make all columns lower case
unpacked_biz_df.columns = [str(x).lower() for x in unpacked_biz_df.columns]

In [None]:

time_marker(text='Writing to files...')

for rating in biz_df.stars.unique()):
    df = biz_df[biz_df.stars == rating].copy()
    df.reset_index(inplace=True, drop=True)

    file_name = '../clean_data/business/{}_star_business_clean.csv'.format(rating)
    time_marker(text='Writing {} rated records file...'.format(rating))
    if DRY_RUN:
        pass
    else:
        df.to_csv(file_name, encoding='utf-8')