# Yelp Business Wrangling

In [1]:
import numpy as np
import pandas as pd
import datetime
import json

DRY_RUN = False

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text.title()))

def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

# Load Business Data

In [4]:
time_marker(text='Loading Business Info Data...')

data = pd.DataFrame()
source_data_file = '../source_data/business.json'

biz_list = []
for line in open(source_data_file, 'r'):
    biz_list.append(json.loads(line))

time_marker(text='creating dataframe...')
biz_df = pd.DataFrame(biz_list)

time_marker(text='set index to business_id...')
biz_df.set_index('business_id', inplace=True, drop=True)
    
biz_df.head(3)

[16:37:03.293527] Loading Business Info Data...
[16:37:10.724892] Creating Dataframe...
[16:37:11.807155] Set Index To Business_Id...


Unnamed: 0_level_0,address,attributes,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
YDf95gJZaq05wvo7hTQbbQ,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...","[Shopping, Shopping Centers]",Richmond Heights,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",1,41.541716,-81.493116,Richmond Town Square,,44143,17,2.0,OH
mLwM-h2YhXl2NCgdS84_Bw,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...","[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,"{'Monday': '10:00-22:00', 'Tuesday': '10:00-22...",0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,4,4.5,NC
v2WhjAB3PIBA8J8VxG3wEg,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...","[Food, Coffee & Tea]",Toronto,"{'Monday': '10:00-19:00', 'Tuesday': '10:00-19...",0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,7,4.5,ON


# Separate `hours` column into daily open and close columns

In [5]:
time_marker(text='collecting business Hours...')
biz_hours = biz_df[['hours']].copy()

time_marker('splitting hours into individual columns...')
biz_hours = pd.concat([biz_hours.drop(['hours'], axis=1), biz_hours['hours'].apply(pd.Series)], axis=1)

time_marker(text='Split hours into open and close...')
# split daily hours columns into '{DAY}_open' and '{DAY}_close'
for col in biz_hours.columns:

    # split hours column of [11:00-19:00] into '{original_name}_open' and '{original_name}_close' columns
    biz_hours['{}_open'.format(col.lower())]  = pd.to_datetime(biz_hours[col].str.split('-', 1).str[0], format='%H:%M').dt.time
    biz_hours['{}_close'.format(col.lower())] = pd.to_datetime(biz_hours[col].str.split('-', 1).str[1], format='%H:%M').dt.time
    
    # drop original day columns
    biz_hours.drop(col, axis=1, inplace=True)

time_marker('sorting day columns order...')
cols = [['{}_open'.format(x.lower()), '{}_close'.format(x.lower())] for x in day_labels]
ordered_cols = list()
for day in cols:
    for time in day:
        ordered_cols.append(time)
        
biz_hours = biz_hours[ordered_cols].copy()

# merge back to original data frame
time_marker(text='merge open and close hours to business data...')
biz_df = biz_df.merge(biz_hours, left_index=True, right_index=True)

biz_df.drop(['hours'], axis=1, inplace=True)

biz_df.head(10)

[16:37:14.768932] Collecting Business Hours...
[16:37:14.786055] Splitting Hours Into Individual Columns...
[16:38:29.307331] Split Hours Into Open And Close...
[21:40:09.775531] Sorting Day Columns Order...
[21:40:10.034447] Merge Open And Close Hours To Business Data...


Unnamed: 0_level_0,address,attributes,categories,city,is_open,latitude,longitude,name,neighborhood,postal_code,...,wednesday_open,wednesday_close,thursday_open,thursday_close,friday_open,friday_close,saturday_open,saturday_close,sunday_open,sunday_close
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YDf95gJZaq05wvo7hTQbbQ,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...","[Shopping, Shopping Centers]",Richmond Heights,1,41.541716,-81.493116,Richmond Town Square,,44143,...,10:00:00,21:00:00,10:00:00,21:00:00,10:00:00,21:00:00,10:00:00,21:00:00,11:00:00,18:00:00
mLwM-h2YhXl2NCgdS84_Bw,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...","[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,...,10:00:00,22:00:00,10:00:00,22:00:00,10:00:00,22:00:00,10:00:00,22:00:00,10:00:00,22:00:00
v2WhjAB3PIBA8J8VxG3wEg,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...","[Food, Coffee & Tea]",Toronto,0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,...,10:00:00,19:00:00,10:00:00,19:00:00,10:00:00,19:00:00,10:00:00,18:00:00,12:00:00,17:00:00
CVtCbSB1zUcUWg-9TNGTuQ,"7702 E Doubletree Ranch Rd, Ste 300",{},"[Professional Services, Matchmakers]",Scottsdale,1,33.565082,-111.9164,TRUmatch,,85258,...,09:00:00,17:00:00,09:00:00,17:00:00,09:00:00,17:00:00,,,,
duHFBe87uNSXImQmvBh87Q,4719 N 20Th St,"{'RestaurantsTableService': False, 'GoodForMea...","[Sandwiches, Restaurants]",Phoenix,0,33.505928,-112.038847,Blimpie,,85016,...,,,,,,,,,,
uUEMrhJiL1a1pCA_I1SU7Q,2017 E Camelback Rd,"{'BusinessAcceptsCreditCards': True, 'Restaura...","[Shopping, Tobacco Shops]",Phoenix,0,33.508068,-112.037552,Baxter's Cigars,,85016,...,10:00:00,20:00:00,10:00:00,20:00:00,10:00:00,21:00:00,10:00:00,21:00:00,10:00:00,19:00:00
2eJEUJIP54tex7T9YOcLSw,"4425 N 24th St, Ste 125","{'AcceptsInsurance': True, 'ByAppointmentOnly'...","[Chiropractors, Health & Medical]",Phoenix,1,33.502848,-112.012696,Back-Health Chiropractic,,85016,...,14:30:00,17:00:00,14:00:00,19:00:00,09:00:00,12:00:00,,,,
fEylCY3UEH8YJ0Xa7lu6lA,5770 Butler St,{'BusinessAcceptsCreditCards': True},"[Automotive, Oil Change Stations, Car Wash, Au...",Pittsburgh,1,40.485939,-79.943727,Auto Bathouse,Lawrenceville,15201,...,08:00:00,17:00:00,08:00:00,17:00:00,08:00:00,17:00:00,08:00:00,17:00:00,08:00:00,16:30:00
kFtuYklkAIlmYw8RZAieGw,3220 Washington Rd,"{'DogsAllowed': True, 'BusinessParking': {'gar...","[Jewelry Repair, Gold Buyers, Local Services, ...",McMurray,1,40.290498,-80.110021,JAB Jewelry Designs,,15317,...,10:00:00,20:00:00,10:00:00,20:00:00,10:00:00,18:00:00,10:00:00,16:00:00,,
NqiQdFa93wzUJGo29NbTPQ,"1425 S Higley Rd, Ste 103","{'AcceptsInsurance': True, 'ByAppointmentOnly'...","[Health & Medical, Optometrists]",Gilbert,1,33.324539,-111.720449,Neighborhood Vision Center,,85296,...,07:30:00,17:00:00,07:30:00,17:00:00,07:30:00,11:00:00,,,,


# Separate `attributes` column into columns

In [None]:
attributes_df = biz_df[:2000]['attributes'].apply(pd.Series)
attributes_df.columns = [str(x).lower() for x in attributes_df.columns]

In [None]:
expandable_cols = ['businessparking','goodformeal','ambience','hairspecializesin','music','bestnights','dietaryrestrictions']

for excol in [col for col in attributes_df.columns if col != 0]:
    df = attributes_df[excol].apply(pd.Series)
    df.columns = ['{}_{}'.format(excol, str(x).lower()) for x in df.columns]
    
    # append to attributes_df
    attributes_df = attributes_df.merge(df, left_index=True, right_index=True)
    
    # drop original column
    attributes_df.drop([excol], axis=1, inplace=True)

# if attribute column ends in '_0', trim it
col_names = list()
for col in attributes_df.columns:
    if col.endswith('_0'):
        col_names.append(col[:-2])
    else:
        col_names.append(col)
# correct collumn names
attributes_df.columns = col_names

# time_marker('encoding 0/1 to False/True...')
# for col in attributes_df.columns:
#     if len(attributes_df[col].unique()) < 4:
#         print(col)
#         attributes_df[col] = attributes_df[col].replace(0.0, False).replace(1.0, True)

time_marker('drop')
df=df.dropna(axis=1,how='all')

In [None]:
for col in attributes_df.columns:
    print('{}{}'.format(str(col).ljust(40), attributes_df[col].unique()))

### Merge back to biz_df

In [None]:
biz_df = biz_df.merge(attributes_df, left_index=True, right_index=True)
biz_df.drop(['attributes'], axis=1, inplace=True)

biz_df.head(3)

In [None]:
for col in biz_df.columns[2:]:
    print('\n')
    print('-' * 80)
    try:
        print('{}{}'.format(str(col).ljust(40), sorted(biz_df[col].unique())))
    except:
        print('{}{}'.format(str(col).ljust(40), biz_df[col].unique()))

In [None]:
time_marker(text='cleaning up and reset index...')
unpacked_biz_df.reset_index(inplace=True, drop=True)

In [None]:
# make all columns lower case
unpacked_biz_df.columns = [str(x).lower() for x in unpacked_biz_df.columns]

# One Hot Encode `categories` with sklearn
<p>Save to separate file</p>

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [None]:
biz_cat = biz_df[['business_id', 'categories']].copy()
biz_cat.set_index('business_id', inplace=True)
biz_cat.head(3)

In [None]:
time_marker('one hot encoding of categories started...')
biz_cat = biz_cat.join(pd.DataFrame(mlb.fit_transform(biz_cat.pop('categories')),
                          columns=mlb.classes_,
                          index=biz_cat.index))
time_marker('complete!')

In [None]:
restaurants = biz_cat[(biz_cat.Restaurants == 1) | (biz_cat.Food == 1)].copy()
restaurants = restaurants.replace(0, np.nan)
restaurants = restaurants.dropna(how='all', axis=1)
restaurants.shape

In [None]:
biz_df.shape

# Write to Files

In [None]:

time_marker(text='Writing to files...')

for rating in biz_df.stars.unique()):
    df = biz_df[biz_df.stars == rating].copy()
    df.reset_index(inplace=True, drop=True)

    file_name = '../clean_data/business/{}_star_business_clean.csv'.format(rating)
    time_marker(text='Writing {} rated records file...'.format(rating))
    if DRY_RUN:
        pass
    else:
        df.to_csv(file_name, encoding='utf-8')