# Yelp Business Wrangling

In [1]:
import numpy as np
import pandas as pd
import datetime
import json

DRY_RUN = False

In [None]:
sample = pd.read_csv('../clean_data/re')

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text.lower()))

def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

# Load Business Data

In [4]:
time_marker(text='Loading Business Info Data...')

data = pd.DataFrame()
source_data_file = '../source_data/business.json'

biz_list = []
for line in open(source_data_file, 'r'):
    biz_list.append(json.loads(line))

time_marker(text='creating dataframe...')
data = pd.DataFrame(biz_list)

time_marker(text='set index to business_id...')
data.set_index('business_id', inplace=True, drop=True)
    
data.head(3)

[15:05:45.845550] loading business info data...
[15:05:54.323582] creating dataframe...
[15:05:55.437396] append business_id prefix column for file sorting...
[15:05:55.529088] set index to business_id...


Unnamed: 0_level_0,address,attributes,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,bid_prefix
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
YDf95gJZaq05wvo7hTQbbQ,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...","[Shopping, Shopping Centers]",Richmond Heights,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",1,41.541716,-81.493116,Richmond Town Square,,44143,17,2.0,OH,Y
mLwM-h2YhXl2NCgdS84_Bw,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...","[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,"{'Monday': '10:00-22:00', 'Tuesday': '10:00-22...",0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,4,4.5,NC,m
v2WhjAB3PIBA8J8VxG3wEg,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...","[Food, Coffee & Tea]",Toronto,"{'Monday': '10:00-19:00', 'Tuesday': '10:00-19...",0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,7,4.5,ON,v


# Separate `hours` column into daily open and close columns

In [6]:
time_marker(text='collecting business Hours...')
hours = data[['hours']].copy()

time_marker('splitting hours into individual columns...')
hours = pd.concat([hours.drop(['hours'], axis=1), hours['hours'].apply(pd.Series)], axis=1)

time_marker(text='Split hours into open and close...')
# split daily hours columns into '{DAY}_open' and '{DAY}_close'
for col in hours.columns:

    # split hours column of [11:00-19:00] into '{original_name}_open' and '{original_name}_close' columns
    hours['{}_open'.format(col.lower())]  = pd.to_datetime(hours[col].str.split('-', 1).str[0], format='%H:%M').dt.time
    hours['{}_close'.format(col.lower())] = pd.to_datetime(hours[col].str.split('-', 1).str[1], format='%H:%M').dt.time
    
    # drop original day columns
    hours.drop(col, axis=1, inplace=True)

time_marker('sorting day columns order...')
cols = [['{}_open'.format(x.lower()), '{}_close'.format(x.lower())] for x in day_labels]
ordered_cols = list()
for day in cols:
    for time in day:
        ordered_cols.append(time)
        
hours = hours[ordered_cols].copy()

# merge back to original data frame
time_marker(text='merge open and close hours to business data...')
data = data.merge(hours, left_index=True, right_index=True)

data.drop(['hours'], axis=1, inplace=True)

data.head(10)

[15:08:11.888856] collecting business hours...
[15:08:11.904895] splitting hours into individual columns...
[15:09:30.504588] split hours into open and close...
[15:09:58.999652] sorting day columns order...
[15:09:59.102873] merge open and close hours to business data...


Unnamed: 0_level_0,address,attributes,categories,city,is_open,latitude,longitude,name,neighborhood,postal_code,...,wednesday_open,wednesday_close,thursday_open,thursday_close,friday_open,friday_close,saturday_open,saturday_close,sunday_open,sunday_close
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YDf95gJZaq05wvo7hTQbbQ,691 Richmond Rd,"{'RestaurantsPriceRange2': 2, 'BusinessParking...","[Shopping, Shopping Centers]",Richmond Heights,1,41.541716,-81.493116,Richmond Town Square,,44143,...,10:00:00,21:00:00,10:00:00,21:00:00,10:00:00,21:00:00,10:00:00,21:00:00,11:00:00,18:00:00
mLwM-h2YhXl2NCgdS84_Bw,2824 Milton Rd,"{'GoodForMeal': {'dessert': False, 'latenight'...","[Food, Soul Food, Convenience Stores, Restaura...",Charlotte,0,35.23687,-80.741976,South Florida Style Chicken & Ribs,Eastland,28215,...,10:00:00,22:00:00,10:00:00,22:00:00,10:00:00,22:00:00,10:00:00,22:00:00,10:00:00,22:00:00
v2WhjAB3PIBA8J8VxG3wEg,337 Danforth Avenue,"{'BusinessParking': {'garage': False, 'street'...","[Food, Coffee & Tea]",Toronto,0,43.677126,-79.353285,The Tea Emporium,Riverdale,M4K 1N7,...,10:00:00,19:00:00,10:00:00,19:00:00,10:00:00,19:00:00,10:00:00,18:00:00,12:00:00,17:00:00
CVtCbSB1zUcUWg-9TNGTuQ,"7702 E Doubletree Ranch Rd, Ste 300",{},"[Professional Services, Matchmakers]",Scottsdale,1,33.565082,-111.9164,TRUmatch,,85258,...,09:00:00,17:00:00,09:00:00,17:00:00,09:00:00,17:00:00,,,,
duHFBe87uNSXImQmvBh87Q,4719 N 20Th St,"{'RestaurantsTableService': False, 'GoodForMea...","[Sandwiches, Restaurants]",Phoenix,0,33.505928,-112.038847,Blimpie,,85016,...,,,,,,,,,,
uUEMrhJiL1a1pCA_I1SU7Q,2017 E Camelback Rd,"{'BusinessAcceptsCreditCards': True, 'Restaura...","[Shopping, Tobacco Shops]",Phoenix,0,33.508068,-112.037552,Baxter's Cigars,,85016,...,10:00:00,20:00:00,10:00:00,20:00:00,10:00:00,21:00:00,10:00:00,21:00:00,10:00:00,19:00:00
2eJEUJIP54tex7T9YOcLSw,"4425 N 24th St, Ste 125","{'AcceptsInsurance': True, 'ByAppointmentOnly'...","[Chiropractors, Health & Medical]",Phoenix,1,33.502848,-112.012696,Back-Health Chiropractic,,85016,...,14:30:00,17:00:00,14:00:00,19:00:00,09:00:00,12:00:00,,,,
fEylCY3UEH8YJ0Xa7lu6lA,5770 Butler St,{'BusinessAcceptsCreditCards': True},"[Automotive, Oil Change Stations, Car Wash, Au...",Pittsburgh,1,40.485939,-79.943727,Auto Bathouse,Lawrenceville,15201,...,08:00:00,17:00:00,08:00:00,17:00:00,08:00:00,17:00:00,08:00:00,17:00:00,08:00:00,16:30:00
kFtuYklkAIlmYw8RZAieGw,3220 Washington Rd,"{'DogsAllowed': True, 'BusinessParking': {'gar...","[Jewelry Repair, Gold Buyers, Local Services, ...",McMurray,1,40.290498,-80.110021,JAB Jewelry Designs,,15317,...,10:00:00,20:00:00,10:00:00,20:00:00,10:00:00,18:00:00,10:00:00,16:00:00,,
NqiQdFa93wzUJGo29NbTPQ,"1425 S Higley Rd, Ste 103","{'AcceptsInsurance': True, 'ByAppointmentOnly'...","[Health & Medical, Optometrists]",Gilbert,1,33.324539,-111.720449,Neighborhood Vision Center,,85296,...,07:30:00,17:00:00,07:30:00,17:00:00,07:30:00,11:00:00,,,,


# Separate `attributes` column into columns

In [68]:
time_marker('splitting out attributes columns...')
attributes_df = data['attributes'].apply(pd.Series)
attributes_df.columns = [str(x).lower() for x in attributes_df.columns]
time_marker('done')

[15:30:19.854574] splitting out attributes columns...


KeyboardInterrupt: 

In [66]:
time_marker('expanding attributes...')
expandable_cols = ['businessparking','goodformeal','ambience','hairspecializesin','music','bestnights','dietaryrestrictions']

for excol in [col for col in attributes_df.columns if col != 0]:
    time_marker('\texpanding "{}"...'.format(excol))
    df = attributes_df[excol].apply(pd.Series)
    df.columns = ['{}_{}'.format(excol, str(x).lower()) for x in df.columns]
    
    # append to attributes_df
    attributes_df = attributes_df.merge(df, left_index=True, right_index=True)
    
    # drop original column
    attributes_df.drop([excol], axis=1, inplace=True)

# if attribute column ends in '_0', trim it
time_marker('trimming odd columns...')
col_names = list()
for col in attributes_df.columns:
    if col.endswith('_0'):
        col_names.append(col[:-2])
    else:
        col_names.append(col)
# correct collumn names
attributes_df.columns = col_names

time_marker('dropping columns of all nan...')
attributes_df=attributes_df.dropna(axis=1,how='all')

[15:25:49.474095] expanding acceptsinsurance...
[15:25:49.487835] expanding alcohol...
[15:25:49.500423] expanding ambience...
[15:25:49.526968] expanding bikeparking...
[15:25:49.538870] expanding businessacceptsbitcoin...
[15:25:49.549766] expanding businessacceptscreditcards...
[15:25:49.562097] expanding businessparking...
[15:25:49.584688] expanding byappointmentonly...
[15:25:49.595647] expanding caters...
[15:25:49.609705] expanding coatcheck...
[15:25:49.620464] expanding dogsallowed...
[15:25:49.632662] expanding drivethru...
[15:25:49.644475] expanding goodfordancing...
[15:25:49.656696] expanding goodforkids...
[15:25:49.667902] expanding goodformeal...


  result = result.union(other)
  union = _union_indexes(indexes)
  union = _union_indexes(indexes)
  result = result.union(other)


[15:25:49.694075] expanding hairspecializesin...
[15:25:49.719891] expanding happyhour...
[15:25:49.737525] expanding hastv...
[15:25:49.754010] expanding music...
[15:25:49.782390] expanding noiselevel...
[15:25:49.795498] expanding outdoorseating...
[15:25:49.808579] expanding restaurantsattire...
[15:25:49.823871] expanding restaurantsdelivery...
[15:25:49.837789] expanding restaurantsgoodforgroups...
[15:25:49.850584] expanding restaurantspricerange2...
[15:25:49.863195] expanding restaurantsreservations...
[15:25:49.876486] expanding restaurantstableservice...
[15:25:49.888548] expanding restaurantstakeout...
[15:25:49.900703] expanding wheelchairaccessible...
[15:25:49.913229] expanding wifi...
[15:25:49.927153] trimming odd columns...
[15:25:49.927843] dropping columns of all nan...


### Merge back to biz_df

In [None]:
time_marker('merging attributes to main business dataframe...')
data = data.merge(attributes_df, left_index=True, right_index=True)
data.drop(['attributes'], axis=1, inplace=True)
time_marker('done')
data.head(3)

# One Hot Encode `categories` with sklearn
<p>Save to separate file</p>

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [None]:
data = data[['business_id', 'categories']].copy()
data.set_index('business_id', inplace=True)
data.head(3)

In [None]:
time_marker('one hot encoding of categories started...')
data = data.join(pd.DataFrame(mlb.fit_transform(data.pop('categories')),
                          columns=mlb.classes_,
                          index=data.index))
time_marker('complete!')

# Write to Files

In [None]:
time_marker('append business_id prefix column for file sorting...')
data['bid_prefix'] = data.business_id.apply(lambda x: x[:1])

In [None]:
time_marker(text='Writing to files...')
file_count = len(reviews.bid_prefix.unique())

for i, prefix in enumerate(sorted(data.bid_prefix.unique())):
    df = reviews[reviews.bid_prefix == prefix].iloc[:,:-1].copy()
    df.reset_index(inplace=True, drop=True)
    file_name = '../clean_data/business/{}_{}_business_clean.csv'.format(str(i).zfill(2), prefix)
    time_marker(text='Writing {:d} records to file {}'.format(df.shape[0], file_name))
    if DRY_RUN:
        pass
    else:
        df.to_csv(file_name, encoding='utf-8')
time_marker(text='Done!')