In [154]:
import numpy as np
import pandas as pd
import math
import time
from datetime import timedelta

In [155]:
df = pd.read_json('data/yelp_academic_dataset_business.json', lines=True)

In [156]:
print("df.shape", df.shape)
df.head()

df.shape (150346, 14)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [157]:
# drop all records with NA values and drop irrelevant columns
df = df.dropna()
df = df.drop(columns=['name', 'longitude', 'latitude', 'address', 'city'])
print("df.shape", df.shape)

df.shape (117618, 9)


In [158]:
# only keep currently open businesses
df = df[df['is_open']==1]
# remove is_open column since it only contains 1s now
df = df.drop(columns='is_open')
print("df.shape", df.shape)
df.head()

df.shape (94976, 8)


Unnamed: 0,business_id,state,postal_code,stars,review_count,attributes,categories,hours
1,mpf3x-BjTdTEA3yCZrAYPw,MO,63123,3.0,15,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
3,MTSW4McQd7CbVtyjqoe9mw,PA,19107,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,PA,18054,4.5,13,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
5,CF33F8-E6oudUQ46HnavjQ,TN,37015,2.0,6,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
6,n_0UpQx1hsNbnPUSlodU8w,MO,63144,2.5,13,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Sporting Goods, Fashion, Shoe Stores, Shopping...","{'Monday': '0:0-0:0', 'Tuesday': '10:0-18:0', ..."


In [159]:
# only keep businesses that are categorizes as 'Restaurants'
df = df[df['categories'].str.contains('Restaurants')]
print("df.shape", df.shape)
df.head()

df.shape (31357, 8)


Unnamed: 0,business_id,state,postal_code,stars,review_count,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,PA,19107,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
5,CF33F8-E6oudUQ46HnavjQ,TN,37015,2.0,6,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
9,bBDDEgkFA1Otx9Lfe7BZUQ,TN,37207,1.5,10,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
11,eEOYSgkmpB90uNA7lDOMRA,FL,33602,4.0,10,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
12,il_Ro8jwPlHresjw9EGmBg,IN,46227,2.5,28,"{'RestaurantsReservations': 'False', 'Restaura...","American (Traditional), Restaurants, Diners, B...","{'Monday': '6:0-22:0', 'Tuesday': '6:0-22:0', ..."


In [160]:
df = df.join(pd.json_normalize(df['attributes']))
df = df.join(pd.json_normalize(df['hours']))
df = df.drop(columns=['attributes', 'hours'])

In [161]:
def parse_hours(day_hours_str):
    if pd.isna(day_hours_str):
        return 0
    
    time_endpoints = str(day_hours_str).split('-')

    if time_endpoints[0] == time_endpoints[1]:
        # 0:0-0:0
        return 0
    
    start_time = time.strptime(time_endpoints[0], "%H:%M")
    end_time = time.strptime(time_endpoints[1], "%H:%M")

    start_time_td = timedelta(hours=start_time.tm_hour, minutes=start_time.tm_min)
    end_time_td = timedelta(hours=end_time.tm_hour, minutes=end_time.tm_min)

    duration = end_time_td - start_time_td

    return duration.total_seconds() / 3600

total_hours_arr = []

for ind in df.index:
    total_hours = 0

    monday_hours = df['Monday'][ind]
    total_hours += parse_hours(monday_hours)

    tuesday_hours = df['Tuesday'][ind]
    total_hours += parse_hours(tuesday_hours)

    wednesday_hours = df['Wednesday'][ind]
    total_hours += parse_hours(wednesday_hours)

    thursday_hours = df['Thursday'][ind]
    total_hours += parse_hours(thursday_hours)

    friday_hours = df['Friday'][ind]
    total_hours += parse_hours(friday_hours)

    saturday_hours = df['Saturday'][ind]
    total_hours += parse_hours(saturday_hours)

    sunday_hours = df['Sunday'][ind]
    total_hours += parse_hours(sunday_hours)

    total_hours_arr.append(total_hours)

df['total_hours'] = pd.Series(total_hours_arr)
df.head()

Unnamed: 0,business_id,state,postal_code,stars,review_count,categories,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,BusinessParking,...,DietaryRestrictions,HairSpecializesIn,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,total_hours
3,MTSW4McQd7CbVtyjqoe9mw,PA,19107,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",,,,"{'garage': False, 'street': False, 'validated'...",...,,,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,11:0-14:0,5:0-10:0,15:0-18:0,36.0
5,CF33F8-E6oudUQ46HnavjQ,TN,37015,2.0,6,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...",True,True,True,"{u'valet': False, u'garage': None, u'street': ...",...,,,,13:30-22:0,13:30-22:0,13:30-22:0,13:30-23:0,13:30-23:0,13:30-22:0,-36.0
9,bBDDEgkFA1Otx9Lfe7BZUQ,TN,37207,1.5,10,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...",,,,,...,,,10:0-0:0,10:0-0:0,10:0-0:0,10:0-0:0,10:0-1:0,10:0-1:0,10:0-0:0,68.0
11,eEOYSgkmpB90uNA7lDOMRA,FL,33602,4.0,10,"Vietnamese, Food, Restaurants, Food Trucks",True,True,True,"{'garage': False, 'street': False, 'validated'...",...,,,0:0-0:0,,16:0-22:0,16:0-22:0,16:0-19:0,11:0-23:0,11:0-20:0,75.0
12,il_Ro8jwPlHresjw9EGmBg,IN,46227,2.5,28,"American (Traditional), Restaurants, Diners, B...",False,True,True,"{'garage': False, 'street': True, 'validated':...",...,,,0:0-0:0,16:0-21:30,16:0-21:30,16:0-21:30,16:0-22:0,16:0-22:0,16:0-21:30,61.0
