In [277]:
import numpy as np
import pandas as pd
import re
from datetime import datetime  
from datetime import timedelta
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [278]:
df = pd.read_csv('yelp_data_all.csv')

In [279]:
print(df.columns.values)
print(df.shape)

['CAMIS' 'INSP_DATE' 'CRIT' 'CRIT_TYPE' 'HIST_CRIT' 'HIST_CRIT_TYPE'
 'LAST_INSP_DATE' 'LAST_CRIT_DATE' 'CRIT_TIMES' 'INSP_TIMES' 'CRIT_RATE'
 'DBA' 'BORO' 'ZIPCODE' 'PHONE' 'CUISINE DESCRIPTION' 'ACTION' 'SCORE'
 'GRADE' 'GRADE DATE' 'RECORD DATE' 'INSPECTION TYPE' 'Dsince_LAST_INSP'
 'Dsince_LAST_CRIT' 'last_crit_over_last_insp' 'NO_HISTCRIT' 'NEWLY_INSP'
 'Target' 'ADDRESS' 'yelp_id' 'yelp_name' 'yelp_is_closed' 'yelp_url'
 'yelp_phone' 'yelp_review_count' 'yelp_categories_a' 'yelp_categories_t'
 'yelp_rating' 'yelp_address1' 'yelp_city' 'yelp_state' 'yelp_zip_code'
 'yelp_latitude' 'yelp_longitude' 'yelp_price' 'yelp_transactions'
 'yelp_day0' 'yelp_day1' 'yelp_day2' 'yelp_day3' 'yelp_day4' 'yelp_day5'
 'yelp_day6']
(25750, 53)


In [280]:
#Drop nobusiness found data
#Drop mismatched data
#handle missing data
#get rodent, weather, demographic,yelp feature
(df.NEWLY_INSP == 1).sum()

1813

In [281]:
#Drop nobusiness found - yelp_id.isnull()
df = df.drop(df[df.yelp_id.isnull()].index, axis =0)


In [282]:
def getspurious(d):
    '''Apply the most strict match condition:
        exact match with name, address and zipcode'''
    
    rep = {"avenue": "", "ave": "",'highway':'','street':'','hwy':'','st':'','pkwy':'','blvd':'','-':''} # define desired replacements here
    # use these three lines to do the replacement
    rep = dict((re.escape(k), v) for k, v in rep.items())
    pattern = re.compile("|".join(rep.keys()))

    name1 = d['DBA'].apply(lambda x: x.lower().replace(' ',''))
    name2 = d['yelp_name'].apply(lambda x:x.lower().replace(' ',''))
    nmask = (name1 != name2)
    ad1 = df['ADDRESS'].apply(lambda x: pattern.sub(lambda m: rep[re.escape(m.group(0))], str(x).lower().replace(' ','')))
    ad2 = df['yelp_address1'].apply(lambda x: pattern.sub(lambda m: rep[re.escape(m.group(0))], str(x).lower().replace(' ','')))
    amask = (ad1 != ad2)
    zmask = df['ZIPCODE'] != df['yelp_zip_code']
  
    return nmask,amask,zmask

def getspurious2(d):
    ''' Loosen the matching condition on address and get spurious records'''
    
    ad1 = d['ADDRESS'].apply(lambda x: str(x).upper().replace(' ',''))
    ad2 = d['yelp_address1'].apply(lambda x: str(x).upper().replace('-','').split(' '))
    mask = [True]*len(ad1)
    for i in range(len(ad1)):
        for each in ad2.iloc[i]:
            if each == 'W' or each == 'E' or each == 'N' or each == 'S' or each == 'PARK' or each == 'AVE' or each == 'ST' or each == 'BLVD' or each == 'RD':
                continue
            else:
                if each in ad1.iloc[i]:
                    mask[i] = False
                    break
  
    return mask

In [283]:
# get the indexex of records failing at certain match 
# There are a lot of restaurants with the same name but different address: 
# chain restaurants. We think if there are an exact match on one of zipcode 
# or address then we are generally safe.
# Therefore we first check records that fail both of address and zipcode
# We found that address parsing could be improved because most of these records are on 
# the same address, exact match is too strict

a,b,c = getspurious(df)
e = df.yelp_address1.isnull()
#d = m&c
#d = b&e
#d = b&c

In [284]:
df[a&b&c][['yelp_address1','ADDRESS','DBA','yelp_name','yelp_zip_code','ZIPCODE',
           'yelp_longitude','yelp_latitude']][60:]

Unnamed: 0,yelp_address1,ADDRESS,DBA,yelp_name,yelp_zip_code,ZIPCODE,yelp_longitude,yelp_latitude
6029,John F Kennedy International Airport,0JFK INTERNATIONAL AIRPORT,NEW YORK SPORTS GRILL,New York Sports Bar,11464.0,11430.0,-73.754360,40.650090
6327,275 Essex St,86103 AVENUE,PEGUSUS COFFEE SHOP,Flicker Coffee and Tea Shop,10002.0,11209.0,-73.882410,40.677370
6479,,000126 TH & ROOSEVELT AVENUE,STAND 420 (BIG APPLE BREWS),Big Apple Sports,,,,
6486,,000126TH ST &ROOSEVELT AVENUE,337 - BURGERS & DOGS,Wall Street Burgers,,,-74.007140,40.714550
6647,48 S 4th St,48SOUTH 4 STREET,THE WOODS/ THE BEAR (BACKYARD),The Woods,11211.0,11249.0,-73.966361,40.712808
6970,102ND St And Atlantic Ave,10202ATLANTIC AVENUE,DUNKIN' DONUTS,Dunkin Donuts,11418.0,11416.0,-73.842557,40.689680
7128,805 Coney Island Ave,805807CONEY ISLAND AVENUE,LOS MARIACHIS RESTAURANT,Los Mariachis Mexican Restaurant,11218.0,,-73.968047,40.637732
7182,381 Park Ave S,59610 AVENUE,MR. BIGGS BAR AND GRILL,Mr. Biggs Bar & Grill,10016.0,10036.0,-73.984460,40.742510
7270,496 Laguardia Pl,2103326 AVENUE,FIVE GUYS FAMOUS BURGERS AND FRIES,Five Guys Burgers and Fries,10012.0,11360.0,-73.999778,40.727470
7344,100 Chambers St,5401108th STREET,"DUNKIN' DONUTS, BASKIN ROBBINS",Dunkin Donuts & Baskin Robbins,10007.0,11368.0,-74.007812,40.714649


In [285]:
dropindbca = [292,435,1708,2258,2371,2563,3588,4103,21126,4566,5582,5589,5592,5893,6327,
              6479,6486,7344,7425,8134,8346,8564,9560,9673,9693,9803,9968,10614,10762,
              10621,10955,10956,10957,11332,11407,11419,11570,11575,11801,11974,12407,
              12440,12533,12543,12651,12758,12806,13321,13612,13866,14015,14045,14898,
              15282,15470,15580,16302,16337,16515,16810,17027,17150,17170,17301,17519,
              17769,18067,18206,18267,18313,18361,18517,18806,19204,19295,19582,20118,
              20714,20885,21126,21610,22171,23491,23504,23528,24141,24476,24563,24660,
              24712,24933,25002,25074,25153,25298,25359,25646,25546,25667,2922,12491,
              13304,19237,23860,7182,7270,8152,8436,10484,12491,10466,7694,4810]

df = df.drop(dropindbca,axis = 0)
a,b,c = getspurious(df) 
m = getspurious2(df)
df = df.drop(set(df[m&c].index) - set(df[a&b&c].index),axis = 0)   

In [286]:
df.shape

(17101, 53)

In [287]:
df.isnull().sum()

CAMIS                          0
INSP_DATE                      0
CRIT                           0
CRIT_TYPE                   2635
HIST_CRIT                      0
HIST_CRIT_TYPE              1631
LAST_INSP_DATE                 0
LAST_CRIT_DATE              1631
CRIT_TIMES                     0
INSP_TIMES                     0
CRIT_RATE                   1134
DBA                            0
BORO                           0
ZIPCODE                      166
PHONE                          0
CUISINE DESCRIPTION            0
ACTION                         0
SCORE                          0
GRADE                          0
GRADE DATE                   269
RECORD DATE                    0
INSPECTION TYPE                0
Dsince_LAST_INSP               0
Dsince_LAST_CRIT               0
last_crit_over_last_insp       0
NO_HISTCRIT                    0
NEWLY_INSP                     0
Target                         0
ADDRESS                        0
yelp_id                        0
yelp_name 

In [288]:
#df.loc[df[df.ZIPCODE.isnull()].index,'ZIPCODE'] = df.loc[df[df.ZIPCODE.isnull()].index,'yelp_zip_code']
#df.drop(columns = 'GRADE DATE',axis = 1,inplace = True)# GRADE DATE is exactly the same as inspection date
df.drop(columns = ['yelp_phone','yelp_zip_code','yelp_url','yelp_is_closed','yelp_address1','yelp_city','yelp_state','yelp_name','yelp_id'],inplace = True)

In [289]:
df.loc[18143,'BORO'] = 'BROOKLYN'

In [290]:
dfrodent = pd.read_csv('rodent_process.csv')

In [291]:
def keepday(x):
    temp = x[0:10]
    output = datetime.strptime(temp, '%m/%d/%Y')
    return output

dfrodent['Created Date'] = dfrodent['Created Date'].apply(keepday)
print(len(dfrodent))
dfrodent = dfrodent[dfrodent['Created Date'] >= datetime(2017,1,1,0,0)]
print(len(dfrodent))

237407
65281


In [292]:
lst = ['Rat Sighting','Mouse Sighting',
       'Condition Attracting Rodents', 'Signs of Rodents',
       'Rodent Bite - PCS Only']
for i in lst:  
    dfrodent[i] = dfrodent.groupby('ZIPCODE')['Descriptor'].transform(lambda x: (x == i).sum())
dfrodent.drop(columns = ['Unnamed: 0'],inplace = True,axis = 1)

In [293]:
dfrodent.drop_duplicates(subset = 'ZIPCODE',keep = 'first',inplace = True)
df = df.merge(dfrodent,left_on = 'ZIPCODE',right_on = 'ZIPCODE',how = 'right')

In [295]:
df.drop(columns = ['Rodent Bite - PCS Only','Unique Key'],axis = 1,inplace = True)
df.drop(columns = ['Park Borough'],axis = 1,inplace = True)

df.drop(columns = ['Descriptor'],axis = 1,inplace = True)
df.drop(df[df['CAMIS'].isnull()].index,axis = 0,inplace = True)
df.drop(columns = ['Created Date','Closed Date'],axis = 1,inplace =True)

In [296]:
df.shape

(16815, 48)

In [297]:
ind = df[df.yelp_day0.isnull()&df.yelp_day1.isnull()&df.yelp_day2.isnull()&df.yelp_day4.isnull()&df.yelp_day3.isnull()&df.yelp_day5.isnull()&df.yelp_day6.isnull()].index
df.drop(ind,axis = 0,inplace = True)
df.shape

(13720, 48)

In [310]:
def getdailyhour(x):
    if x != x:
        return 0
    elif int(x) == 0:
        return 24
    else:
        x = int(x)
        close = int(str(x).zfill(8)[-4::])
        start = int(str(x).zfill(8)[:-4])
    
        if close <  start:
            close += 2400
        return int((close - start)/100)
    

for i in range(7):
    df['day'+str(i)+'_daily'] = df['yelp_day' + str(i)].apply(getdailyhour)
    df['yelp_day' + str(i)] = df['yelp_day' + str(i)].apply(lambda x: str(int(x)) if x == x else 'n')


In [317]:
df.drop(columns = ['yelp_day0','yelp_day1','yelp_day2','yelp_day3','yelp_day4',
                   'yelp_day5','yelp_day6'],axis = 1,inplace = True)

In [315]:
# df['yelp_weekly_day'] = 7-((df.day0_daily == 0)*1+(df.day1_daily == 0)*1 +(df.day2_daily == 0)*1+(df.day3_daily == 0)*1+(df.day4_daily == 0)*1
# +(df.day5_daily == 0)*1+(df.day6_daily == 0)*1)
# inddf = [91,161,  1451,  1469,  1929,  2281,  2544,  3225,  3240,
#             4281,  4332,  4374,  4405,  4577,  4685,  4973,  5314,  5757,
#             5762,  6940,  7295,  7336,  7556,  7633,  8219,  8478,  8485,
#             8825,  8864,  9456,  9467,  9558,  9706,  9955, 10004, 10098,
#            10225, 10408, 10416, 10439, 10481, 10500, 11130, 11303, 11534,
#            11811, 12631, 12950]
# df.loc[inddf,'yelp_weekly_day'] = 7
# df.loc[7556,'yelp_weekly_day'] = 6


# # step1: create open indicator for each business day. change the dtype of yelp_day
# for i in range(7):
#     df['day'+str(i)+'_open'] = df['day'+str(i)+'_daily'].astype(bool).astype(int)
#     #df['yelp_day'+str(i)] = df['yelp_day'+str(i)].astype(int).astype(str)

# # step2: get the index of unmatched records (who is actually 24hr open but daily_hr = 0), re-assign value.
# for i in range(7):
#     locls = df[(df['yelp_day'+str(i)].apply(len) > 6) & (df['day'+str(i)+'_daily'] == 0)].index
#     df.loc[locls,'day'+str(i)+'_daily'] = 24

# # step3: repeat step1 to rewrite the open indicator
# for i in range(7):
#     df['day'+str(i)+'_open'] = df['day'+str(i)+'_daily'].astype(bool).astype(int)
#     #df['yelp_day'+str(i)] = df['yelp_day'+str(i)].astype(int).astype(str)

for i in range(7):
    df['day'+str(i)+'_open'] = (df['day'+str(i)+'_daily'] >0)*1
    

# step4: recalculate the yelp_weekly_day and yelp_weekly_hr
df.yelp_weekly_day = df[['day0_open', 'day1_open', 'day2_open', 'day3_open', 
                         'day4_open', 'day5_open', 'day6_open']].sum(axis=1)
df.yelp_weekly_hr = df[['day0_daily', 'day1_daily', 'day2_daily', 'day3_daily', 
                        'day4_daily', 'day5_daily', 'day6_daily']].sum(axis=1)

# step5: fix overnight 
for i in range(7):
    df['day'+str(i)+'_overnight'] = 0
    locls = df[df['day'+str(i)+'_daily'] == 24].index
    df.loc[locls,'day'+str(i)+'_overnight'] = 1
    
# step6: drop records whose start or end time is missing
for i in range(7):
    idx = df[(df['yelp_day'+str(i)].apply(len) <5) & (df['yelp_day'+str(i)].apply(len)>2)].index
    df.drop(idx, inplace=True, axis=0)
    
# step7: recalculate weekly overnight numbers
df.yelp_overnight_times = df[['day0_overnight', 'day1_overnight', 'day2_overnight',
                              'day3_overnight', 'day4_overnight', 'day5_overnight', 
                              'day6_overnight']].sum(axis=1)


In [None]:
n = pd.read_csv('weather_data_noaa.csv')[['DATE', 'HOURLYDRYBULBTEMPF', 'HOURLYRelativeHumidity' ,'DAILYMaximumDryBulbTemp', 'DAILYMinimumDryBulbTemp', 'DAILYAverageRelativeHumidity']]
print(n.shape)
print(n.columns)
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0] - missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

In [None]:
n['DATE'] = n.DATE.astype(str)
n['OnlyDate'] = n.DATE.apply(lambda x: x[:10])
dfn = n.dropna(subset=['HOURLYDRYBULBTEMPF', 'HOURLYRelativeHumidity'])
print(dfn.shape)
dfn['HOURLYDRYBULBTEMPF'] = dfn['HOURLYDRYBULBTEMPF'].astype(int)
dfn['HOURLYRelativeHumidity'] = dfn['HOURLYRelativeHumidity'].astype(int)

# grouby 'OnlyDate', compute the mean of HOURLYDRYBULBTEMPF

grouped = dfn['HOURLYDRYBULBTEMPF'].groupby(dfn['OnlyDate'])
grouped2 = dfn['HOURLYRelativeHumidity'].groupby(dfn['OnlyDate'])
# for n, g in grouped:
#     print(n)
#     print(g)
print(type(grouped.mean()))
avetemp = grouped.mean()
maxtemp = grouped.max()
avehum = grouped2.mean()
print(len(avehum), len(avetemp))
# avehum
w1 = avetemp.to_frame()
w2 = maxtemp.to_frame()
w3 = avehum.to_frame()

w1.columns = ['AveTemp']
w2.columns = ['MaxTemp']
w3.columns = ['Avehum']

w = w1.join(w2, how='left')
w = w.join(w3, how='left')
print(w.shape)
w.head(10)

wdf = w
wdf1 = wdf.rolling(3).max()
wdf1.columns = ['AveTemp_3dayMax', 'MaxTemp_3dayMax', 'AveHum_3dayMax']
wdf2 = wdf.rolling(3).mean()
wdf2.columns = ['AveTemp_3dayAve', 'MaxTemp_3dayAve', 'AveHum_3dayAve']
wdf3 = wdf.rolling(3).max() - wdf.rolling(3).min()
wdf3.columns = ['AveTemp_3dayRange', 'MaxTemp_3dayRange', 'AveHum_3dayRange']

ww = wdf1.join(wdf2, how='left')
ww = ww.join(wdf3, how='left')

ww.head()

In [None]:
df = df.merge(ww, left_on='INSP_DATE', right_on='Weather_Date', how='left')

In [None]:
#Reviews
g = pd.read_csv('gmap_reviews.csv')[['CAMIS','gmap_reviews']]
g.columns = ['CAMIS1','gmap_reviews']
print(g.shape)
g.head()
df = df.merge(g, left_on='CAMIS', right_on='CAMIS1', how='left')
print(df.shape)
# d8.drop('CAMIS1', axis=1, inplace=True)
df.columns

In [None]:
#Month
df['Month'] = df.INSP_DATE.apply(lambda x: x[5:7]).astype(int)

In [None]:
#populaiton tract
puma = pd.read_csv('nyc_zcta_to_puma_assigned.csv')
print(puma.shape)
puma.head()
puma.to_csv('zipcode_subboro.csv', index=False)
d1 = pd.read_csv('householdswithchildrenunder18yearsold.csv')
d2 = pd.read_csv('incomediversityratio.csv')
d3 = pd.read_csv('medianhouseholdincome2017.csv')
d4 = pd.read_csv('population.csv')
d5 = pd.read_csv('populationdensity1000personspersquaremile.csv')
d6 = pd.read_csv('povertyrate.csv')
d7 = pd.read_csv('racialdiversityindex.csv')
d8 = pd.read_csv('unemploymentrate.csv')
d9 = pd.read_csv('medianrentburden.csv')
d10 = pd.read_csv('populationaged25withoutahighschooldiploma.csv')
d11 = pd.read_csv('seriouscrimerateper1000residents.csv')

d1 = d1.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d1.columns = ['Sub-Borough Area', 'ChildrenUnder18']
d2 = d2.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d2.columns = ['Sub-Borough Area', 'IncomeDiv']
d3 = d3.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d3.columns = ['Sub-Borough Area', 'MedianIncome']
d4 = d4.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d4.columns = ['Sub-Borough Area', 'Population']

d5 = d5.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d5.columns = ['Sub-Borough Area', 'PopDensity']
d6 = d6.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d6.columns = ['Sub-Borough Area', 'PovertyRate']
d7 = d7.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d7.columns = ['Sub-Borough Area', 'RacialDiv']
d8 = d8.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d8.columns = ['Sub-Borough Area', 'UnemployRate']
# d = d.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
# d.columns = ['IncomeDiv_' + str(col) for col in d.columns]

d9 = d9.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d9.columns = ['Sub-Borough Area', 'MedianRentB']
d10 = d10.sort_values(by=['Sub-Borough Area'])[['Sub-Borough Area', '2016']]
d10.columns = ['Sub-Borough Area', '25NoHighSchool']

In [None]:
from functools import reduce

dfs = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
final = reduce(lambda left,right: pd.merge(left,right,on='Sub-Borough Area'), dfs)
final.head()
boro = puma.merge(final, on='sub_boro', how='left')
puma1 = puma.drop_duplicates(subset=['puma10'], keep='first', inplace=False)
dic = {'Bronx': "BX ", 'Brooklyn': "BK ", 'Manhattan':'MN ', 'Queens':'QN ', 'Staten Island':'SI '}
puma1 = puma1.replace({"boro": dic})

In [None]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(' & ')))

# calculate lengths of splits
lens = puma1['CD'].str.split(' & ').map(len)

# create new dataframe, repeating or chaining as appropriate
res = pd.DataFrame({'puma10': np.repeat(puma1['puma10'], lens),
                    'boro': np.repeat(puma1['boro'], lens),
                    'CD': chainer(puma1['CD'])})
res.shape

In [None]:
res['CD'] = res.CD.apply(lambda x: x.zfill(2))
res.head()
res['CD_code'] = res.boro + res.CD
res.head()

res = res[['CD_code','puma10']]
df = df.merge(boro, on='ZIPCODE', how='left')

In [None]:
#Crime
c1 = pd.read_csv('seriouscrimerateper1000residents.csv')
c2 = pd.read_csv('seriouscrimerateviolentper1000residents.csv')
c3 = pd.read_csv('seriouscrimeratepropertyper1000residents.csv')

c1 = c1.sort_values(by=['Community District'])[['Community District', '2016']]
c1.columns = ['CD', 'CrimeRate']
c2 = c2.sort_values(by=['Community District'])[['Community District', '2016']]
c2.columns = ['CD', 'CrimeRateViolent']
c3 = c3.sort_values(by=['Community District'])[['Community District', '2016']]
c3.columns = ['CD', 'CrimeRateProperty']

cs = [c1, c2, c3]
cdf = reduce(lambda left,right: pd.merge(left,right,on='CD'), cs)
cdf.head()
cdf['CD_code'] = cdf['CD'].str.split(' - ').str[0]
cdf.head()
cdf = cdf.merge(res, on='CD_code', how='left')
print(cdf.shape)
cdf.head()

cdf.loc[[33,34],'CrimeRate'] = 58.91219
cdf.loc[[33,34],'CrimeRateViolent'] = 7.93719
cdf.loc[[33,34],'CrimeRateProperty'] = 50.97500
cdf[cdf.puma10==3807]

crime = cdf[['puma10', 'CrimeRate', 'CrimeRateViolent', 'CrimeRateProperty']].drop_duplicates(subset=['puma10'], keep='first', inplace=False)
crime.shape


In [None]:
df = df.merge(crime, on='puma10', how='left')

In [None]:
df['price_over_rating'] = df.yelp_price/df.yelp_rating

# sorted(v12.yelp_review_count)
df['rating_over_price'] = df.yelp_rating/df.yelp_price
df['rating_count_prod'] = df.yelp_review_count * df.yelp_rating

In [None]:
df.to_csv('dataset_cleaned.csv',index = False)