In [29]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import geocoder
import matplotlib.pyplot as plt

In [93]:
train = pd.read_csv("../data_cleaned/cleaned_train.csv")
test = pd.read_csv("../data_cleaned/cleaned_test.csv")

In [94]:
df_sensor = pd.read_csv('../data_cleaned/geocoded_sensor.csv')

### clean up train and test

In [95]:
#drop unnessesary columns
train = train.drop(['Unnamed: 0', 'from_lat', 'from_long', 'to_lat', 'to_long', 'Date', 'Time', 'Real.Spots'], axis=1)
test = test.drop(['Unnamed: 0', 'from_lat', 'from_long', 'to_lat', 'to_long', 'Date', 'Time'], axis=1)

In [96]:
#add Block ID for each street-from-to combo
train['block_id'] = train.groupby(['Street','From','To']).ngroup()
test['block_id'] = test.groupby(['Street','From','To']).ngroup()
train['Street'] = train['Street'].apply(lambda x: x.lower())
test['Street'] = test['Street'].apply(lambda x: x.lower())

In [97]:
train.head(5)

Unnamed: 0,Street,From,To,Street.Length,any_spot,street_from,street_to,block_lat,block_long,hour,month,dayofweek,is_weekend,block_id
0,mission street,25th Street,26th Street,179.13297,1,"Mission Street & 25th Street, San Francisco CA","Mission Street & 26th Street, San Francisco CA",37.749846,-122.418241,16,1,1,0,42
1,polk street,Ellis Street,Olive Street,52.74021,0,"Polk Street & Ellis Street, San Francisco CA","Polk Street & Olive Street, San Francisco CA",37.784273,-122.419429,20,1,5,1,60
2,van ness avenue,Geary Boulevard,Myrtle Street,52.51784,0,"Van Ness Avenue & Geary Boulevard, San Francis...","Van Ness Avenue & Myrtle Street, San Francisco CA",37.785451,-122.421355,20,1,5,1,80
3,van ness avenue,Bush Street,Fern Street,52.405315,0,"Van Ness Avenue & Bush Street, San Francisco CA","Van Ness Avenue & Fern Street, San Francisco CA",37.788271,-122.421847,20,1,5,1,77
4,van ness avenue,Daniel Burnham Court,Post Street,52.191193,0,"Van Ness Avenue & Daniel Burnham Court, San Fr...","Van Ness Avenue & Post Street, San Francisco CA",37.786863,-122.421637,20,1,5,1,79


### clean up sensor 

In [98]:
#clean up sensor
df_sensor.rename(index=str, columns={"STREET_NAME": "Street", "DAY_TYPE":"is_weekend"}, inplace=True)
df_sensor = df_sensor.drop(['Unnamed: 0'], axis=1)

In [99]:
df_sensor['Date'] = pd.to_datetime(df_sensor['CAL_DATE'], infer_datetime_format=True)
df_sensor['hour'] = df_sensor['TIME_OF_DAY'].apply(lambda x: x/100).astype(int)
df_sensor['month'] = pd.to_datetime(df_sensor['Date']).dt.month
df_sensor['dayofweek'] = pd.to_datetime(df_sensor['Date']).dt.weekday
df_sensor['is_weekend'] = ((pd.DatetimeIndex(df_sensor['Date']).dayofweek) // 5 == 1).astype(int)

In [100]:
df_sensor_new = df_sensor.copy()
df_sensor_new = df_sensor_new.drop(['Date', 'START_TIME_DT', 'CAL_DATE', 'STREET_BLOCK_renamed', 'coord_block'], axis=1)

In [101]:
df_sensor_new['TOTAL_VAC_RATIO'] = df_sensor['TOTAL_VACANT_TIME']/df_sensor['TOTAL_TIME']

In [102]:
pd.set_option('display.max_columns', 500)
df_sensor_new.head()

Unnamed: 0,BLOCK_ID,Street,BLOCK_NUM,STREET_BLOCK,AREA_TYPE,PM_DISTRICT_NAME,RATE,TOTAL_TIME,TOTAL_OCCUPIED_TIME,TOTAL_VACANT_TIME,TOTAL_UNKNOWN_TIME,OP_TIME,OP_OCCUPIED_TIME,OP_VACANT_TIME,OP_UNKNOWN_TIME,NONOP_TIME,NONOP_OCCUPIED_TIME,NONOP_VACANT_TIME,NONOP_UNKNOWN_TIME,GMP_TIME,GMP_OCCUPIED_TIME,GMP_VACANT_TIME,GMP_UNKNOWN_TIME,COMM_TIME,COMM_OCCUPIED_TIME,COMM_VACANT_TIME,COMM_UNKNOWN_TIME,CAL_MONTH_NAME,CAL_YEAR,is_weekend,TIME_OF_DAY,block_lat,block_long,hour,month,dayofweek,TOTAL_VAC_RATIO
0,50001,jackson street,1.0,JACKSON ST 100,Pilot,Downtown,,79200.0,39985.0,39215.0,0.0,0.0,0.0,0.0,0.0,79200.0,39985.0,39215.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,July,2012.0,1,200.0,37.797099,-122.398361,2,7,6,0.495139
1,50002,jackson street,2.0,JACKSON ST 200,Pilot,Downtown,,82800.0,30465.0,52335.0,0.0,0.0,0.0,0.0,0.0,82800.0,30465.0,52335.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,July,2012.0,1,800.0,37.797029,-122.399959,8,7,6,0.632065
2,50004,jackson street,4.0,JACKSON ST 400,Pilot,Downtown,,118800.0,62914.0,55886.0,0.0,0.0,0.0,0.0,0.0,118800.0,62914.0,55886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,July,2012.0,1,2100.0,37.796633,-122.401883,21,7,6,0.470421
3,50023,jackson street,23.0,JACKSON ST 2300,Pilot,Fillmore,,28800.0,18000.0,10800.0,0.0,0.0,0.0,0.0,0.0,28800.0,18000.0,10800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,July,2012.0,1,500.0,37.792687,-122.432951,5,7,6,0.375
4,50228,jones street,28.0,JONES ST 2800,Pilot,Fisherman's Wharf,3.5,28800.0,21508.0,7292.0,0.0,28800.0,21508.0,7292.0,0.0,0.0,0.0,0.0,0.0,28800.0,21508.0,7292.0,0.0,0.0,0.0,0.0,0.0,July,2012.0,1,1600.0,37.807149,-122.41729,16,7,6,0.253194


### find means on grouped sensor file by street and weekend

In [103]:
sensor_means =  df_sensor_new.groupby(['Street', 'is_weekend']).mean()

#only keep the columns that make sense to group by mean
sensor_means = sensor_means[['RATE', 'TOTAL_TIME', 'TOTAL_OCCUPIED_TIME', 'TOTAL_VACANT_TIME', 'TOTAL_UNKNOWN_TIME',
                              'OP_TIME','OP_OCCUPIED_TIME','NONOP_TIME','NONOP_OCCUPIED_TIME', 'NONOP_VACANT_TIME', 'NONOP_UNKNOWN_TIME',
                             'GMP_TIME', 'GMP_OCCUPIED_TIME', 'GMP_VACANT_TIME','GMP_UNKNOWN_TIME','COMM_TIME','COMM_OCCUPIED_TIME','COMM_VACANT_TIME','COMM_UNKNOWN_TIME',
                               'TOTAL_VAC_RATIO']]
sensor_means_df = pd.DataFrame(sensor_means).reset_index()
sensor_means_df

Unnamed: 0,Street,is_weekend,RATE,TOTAL_TIME,TOTAL_OCCUPIED_TIME,TOTAL_VACANT_TIME,TOTAL_UNKNOWN_TIME,OP_TIME,OP_OCCUPIED_TIME,NONOP_TIME,NONOP_OCCUPIED_TIME,NONOP_VACANT_TIME,NONOP_UNKNOWN_TIME,GMP_TIME,GMP_OCCUPIED_TIME,GMP_VACANT_TIME,GMP_UNKNOWN_TIME,COMM_TIME,COMM_OCCUPIED_TIME,COMM_VACANT_TIME,COMM_UNKNOWN_TIME,TOTAL_VAC_RATIO
0,23rd street,0,1.577751,84072.906404,42615.112753,33267.676724,8190.116927,33544.08867,18001.390941,50528.817734,24613.721812,20864.992269,5050.103654,19861.083744,11138.65052,6355.95806,2366.475164,13683.004926,6862.740421,6046.726396,773.538109,0.392916
1,23rd street,1,2.614159,84127.868852,50593.060622,25193.574966,8341.233265,18675.614754,11562.536031,65452.254098,39030.52459,20363.773907,6057.955601,15569.262295,9910.091018,3534.231557,2124.93972,3106.352459,1652.445014,1295.569501,158.337944,0.298229
2,battery street,0,4.025045,48546.243842,17973.883296,24881.581623,5690.778924,19467.302956,8513.66499,29078.940887,9460.218305,16028.767105,3589.955477,7208.374384,5141.551673,1471.566246,595.256466,12258.928571,3372.113318,7381.248272,1505.566981,0.482129
3,battery street,1,2.994005,48562.653689,22238.935067,23368.329022,2955.3896,12545.440574,3721.888405,36017.213115,18517.046662,15531.925546,1968.240907,12545.440574,3721.888405,7836.403475,987.148694,0.0,0.0,0.0,0.0,0.449002
4,bryant street,0,2.161522,122997.352217,40454.789973,74304.893165,8237.669078,53100.862069,23336.246836,69896.490148,17118.543138,48522.118038,4255.828972,48809.421182,22527.74894,22691.455904,3590.216338,4291.440887,808.497896,3091.319222,391.623768,0.594282
5,bryant street,1,1.733539,123047.336066,32011.54474,83185.260886,7850.530439,35247.540984,8657.877988,87799.795082,23353.666752,59607.497353,4838.630977,33143.237705,8493.315488,21819.308999,2830.613217,2104.303279,164.5625,1758.454534,181.286245,0.663253
6,bush street,0,3.5,39429.14615,9355.464781,21372.728444,8700.952925,9426.380304,3572.88088,30002.765846,5782.583901,15615.828881,8604.353064,0.0,0.0,0.0,0.0,9426.380304,3572.88088,5756.899563,96.59986,0.544352
7,bush street,1,3.5,39388.493724,15189.557096,23761.648361,437.288267,10890.690377,3400.843794,28497.803347,11788.713302,16408.79864,300.291405,8947.280335,2891.363319,5942.148971,113.768044,1943.410042,509.480474,1410.70075,23.228818,0.60896
8,columbus avenue,0,1.894069,56228.571429,20045.815921,27581.386323,8601.369184,28074.384236,11584.341988,28154.187192,8461.473933,15372.333368,4320.379892,28074.384236,11584.341988,12209.052956,4280.989293,0.0,0.0,0.0,0.0,0.487568
9,columbus avenue,1,2.752818,56257.377049,27353.453552,20307.067367,8596.85613,28128.688525,14869.357497,28128.688525,12484.096055,11346.500598,4298.091872,28128.688525,14869.357497,8960.566769,4298.764259,0.0,0.0,0.0,0.0,0.360396


### attach sensor data to train and test by street and wekend

In [104]:
cols_for_join = ['Street','is_weekend']

train_sensor_merged = pd.merge(train, sensor_means_df, how='left', on = cols_for_join)
test_sensor_merged = pd.merge(test, sensor_means_df, how='left', on = cols_for_join)

### label encoding

In [105]:
train_sensor_merged.head(3)

Unnamed: 0,Street,From,To,Street.Length,any_spot,street_from,street_to,block_lat,block_long,hour,month,dayofweek,is_weekend,block_id,RATE,TOTAL_TIME,TOTAL_OCCUPIED_TIME,TOTAL_VACANT_TIME,TOTAL_UNKNOWN_TIME,OP_TIME,OP_OCCUPIED_TIME,NONOP_TIME,NONOP_OCCUPIED_TIME,NONOP_VACANT_TIME,NONOP_UNKNOWN_TIME,GMP_TIME,GMP_OCCUPIED_TIME,GMP_VACANT_TIME,GMP_UNKNOWN_TIME,COMM_TIME,COMM_OCCUPIED_TIME,COMM_VACANT_TIME,COMM_UNKNOWN_TIME,TOTAL_VAC_RATIO
0,mission street,25th Street,26th Street,179.13297,1,"Mission Street & 25th Street, San Francisco CA","Mission Street & 26th Street, San Francisco CA",37.749846,-122.418241,16,1,1,0,42,2.944775,99453.235064,52092.437399,42209.900055,5150.89761,35453.49319,24220.779699,63999.741874,27871.6577,31183.235732,4944.848442,27734.04884,20454.694393,7120.745859,158.608588,7719.44435,3766.085306,3905.918464,47.44058,0.425016
1,polk street,Ellis Street,Olive Street,52.74021,0,"Polk Street & Ellis Street, San Francisco CA","Polk Street & Olive Street, San Francisco CA",37.784273,-122.419429,20,1,5,1,60,2.280039,37451.803279,16063.416598,19844.934119,1543.452561,11774.139344,6088.767589,25677.663934,9974.64901,14646.284085,1056.73084,11609.139344,6033.478005,5089.174829,486.48651,165.0,55.289583,109.475205,0.235212,0.500695
2,van ness avenue,Geary Boulevard,Myrtle Street,52.51784,0,"Van Ness Avenue & Geary Boulevard, San Francis...","Van Ness Avenue & Myrtle Street, San Francisco CA",37.785451,-122.421355,20,1,5,1,80,1.314128,48816.393443,15206.722108,33560.860115,48.811219,11324.590164,3504.582309,37491.803279,11702.1398,25753.765881,35.897598,10392.520492,3297.419456,7082.399932,12.701104,932.069672,207.162853,724.694302,0.212517,0.688941


In [106]:
def proc_col(col):
    """Encodes a pandas column with continous ids. 
    """
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return np.array([name2idx[x] for x in col])

cols_str_2_num = [
    'Street',
    'From',
    'To',
    'street_from',
    'street_to',
    
]

for col in cols_str_2_num:
    train_sensor_merged[col+'_num'] = proc_col(train_sensor_merged[col])
    test_sensor_merged[col+'_num'] = proc_col(test_sensor_merged[col])

In [107]:
train_sensor_merged = train_sensor_merged.drop(cols_str_2_num, axis=1)
test_sensor_merged = test_sensor_merged.drop(cols_str_2_num, axis=1)

In [108]:
train_sensor_merged.isnull().sum()

Street.Length          0
any_spot               0
block_lat              0
block_long             0
hour                   0
month                  0
dayofweek              0
is_weekend             0
block_id               0
RATE                   0
TOTAL_TIME             0
TOTAL_OCCUPIED_TIME    0
TOTAL_VACANT_TIME      0
TOTAL_UNKNOWN_TIME     0
OP_TIME                0
OP_OCCUPIED_TIME       0
NONOP_TIME             0
NONOP_OCCUPIED_TIME    0
NONOP_VACANT_TIME      0
NONOP_UNKNOWN_TIME     0
GMP_TIME               0
GMP_OCCUPIED_TIME      0
GMP_VACANT_TIME        0
GMP_UNKNOWN_TIME       0
COMM_TIME              0
COMM_OCCUPIED_TIME     0
COMM_VACANT_TIME       0
COMM_UNKNOWN_TIME      0
TOTAL_VAC_RATIO        0
Street_num             0
From_num               0
To_num                 0
street_from_num        0
street_to_num          0
dtype: int64

### fit model

In [109]:
from sklearn.model_selection import train_test_split
X_data = train_sensor_merged.drop(['any_spot'], axis=1)
y_data = train_sensor_merged['any_spot']

In [110]:
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.25, random_state=0)

In [111]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((825, 33), (275, 33), (825,), (275,))

#### tune hyperparameters

In [80]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

parameters = {
    "n_estimators" :  [1, 50, 80, 100, 200, 400, 600],
    "max_depth" :  [1, 5, 10, 20, 30, 50],
    "min_samples_leaf" : [1, 2, 5, 10],
}


f_point5_scorer = make_scorer(fbeta_score, beta=0.5)

gsc = GridSearchCV(model, parameters, n_jobs=-1,verbose=1, scoring=f_point5_scorer)
gsc.fit(X_train, y_train)
best_parameters, score, _ = max(gsc.grid_scores_, key=lambda x: x[1])
print(best_parameters)
print(score)

Fitting 3 folds for each of 168 candidates, totalling 504 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 246 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 504 out of 504 | elapsed:  2.6min finished


{'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
1.0




In [112]:
max_depth_best = best_parameters['max_depth']
min_samples_leaf_best = best_parameters['min_samples_leaf']
n_estimator_best = best_parameters['n_estimators']
# random_state_best = best_parameters['random_state']

m = RandomForestClassifier(min_samples_leaf=min_samples_leaf_best, 
                           n_estimators=n_estimator_best, 
                           n_jobs=-1,
                           max_depth=max_depth_best)


m.fit(X_train, y_train)
y_pred_test = m.predict(test_sensor_merged)

1.0


In [115]:
df = pd.DataFrame(y_pred_test)
df.index = df.index+1
df = df.reset_index()
df.columns = ["id", "any_spot"]
df.to_csv("preds_SM1.csv", index=False)