# Predict Bike Wait Times

In [1]:
import pandas as pd
import numpy as np
import random
import sklearn

In [28]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

# Use full dataset with weather, cluster, and time data

In [73]:
df_train = pd.read_csv('202109-stations-capacity/all_data_train_cluster.csv')
df_test = pd.read_csv('202109-stations-capacity/all_data_test_cluster.csv')

In [74]:
df_train['seconds_from_hour'] = 60*df_train['minute'] + df_train['second']
df_test['seconds_from_hour'] = 60*df_test['minute'] + df_test['second']

In [75]:
X_train = df_train.loc[:, df_train.columns != 'decrement']
X_train = X_train[X_train['increment']==1]
X_train = X_train[X_train['wait_time']!=0]

In [93]:
X_train

Unnamed: 0,time,capacity,station,increment,wait_time,year,month,day_of_week,day,hour,...,Total.docks,Temp.y,Rain.y,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6,seconds_from_hour
42,03:02:00,20,Galileo Galilei Way at Main Street,1,1056,2021,9,Wednesday,1,0,...,19.0,68,1,0,1,0,0,0,0,182
54,03:54:00,15,Lesley University,1,1034,2021,9,Wednesday,1,0,...,15.0,68,1,0,1,0,0,0,0,234
63,04:46:00,21,Beacon St at Massachusetts Ave,1,664,2021,9,Wednesday,1,0,...,19.0,68,1,0,0,0,0,1,0,286
70,05:12:00,24,Roxbury Crossing T Stop - Columbus Ave at Trem...,1,1744,2021,9,Wednesday,1,0,...,23.0,68,1,0,1,0,0,0,0,312
85,05:55:00,16,Cambridge St at Joy St,1,506,2021,9,Wednesday,1,0,...,15.0,68,1,0,0,0,0,0,1,355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730380,,28,Harvard Square at Brattle St / Eliot St,1,344,2021,9,Saturday,25,23,...,15.0,72,0,0,1,0,0,0,0,3585
730381,,17,Tremont St at E Berkeley St,1,18,2021,9,Saturday,25,23,...,19.0,72,0,0,1,0,0,0,0,3587
730382,,46,Landmark Center - Brookline Ave at Park Dr,1,808,2021,9,Saturday,25,23,...,15.0,72,0,0,0,0,0,0,1,3587
730384,,-90,Warren St at Chelsea St,1,1582,2021,9,Saturday,25,23,...,23.0,72,0,0,1,0,0,0,0,3591


In [55]:
features = ['station', 
            'day_of_week', 
            'day', 
            'hour', 
            'seconds_from_hour',
            'Temp.x',
            'Rain.x',
            'avgBikeIn',
            'avgBikeOut',
            'avgwait',
            'Total.docks',
            'cluster1',
            'cluster2',
            'cluster3',
            'cluster4',
            'cluster5',
            'cluster6'
           ]

In [90]:
X_train = X_train[X_train['Total.docks'].notnull()]

In [91]:
y_train = X_train['wait_time'] > 300
y_train = y_train.replace(True, 1)
y_train = y_train.replace(False, 0)


In [92]:
y_train

42        1
54        1
63        1
70        1
85        1
         ..
730380    1
730381    0
730382    1
730384    1
730385    1
Name: wait_time, Length: 360491, dtype: int64

In [94]:
X_train = X_train[features]
X_train = pd.get_dummies(X_train, columns = ['station', 
                                             'day_of_week', 
                                             'day', 
                                             'hour',
                                             ], drop_first = True)


In [101]:
X_test = df_test.loc[:, df_test.columns != 'decrement']
X_test = X_test[X_test['increment']==1]
X_test = X_test[X_test['wait_time']!=0]

In [103]:
X_test = X_test[X_test['Total.docks'].notnull()]

In [104]:
y_test = X_test['wait_time'] > 300
y_test = y_test.replace(True, 1)
y_test = y_test.replace(False, 0)
;

''

In [105]:
X_test = X_test[features]
X_test = pd.get_dummies(X_test, columns = ['station', 
                                             'day_of_week', 
                                             'day', 
                                             'hour',
                                             ], drop_first = True)

In [99]:
X_test

Unnamed: 0,seconds_from_hour,Temp.x,Rain.x,avgBikeIn,avgBikeOut,avgwait,Total.docks,cluster1,cluster2,cluster3,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,0,77,1,10.80,12.20,8080.232210,19.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,77,1,93.52,93.24,937.673913,19.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,77,1,87.96,85.44,990.988062,18.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,77,1,56.84,57.00,1525.680565,19.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,6,77,1,108.88,107.68,798.374444,16.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131691,579,68,0,65.88,66.92,1319.646267,47.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
131695,2806,68,0,65.88,66.92,1319.646267,47.0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
131696,91,68,0,11.72,10.32,7527.965157,18.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
131706,3150,68,0,11.72,10.32,7527.965157,18.0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [108]:
'seconds_from_hour' in X_test

True

In [109]:
for col in X_train.columns:
    if col not in X_test:
        X_test[col] = 0

In [113]:
for col in X_test.columns:
    if col not in X_train:
        X_train[col] = 0

In [95]:
# Select the model type
model = RandomForestClassifier()

# Fit the model to our data
model.fit(X_train, y_train)

RandomForestClassifier()

In [112]:
from sklearn import metrics
#y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# r2_train = metrics.r2_score(y_train, y_train_pred)
# r2_test = metrics.r2_score(y_test, y_test_pred)

#auc_train = metrics.roc_auc_score(y_train, y_train_pred)
auc_test = metrics.roc_auc_score(y_test, y_test_pred)

Feature names unseen at fit time:
- day_26
- day_27
- day_28
- day_29
- day_30
Feature names must be in the same order as they were in fit.



ValueError: X has 477 features, but RandomForestClassifier is expecting 472 features as input.

In [14]:
# Old version

In [5]:
df = pd.read_csv('202109-stations-capacity/all_data.csv')

In [6]:
df

Unnamed: 0,time,capacity,station,increment,decrement,wait_time,year,month,day_of_week,day,hour,minute,second,seconds_from_hour
0,00:10.3,15,The Dimock Center,0,1,0,2021,9,Wednesday,1,0,0,10,10
1,00:11.2,18,Boylston St at Fairfield St,0,1,0,2021,9,Wednesday,1,0,0,11,11
2,00:11.4,17,Boylston St at Fairfield St,0,1,0,2021,9,Wednesday,1,0,0,11,11
3,00:13.3,14,Congress St at Northern Ave,0,1,0,2021,9,Wednesday,1,0,0,13,13
4,00:21.2,24,Inman Square at Springfield St.,0,1,0,2021,9,Wednesday,1,0,0,21,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862139,45:24.9,105,E Cottage St at Columbia Rd,1,0,0,2021,10,Sunday,3,21,45,24,2724
862140,28:07.5,13,Christian Science Plaza - Massachusetts Ave at...,1,0,0,2021,10,Monday,4,0,28,7,1687
862141,05:12.1,27,Boston Landing,1,0,0,2021,10,Monday,4,10,5,12,312
862142,41:33.2,25,Government Center - Cambridge St at Court St,1,0,0,2021,10,Monday,4,12,41,33,2493


In [5]:
features = ['station', 'day_of_week', 'day', 'hour', 'seconds_from_hour']

In [6]:
# get only instances where bike left station

#X = df.loc[:, df.columns != 'wait_time']
X = df.loc[:, df.columns != 'decrement']
X = X[X['increment']==1]
X = X[X['wait_time']!=0]
#X =X[X['station']=='Ames St at Main St']

In [7]:
X

Unnamed: 0,time,capacity,station,increment,wait_time,year,month,day_of_week,day,hour,minute,second,seconds_from_hour
42,03:02.3,20,Galileo Galilei Way at Main Street,1,1056,2021,9,Wednesday,1,0,3,2,182
54,03:54.5,15,Lesley University,1,1034,2021,9,Wednesday,1,0,3,54,234
63,04:46.0,21,Beacon St at Massachusetts Ave,1,664,2021,9,Wednesday,1,0,4,46,286
70,05:12.1,24,Roxbury Crossing T Stop - Columbus Ave at Trem...,1,1744,2021,9,Wednesday,1,0,5,12,312
85,05:55.7,16,Cambridge St at Joy St,1,506,2021,9,Wednesday,1,0,5,55,355
...,...,...,...,...,...,...,...,...,...,...,...,...,...
862100,09:39.9,35,South Station - 700 Atlantic Ave,1,5826,2021,10,Friday,1,1,9,39,579
862104,46:46.1,23,South Station - 700 Atlantic Ave,1,125752,2021,10,Friday,1,2,46,46,2806
862105,01:31.3,58,Dudley Square - Bolling Building,1,67858,2021,10,Friday,1,4,1,31,91
862115,52:30.0,60,Dudley Square - Bolling Building,1,29308,2021,10,Friday,1,22,52,30,3150


In [8]:
X['wait_time'][1000:1020]

2167     3420
2168     1647
2171    52250
2173     3443
2174     2801
2175      149
2176        4
2177     4731
2178      135
2179    14368
2181     1743
2182      821
2187      630
2189     7318
2191     1206
2193      987
2194       12
2197      490
2198       30
2201      799
Name: wait_time, dtype: int64

In [9]:
y_cat = X['wait_time'] > 300
y_cat = y_cat.replace(True, 1)
y_cat = y_cat.replace(False, 0)
y_cat

42        1
54        1
63        1
70        1
85        1
         ..
862100    1
862104    1
862105    1
862115    1
862128    1
Name: wait_time, Length: 426307, dtype: int64

In [10]:
sum(y_cat)

248054

In [11]:
y = X['wait_time']

In [12]:
X = X[features]

In [13]:
X_clean = pd.get_dummies(X, columns = ['station', 'day_of_week', 'day', 'hour',], drop_first = True)


In [14]:
X_clean

Unnamed: 0,seconds_from_hour,station_160 Arsenal,station_175 N Harvard St,station_191 Beacon St,station_2 Hummingbird Lane at Olmsted Green,station_30 Dane St,station_359 Broadway - Broadway at Fayette Street,station_645 Summer St,station_699 Mt Auburn St,station_7 Acre Park,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
42,182,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,234,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,286,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70,312,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85,355,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862100,579,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
862104,2806,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
862105,91,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
862115,3150,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_clean, 
                                                    y_cat, 
                                                    train_size = 0.70, 
                                                    stratify= y_cat,
                                                    random_state = 1)

In [34]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

# Select the model type
model = RandomForestClassifier()

# Fit the model to our data
model.fit(X_train, y_train)

RandomForestClassifier()

In [48]:
lnr = GridSearchCV(RandomForestClassifier(
    random_seed = 511), 
    n_estimators = [500, 1000, 2000],
    cp = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5])

lnr.fit(lnr, X_train, y_train)

NameError: name 'IAI' is not defined

In [35]:
from sklearn import metrics
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# r2_train = metrics.r2_score(y_train, y_train_pred)
# r2_test = metrics.r2_score(y_test, y_test_pred)

auc_train = metrics.roc_auc_score(y_train, y_train_pred)
auc_test = metrics.roc_auc_score(y_test, y_test_pred)

In [36]:
metrics.accuracy_score(y_test, y_test_pred)

0.6381975557692758

In [37]:
auc_test

0.6321269578021126

In [182]:
r2_train

0.370539164580836

In [183]:
r2_test

0.15192320703581041

In [61]:
y_test_pred

array([ 590.1633099 , 4414.83220943, -448.85237614, ..., 1322.60261489,
         66.30093624, -332.98419694])