In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor,LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score,r2_score,accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel,WhiteKernel
%matplotlib inline

### Preprocessing

In [2]:
data = pd.read_csv('Merged_Data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Date,HOUR_BIN,Citibike_Y,GreenTaxi_Y,YellowTaxi_Y,MTAExit_Y,MTAEntry_Y,temp,feelslike,...,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,Month,Day of Week
0,0,2018-01-01,0,400,11484.0,109663.0,283254.0,299794.0,-11.575,-16.85,...,9.35,304.25,1026.925,0.0,16.0,0.0,,0.0,1,0
1,1,2018-01-01,4,233,4268.0,32370.0,137862.0,146343.0,-11.975,-16.6,...,7.775,292.75,1027.925,0.05,16.0,16.25,0.15,0.0,1,0
2,2,2018-01-01,8,1070,2275.0,42633.0,151154.0,196377.0,-9.025,-13.35,...,8.35,305.5,1028.5,0.25,15.525,330.25,1.175,3.5,1,0
3,3,2018-01-01,12,1830,4275.0,84638.0,328108.0,415589.0,-5.15,-8.175,...,7.075,320.5,1027.15,0.2,16.0,296.75,1.075,3.25,1,0
4,4,2018-01-01,16,1422,5761.0,81748.0,492717.0,594243.0,-6.35,-9.775,...,7.775,305.75,1027.75,0.2,16.0,2.5,0.0,0.0,1,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4380 entries, 0 to 4379
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        4380 non-null   int64  
 1   Date              4380 non-null   object 
 2   HOUR_BIN          4380 non-null   int64  
 3   Citibike_Y        4380 non-null   int64  
 4   GreenTaxi_Y       4380 non-null   float64
 5   YellowTaxi_Y      4380 non-null   float64
 6   MTAExit_Y         4380 non-null   float64
 7   MTAEntry_Y        4380 non-null   float64
 8   temp              4380 non-null   float64
 9   feelslike         4380 non-null   float64
 10  dew               4380 non-null   float64
 11  humidity          4380 non-null   float64
 12  precip            4380 non-null   float64
 13  precipprob        4380 non-null   float64
 14  snow              4380 non-null   float64
 15  snowdepth         4380 non-null   float64
 16  windgust          2332 non-null   float64


In [4]:
data = data.drop(['solarenergy','windgust','Unnamed: 0','uvindex','solarradiation'],axis=1).dropna().reset_index()
data

Unnamed: 0,index,Date,HOUR_BIN,Citibike_Y,GreenTaxi_Y,YellowTaxi_Y,MTAExit_Y,MTAEntry_Y,temp,feelslike,...,precipprob,snow,snowdepth,windspeed,winddir,sealevelpressure,cloudcover,visibility,Month,Day of Week
0,0,2018-01-01,0,400,11484.0,109663.0,283254.0,299794.0,-11.575000,-16.850000,...,0.000000,0.0,0.0,9.350000,304.25,1026.925000,0.000000,16.000,1,0
1,1,2018-01-01,4,233,4268.0,32370.0,137862.0,146343.0,-11.975000,-16.600000,...,0.000000,0.0,0.0,7.775000,292.75,1027.925000,0.050000,16.000,1,0
2,2,2018-01-01,8,1070,2275.0,42633.0,151154.0,196377.0,-9.025000,-13.350000,...,0.000000,0.0,0.0,8.350000,305.50,1028.500000,0.250000,15.525,1,0
3,3,2018-01-01,12,1830,4275.0,84638.0,328108.0,415589.0,-5.150000,-8.175000,...,0.000000,0.0,0.0,7.075000,320.50,1027.150000,0.200000,16.000,1,0
4,4,2018-01-01,16,1422,5761.0,81748.0,492717.0,594243.0,-6.350000,-9.775000,...,0.000000,0.0,0.0,7.775000,305.75,1027.750000,0.200000,16.000,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4375,4375,2019-12-31,4,3150,516.0,12701.0,137058.0,175073.0,4.125000,1.750000,...,0.000000,0.0,0.0,9.925000,265.00,1006.225000,48.625000,12.900,12,1
4376,4376,2019-12-31,8,8079,2153.0,41939.0,588964.0,736485.0,4.625000,3.225000,...,0.000000,0.0,0.0,6.050000,317.00,1005.525000,32.825000,16.000,12,1
4377,4377,2019-12-31,12,12011,3451.0,62170.0,796791.0,997440.0,6.425000,3.400000,...,75.000000,0.0,0.0,15.950000,254.00,1003.450000,24.175000,16.000,12,1
4378,4378,2019-12-31,16,6088,4469.0,64740.0,973974.0,1236863.0,6.575000,4.200000,...,25.000000,0.0,0.0,12.475000,258.50,1003.850000,42.675000,16.000,12,1


In [5]:
def split_X_and_Y(data):
    list_=[]
    for i in data.columns:
        if '_Y' not in i:
            list_.append(i)
    X = data[list_].drop('Date',axis=1)
    Y = data[['MTAExit_Y','MTAEntry_Y','Citibike_Y','GreenTaxi_Y','YellowTaxi_Y']]
    return X,Y
X,Y = split_X_and_Y(data)

In [6]:
X.describe()

Unnamed: 0,index,HOUR_BIN,temp,feelslike,dew,humidity,precip,precipprob,snow,snowdepth,windspeed,winddir,sealevelpressure,cloudcover,visibility,Month,Day of Week
count,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0,4380.0
mean,2189.5,10.0,32.767629,31.641096,23.847451,66.411185,0.168929,14.600457,0.009938,0.414431,9.182047,187.238761,1017.640556,23.26188,12.711252,6.526027,2.993151
std,1264.541419,6.83208,28.604967,29.87281,26.569764,17.55204,2.455728,28.3919,0.081623,1.877873,5.693534,99.282969,8.067116,22.528808,3.449083,3.448245,2.001928
min,0.0,0.0,-13.9,-21.575,-20.025,15.27,0.0,0.0,0.0,0.0,0.0,0.0,988.0,0.0,1.15,1.0,0.0
25%,1094.75,4.0,8.625,6.9,1.7,53.334167,0.0,0.0,0.0,0.0,5.15,88.0,1012.4,1.05,9.9,4.0,1.0
50%,2189.5,10.0,22.170833,21.825,15.525,65.510417,0.0,0.0,0.0,0.0,7.925,217.375,1017.75,17.1375,14.3875,7.0,3.0
75%,3284.25,16.0,62.70625,62.70625,50.7,81.0,0.00225,25.0,0.0,0.0,11.88125,268.0,1022.85,44.40625,16.0,10.0,5.0
max,4379.0,20.0,95.75,108.2,75.925,99.84,73.935,100.0,2.6675,19.36,39.1,359.75,1043.725,100.0,16.0,12.0,6.0


In [7]:
Y.describe()

Unnamed: 0,MTAExit_Y,MTAEntry_Y,Citibike_Y,GreenTaxi_Y,YellowTaxi_Y
count,4380.0,4380.0,4380.0,4380.0,4380.0
mean,611718.6,801636.6,8634.94863,4313.719178,66937.084932
std,358182.7,499633.7,7085.599134,2488.694153,31210.733647
min,60531.0,62412.0,0.0,364.0,6707.0
25%,283144.0,303971.8,2238.5,2153.0,38371.75
50%,605673.5,777152.5,7285.5,4135.5,73753.0
75%,928769.2,1229888.0,13489.0,6272.25,91509.5
max,1370140.0,1765634.0,33112.0,13098.0,135165.0


In [8]:
list_ = []
for i in Y.columns:
    list_.append(pd.cut(Y[i],30,labels=range(1,31)))
Y = pd.concat(list_,axis=1)
Y.head()

Unnamed: 0,MTAExit_Y,MTAEntry_Y,Citibike_Y,GreenTaxi_Y,YellowTaxi_Y
0,6,5,1,27,25
1,2,2,1,10,6
2,3,3,1,5,9
3,7,7,2,10,19
4,10,10,2,13,18


In [9]:
Y.describe()

Unnamed: 0,MTAExit_Y,MTAEntry_Y,Citibike_Y,GreenTaxi_Y,YellowTaxi_Y
count,4380,4380,4380,4380,4380
unique,30,30,30,29,30
top,1,1,1,2,20
freq,355,497,725,347,276


In [10]:
categorical = ['HOUR_BIN','Month','Day of Week']

def onehotencoder(X,categorical):
    for i in categorical:
        X = X.drop(i,axis=1).merge(pd.get_dummies(X[i],prefix=i),left_index=True, right_index=True)
    return X
X_encoded = onehotencoder(X,categorical)

In [11]:
X_encoded.columns

Index(['index', 'temp', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob',
       'snow', 'snowdepth', 'windspeed', 'winddir', 'sealevelpressure',
       'cloudcover', 'visibility', 'HOUR_BIN_0', 'HOUR_BIN_4', 'HOUR_BIN_8',
       'HOUR_BIN_12', 'HOUR_BIN_16', 'HOUR_BIN_20', 'Month_1', 'Month_2',
       'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8',
       'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Day of Week_0',
       'Day of Week_1', 'Day of Week_2', 'Day of Week_3', 'Day of Week_4',
       'Day of Week_5', 'Day of Week_6'],
      dtype='object')

### Creating a metric to evaluate the model

In [12]:
def evaluatemodel(y_pred,y_test,error=10):
    y_test = np.array(y_test).ravel()
    minimum = y_test-(error)
    maximum = y_test+(error)
    temp = (y_pred<=maximum) & (y_pred>=minimum)
    return temp.sum()/len(y_test)

### Finding and removing outliers in data to prevent the model from skewing

In [13]:
def removedensityoutliers(X_encoded,Y,factor=300):
    n = len(Y)//factor
    temp = Y.copy()
    anom = LocalOutlierFactor(n_neighbors=n, novelty=False)
    Y_Out = anom.fit_predict(temp)
    temp['Outlier'] = Y_Out
    indexes = temp[temp['Outlier'] == 1].index
    return X_encoded.iloc[indexes].reset_index(drop=True),Y.iloc[indexes].reset_index(drop=True)
X_Processed,Y_Processed = removedensityoutliers(X_encoded,Y)

  X = check_array(X, **check_params)


In [14]:
X_Processed.shape[0]/X_encoded.shape[0]

0.9659817351598173

In [15]:
Y.describe()

Unnamed: 0,MTAExit_Y,MTAEntry_Y,Citibike_Y,GreenTaxi_Y,YellowTaxi_Y
count,4380,4380,4380,4380,4380
unique,30,30,30,29,30
top,1,1,1,2,20
freq,355,497,725,347,276


In [16]:
#Y = Y[['MTAExit_Y']]

In [17]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Building a Model

In [18]:
def Model(Outlier_Factor = 150,error = 1):
    Outlier_Factor = Outlier_Factor
    error = error

    #Testing out different models
    param_grid_lr = {}
    lr = LinearRegression()

    param_grid_svr = {'kernel':['rbf'],'degree':range(30,31),'gamma':['scale','auto']}
    svr = SVR()

    param_grid_sgd = {'penalty':['l1','l2'],'alpha':[1e-6,1e-5,1e-4,1e-3,1e-2],'max_iter':[100000]}
    sgd=SGDRegressor()


    param_grid_dtc = {'random_state':[123],'max_depth':[None],'splitter':['best'],'criterion':['gini', 'entropy'],'max_features':['auto','sqrt','log2',None],'ccp_alpha':[1e-2,1e-3]}
    dtc=DecisionTreeClassifier()


    #param_grid_rfr = {'n_estimators':[1000],'criterion':['squared_error','absolute_error','poisson'],'max_features':['sqrt','log2',None],'ccp_alpha':[1e-2,1e-3]}
    param_grid_rfc = {'ccp_alpha':[1e-2],'criterion': ['entropy'],'max_features': [None],'n_estimators': [1000]}  #Obtained after training
    rfc = RandomForestClassifier()

    param_grid_gbr = {'loss':['squared_error','absolute_error'],'criterion': ['friedman_mse','squared_error'],'learning_rate':[0.01,0.1],'n_estimators':[1000,2000],'ccp_alpha':[1,1e-1]}
    gbr = GradientBoostingRegressor()


    IS = []
    OS = []

    for k in Y.columns:
        X_Processed,Y_Processed = removedensityoutliers(X_encoded,Y,Outlier_Factor)
        Y_array = np.array(Y_Processed[k])
        X_train, X_test, y_train, y_test = train_test_split(X_Processed,Y_array, test_size=0.25, random_state=123)

        #finding the best correlators from training data
        Xs = X_train.columns

        #Using only those features
        X_train = np.array(X_train[Xs])
        X_test = np.array(X_test[Xs])

        #Standardizing the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        #Using gridsearch for hyperparamter optimization


        #Linear methods

        #gr=GridSearchCV(lr,param_grid=param_grid_lr,cv=5)
        #gr=GridSearchCV(svr,param_grid=param_grid_svr,cv=5)
        #gr=GridSearchCV(sgd,param_grid=param_grid_sgd,cv=5)
        #gr.fit(X_train,np.log(np.array(y_train)).ravel())


        #Tree and ensemble based

        #gr=GridSearchCV(dtc,param_grid=param_grid_dtc,cv=3)
        gr=GridSearchCV(rfc,param_grid=param_grid_rfc,cv=3,verbose=2)
        #gr =GridSearchCV(gbr,param_grid=param_grid_gbr,cv=5,verbose=1)
        gr.fit(X_train,np.array(y_train).ravel())                  
        #Output the best parameter
        print(gr.best_params_)
        y_pred = gr.predict(X_train)
        IS.append(evaluatemodel(y_pred,y_train,error=error))
        y_pred = gr.predict(X_test)
        OS.append(evaluatemodel(y_pred,y_test,error=error))
    result = pd.DataFrame(index=Y.columns)
    result['IS'] = IS
    result['OS'] = OS  
    return result

In [19]:
Model(Outlier_Factor = 250,error = 1)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  40.3s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  39.8s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  40.1s
{'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_features': None, 'n_estimators': 1000}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  40.8s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  40.2s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  40.7s
{'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_features': None, 'n_estimators': 1000}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END ccp_alpha=0.01, criterion=entrop



[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  35.8s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  35.1s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time= 7.1min
{'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_features': None, 'n_estimators': 1000}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  38.5s
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time= 2.5min
[CV] END ccp_alpha=0.01, criterion=entropy, max_features=None, n_estimators=1000; total time=  39.1s
{'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_features': None, 'n_estimators': 1000}


Unnamed: 0,IS,OS
MTAExit_Y,0.846012,0.826408
MTAEntry_Y,0.836156,0.811634
Citibike_Y,0.778873,0.699908
GreenTaxi_Y,0.836772,0.796861
YellowTaxi_Y,0.73052,0.694367
