In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

fhvhv_covid_df = pd.read_csv('/Volumes/E/data/preprocessed_data/fhvhv_covid.csv')
fhvhv_covid_df.head()

Unnamed: 0,pickup_date,num_of_SR,trip_count,avg_trip_duration,CASE_COUNT,PROBABLE_CASE_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,BX_CASE_COUNT,BX_PROBABLE_CASE_COUNT,...,MN_ALL_CASE_COUNT_7DAY_AVG,QN_CASE_COUNT,QN_PROBABLE_CASE_COUNT,QN_CASE_COUNT_7DAY_AVG,QN_ALL_CASE_COUNT_7DAY_AVG,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,INCOMPLETE
0,2020-09-01,34,344879,17.833136,294,34,255,278,52,9,...,48,73,4,58,62,24,1,18,19,0
1,2020-09-02,42,357969,17.71804,271,25,249,272,42,6,...,48,69,3,58,62,17,0,18,19,0
2,2020-09-03,26,391021,18.258425,274,29,250,274,41,5,...,46,90,11,62,67,17,4,19,20,0
3,2020-09-04,6,425570,19.115868,230,19,243,267,44,1,...,43,59,4,63,68,26,2,20,21,0
4,2020-09-05,0,434656,17.152276,156,11,247,269,30,3,...,43,34,3,63,68,24,0,22,23,0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,mean_absolute_error 
from sklearn.metrics import r2_score

from numpy import absolute
from numpy import mean
from numpy import std

from statsmodels.formula.api import ols,glm
from statsmodels.api import families

In [8]:
# Select a sample of interest and see if all the predictors are significant
# Use Poisson distribution because trip count is discrete
# Codes are based on Lab 3's materials

fit = glm(
    formula="trip_count ~ num_of_SR + CASE_COUNT + PROBABLE_CASE_COUNT + CASE_COUNT_7DAY_AVG + ALL_CASE_COUNT_7DAY_AVG",
    data=fhvhv_covid_df,family=families.Poisson()).fit()

print(fit.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             trip_count   No. Observations:                  122
Model:                            GLM   Df Residuals:                      116
Model Family:                 Poisson   Df Model:                            5
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.5754e+05
Date:                Thu, 12 Aug 2021   Deviance:                   7.1328e+05
Time:                        21:32:18   Pearson chi2:                 7.24e+05
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

# Ridge regression model

In [18]:
X = fhvhv_covid_df.drop(['pickup_date','trip_count'],axis=1).values
y = fhvhv_covid_df['trip_count'].values
y = y.reshape(-1, 1)

# Standardize data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
scaler.fit(y)
y = scaler.transform(y)

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train)
pred_train = rr.predict(X_train)

# Compute MAE, MSE, RMSE, R2 to evalute the performance of the model
print("MAE on train data: ", mean_absolute_error(y_train, pred_train))
print("MSE on train data: ", mean_squared_error(y_train, pred_train))
print("RMSE on train data: ", np.sqrt(mean_squared_error(y_train,pred_train)))
print("R2 score on train data: ", r2_score(y_train, pred_train))

rr.fit(X_test, y_test)
pred_test = rr.predict(X_test)
print("\nMAE on test data: ", mean_absolute_error(y_test, pred_test))
print("MSE on test data: ", mean_squared_error(y_test, pred_test))
print("RMSE on test data: ",np.sqrt(mean_squared_error(y_test,pred_test)))
print("R2 score on test data: ", r2_score(y_test, pred_test))

MAE on train data:  0.49765430691405615
MSE on train data:  0.4263464166903157
RMSE on train data:  0.6529520784026311
R2 score on train data:  0.5645833008921182

MAE on test data:  0.4605268377162081
MSE on test data:  0.3367727708588516
RMSE on test data:  0.5803212652133743
R2 score on test data:  0.6755462621422497


# Hyperparameter tuning

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold

# Define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# Define grid
grid = {'alpha':[0.01, 0.1, 1, 10]}

search = GridSearchCV(rr, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

results = search.fit(X, y)

print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -0.689
Config: {'alpha': 0.01}
