In [1]:
# Capital Bike Share data - Feature Selection

In [54]:
import numpy as np

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm 

In [3]:
train = pd.read_csv('train_bikes.csv', index_col=0, parse_dates=True)
test = pd.read_csv('test_bikes.csv', index_col=0, parse_dates=True)

In [4]:
# DataTimeIndex-ifying

In [5]:
# using index column which has datetime info to create a new column 

train['datetime'] = pd.to_datetime(train.index)

In [6]:
# create new columns for time series data
train['year'] = pd.DatetimeIndex(train['datetime']).year
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['day'] = pd.DatetimeIndex(train['datetime']).day
train['hour'] = pd.DatetimeIndex(train['datetime']).hour

In [7]:
# datetime column not required anymore, drop

train = train.drop(['datetime'], axis=1)

In [8]:
# coercing these into categorical datatypes

cat_var_list = ['season', 'holiday', 'workingday', 'year', 'month', 'day', 'hour']

for var in cat_var_list:
    train[var] = train[var].astype('category')

In [9]:
# drop columns as discussed above
train = train.drop(['atemp'], axis=1)
train = train.drop(['casual'], axis=1)
train = train.drop(['registered'], axis=1)

In [10]:
# several outliers all around, rather noticeably in the non-office hours
# removing those that lie 3 or more standard deviations away from the mean 

train = train[np.abs(train['count'] - train['count'].mean()) <= (3 * train['count'].std())]

In [11]:
# Feature Engineering

In [12]:
# creating dummies for categorial calues, and dropping original columns 

season_dummy = pd.get_dummies(train['season'], prefix='season', drop_first=True)
train = pd.concat([train, season_dummy], axis=1)
train = train.drop(['season'], axis=1)

In [13]:
weather_dummy = pd.get_dummies(train['weather'], prefix='weather', drop_first=True)
train = pd.concat([train, weather_dummy], axis=1)
train = train.drop(['weather'], axis=1)

In [14]:
workingday_dummy = pd.get_dummies(train['workingday'], prefix='workingday', drop_first=True)
train = pd.concat([train, workingday_dummy], axis=1)
train = train.drop(['workingday'], axis=1)

In [15]:
holiday_dummy = pd.get_dummies(train['holiday'], prefix='holiday', drop_first=True)
train = pd.concat([train, holiday_dummy], axis=1)
train = train.drop(['holiday'], axis=1)

In [16]:
# MinMax Scaling for numerical columns

scaler = MinMaxScaler()
humidity_scaled = scaler.fit_transform(train[['humidity']])
train['humidity_scaled'] = humidity_scaled
train = train.drop(['humidity'], axis=1)

windspeed_scaled = scaler.fit_transform(train[['windspeed']])
train['windspeed_scaled'] = windspeed_scaled
train = train.drop(['windspeed'], axis=1)

In [17]:
# Binning temp data: into bins of 5 deg (C or F?)

train['binned_temp'] = np.floor(train['temp']) // 5

# creating dummies for binned_temp

binned_temp = pd.get_dummies(train['binned_temp'], prefix='binned_temp', drop_first=True)
train = pd.concat([train, binned_temp], axis=1)
train = train.drop(['binned_temp'], axis=1)

In [18]:
# Binning hour data: into 6 bins of 4 hours each

train['binned_hour'] = pd.cut(train['hour'], bins=6)

# creating dummies for binned_hour

binned_hour = pd.get_dummies(train['binned_hour'], prefix='binned_hour', drop_first=True)
train = pd.concat([train, binned_hour], axis=1)
train = train.drop(['binned_hour'], axis=1)

In [19]:
# creating dummies for year, month, day and hour - and droping original columns

year_dummy = pd.get_dummies(train['year'], prefix='year', drop_first=True)
train = pd.concat([train, year_dummy], axis=1)
train = train.drop(['year'], axis=1)

month_dummy = pd.get_dummies(train['month'], prefix='month', drop_first=True)
train = pd.concat([train, month_dummy], axis=1)
train = train.drop(['month'], axis=1)

# day info different for train and test datasets, cannot be compared, drop
train = train.drop(['day'], axis=1)

hour_dummy = pd.get_dummies(train['hour'], prefix='hour', drop_first=True)
train = pd.concat([train, hour_dummy], axis=1)
train = train.drop(['hour'], axis=1)

In [20]:
train.head()

Unnamed: 0_level_0,temp,count,season_2,season_3,season_4,weather_2,weather_3,weather_4,workingday_1,holiday_1,humidity_scaled,windspeed_scaled,binned_temp_1.0,binned_temp_2.0,binned_temp_3.0,binned_temp_4.0,binned_temp_5.0,binned_temp_6.0,binned_temp_7.0,binned_temp_8.0,"binned_hour_(3.833, 7.667]","binned_hour_(7.667, 11.5]","binned_hour_(11.5, 15.333]","binned_hour_(15.333, 19.167]","binned_hour_(19.167, 23.0]",year_2012,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
2011-01-01 00:00:00,9.84,16,0,0,0,0,0,0,0,0,0.81,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 01:00:00,9.02,40,0,0,0,0,0,0,0,0,0.8,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 02:00:00,9.02,32,0,0,0,0,0,0,0,0,0.8,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 03:00:00,9.84,13,0,0,0,0,0,0,0,0,0.75,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 04:00:00,9.84,1,0,0,0,0,0,0,0,0,0.75,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# subjecting test data set to the same horrors

In [22]:
# DataTimeIndex-ifying

test['datetime'] = pd.to_datetime(test.index)

test['year'] = pd.DatetimeIndex(test['datetime']).year
test['month'] = pd.DatetimeIndex(test['datetime']).month
test['day'] = pd.DatetimeIndex(test['datetime']).day
test['hour'] = pd.DatetimeIndex(test['datetime']).hour

test = test.drop(['datetime'], axis=1)

In [23]:
# coercing these into categorical datatypes

cat_var_list = ['season', 'holiday', 'workingday', 'year', 'month', 'day', 'hour']

for var in cat_var_list:
    test[var] = test[var].astype('category')

In [24]:
# drop columns as discussed above

test = test.drop(['atemp'], axis=1)

In [25]:
# outlier removal not possible as test data has no counts

# test = test[np.abs(test['count'] - test['count'].mean()) <= (3 * test['count'].std())]

In [26]:
# Feature Engineering

In [27]:
# creating dummies for categorial calues, and dropping original columns 

season_dummy = pd.get_dummies(test['season'], prefix='season', drop_first=True)
test = pd.concat([test, season_dummy], axis=1)
test = test.drop(['season'], axis=1)

weather_dummy = pd.get_dummies(test['weather'], prefix='weather', drop_first=True)
test = pd.concat([test, weather_dummy], axis=1)
test = test.drop(['weather'], axis=1)

workingday_dummy = pd.get_dummies(test['workingday'], prefix='workingday', drop_first=True)
test = pd.concat([test, workingday_dummy], axis=1)
test = test.drop(['workingday'], axis=1)

holiday_dummy = pd.get_dummies(test['holiday'], prefix='holiday', drop_first=True)
test = pd.concat([test, holiday_dummy], axis=1)
test = test.drop(['holiday'], axis=1)

In [28]:
# MinMax Scaling for numerical columns

scaler = MinMaxScaler()
humidity_scaled = scaler.fit_transform(test[['humidity']])
test['humidity_scaled'] = humidity_scaled
test = test.drop(['humidity'], axis=1)

windspeed_scaled = scaler.fit_transform(test[['windspeed']])
test['windspeed_scaled'] = windspeed_scaled
test = test.drop(['windspeed'], axis=1)

In [29]:
# Binning temp data: into bins of 5 deg (C or F?)

test['binned_temp'] = np.floor(test['temp']) // 5

# creating dummies for binned_temp

binned_temp = pd.get_dummies(test['binned_temp'], prefix='binned_temp', drop_first=True)
test = pd.concat([test, binned_temp], axis=1)
test = test.drop(['binned_temp'], axis=1)

In [30]:
# Binning hour data: into 6 bins of 4 hours each

test['binned_hour'] = pd.cut(test['hour'], bins=6)

# creating dummies for binned_hour

binned_hour = pd.get_dummies(test['binned_hour'], prefix='binned_hour', drop_first=True)
test = pd.concat([test, binned_hour], axis=1)
test = test.drop(['binned_hour'], axis=1)

In [31]:
# creating dummies for year, month, day and hour - and droping original columns

year_dummy = pd.get_dummies(test['year'], prefix='year', drop_first=True)
test = pd.concat([test, year_dummy], axis=1)
test = test.drop(['year'], axis=1)

month_dummy = pd.get_dummies(test['month'], prefix='month', drop_first=True)
test = pd.concat([test, month_dummy], axis=1)
test = test.drop(['month'], axis=1)

# day info different for train and test datasets, cannot be compared, drop
test = test.drop(['day'], axis=1)

hour_dummy = pd.get_dummies(test['hour'], prefix='hour', drop_first=True)
test = pd.concat([test, hour_dummy], axis=1)
test = test.drop(['hour'], axis=1)

In [32]:
test.head()

Unnamed: 0_level_0,temp,season_2,season_3,season_4,weather_2,weather_3,weather_4,workingday_1,holiday_1,humidity_scaled,windspeed_scaled,binned_temp_1.0,binned_temp_2.0,binned_temp_3.0,binned_temp_4.0,binned_temp_5.0,binned_temp_6.0,binned_temp_7.0,binned_temp_8.0,"binned_hour_(3.833, 7.667]","binned_hour_(7.667, 11.5]","binned_hour_(11.5, 15.333]","binned_hour_(15.333, 19.167]","binned_hour_(19.167, 23.0]",year_2012,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
2011-01-20 00:00:00,10.66,0,0,0,0,0,0,1,0,0.47619,0.464346,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-20 01:00:00,10.66,0,0,0,0,0,0,1,0,0.47619,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-20 02:00:00,10.66,0,0,0,0,0,0,1,0,0.47619,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-20 03:00:00,10.66,0,0,0,0,0,0,1,0,0.47619,0.196458,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-20 04:00:00,10.66,0,0,0,0,0,0,1,0,0.47619,0.196458,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
# Feature Selection
# will first run a basic linear regression - and use statsmodels to determine which features to select

In [34]:
# first selecting all available features 

#list(train.head())
#list(test.head())

In [35]:
X_train = train[['temp','season_2','season_3','season_4','weather_2','weather_3','weather_4',
'workingday_1','holiday_1','humidity_scaled','windspeed_scaled','binned_temp_1.0','binned_temp_2.0',
'binned_temp_3.0','binned_temp_4.0','binned_temp_5.0','binned_temp_6.0','binned_temp_7.0',
'binned_temp_8.0','binned_hour_(3.833, 7.667]','binned_hour_(7.667, 11.5]','binned_hour_(11.5, 15.333]',
'binned_hour_(15.333, 19.167]','binned_hour_(19.167, 23.0]','year_2012','month_2','month_3','month_4',
'month_5','month_6','month_7','month_8','month_9','month_10','month_11','month_12','hour_1','hour_2','hour_3',
'hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14',
'hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']].to_numpy()

y_train = np.log1p(train['count'])

In [36]:
X_train.shape, y_train.shape

((10739, 59), (10739,))

In [37]:
X_test = test[['temp','season_2','season_3','season_4','weather_2','weather_3','weather_4',
'workingday_1','holiday_1','humidity_scaled','windspeed_scaled','binned_temp_1.0','binned_temp_2.0',
'binned_temp_3.0','binned_temp_4.0','binned_temp_5.0','binned_temp_6.0','binned_temp_7.0',
'binned_temp_8.0','binned_hour_(3.833, 7.667]','binned_hour_(7.667, 11.5]','binned_hour_(11.5, 15.333]',
'binned_hour_(15.333, 19.167]','binned_hour_(19.167, 23.0]','year_2012','month_2','month_3','month_4',
'month_5','month_6','month_7','month_8','month_9','month_10','month_11','month_12','hour_1','hour_2','hour_3','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9',
'hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19',
'hour_20','hour_21','hour_22','hour_23']].to_numpy()

In [38]:
X_test.shape

(6493, 59)

In [39]:
# cool, so both train and test have same number of features 

In [48]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [49]:
print('Linear Regression Training Score: ', linreg.score(X_train, y_train))

Linear Regression Training Score:  0.8297979666044782


In [50]:
# attn in the predicted y for the *training* dataset

y_train_pred_linreg = linreg.predict(X_train)

In [51]:
# MSE for *training* dataset

print('Training Mean Squared Error: ', mean_squared_error(y_train, y_train_pred_linreg))

Training Mean Squared Error:  0.3370277761872355


In [52]:
# Statistical Analysis to determine which of above features are significant

In [60]:
X = sm.add_constant(X_train)
y = y_train
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
results = model.summary()
print(results)

                            OLS Regression Results                            
Dep. Variable:                  count   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.829
Method:                 Least Squares   F-statistic:                     1022.
Date:                Tue, 20 Oct 2020   Prob (F-statistic):               0.00
Time:                        16:44:56   Log-Likelihood:                -9398.2
No. Observations:               10739   AIC:                         1.890e+04
Df Residuals:                   10687   BIC:                         1.928e+04
Df Model:                          51                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.7476      0.067     40.789      0.0

In [None]:
# R-squared value is 0.83 - assumption of linearity holds true 
# 7 or 8 features do not appear to be important - will remove those from future analysis 

In [None]:
# Selected features for further model optimization

X_train = train[['temp','season_2','season_3','season_4','weather_2','weather_3','workingday_1',
'humidity_scaled','windspeed_scaled','binned_temp_1.0','binned_temp_2.0','binned_temp_3.0','binned_temp_4.0',
'binned_temp_5.0','binned_hour_(3.833, 7.667]','binned_hour_(7.667, 11.5]','binned_hour_(11.5, 15.333]',
'binned_hour_(15.333, 19.167]','binned_hour_(19.167, 23.0]','year_2012','month_2','month_3',
'month_5','month_6','month_7','month_8','month_9','month_10','month_11','month_12','hour_1','hour_2','hour_3',
'hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_11','hour_12','hour_13','hour_14',
'hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']].to_numpy()

y_train = np.log1p(train['count'])