In [1]:
# Capital Bike Share data

In [33]:
import numpy as np

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm 

In [9]:
train = pd.read_csv('train_bikes.csv', index_col=0, parse_dates=True)
test = pd.read_csv('test_bikes.csv', index_col=0, parse_dates=True)

In [10]:
# DataTimeIndex-ifying

In [11]:
# using index column which has datetime info to create a new column 

train['datetime'] = pd.to_datetime(train.index)

In [12]:
# create new columns for time series data
train['year'] = pd.DatetimeIndex(train['datetime']).year
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['day'] = pd.DatetimeIndex(train['datetime']).day
train['hour'] = pd.DatetimeIndex(train['datetime']).hour

In [13]:
# datetime column not required anymore, drop

train = train.drop(['datetime'], axis=1)

In [14]:
# coercing these into categorical datatypes

cat_var_list = ['season', 'holiday', 'workingday', 'year', 'month', 'day', 'hour']

for var in cat_var_list:
    train[var] = train[var].astype('category')

In [15]:
# drop columns as discussed above
train = train.drop(['atemp'], axis=1)
train = train.drop(['casual'], axis=1)
train = train.drop(['registered'], axis=1)

In [18]:
# several outliers all around, rather noticeably in the non-office hours
# removing those that lie 3 or more standard deviations away from the mean 

train = train[np.abs(train['count'] - train['count'].mean()) <= (3 * train['count'].std())]

In [19]:
# Feature Engineering

In [20]:
# creating dummies for categorial calues, and dropping original columns 

season_dummy = pd.get_dummies(train['season'], prefix='season', drop_first=True)
train = pd.concat([train, season_dummy], axis=1)
train = train.drop(['season'], axis=1)

In [21]:
weather_dummy = pd.get_dummies(train['weather'], prefix='weather', drop_first=True)
train = pd.concat([train, weather_dummy], axis=1)
train = train.drop(['weather'], axis=1)

In [22]:
workingday_dummy = pd.get_dummies(train['workingday'], prefix='workingday', drop_first=True)
train = pd.concat([train, workingday_dummy], axis=1)
train = train.drop(['workingday'], axis=1)

In [23]:
holiday_dummy = pd.get_dummies(train['holiday'], prefix='holiday', drop_first=True)
train = pd.concat([train, holiday_dummy], axis=1)
train = train.drop(['holiday'], axis=1)

In [25]:
# MinMax Scaling for numerical columns

scaler = MinMaxScaler()
humidity_scaled = scaler.fit_transform(train[['humidity']])
train['humidity_scaled'] = humidity_scaled
train = train.drop(['humidity'], axis=1)

windspeed_scaled = scaler.fit_transform(train[['windspeed']])
train['windspeed_scaled'] = windspeed_scaled
train = train.drop(['windspeed'], axis=1)

In [27]:
# Binning temp data: into bins of 5 deg (C or F?)

train['binned_temp'] = np.floor(train['temp']) // 5

# creating dummies for binned_temp

binned_temp = pd.get_dummies(train['binned_temp'], prefix='binned_temp', drop_first=True)
train = pd.concat([train, binned_temp], axis=1)
train = train.drop(['binned_temp'], axis=1)

In [29]:
# Binning hour data: into 6 bins of 4 hours each

train['binned_hour'] = pd.cut(train['hour'], bins=6)

# creating dummies for binned_hour

binned_hour = pd.get_dummies(train['binned_hour'], prefix='binned_hour', drop_first=True)
train = pd.concat([train, binned_hour], axis=1)
train = train.drop(['binned_hour'], axis=1)

In [30]:
# creating dummies for year, month, day and hour - and droping original columns

year_dummy = pd.get_dummies(train['year'], prefix='year', drop_first=True)
train = pd.concat([train, year_dummy], axis=1)
train = train.drop(['year'], axis=1)

month_dummy = pd.get_dummies(train['month'], prefix='month', drop_first=True)
train = pd.concat([train, month_dummy], axis=1)
train = train.drop(['month'], axis=1)

day_dummy = pd.get_dummies(train['day'], prefix='day', drop_first=True)
train = pd.concat([train, day_dummy], axis=1)
train = train.drop(['day'], axis=1)

hour_dummy = pd.get_dummies(train['hour'], prefix='hour', drop_first=True)
train = pd.concat([train, hour_dummy], axis=1)
train = train.drop(['hour'], axis=1)

In [31]:
train.head()

Unnamed: 0_level_0,temp,count,season_2,season_3,season_4,weather_2,weather_3,weather_4,workingday_1,holiday_1,humidity_scaled,windspeed_scaled,binned_temp_1.0,binned_temp_2.0,binned_temp_3.0,binned_temp_4.0,binned_temp_5.0,binned_temp_6.0,binned_temp_7.0,binned_temp_8.0,"binned_hour_(3.833, 7.667]","binned_hour_(7.667, 11.5]","binned_hour_(11.5, 15.333]","binned_hour_(15.333, 19.167]","binned_hour_(19.167, 23.0]",year_2012,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1
2011-01-01 00:00:00,9.84,16,0,0,0,0,0,0,0,0,0.81,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 01:00:00,9.02,40,0,0,0,0,0,0,0,0,0.8,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 02:00:00,9.02,32,0,0,0,0,0,0,0,0,0.8,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 03:00:00,9.84,13,0,0,0,0,0,0,0,0,0.75,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-01 04:00:00,9.84,1,0,0,0,0,0,0,0,0,0.75,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
# subjecting test data set to the same horrors

In [32]:
# Feature Selection
# will first run a basic linear regression - and use statsmodels to determine which features to select

In [45]:
# first selecting all available features 

#list(train.head())

In [39]:
X_train = train[['temp','count','season_2','season_3','season_4','weather_2','weather_3','weather_4',
'workingday_1','holiday_1','humidity_scaled','windspeed_scaled','binned_temp_1.0','binned_temp_2.0',
'binned_temp_3.0','binned_temp_4.0','binned_temp_5.0','binned_temp_6.0','binned_temp_7.0',
'binned_temp_8.0','binned_hour_(3.833, 7.667]','binned_hour_(7.667, 11.5]','binned_hour_(11.5, 15.333]',
'binned_hour_(15.333, 19.167]','binned_hour_(19.167, 23.0]','year_2012','month_2','month_3','month_4',
'month_5','month_6','month_7','month_8','month_9','month_10','month_11','month_12','day_2','day_3','day_4',
'day_5','day_6','day_7','day_8','day_9','day_10','day_11','day_12','day_13','day_14','day_15','day_16',
'day_17','day_18','day_19','hour_1','hour_2','hour_3','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9',
'hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19',
'hour_20','hour_21','hour_22','hour_23']]

y_train = np.log1p(train['count'])

In [40]:
X_train.shape, y_train.shape

((10739, 78), (10739,))

In [41]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [42]:
print('Linear Regression Training Score: ', linreg.score(X_train, y_train))

Linear Regression Training Score:  0.8933677207551483


In [44]:
y_pred_linreg = linreg.predict(X_test)

NameError: name 'X_test' is not defined