In [539]:
import pandas as pd
import numpy as np
import seaborn as sns

In [540]:
df = pd.read_csv('data/bikeshare.csv')

In [541]:
train = pd.read_csv('data/bikeshare.csv').iloc[:5443]
test  = pd.read_csv('data/bikeshare.csv').iloc[5443:] 

In [542]:
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [543]:
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
5443,2012-01-01 21:00:00,Spring,0,0,Partly Cloudy,18.04,21.97,67,16.9979,71
5444,2012-01-01 22:00:00,Spring,0,0,Clear Skies,18.86,22.725,55,27.9993,66
5445,2012-01-01 23:00:00,Spring,0,0,Clear Skies,18.04,21.97,51,19.9995,29
5446,2012-01-02 00:00:00,Spring,1,0,Clear Skies,16.4,20.455,40,31.0009,39
5447,2012-01-02 01:00:00,Spring,1,0,Clear Skies,14.76,16.665,43,27.9993,12


# Exploratory Data Analysis

Including: total nulls, index, data types, shape, summary statistics, and the number of unique values for each column

In [544]:
train.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
count         0
dtype: int64

In [545]:
test.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
count         0
dtype: int64

In [546]:
print(np.shape(test))
print(np.shape(train))

(5443, 10)
(5443, 10)


In [547]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5443 entries, 5443 to 10885
Data columns (total 10 columns):
datetime      5443 non-null object
season        5443 non-null object
holiday       5443 non-null int64
workingday    5443 non-null int64
weather       5443 non-null object
temp          5443 non-null float64
atemp         5443 non-null float64
humidity      5443 non-null int64
windspeed     5443 non-null float64
count         5443 non-null int64
dtypes: float64(3), int64(4), object(3)
memory usage: 425.3+ KB


In [548]:
train.info()
# most columns are numeric (int or float)
# datetime is an object - will need to convert this to "datetime"
# Season & weather are strings. These columns are categorical and should
# be converted to numerical

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5443 entries, 0 to 5442
Data columns (total 10 columns):
datetime      5443 non-null object
season        5443 non-null object
holiday       5443 non-null int64
workingday    5443 non-null int64
weather       5443 non-null object
temp          5443 non-null float64
atemp         5443 non-null float64
humidity      5443 non-null int64
windspeed     5443 non-null float64
count         5443 non-null int64
dtypes: float64(3), int64(4), object(3)
memory usage: 425.3+ KB


In [549]:
train.nunique()
#train has 3 values in weather, while test has 4

datetime      5443
season           4
holiday          2
workingday       2
weather          3
temp            45
atemp           54
humidity        87
windspeed       27
count          557
dtype: int64

In [550]:
test.nunique()
#train has 3 values in weather, while test has 4, so will need to concat then
#for category encoding

datetime      5443
season           4
holiday          2
workingday       2
weather          4
temp            49
atemp           60
humidity        81
windspeed       25
count          813
dtype: int64

In [551]:
train['weather'].unique()
test['weather'].unique()

array(['Partly Cloudy', 'Clear Skies', 'Light Storms/Rain',
       'Heavy Storms/Rain'], dtype=object)

In [552]:
# added my own datepart colunmns to increase prediction score
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])
#train['hour_of_day'] = train['datetime'].dt.hour()
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [553]:
train['hour_of_day'] = train['datetime'].dt.hour.astype(str)
test['hour_of_day'] = test['datetime'].dt.hour.astype(str)

In [554]:
train['day_of_week'] = train['datetime'].dt.weekday_name
test['day_of_week'] = test['datetime'].dt.weekday_name

In [555]:
mapping = {'Clear Skies': 3, 
           'Partly Cloudy': 2,
           'Light Storms/Rain': 1,
           'Heavy Storms/Rain': 0
          }

train['weather'] = train['weather'].map(mapping)
test['weather'] = test['weather'].map(mapping)
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,hour_of_day,day_of_week
0,2011-01-01 00:00:00,Spring,0,0,3,9.84,14.395,81,0.0,16,0,Saturday
1,2011-01-01 01:00:00,Spring,0,0,3,9.02,13.635,80,0.0,40,1,Saturday
2,2011-01-01 02:00:00,Spring,0,0,3,9.02,13.635,80,0.0,32,2,Saturday
3,2011-01-01 03:00:00,Spring,0,0,3,9.84,14.395,75,0.0,13,3,Saturday
4,2011-01-01 04:00:00,Spring,0,0,3,9.84,14.395,75,0.0,1,4,Saturday


In [556]:
master = pd.concat([train, test])
master = pd.get_dummies(master)

In [557]:
train  = master.iloc[:5443].copy()
test   = master.iloc[5443:].copy()

In [558]:
corr = train.corr()
corr['count_abs'] = abs(corr['count'])
corr.sort_values(by='count_abs',ascending=False).head(10)
#I originially chose highly correlated values (temp, atemp, hour_of_day_17, season_Spring, humidity, hour_of_day_18,hour_of_day_4,hour_of_day_3,hour_of_day_5 )
# but my score was extremely low, so I reran with all values to increase score

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Fall,season_Spring,...,hour_of_day_8,hour_of_day_9,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,count_abs
count,0.010837,-0.003955,0.133236,0.451201,0.448814,-0.283322,0.083595,1.0,0.186536,-0.308264,...,0.188122,0.031222,0.024082,0.02403,-0.001751,0.002084,-0.005384,0.001942,-0.04534,1.0
temp,0.004226,0.038313,0.045251,1.0,0.992281,-0.046221,0.00538,0.451201,0.621443,-0.598757,...,-0.041331,-0.016624,-0.020476,0.045753,-0.046498,-0.005578,0.000609,0.021809,0.005924,0.451201
atemp,-0.00273,0.036195,0.046271,0.992281,1.0,-0.025471,-0.03764,0.448814,0.59971,-0.608334,...,-0.040161,-0.018535,-0.022221,0.047783,-0.045911,-0.000183,-0.009705,0.027358,0.004325,0.448814
hour_of_day_17,-0.000333,0.000816,-0.000614,0.059258,0.054897,-0.114612,0.077216,0.328488,-0.000905,0.003087,...,-0.04382,-0.04382,-7e-05,-0.000156,-0.000849,-7e-05,-0.000156,0.001253,6.6e-05,0.328488
season_Spring,-0.03068,0.005368,-0.000912,-0.598757,-0.608334,-0.229741,0.146693,-0.308264,-0.331286,1.0,...,0.001492,0.001492,-0.010714,-0.029659,0.017318,-0.010714,-0.001418,0.008701,0.02623,0.308264
humidity,0.001535,-0.025747,-0.399264,-0.046221,-0.025471,1.0,-0.294701,-0.283322,0.068677,-0.229741,...,0.068347,0.02348,-0.044705,-0.016975,-0.008283,0.041919,-0.104305,0.028507,0.104058,0.283322
hour_of_day_18,-0.000333,0.000816,0.003627,0.044428,0.041691,-0.092063,0.059463,0.278611,-0.000905,0.003087,...,-0.04382,-0.04382,-7e-05,-0.000156,-0.000849,-7e-05,-0.000156,0.001253,6.6e-05,0.278611
hour_of_day_4,0.001674,-0.007565,0.01167,-0.050597,-0.048129,0.122763,-0.051217,-0.211858,0.006147,-0.018191,...,-0.042505,-0.042505,-0.008444,0.002064,0.004265,0.004951,-0.000649,-0.001822,-0.000434,0.211858
hour_of_day_3,0.002315,-0.010242,0.013676,-0.042402,-0.039104,0.106704,-0.06423,-0.202269,0.00621,-0.018382,...,-0.042094,-0.042094,0.001149,-0.001859,0.005899,0.006555,-0.001859,-0.005734,-0.004387,0.202269
hour_of_day_5,0.000273,0.00422,0.005505,-0.068725,-0.066349,0.117262,-0.071143,-0.202057,0.001223,-0.003334,...,-0.043419,-0.043419,0.001445,0.001332,0.000694,-0.006436,0.001332,0.000122,0.001554,0.202057


In [559]:
train.head()

Unnamed: 0,datetime,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Fall,...,hour_of_day_7,hour_of_day_8,hour_of_day_9,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,2011-01-01 00:00:00,0,0,3,9.84,14.395,81,0.0,16,0,...,0,0,0,0,0,1,0,0,0,0
1,2011-01-01 01:00:00,0,0,3,9.02,13.635,80,0.0,40,0,...,0,0,0,0,0,1,0,0,0,0
2,2011-01-01 02:00:00,0,0,3,9.02,13.635,80,0.0,32,0,...,0,0,0,0,0,1,0,0,0,0
3,2011-01-01 03:00:00,0,0,3,9.84,14.395,75,0.0,13,0,...,0,0,0,0,0,1,0,0,0,0
4,2011-01-01 04:00:00,0,0,3,9.84,14.395,75,0.0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [560]:
y = np.log(train['count'])
train.drop('count', axis=1, inplace=True)
test.drop('count', axis=1, inplace=True)
train.drop('datetime', axis=1, inplace=True)
test.drop('datetime', axis=1, inplace=True)
#train = train[['temp', 'atemp', 'hour_of_day_17', 'season_Spring', 'humidity', 'hour_of_day_18','hour_of_day_4','hour_of_day_3','hour_of_day_5' ]]
#test = train[['temp', 'atemp', 'hour_of_day_17', 'season_Spring', 'humidity', 'hour_of_day_18','hour_of_day_4','hour_of_day_3','hour_of_day_5' ]]
#^ removing narrowing down values to the highly correlated values due to low score

In [561]:
#standardize data
train_means = train.mean()
train_stds  = train.std()

In [562]:
train_std = train - train_means
train_std /= train_stds

In [563]:
test -= train_means
test /= train_stds

In [564]:
#create train and validation set :
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_val, y_train, y_val = train_test_split(train, y, random_state=2020)

In [565]:
# run linear regression, fit on the training data and score on the validation data
lreg = LinearRegression()
lreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [566]:
lreg.score(X_val, y_val)
#linear regression score was only a 61.. will move on to ridge & lasso as 
# a strategy to increase score

0.8117679637770393

In [567]:
lreg.predict(X_train)

array([4.88156891, 5.47251129, 1.62861633, ..., 4.83310699, 2.9625473 ,
       4.61597443])

In [568]:

train_predictions = train.copy()
train_predictions['predictions'] = pd.DataFrame(prediction)
train_predictions.head()
df.iloc[:5443].head()




Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [569]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()


In [570]:
scores = cross_val_score(estimator=lreg, X=X_train, y=y_train,cv=10) 
scores

array([0.79813805, 0.80953738, 0.78023513, 0.81415019, 0.77468127,
       0.80917822, 0.82895044, 0.81580367, 0.83210302, 0.80636103])

In [571]:
from sklearn.linear_model import Ridge, Lasso
ridge = Ridge()
lasso = Lasso()

In [572]:
alphas = np.logspace(-3,3,7)
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size =0.2, random_state=2020)
ridge_scores = []

for alpha in alphas:
    scores = cross_val_score(estimator=ridge.set_params(alpha=alpha),X=X_train,y=y_train,cv=10)
    ridge_scores.append((np.mean(scores),np.std(scores),alpha))

In [573]:
max(ridge_scores)

(0.8077713848775028, 0.01778072811937441, 0.1)

In [574]:
ridge.set_params(alpha=0.1)
ridge.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [575]:
ridge.score(X_test, y_test)

0.8094652736667345

In [576]:
# now doing lasso

In [577]:
lasso.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [578]:
alphas = np.logspace(-3,3,7)
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size =0.2, random_state=2020)
lasso_scores = []

for alpha in alphas:
    scores = cross_val_score(estimator=lasso.set_params(alpha=alpha),X=X_train,y=y_train,cv=10)
    lasso_scores.append((np.mean(scores),np.std(scores),alpha))

In [579]:
lasso_scores

[(0.8074497848627598, 0.017218214165111937, 0.001),
 (0.7823156427776613, 0.015406501190223746, 0.01),
 (0.24678870765685637, 0.021246338131946256, 0.1),
 (0.23444815739626162, 0.01956357194958417, 1.0),
 (-0.003805011070616304, 0.006807217919471082, 10.0),
 (-0.003805011070616304, 0.006807217919471082, 100.0),
 (-0.003805011070616304, 0.006807217919471082, 1000.0)]

In [580]:
max(lasso_scores)

(0.8074497848627598, 0.017218214165111937, 0.001)

In [581]:
lasso.set_params(alpha=0.001).fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [582]:
lasso.score(X_test, y_test)

0.8086947190772119

In [583]:
from sklearn.pipeline import make_pipeline
ridge_pipe = make_pipeline(ridge)
# only passing in ridge because i did sc,ore,ohe already manually. doing ridge
# because ridge as very slightly better scores than lasso

In [584]:
ridge_pipe.fit(train, y)

Pipeline(memory=None,
     steps=[('ridge', Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [585]:
ridge_pipe.steps[0][1].set_params(alpha=0.1)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [586]:
ridge_preds = np.exp(ridge_pipe.predict(test))
preds = ridge_pipe.predict(test)
preds

array([4.20974077, 3.34361154, 1.34072301, ..., 5.75832782, 4.58672211,
       2.57669917])

In [587]:
train['predictions'] = ridge_preds
train.head()


Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,season_Fall,season_Spring,season_Summer,...,hour_of_day_8,hour_of_day_9,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,predictions
0,0,0,3,9.84,14.395,81,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,67.339081
1,0,0,3,9.02,13.635,80,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,28.321225
2,0,0,3,9.02,13.635,80,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,3.821806
3,0,0,3,9.84,14.395,75,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,0.216071
4,0,0,3,9.84,14.395,75,0.0,0,1,0,...,0,0,0,0,1,0,0,0,0,0.009293
