In this notebook, we'll participate in bike sharing demand competition hosted by Kaggle.
 
Go to https://www.kaggle.com/c/bike-sharing-demand/overview

and download bike-sharing-demand.zip (go to data and download all)

In [87]:
#! ls *.zip

In [88]:
#! unzip bike-sharing-demand.zip

In [89]:
#! cat train.csv

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [91]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [92]:
train_df.isnull().any()

datetime      False
season        False
holiday       False
workingday    False
weather       False
temp          False
atemp         False
humidity      False
windspeed     False
casual        False
registered    False
count         False
dtype: bool

In [93]:
train_df.dtypes

datetime       object
season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
casual          int64
registered      int64
count           int64
dtype: object

In [94]:
train_df.datetime = pd.to_datetime(train_df.datetime)
train_df.dtypes

datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity               int64
windspeed            float64
casual                 int64
registered             int64
count                  int64
dtype: object

In [96]:
train_df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [97]:
def engineered_time_feature(df):
    df['month'] = df.datetime.dt.month
    df['year'] = df.datetime.dt.year
    df['day'] = df.datetime.dt.day
    df['day_of_the_week'] = df.datetime.dt.day
    df['hour'] = df.datetime.dt.hour   
    
    
    return df


In [98]:
test_df = pd.read_csv('test.csv')


In [99]:
test_df.datetime= pd.to_datetime(test_df.datetime)
test_df.dtypes

datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity               int64
windspeed            float64
dtype: object

In [101]:
train_df = engineered_time_feature(train_df)
train_df['count'] = np.log1p(train_df['count'])
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,year,day,day_of_the_week,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,1,2011,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,1,2011,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,3.496508,1,2011,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,2.639057,1,2011,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,0.693147,1,2011,1,1,4


In [102]:
train_df.dtypes

datetime           datetime64[ns]
season                      int64
holiday                     int64
workingday                  int64
weather                     int64
temp                      float64
atemp                     float64
humidity                    int64
windspeed                 float64
casual                      int64
registered                  int64
count                     float64
month                       int64
year                        int64
day                         int64
day_of_the_week             int64
hour                        int64
dtype: object

In [104]:
test_df = engineered_time_feature(test_df)
test_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,year,day,day_of_the_week,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,1,2011,20,20,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,1,2011,20,20,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,1,2011,20,20,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,1,2011,20,20,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,1,2011,20,20,4


In [105]:
from sklearn.model_selection import train_test_split

In [106]:
train_set_df, val_set_df = train_test_split(train_df, test_size = .2, random_state= 1)

In [107]:
train_set_df.shape, val_set_df.shape

((8708, 17), (2178, 17))

In [108]:
train_set_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,year,day,day_of_the_week,hour
3224,2011-08-04 06:00:00,3,0,1,2,27.88,31.82,89,19.9995,7,90,4.584967,8,2011,4,4,6
1299,2011-03-19 00:00:00,1,0,0,2,24.6,31.06,53,16.9979,26,50,4.343805,3,2011,19,19,0
3489,2011-08-15 07:00:00,3,0,1,1,24.6,27.275,88,16.9979,10,248,5.556828,8,2011,15,15,7
7649,2012-05-18 02:00:00,2,0,1,1,19.68,23.485,67,8.9981,12,9,3.091042,5,2012,18,18,2
5950,2012-02-04 03:00:00,1,0,0,1,9.84,14.395,75,0.0,1,17,2.944439,2,2012,4,4,3


In [109]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,year,day,day_of_the_week,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,1,2011,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,1,2011,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,3.496508,1,2011,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,2.639057,1,2011,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,0.693147,1,2011,1,1,4


In [110]:
val_set_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,year,day,day_of_the_week,hour
3709,2011-09-05 11:00:00,3,1,0,2,28.7,33.335,74,11.0014,101,207,5.733341,9,2011,5,5,11
6741,2012-03-18 04:00:00,1,0,0,2,17.22,21.21,94,11.0014,6,8,2.70805,3,2012,18,18,4
9848,2012-10-14 17:00:00,4,0,0,1,26.24,31.06,44,12.998,193,346,6.291569,10,2012,14,14,17
1410,2011-04-04 15:00:00,2,0,1,1,31.16,33.335,23,36.9974,47,96,4.969813,4,2011,4,4,15
10672,2012-12-11 02:00:00,4,0,1,2,16.4,20.455,66,22.0028,0,1,0.693147,12,2012,11,11,2


In [111]:
col = ['count','season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'casual',
 'registered',
 'month',
 'year',
 'day',
 'day_of_the_week',
 'hour']

In [112]:
train_set_df.to_csv('ctrain.csv', index=False, header=False, columns=col)

In [113]:
!cat ctrain.csv

4.584967478670572,3,0,1,2,27.88,31.82,89,19.9995,7,90,8,2011,4,4,6
4.343805421853684,1,0,0,2,24.6,31.06,53,16.9979,26,50,3,2011,19,19,0
5.556828061699537,3,0,1,1,24.6,27.275,88,16.9979,10,248,8,2011,15,15,7
3.091042453358316,2,0,1,1,19.68,23.485,67,8.9981,12,9,5,2012,18,18,2
2.9444389791664403,1,0,0,1,9.84,14.395,75,0.0,1,17,2,2012,4,4,3
6.161207321695077,4,0,0,1,18.04,21.97,62,16.9979,142,331,12,2012,2,2,14
6.198478716492308,3,0,0,1,35.26,40.15,47,35.0008,161,330,8,2012,5,5,12
4.007333185232471,1,0,1,2,9.02,12.88,80,6.0032,5,49,2,2011,1,1,14
6.210600077024653,4,1,0,2,17.22,21.21,66,16.9979,81,416,10,2012,8,8,17
2.9444389791664403,4,0,1,2,18.04,21.97,94,8.9981,0,18,12,2012,18,18,0
2.772588722239781,3,0,0,1,27.88,31.82,83,8.9981,6,9,7,2012,15,15,6
5.480638923341991,4,0,1,3,21.32,25.0,83,6.0032,12,227,10,2012,15,15,19
4.962844630259907,2,0,0,1,14.76,15.91,50,32.9975,68,74,4,2011,3,3,9
5.484796933490655,3,0,1,1,33.62,36.365,38,19.9995,74,166,8,2011,10,10,12
4.343805421853684

5.135798437050262,2,0,1,1,26.24,31.06,41,23.9994,30,139,6,2011,14,14,12
6.484635235635252,3,0,0,2,27.88,31.82,34,19.0012,312,342,9,2012,15,15,15
0.6931471805599453,1,0,0,1,9.84,12.88,70,6.0032,0,1,2,2012,18,18,5
5.204006687076795,4,0,1,2,24.6,29.545,73,11.0014,51,130,10,2011,11,11,12
4.584967478670572,4,0,1,3,22.14,25.76,94,11.0014,17,80,10,2011,19,19,14
3.8501476017100584,1,0,1,2,9.84,12.12,65,11.0014,1,45,2,2011,4,4,22
3.1780538303479458,1,0,1,1,16.4,20.455,47,15.0013,0,23,1,2012,10,10,5
6.093569770045136,3,0,0,1,25.42,31.06,41,26.0027,119,323,9,2012,15,15,10
5.517452896464707,2,0,1,1,27.88,31.06,26,16.9979,58,190,6,2011,3,3,13
6.481577129276431,3,0,1,1,30.34,34.85,66,16.9979,88,564,8,2012,1,1,19
3.258096538021482,1,0,0,2,22.96,26.515,43,15.0013,5,20,3,2011,19,19,2
4.499809670330265,1,0,1,1,9.02,11.365,47,11.0014,7,82,1,2011,5,5,20
3.6888794541139363,4,0,0,2,14.76,18.18,100,6.0032,3,36,12,2012,8,8,7
3.6888794541139363,2,0,0,1,12.3,14.395,65,12.998,8,31,4,2011,3,3,0
3.80

In [114]:
val_set_df.to_csv('cval.csv', index=False, header=False, columns=col)

In [115]:
test_df.to_csv('ctest.csv', index=False)

In [116]:
!cat ctrain.csv

4.584967478670572,3,0,1,2,27.88,31.82,89,19.9995,7,90,8,2011,4,4,6
4.343805421853684,1,0,0,2,24.6,31.06,53,16.9979,26,50,3,2011,19,19,0
5.556828061699537,3,0,1,1,24.6,27.275,88,16.9979,10,248,8,2011,15,15,7
3.091042453358316,2,0,1,1,19.68,23.485,67,8.9981,12,9,5,2012,18,18,2
2.9444389791664403,1,0,0,1,9.84,14.395,75,0.0,1,17,2,2012,4,4,3
6.161207321695077,4,0,0,1,18.04,21.97,62,16.9979,142,331,12,2012,2,2,14
6.198478716492308,3,0,0,1,35.26,40.15,47,35.0008,161,330,8,2012,5,5,12
4.007333185232471,1,0,1,2,9.02,12.88,80,6.0032,5,49,2,2011,1,1,14
6.210600077024653,4,1,0,2,17.22,21.21,66,16.9979,81,416,10,2012,8,8,17
2.9444389791664403,4,0,1,2,18.04,21.97,94,8.9981,0,18,12,2012,18,18,0
2.772588722239781,3,0,0,1,27.88,31.82,83,8.9981,6,9,7,2012,15,15,6
5.480638923341991,4,0,1,3,21.32,25.0,83,6.0032,12,227,10,2012,15,15,19
4.962844630259907,2,0,0,1,14.76,15.91,50,32.9975,68,74,4,2011,3,3,9
5.484796933490655,3,0,1,1,33.62,36.365,38,19.9995,74,166,8,2011,10,10,12
4.343805421853684

4.0943445622221,1,0,1,1,5.74,9.09,36,0.0,8,51,2,2011,9,9,10
5.945420608606575,3,0,1,2,28.7,32.575,61,8.9981,82,299,9,2011,2,2,16
4.110873864173311,4,0,1,1,14.76,17.425,53,11.0014,10,50,12,2011,19,19,23
5.579729825986222,3,0,0,1,30.34,34.85,66,11.0014,131,133,7,2011,3,3,10
6.480044561926653,4,0,0,1,21.32,25.0,39,0.0,268,383,11,2012,10,10,13
2.772588722239781,3,0,1,1,28.7,33.335,74,7.0015,4,11,7,2012,16,16,2
6.436150368369428,2,0,0,1,28.7,31.82,42,22.0028,262,361,4,2012,15,15,17
5.998936561946683,4,0,1,2,12.3,14.395,56,15.0013,18,384,12,2012,12,12,19
5.556828061699537,1,0,0,1,20.5,24.24,31,19.0012,55,203,3,2012,11,11,19
4.59511985013459,2,0,0,2,13.94,17.425,87,7.0015,27,71,4,2011,9,9,9
5.771441123130016,4,0,0,1,14.76,16.665,46,16.9979,62,258,11,2012,17,17,10
4.955827057601261,2,0,1,1,22.96,26.515,64,6.0032,18,123,5,2011,11,11,11
4.6913478822291435,1,0,1,2,12.3,15.15,56,11.0014,8,100,2,2012,8,8,14
4.51085950651685,1,0,0,3,9.02,12.12,93,7.0015,5,85,2,2012,11,11,9
6.0776422433