In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [None]:
train=pd.read_csv("../input/bike-sharing-demand/train.csv")
test=pd.read_csv("../input/bike-sharing-demand/test.csv")
combine=[train,test]

In [None]:
train.shape

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
train.describe()

<h3>Observation</h3>
* No NAN value in the dataset
* <font color="Blue"><u>Categorical data</u></font>:- Season,Holiday,WorkingDay,Weather,
* <font color="Blue"><u>Descrete numercial data</u></font>:- Humidity,Casual,Registered,Count
* <font color="Blue"><u>Continuous numerical data</u></font>:- temp,atemp
* Datetime is an object type and is a good candidate for feature engineering

<h3>Creating features  <i><u>Day</u></i>,<i><u>Month</u></i>,<i><u>Year</u></i> and <i><u>Hour</u></i> from datatime feature</h3>

In [None]:
for data in combine:
    data["dt"]=data["datetime"].apply(lambda x: x.split())
    data["hour"]=data["dt"].apply(lambda x: x[1].split(':')[0])
    data["ymd"]=data["dt"].apply(lambda x: x[0].split("-"))
    data["year"]=data["ymd"].apply(lambda x: x[0])
    data["month"]=data["ymd"].apply(lambda x: x[1])
    data["day"]=data["ymd"].apply(lambda x: x[2])

In [None]:
train=train.drop(["datetime","dt","ymd"],axis=1)
test=test.drop(["datetime","dt","ymd"],axis=1)
combine=[train,test]

In [None]:
for data in combine:
    data["year"]=data["year"].astype(int)
    data["month"]=data["month"].astype(int) 
    data["day"]=data["day"].astype(int)
    data["hour"]=data["hour"].astype(int)    

<h1> Visualisation</h1>

In [None]:
sns.distplot(train["count"],bins=20)

<h3>Ovservation</h3>
Data is highly skewed. Let's see if it is due to the outliers.

In [None]:
f,ax=plt.subplots(ncols=3,nrows=3)
f.set_size_inches(20,15)
sns.boxplot(y='count',data=train,ax=ax[0][0])
sns.boxplot(x='day',y='count',data=train,ax=ax[0][1])
sns.boxplot(x='hour',y='count',data=train,ax=ax[0][2])
sns.boxplot(x='season',y='count',data=train,ax=ax[1][0])
sns.boxplot(x='weather',y='count',data=train,ax=ax[1][1])
sns.boxplot(x='workingday',y='count',data=train,ax=ax[1][2])
sns.boxplot(x='holiday',y='count',data=train,ax=ax[2][0])


<h3>Observation</h3>
It is quite visible that the data has large number of outliers. Lets try to remove these outliers.

In [None]:
train=train.loc[np.abs(train['count']-train['count'].mean())<3*train['count'].std(),:]

In [None]:
f,ax=plt.subplots(ncols=3,nrows=3)
f.set_size_inches(20,15)
sns.boxplot(y='count',data=train,ax=ax[0][0])
sns.boxplot(x='day',y='count',data=train,ax=ax[0][1])
sns.boxplot(x='hour',y='count',data=train,ax=ax[0][2])
sns.boxplot(x='season',y='count',data=train,ax=ax[1][0])
sns.boxplot(x='weather',y='count',data=train,ax=ax[1][1])
sns.boxplot(x='workingday',y='count',data=train,ax=ax[1][2])
sns.boxplot(x='holiday',y='count',data=train,ax=ax[2][0])


In [None]:
sns.distplot(train["count"],bins=20)

In [None]:
train.shape

<h3>Observation</h3>
Removing outliers didn't help much in removing skewness. So we will use square root transformation.

In [None]:
train[["month","count"]].groupby(["month"],as_index=False).mean().sort_values(by='count',ascending=True)

In [None]:
train[["day","count"]].groupby(["day"],as_index=False).mean().sort_values(by='count',ascending=True)

In [None]:
train[["year","count"]].groupby(["year"],as_index=False).mean().sort_values(by='count',ascending=True)

In [None]:
train[["hour","count"]].groupby(["hour"],as_index=True).mean().sort_values(by='hour',ascending=True)

In [None]:
train[["workingday","count"]].groupby(["workingday"],as_index=False).mean().sort_values(by='count',ascending=True)

In [None]:
train[["holiday","count"]].groupby(["holiday"],as_index=False).mean().sort_values(by='count',ascending=True)

In [None]:
train[["season","count"]].groupby(["season"],as_index=False).mean().sort_values(by='count',ascending=True)

In [None]:
train[['weather','count']].groupby(['weather'],as_index=False).mean().sort_values(by='count',ascending=True)

<h3>Observation</h3>
* Month of january had least count and there was less use of bicycle from jan-apr and nov-dec
* 2012 has seen increase in the use of the service in comparision to 2011
* Service is used less before 7A.M and heavly used between 4P.M to 8 P.M.  Also lesser use was reported after 11 P.M
* Mean count does not vary  much with working day or holidays. So these features does not look much important
* Service was used more during Summer and Fall seasons.

<h3>Conclusion</h3>
These features are correlated with count and should be included in the feature list



In [None]:
train.columns

In [None]:
c_features=['temp','atemp','humidity','windspeed','casual','registered','count','hour']
f,ax= plt.subplots(figsize=(18,18))
sns.heatmap(train[c_features].corr(),annot=True,fmt='.1g',linewidth=.2,ax=ax)

<h2>Observation</h2>
<p>Windspeed is least correlated with count.</p>
<p>atemp and temp have strong correlation with each other and one of them must be dropped.</p>
<p>Registered and casual must be dropped as it looks like the case of data leakage.</p>

In [None]:
sns.violinplot(x="season",y="count",hue="year",data=train,split=True)

<h3>Conclusion</h3>
<p>There were more  demand in season 2 and 3 as compared to season 1 and 4.</p>
<p>For season 1 much of data is concentrated near the first quartile value.</p>
<p>Season 1 has lowest demand for both years.</p>


In [None]:
sns.violinplot(x="weather",y="count",data=train,hue="year",split=True)

In [None]:
train['weather'].value_counts()

<h3>Conclusion</h3>
<p>Weather 4 has only 1 count which is obvious as it is heavy rain and thunderstorm</p>
<p>Weather 1 has highest demand which is obvious as the day is clear.

In [None]:
sns.violinplot(x="workingday",y="count",data=train,hue="year")

<h3>Conclusion</h3>
<p>For workingday=0,  lower count values distribution peaks near lower quartile value and gradually platues towards upper quartile values as count increases with slight increase near upper quartile value </p>
<p>For  workingday=1, although the peak of count distribution is near lower quartile value, but the distribution of count increases near the median for both years.

In [None]:
sns.violinplot(x='holiday',y='count',data=train,hue='year')

<h3>Conclusion</h3>
<p>For holiday=1,  lower count values distribution peaks near lower quartile value and gradually platues towards upper quartile values as count increases with slight increase near upper quartile value </p>
<p>For  holiday=0, although the peak of count distribution is near lower quartile value, but the distribution of count increases near the median for both years.

In [None]:
sns.barplot(x='year',y='count',data=train)

<h3>Conclusion</h3>
<p>2012 saw an increase in the demand as compared to 2011.</p>
<p>Since there are only two values so we should consider represnting it by binary values.</p>

In [None]:
g=sns.FacetGrid(train,col='season',aspect=1.5)
g.map(sns.violinplot,'weather','count','year',split=True)

<h3>Observation</h3>
<p>For season 2 and 3, there is very less variation in count values for weather 1 and 2</p>
<p>Across all weather conditions, bike demand was very low for season 1 in 2011. This was probably due to less people using the service in 2011.</p>

<h2>Visualising continuous data</h2>

In [None]:
f,ax=plt.subplots(nrows=5)
f.set_size_inches(20,27)
sns.pointplot(x="day",y="count",data=train,hue='year',ax=ax[0])
sns.pointplot(x="day",y="count",data=train,hue='season',ax=ax[1])
sns.pointplot(x="day",y="count",data=train,hue='weather',ax=ax[2])
sns.pointplot(x="day",y="count",data=train,hue='workingday',ax=ax[3])
sns.pointplot(x="day",y="count",data=train,hue='holiday',ax=ax[4])

<h3>Observations</h3>
<p> Across all days, season 1 had  least count.</p>
<p>Across all days, count for weather 1 remained nearly uniform as compared to other weather.</p>
<p>Across all days, count for holiday=0 remained  nearly constant.</p>

In [None]:
f,ax=plt.subplots(nrows=5)
f.set_size_inches(20,27)
sns.pointplot(x="hour",y="count",data=train,hue='year',ax=ax[0])
sns.pointplot(x="hour",y="count",data=train,hue='season',ax=ax[1])
sns.pointplot(x="hour",y="count",data=train,hue='weather',ax=ax[2])
sns.pointplot(x="hour",y="count",data=train,hue='workingday',ax=ax[3])
sns.pointplot(x="hour",y="count",data=train,hue='holiday',ax=ax[4])

<h3>Observations</h3>
<p>In each case bike demand peaks at 8h and 17h.</p>
<p>Workingday=0 is an exception here for above point as its graph follows sinuoidal shape reaching max between 11h and 17h.</p>
<p>Bike demand for season 2,3 and 4 was nearly same across all hour</p>
<p>Bike demand was highes for weather 1 across all hours of the day</p>
<p>Bike demand was higher during the afternoon hours on holiday=1 and workingday=1</p>

<h3>Conclusion</h3>
<p>hour shoule be banded and converted to categorical</p>

In [None]:
sns.distplot(train['humidity'],bins=20)

<h3>Observations</h3>
Humidity is normally distributed and most of the time it is between 40-90

In [None]:
sns.distplot(train['atemp'],bins=20)

<h3>Observations</h3>
Most of time atemp is between 15-30.

In [None]:
f,ax=plt.subplots(ncols=2)
f.set_size_inches(12,10)
sns.regplot(x="atemp",y="count",data=train,ax=ax[0])
sns.regplot(x="humidity",y="count",data=train,ax=ax[1])

<h3>Observations</h3>
<p>Bike demand increases with increase in atemp</p>
<p>Bike demand decreases with decrease in humidity</p>
<p>values are not evenly distributed across regression line for both the plots.Count variable needs to be made normally distributed</p>

<h3>Filling missing values with random forest in windspeed</h3>
<p> Before we proceed with that first let's remove unswanted columns and apply the observations we have made till now</p>

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train=train.drop(['temp','casual','registered'],axis=1)
test=test.drop(['temp'],axis=1)
combine=[train,test]

In [None]:
train_ws=train[train['windspeed']!=0]
train_ws_fill=train.loc[train['windspeed']==0,:]
rfr=RandomForestRegressor(n_estimators=20,random_state=10)
rfr.fit(train_ws.drop(['windspeed'],axis=1),train_ws['windspeed'])
train_ws_fill=train_ws_fill.copy()
train_ws_fill.loc[:,'windspeed']=rfr.predict(train_ws_fill.drop(['windspeed'],axis=1))

In [None]:
train_ws_fill.head()

In [None]:
train.loc[train['windspeed']==0,'windspeed']=train_ws_fill['windspeed']

In [None]:
train.head(10)

In [None]:
sns.regplot(x='windspeed',y='count',data=train)

<h3>Observation</h3>
<p>windspeed has very little effect on the value of count. It has least feature importance.</p>

<h3><u>Activities to be performed</u></h3>
<p>Nomalise count varible</p>
<p>Band hour variable</p>
<p>converting year to binary</p>

In [None]:
train['count']=train['count'].apply(lambda x: np.sqrt(x))

In [None]:
sns.distplot(train['count'],bins=20)

<p>This curve looks better i.e more normalized.</p>


<h3>Converting year to binary</h3>

In [None]:
train['year']=train['year'].apply(lambda x: 1 if x==2012 else 0).astype(int)

<h3>Banding hour variable</h3>

In [None]:
train['hour_band']=pd.cut(x=train['hour'],bins=5,precision=0)

In [None]:
train['hour_band'].unique()

In [None]:
train=train.copy()
train.loc[train['hour']<=5.0,'hour']=1
train.loc[(train['hour']>5.0) & (train['hour']<=9.0),'hour']=2
train.loc[(train['hour']>9.0) & (train['hour']<=14.0),'hour']=3
train.loc[(train['hour']>14.0) & (train['hour']<=18.0),'hour']=4
train.loc[(train['hour']>18.0),'hour']=5

In [None]:
train=train.drop(['hour_band'],axis=1)

In [None]:
train.head()

<h2>Model predicting and selection</h2>

In [None]:
train,test=train_test_split(train,test_size=0.20)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
x_train=train.drop(['count'],axis=1)
y_train=train['count']

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
lr_pred=lr.predict(test.drop(['count'],axis=1))

In [None]:
mse(test['count'],lr_pred)