### Bike Sharing 

In [None]:
# Importing Libraries
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import plotly.figure_factory as ff
import sklearn
import statsmodels.api as sm
import statsmodels
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import plotly.io as pio
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import warnings
import calendar
from sklearn.feature_selection import RFE
from sklearn.preprocessing import RobustScaler
%matplotlib inline
pio.templates.default = "plotly_dark"
warnings.filterwarnings('ignore')

### Data understanding, preparation and EDA

In [None]:
# Load Dataset
bike =pd.read_csv('../input/bike-sharing/day.csv')
bike.head(10)

In [None]:
bike.shape # 730 rows & 16 features

In [None]:
bike.isnull().sum() # No missing values

In [None]:
bike.info() # Dataframe info

In [None]:
# Datatype count
fig=px.bar(x=bike.dtypes.value_counts().index.astype(str).to_list() ,  
       y = bike.dtypes.value_counts().to_list(),
       color = bike.dtypes.value_counts().to_list(),
       text = bike.dtypes.value_counts().to_list(),
      )
fig.update_traces(textposition='outside',
                  #marker_coloraxis=None
                 )
fig.show()

In [None]:
# Convert columns into category
cols = ['season','weathersit','holiday','workingday','mnth','weekday']
for i in cols:
    bike[i] = bike[i].astype("category")
    
bike.dtypes

In [None]:
# Extract Month Name , Weekday name & year
bike['month']= bike['mnth'].apply(lambda x: calendar.month_name[x]).astype("category")

bike['week_day']= bike['weekday'].map({1:list(calendar.day_name)[0],
                                       2:list(calendar.day_name)[1],
                                       3:list(calendar.day_name)[2],
                                       4:list(calendar.day_name)[3],
                                       5:list(calendar.day_name)[4],
                                       6:list(calendar.day_name)[5],
                                       0:list(calendar.day_name)[6]}).astype("category")

bike['year'] = bike['dteday'].apply(lambda x : x.split('-')[2]).astype("category")
bike.head(20)

In [None]:
# Map values for 'season' column
bike['season']=bike['season'].map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})
bike.head()

In [None]:
# Map values for 'weathersit' column
bike['weathersit']=bike['weathersit'].map({1: 'Clear',2: 'Mist',3: 'Light_Snow',4: 'Heavy_Rain'})
bike.head()

In [None]:
# Applying custom sort on Day & Month
order1 = ['Monday','Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday']
bike['week_day']=pd.Categorical(bike['week_day'],order1) 

order2 = ["January","Febuary","March","April",
          "May","June","July","August",
          "September","October","November","December"]

bike['month']=pd.Categorical(bike['month'],order2) 

In [None]:
cols = ['season', 'year', 'holiday', 'week_day','workingday', 'weathersit']

#Subplot initialization
fig = make_subplots(
                     rows=2,  
                     cols=3,
                     subplot_titles=cols,
                     horizontal_spacing=0.1,
                     vertical_spacing=0.13 
                   )
# Adding subplots
cnt=0
for i in range(1,3):
    for j in range(1,4):
        fig.add_trace(go.Box( x=bike[cols[cnt]], 
                               y=bike.cnt
                            ),
                      row=i,col=j)
        cnt+=1 
fig.update_layout(
                    title=dict(text = "Bi-Variate Analysis",x=0.5,y=0.99),
                    title_font_size=20,
                    showlegend=False,
                    height = 1000,
                  )
fig.show()

__Insight__

- Higher bike rentals in __fall__ and __summer__.
- 2019 saw more demand as compared to 2018.
- High demand of bikes in __Clear__ weather.

#### Multivariate Analysis

In [None]:
fig=px.box(bike,
           x='season',
           y='cnt',
           facet_col='weathersit',
           facet_row='week_day',
           color='weathersit',
           boxmode="overlay", 
           points='all'
          )
fig.update_layout(height=1600)
fig.show()

__Insights__

- In __winter__ the demand for bikes is high if the climate is __clear__.

In [None]:
ff.create_scatterplotmatrix(bike[['temp', 'atemp', 'hum', 'windspeed','cnt']],
                            diag='box', 
                            size=4,
                            height=1300, 
                            width=1380
                           )

__Insights__

- Strong linear relationship between
     - __cnt__ & temp
     - atemp & cnt 
     - temp & atemp

In [None]:
# Dropping index column instant 
bike.drop(['instant'], inplace=True,axis=1)
bike.head()

In [None]:
# Drop "dteday" column
bike.drop(['dteday'], inplace=True,axis=1)
bike.head()

In [None]:
# Dropping 'casual' & 'registered' columns due to high multi-collinearity.
bike.drop(['casual','registered'], inplace=True,axis=1)
bike.head()

In [None]:
fig= ff.create_annotated_heatmap(np.array(bike.corr()).round(2),
                            x=bike.corr().columns.to_list(),
                            y=bike.corr().columns.to_list(),
                            colorscale='bluyl',
                            showscale=True)
fig.update_layout(height=800,margin=dict(l=340,r=340))

In [None]:
df1= bike.groupby(['week_day']).sum()['cnt'].reset_index() 
df2= bike.groupby(['month']).sum()['cnt'].reset_index() 
fig1= px.line(data_frame=df1,
        x= df1.week_day,
        y=df1.cnt,
        text=df1.cnt.apply(lambda x : str(round(x/1000))+'k').values
       )

fig1.update_traces(line=dict(color="#f58634", width=5),
                   textposition='top left')

fig1.update_xaxes(title='Date') 
fig1.update_yaxes(title='Transaction Amount')
fig1.update_layout(
                    title=dict(text = "Day Wise Total Rentals",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                  )

fig2= px.line(data_frame=df2,
        x= df2.month,
        y=df2.cnt,
        text=df2.cnt.apply(lambda x : str(round(x/1000))+'k').values
       )

fig2.update_traces(line=dict(color="#54e346", width=5),
                   textposition='top left'
                  )
fig2.update_xaxes(title='Day') 
fig2.update_yaxes(title='Total Rentals')
fig2.update_layout(
                    title=dict(text = "Month Wise Total Rentals",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False
                  )

fig1.show()
fig2.show()

__Insights__

- Big spike in demand on __Thursday__. Least demand on __Mondays__.
- __February__ turns out be the worst month for the bike rental business. High demand in __August__,__September__ & __June__.

In [None]:
df1= bike.groupby(['week_day','season']).sum()['cnt'].reset_index() 
df2= bike.groupby(['week_day','weathersit']).sum()['cnt'].reset_index() 

fig1= px.line(data_frame=df1,
        x= df1.week_day,
        y=df1.cnt,
        color=df1.season
       )

fig1.update_traces(line=dict(width=5),mode='markers+lines',marker=dict(size=14))
fig1.update_xaxes(title='Day') 
fig1.update_yaxes(title='Total Rental Bikes')
fig1.update_layout(
                    title=dict(text = "Day Wise Rental Bike count Vs Season",x=0.5,y=0.95),
                    title_font_size=20,
                    height=600
                  )

fig2= px.line(data_frame=df2,
        x= df2.week_day,
        y=df2.cnt,
        color=df2.weathersit
       )

fig2.update_traces(line=dict( width=5),mode='markers+lines',marker=dict(size=14))
fig2.update_xaxes(title='Day') 
fig2.update_yaxes(title='Total Rental Bikes')
fig2.update_layout(
                    title=dict(text = "Day Wise Rental Bike count Vs Weather",x=0.5,y=0.95),
                    title_font_size=20,
                    height=600
                  )

fig1.show()
fig2.show()

In [None]:
bike.describe()

In [None]:
list1 = list()
cols = ['atemp','temp','hum','windspeed']
for i in cols:
    list1.append(bike.groupby(bike[i].round().astype(int)).sum()['cnt'].reset_index())

#Subplot initialization
fig = make_subplots( 
                     rows=2, 
                     cols=2,
                     vertical_spacing=0.13, 
                     subplot_titles=('Rental Count VS atemp', 
                                     'Rental Count VS temp',
                                     'Rental Count VS humity', 
                                     'Rental Count VS windspeed',
                                     ),
                   )
# Adding subplots
count=0
for i in range(1,3): 
    for j in range(1,3): 
        fig.add_trace(go.Line(x=list1[count].iloc[:,0],
                             y=list1[count].cnt,
                             line=dict(width=2),
                             name=cols[count],
                             mode = 'lines+markers',
                             marker=dict(size=8)
                            ),
                      row=i,col=j)
        count+=1
fig.update_layout(
                    height = 1100,
                  )
fig.show()

__Insights__

- At 33 degree temperature bike demand is high
- At 29 degree feeling temperature bike rental demand is quite high.
- People prefer bike rental at 49 & 57 humidity.
- Demand is spiking at windspeed 10.

In [None]:
df1=bike.groupby([bike.season,bike['atemp'].round().astype(int)]).sum()['cnt'].reset_index()
df1['cnt']=df1.cnt.fillna(0)
fig1= px.line(
        data_frame=df1,
        x='atemp',
        y= 'cnt',
        color= 'season'
      )
fig1.show() 


df2=bike.groupby([bike.season,bike['windspeed'].round().astype(int)]).sum()['cnt'].reset_index()
df2['cnt']=df2.cnt.fillna(0)
fig2= px.line(
        data_frame=df2,
        x='windspeed',
        y= 'cnt',
        color= 'season'
      )
fig2.show() 

__Insights__

- There is a huge demand of bike rentals in __fall__ when __feeling temperature__ is at 33.

In [None]:
fig= px.scatter(bike,
           x= 'atemp',
           y= 'hum',
           color='cnt',
           size = 'windspeed',
           facet_col='season',
           facet_row='weathersit'
          )

fig.update_layout(height=1000)

fig.show()

In [None]:
fig= px.scatter(bike,
           x= 'atemp',
           y= 'cnt',
           color='hum',
           size = 'windspeed',
           facet_col='season',
           facet_row='weathersit'
          )

fig.update_layout(height=1000)

fig.show()

In [None]:
fig= px.scatter(bike,
           x= 'hum',
           y= 'cnt',
           color='atemp',
           size = 'windspeed',
           facet_col='season',
           facet_row='weathersit'
          )

fig.update_layout(height=1000)

fig.show()

In [None]:
fig= px.scatter(bike,
           x= 'atemp',
           y= 'hum',
           color='cnt',
           size = 'windspeed',
           facet_col='season',
           facet_row='weathersit'
          )

fig.update_layout(height=1000)

fig.show()

In [None]:
bike.head()

In [None]:
# One-Hot Encoding on Categorical feature (week_day)
bike= pd.concat([bike,pd.get_dummies(bike['week_day'],drop_first=True)],axis=1)
bike.head()

In [None]:
# One-Hot Encoding on Categorical feature (month)
bike= pd.concat([bike,pd.get_dummies(bike['month'],drop_first=True)],axis=1)
bike.head()

In [None]:
# One-Hot Encoding on Categorical data (Season)
bike= pd.concat([bike,pd.get_dummies(bike['season'],drop_first=True)],axis=1) 
bike.head()

In [None]:
# One-Hot Encoding on Categorical data (weathersit)
bike= pd.concat([bike,pd.get_dummies(bike['weathersit'],drop_first=True)],axis=1) 
bike.head()

In [None]:
# Drop year, month , weekday , weathersit  and season columns
bike.drop(['weekday','mnth','month','week_day','year','weathersit','season'],inplace=True,axis=1)
bike.head()

In [None]:
# High correlation between 'temp' & 'atemp'
bike[['temp','atemp']].corr()

In [None]:
# Dropping temp column
bike.drop(['temp'],inplace=True,axis=True) 

### Train Test Split

In [None]:
df_train,df_test=train_test_split(bike,train_size=0.7,random_state=100) 
df_train.shape,df_test.shape

In [None]:
df_train.head()

In [None]:
df_test.head()

### Feature Scaling

In [None]:
scaler = RobustScaler() # Initialize a RobustScaler object

num_cols = ['atemp', 'hum', 'windspeed','cnt'] # Numerical Vaeriables 

df_train[num_cols]

In [None]:
df_train[num_cols]=scaler.fit_transform(df_train[num_cols])
df_train.head()

In [None]:
df_train.describe()

#### Feature importance 

In [None]:
fig=ff.create_annotated_heatmap(np.array(df_train.corr().round(2).fillna(0)),
                            x=df_train.corr().columns.to_list(),
                            y=df_train.corr().columns.to_list(), 
                            colorscale='bluyl',
                            showscale=True) 

fig.update_layout(height=1000)
fig.show()

In [None]:
fig=px.bar(df_train.corr().loc['cnt',:].apply(lambda x: abs(x)).sort_values(ascending=False),
       title='Feature importance by Correlation Coefficient',
       color = df_train.corr().loc['cnt',:].apply(lambda x: abs(x)).sort_values(ascending=False),
       text = df_train.corr().loc['cnt',:].apply(lambda x: abs(x)).sort_values(ascending=False).round(2)
      ) 
fig.update_traces(textposition='outside')
fig.show()

### Train Model

In [None]:
# X Train & Y train
y_train=df_train.pop('cnt')
X_train=df_train
X_train.head()

In [None]:
y_train.head()

#### Feature Selection

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)
rfe = RFE(lm, 14)
rfe = rfe.fit(X_train, y_train)
df = pd.DataFrame(list(zip(X_train.columns,rfe.support_,rfe.ranking_)),columns=['Features','RFE Support','Rank']) 
for i in range(df.shape[0]):
    if df.iloc[i,1]==False:
        df.iloc[i,2]= - df.iloc[i,2] 
fig= px.bar(data_frame=df,x='Features',y='Rank',color='Rank',title='Feature Importance using RFE')

fig.update_layout(height=500)
fig.show()

In [None]:
df[df['RFE Support']==True]

In [None]:
df[df['RFE Support']==True].Features.to_list()

In [None]:
X_train= X_train[df[df['RFE Support']==True].Features.to_list()]
X_train.head()

#### First Model

In [None]:
# Add a constant 
X_train_sm=sm.add_constant(X_train)

#Create first Model
lr= sm.OLS(y_train,X_train_sm)

# Fit
lr_model = lr.fit()

#Params
lr_model.params

In [None]:
lr_model.summary()

In [None]:
#VIF
vif=pd.DataFrame()
vif['Features']=X_train.columns
vif['VIF']=[variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])]
vif['VIF'] = vif['VIF'].apply(lambda x: round(x,2))
vif= vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
# Remove 'May' P value
X=X_train.drop('May',axis=1) 
X.head()

#### Second Model

In [None]:
# Add a constant 
X_train_sm=sm.add_constant(X)

#Create second Model
lr= sm.OLS(y_train,X_train_sm)

# Fit
lr_model = lr.fit()

#Params
lr_model.summary()

In [None]:
# Calculate VIF
vif=pd.DataFrame()
vif['Features']=X.columns
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['VIF'] = vif['VIF'].apply(lambda x: round(x,2))
vif= vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
# Remove holiday due to high P value
X=X.drop('holiday',axis=1) 
X.head()

#### Third Model

In [None]:
# Add a constant 
X_train_sm=sm.add_constant(X)

#Create third Model
lr= sm.OLS(y_train,X_train_sm)

# Fit
lr_model = lr.fit()

#Params
lr_model.summary()

In [None]:
# Calculate VIF
vif=pd.DataFrame()
vif['Features']=X.columns
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['VIF'] = vif['VIF'].apply(lambda x: round(x,2))
vif= vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
# Drop fall due to high VIF value
X=X.drop('fall',axis=1) 
X.head()

#### 4th Model

In [None]:
# Add a constant 
X_train_sm=sm.add_constant(X)

#Create 4th Model
lr= sm.OLS(y_train,X_train_sm)

# Fit
lr_model = lr.fit()

#Params
lr_model.summary()

In [None]:
# Calculate VIF
vif=pd.DataFrame()
vif['Features']=X.columns
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['VIF'] = vif['VIF'].apply(lambda x: round(x,2))
vif= vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
# Remove October due to high P value
X=X.drop('October',axis=1) 
X.head()

#### 5th Model

In [None]:
# Add a constant 
X_train_sm=sm.add_constant(X)

#Create 5th Model
lr= sm.OLS(y_train,X_train_sm)

# Fit
lr_model = lr.fit()

#Params
lr_model.summary()

In [None]:
# Calculate VIF
vif=pd.DataFrame()
vif['Features']=X.columns
vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['VIF'] = vif['VIF'].apply(lambda x: round(x,2))
vif= vif.sort_values(by='VIF',ascending=False)
vif

### Residual Analysis

In [None]:
y_train_pred=lr_model.predict(X_train_sm)
res=y_train-y_train_pred
sns.distplot(res)

In [None]:
#Evaluate
r2_score(y_true=y_train,y_pred=y_train_pred) 

### Model Evaluation

In [None]:
num_cols = ['atemp', 'hum', 'windspeed','cnt']
df_test[num_cols]=scaler.transform(df_test[num_cols])
df_test.head()

In [None]:
y_test=df_test.pop('cnt')
X_test=df_test[X.columns.to_list()]

X_test_sm=sm.add_constant(X_test)
X_test_sm.head()

In [None]:
y_test.head()

In [None]:
# Predict
y_test_pred=lr_model.predict(X_test_sm)

#Evaluate
r2_score(y_true=y_test,y_pred=y_test_pred)

In [None]:
res_train =y_train-y_train_pred
res_test = y_test-y_test_pred

In [None]:
df1 = pd.DataFrame(np.column_stack([y_test.values.tolist(),y_test_pred.values.tolist(),res_test.values.tolist()]),
                   columns=['Target','Prediction','Residual'])
df1['split'] = 'test'

df2 = pd.DataFrame(np.column_stack([y_train.values.tolist(),y_train_pred.values.tolist(),res_train.values.tolist()]),
                   columns=['Target','Prediction','Residual'])
df2['split'] = 'train'

df3= pd.concat([df1, df2])
df3.head(10)

In [None]:
fig = px.scatter(
    df3, x='Prediction', y='Residual',
    marginal_y='violin',
    color='split', trendline='ols',
)

fig.update_layout(height=600,title_text='Residual Analysis', title_x=0.5,title_font_size=20)

fig.show()

# End