In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 1.Reading and Understanding Data

In [None]:
df = pd.read_csv("day.csv")

In [None]:
df.head()

In [None]:
df.info()

Since all the columns have 730 non-null rows,there is no null columns in the dataset

In [None]:
df.describe()

In [None]:
#Correcting values of columns
df['weathersit']=df.weathersit.map({1: 'Clear',2:'Mist',3:'Light Snow/Rain',4:'Heavy Snow/Rain'})
df['season']=df.season.map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})
df['weekday']=df.weekday.map({0:'Sunday', 1:'Monday', 2:'Tuesday', 3:'Wednesday',4:'Thursday',5:'Friday',6:'Saturday'})
df['mnth']=df.mnth.map({1:'January',2:'February',3:'March',4:'April',5:'May',6:'June',7:'July',8:'August',9:'September',10:'October',11:'November',12:'December'})

#### Analsing the Numerical Columns

In [None]:
df_1=df[['cnt','temp','atemp','hum','windspeed']]
sns.pairplot(df_1)

Temp and atemp have a linear relationship with cnt.And also temp and atemp are highly correlated with each other.This may result in multicolinearity

#### Analysing Categorical columns

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.boxplot(x='yr',y='cnt',data=df)
ax=plt.subplot(2,3,2)
sns.boxplot(x='mnth',y='cnt',data=df)
ax.tick_params(labelrotation=45)
ax=plt.subplot(2,3,3)
sns.boxplot(x='season',y='cnt',data=df)
ax.tick_params(labelrotation=45)
plt.subplot(2,3,4)
sns.boxplot(x='holiday',y='cnt',data=df)
plt.subplot(2,3,5)
sns.boxplot(x='workingday',y='cnt',data=df)
ax=plt.subplot(2,3,6)
sns.boxplot(x='weekday',y='cnt',data=df)
ax.tick_params(labelrotation=45)

Working day seems to be a good predictor.With 5000 as median more than 50% of the bikes were rented during working day 



More rental were made in the year 2019


The median count of rentals for each weekday seems to be around 5000

May to October the bike rental count were in good number.Many bookings were made during that period of the year 

Almost 32% of the bike booking were happening in fall with almost 5000 bookings as median 

In [None]:
sns.barplot('weathersit','cnt',data=df)
plt.show()

More bikes were rented during clear weather

In [None]:
#Removing unwanted col
df=df.drop(['instant','dteday','registered','casual'], axis=1)

# 2.Data Preparation

In [None]:
df.head()

#### Dummy variable creation

In [None]:
weather=pd.get_dummies(df['weathersit'],drop_first=True)
Season=pd.get_dummies(df['season'],drop_first=True)
day=pd.get_dummies(df['weekday'],drop_first=True)
month=pd.get_dummies(df['mnth'],drop_first=True)

In [None]:
df=pd.concat([df,weather,Season,day,month],axis=1)

In [None]:
#dropping the columns for which dummy variables are created
df=df.drop(["weathersit","season","weekday","mnth"],axis=1)

In [None]:
df.head()

# 3.Initial Steps

#### Splitting Train and Test

In [None]:
df_train,df_test=train_test_split(df,train_size=0.7,random_state=100)

In [None]:
df_train.shape

In [None]:
df_test.shape

#### Re-scaling

In [None]:
scaler=MinMaxScaler()

In [None]:
scale_vars=["temp","hum","windspeed","cnt","atemp"]

In [None]:
df_train[scale_vars]=scaler.fit_transform(df_train[scale_vars])

In [None]:
df_train.head()

# 4.Training the Model

In [None]:
plt.figure(figsize=(30,25))
ax=sns.heatmap(df_train.corr(),annot=True,cmap="Greens")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

Temperature seems to be more correlated with count.Plotting this heatmap will be used as a reference for building the models 

In [None]:
#Dividing the data into X and y
y_train = df_train.pop('cnt')
X_train = df_train

# 5.Building the Model 

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

##### Recursive feature elimination is used to automatically remove low correlated features

In [None]:
rfe=RFE(lm,15)
rfe=rfe.fit(X_train,y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col=X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

In [None]:
X_train_rfe=X_train[col]

##### Building the OLS Model 

In [None]:
X_train_rfe=sm.add_constant(X_train_rfe)


In [None]:
lm=sm.OLS(y_train,X_train_rfe).fit()

In [None]:
lm.summary()

##### Checking the VIF 

In [None]:
#Dropping the const column
X_train_rfe = X_train_rfe.drop(['const'], axis=1)

In [None]:
vif=pd.DataFrame()

In [None]:
vif['Features']=X_train_rfe.columns

In [None]:
vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]
vif['VIF']=round(vif['VIF'],2)

In [None]:
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
#dropping December since it has high P-value among all
X_train_new1 = X_train_rfe.drop(["December"], axis = 1)


#### Building the model again

In [None]:
X_train_rfe1=sm.add_constant(X_train_new1)
lm1=sm.OLS(y_train,X_train_rfe1).fit()
lm1.summary()

In [None]:
#calculating vif for new model
X_train_rfe1 = X_train_rfe1.drop(["const"], axis = 1)
vif=pd.DataFrame()
vif['Features']=X_train_rfe1.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe1.values, i) for i in range(X_train_rfe1.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
#dropping November sincle it has high p-value and low VIF
X_train_rfe2 = X_train_rfe1.drop(["November"], axis = 1)

#### Building the next model

In [None]:
X_train_rfe2=sm.add_constant(X_train_rfe2)
lm2=sm.OLS(y_train,X_train_rfe2).fit()
lm2.summary()

In [None]:
#calculating vif for new model
X_train_rfe2 = X_train_rfe2.drop(['const'], axis=1)
vif=pd.DataFrame()
vif['Features']=X_train_rfe2.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe2.values, i) for i in range(X_train_rfe2.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
#humidity can be dropped due to high VIF
X_train_rfe3 = X_train_rfe2.drop(["hum"], axis = 1)

#### Building the Model3

In [None]:
X_train_rfe3=sm.add_constant(X_train_rfe3)
lm3=sm.OLS(y_train,X_train_rfe3).fit()
lm3.summary()

In [None]:
X_train_rfe3 = X_train_rfe3.drop(['const'], axis=1)
vif=pd.DataFrame()
vif['Features']=X_train_rfe3.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe3.values, i) for i in range(X_train_rfe3.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
#January has high P-value and low VIF so it can be dropped
X_train_rfe4 = X_train_rfe3.drop(["January"], axis = 1)

#### Building model4

In [None]:
X_train_rfe4=sm.add_constant(X_train_rfe4)
lm4=sm.OLS(y_train,X_train_rfe4).fit()
lm4.summary()

In [None]:
X_train_rfe4 = X_train_rfe4.drop(['const'], axis=1)
vif=pd.DataFrame()
vif['Features']=X_train_rfe4.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe4.values, i) for i in range(X_train_rfe4.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
#Removing July since it has high p value and low p-value
X_train_rfe5 = X_train_rfe4.drop(["July"], axis = 1)

#### Building Model5

In [None]:
X_train_rfe5=sm.add_constant(X_train_rfe5)
lm5=sm.OLS(y_train,X_train_rfe5).fit()
lm5.summary()

In [None]:
X_train_rfe5 = X_train_rfe5.drop(['const'], axis=1)
vif=pd.DataFrame()
vif['Features']=X_train_rfe5.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe5.values, i) for i in range(X_train_rfe5.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
#removing windspeed since it has high VIF and low correltion
X_train_rfe6 = X_train_rfe5.drop(["windspeed"], axis = 1)

#### Building Model6

In [None]:
X_train_rfe6=sm.add_constant(X_train_rfe6)
lm6=sm.OLS(y_train,X_train_rfe6).fit()
lm6.summary()

In [None]:
vif=pd.DataFrame()
vif['Features']=X_train_rfe6.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe6.values, i) for i in range(X_train_rfe6.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

The Model seems to be good now

The co-efficient values of the features are below:
- yr	          0.2332
- holiday	     -0.0991
- temp	          0.4896
- LightSnow/Rain -0.2998
- Mist	         -0.0770
- spring	     -0.0648
- summer	      0.0523
- winter	      0.0957
- September	      0.0954

#### F-Statistics:
The higher the value the good the model is.In our case it is __255.2__

#### P value:
All features have p-value less than 0.005.Then the model is Statistically Significant

#### VIF
All VIF<5.Hence there is no multicollinearity

#### The best fit equation:
cnt = 0.1414 + ( yr × 0.2332) - ( holiday x 0.0991) + ( temp ×0.4896) − ( LightSnow/Rain x 0.2998) - (mist x 0.0770) − (spring ×0.0648) + (summer x 0.0523) + (winter x 0.0957) + (September x 0.0954) 

#### Co-effitients interpretation:
- __temp__ : A degree raise in temp will raise the bie rental by 0.4896
- __holiday__ : As the number of holidays increases the bike rental decreases
- __year__ : The bike rentals increases yearly
- __LightSnow/Rain__ : On account of this condition the bike rentals decreases by 0.2998
- __mist__ : Mist affects bike rentals by 0.0770
- __summer__ :Has positive effect on rental
- __winter__ : Has positive effect on rental
- __spring__ : Has negative effect on rental
- __September__ :During this month the rental is affected by a positive factor value of 0.0954

# 6.Residual Analysis of the train data

In [None]:
#predicting the y value for training set
y_train_pred=lm6.predict(X_train_rfe6)

In [None]:
#Finding error terms
res = y_train - y_train_pred

In [None]:
sns.distplot(res)

Error term seems to follow a Normal Distribution.

In [None]:
plt.scatter(y_train,res)


Error terms are scattered hence they possess no relation.Seems to have constant variance (homoscedasticity)

In [None]:
#calculating r2 value for train data
r2_train=r2_score(y_train,y_train_pred)
r2_train

In [None]:
#calculating adjusted r2 for train data
n = X_train.shape[0]

p = X_train.shape[1]

adjusted_r2_train = 1-(1-r2_train)*(n-1)/(n-p-1)
adjusted_r2_train

# 7.Prediction and Evaluation on test set

In [None]:
scale_vars=["temp","hum","windspeed","cnt","atemp"]

In [None]:
df_test[scale_vars]=scaler.fit_transform(df_test[scale_vars])

In [None]:
df_test.head()

In [None]:
y_test=df_test.pop('cnt')
X_test=df_test
X_test.head()

In [None]:
X_train_rfe6.drop(["const"], axis = 1,inplace=True)
X_test_new = X_test[X_train_rfe6.columns]

In [None]:
# Adding a constant variable 
X_test_new1 = sm.add_constant(X_test_new)
X_test_new1.head()

In [None]:
#predicting y value for test set
y_pred = lm6.predict(X_test_new1)

In [None]:
#calculatin gerror terms for test set
res = y_test - y_pred

In [None]:
sns.distplot(res)

Error term seems to follow a Normal Distribution

In [None]:
plt.scatter(y_test,res)

Error terms seems to be random and seems to follow homoscedasticity 

In [None]:
plt.scatter(y_test,y_pred)

In [None]:
#calculating r2 value for test set

r2_test=r2_score(y_test,y_pred)
r2_test

In [None]:
#calculating gadjusted r2 value for test set
n = X_test.shape[0]

p = X_test.shape[1]

adjusted_r2_test = 1-(1-r2_test)*(n-1)/(n-p-1)
adjusted_r2_test

In [None]:
#calculating MSE
mean_squared_error(y_test,y_pred)

MSE value is nearly 0.Hence we can assume the model to be acceptable

## Model Conclusion

- __r2_score_train__ : 0.8212172937848272
- __adj_r2_score_train__ : 0.8104158386176605
- __r2_score_test__  : 0.8052122801477507
- __adj_r2_score_test__  : 0.7754815229071442

This seems to be a really good model that can very well 'Generalize' various datasets

## Insights
#### Positive Impacts:
- The good the __Temperature__ is the more the bike rentals are
- As the __year__ increases the bike rentals also increases
- Bike rentals are more during __summer__ and __winter__ seasons
- The month of __September__ has increased rentals than other months

#### Negative Impacts:
- When __holiday__ the number of bikes rented are low
- __Snow/Rain__ may negatively affect rental
- __Spring__ season has decreased number of rentals 