   # Bike Sharing Assignment

In [None]:
# Importing all the required libraries and packages 
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE

### Loading the data

In [None]:
# Loading the data provided into a dataframe
bike=pd.read_csv(r'../input/bikesharing/day.csv')

In [None]:
# Checking if the datasey is loaded as required in a dataframe
bike.head()

### Inspecting the dataframe

In [None]:
# Inspecting the shape of the dataframe
bike.shape

In [None]:
bike.info()

In [None]:
# Inspecting the numerical variables of the dataframe
bike .describe()

In [None]:
# Dropping the column named 'instant' as it is nothing but the index 
bike.drop(['instant'],axis=1,inplace=True)

In [None]:
# Checking for null values in the columns of the dataframe
bike.isnull().sum()

In [None]:
# Verifying if the dates of the month in the dtday column are valid.
bike.dteday.apply(lambda x: int(x.split('-')[0])).describe()

In [None]:
# Verifying if the number of months is also valid
bike.dteday.apply(lambda x: int(x.split('-')[1])).describe()

In [None]:
bike.dteday.apply(lambda x: int(x.split('-')[2])).describe()

In [None]:
# Creating the new column of date and fetching days of the month in that column
bike['Date']=bike.dteday.apply(lambda x: int(x.split('-')[0]))

In [None]:
# Inspecting the new column so formed 
bike.Date.describe()

In [None]:
# Checking the 'mnth' column  
bike.mnth.describe()

In [None]:
# Dropping the 'dteday' column as its contents are well explained by other columns 
bike.drop(['dteday'],axis=1,inplace=True)

In [None]:
# Checking the dataframe again for how it looks
bike.head()

## Exploratory data analysis of the dataframe

In [None]:
#Ploting the barplot for "day of month" vs the target variable "Count"
plt.figure(figsize=(10,6))
sns.barplot(bike.Date,bike.cnt)
plt.title('Day vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Days of a month ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
# Dropping the 'Date' column as it seems, it does not have much of significance impact on our target variable
bike.drop('Date',axis=1,inplace=True)

In [None]:
# Reassigning names to the values of column 'season' as provieded in the dictionary
bike['season'].replace({1:"spring",2:"summer",3:"fall",4:"winter"},inplace=True)

In [None]:
# Plotting a barplot 'season' vs 'count'
sns.barplot(bike.season,bike.cnt)
plt.title('Season vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Seasons ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
# Renaming the column 'yr' as 'Year'
bike=bike.rename(columns={'yr':'Year'})

In [None]:
#Plotting barplot 'Year' vs 'count'
sns.barplot(bike.Year,bike.cnt)
plt.title('Year vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Years ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
#Plotting barplot Month vs Count
plt.figure(figsize=(10,6))
sns.barplot(bike.mnth,bike.cnt)
plt.title('Months vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Months ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
# Plotting barplot Holiday vs Count
sns.barplot(bike.holiday,bike.cnt)
plt.title('Holiday vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Holidays ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
# Plotting barplot weekday vs Count
sns.barplot(bike.weekday,bike.cnt)
plt.title('Weekday vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Weekday ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
# Plotting barplot of count vs workingday
sns.barplot(bike.workingday,bike.cnt)
plt.title('Workingday vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Workingkday ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.show()

In [None]:
# Replcing values of the column 'weathersit' as provided in the data dictionary
bike['weathersit'].replace({1:"Clear_Few Clouds",2:"Mist_cloudy",3:"Light rain_Light snow_Thunderstorm",4:'Heavy Rain_Ice Pallets_Thunderstorm_Mist'},inplace=True)

In [None]:
# Plotting barplot of count vs weather
sns.barplot(bike.weathersit,bike.cnt)
plt.title('Weather vs Count \n',fontdict={'fontsize': 18, 'fontweight' : 10, 'color' : 'Blue'})
plt.xlabel("Weather ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.ylabel("Ride Count (In Thousands) ", fontdict={'fontsize': 12, 'fontweight' : 5, 'color' : 'Brown'})
plt.xticks(rotation=30)
plt.show()

In [None]:
#Checking the names of the columns in the dataframe
bike.columns

In [None]:
# Dropping the columns which are irrelevant as the same information is given by some other column cummulatively
bike.drop(['casual','registered'],axis=1,inplace=True)
# Renaming the columns as felt fit for usage
bike=bike.rename(columns={'mnth':'Month'})
bike=bike.rename(columns={'hum':'Humidity'})
bike=bike.rename(columns={'cnt':'Count'})

In [None]:
# Checking the information of the dataframe after performing all above operations
bike.info()

In [None]:
# Converting the numeric variales to pandas numeric
bike[['temp','atemp','Humidity','windspeed','Count']]=bike[['temp','atemp','Humidity','windspeed','Count']].apply(pd.to_numeric)

In [None]:
# Plotting the pairplot for the numeric variables
sns.pairplot(bike, vars=['temp','atemp','Humidity','windspeed','Count'])
plt.show()

In [None]:
# PLotting the heatmap for checking the correlations of the variables of the dataframe 
plt.figure(figsize=(10,6))
sns.heatmap(bike.corr(), annot=True,cmap='Blues')
plt.show()

In [None]:
# Checking the values of the target variable is normally distributed
sns.distplot(bike.Count)
plt.show()

In [None]:
# Dropping the variable 'temp' as it is too much correlated with the variable 'atemp'
bike.drop('temp',axis=1,inplace=True)

In [None]:
# Converting  the dtypes of certain variables as saw fit 
bike['season']=bike['season'].astype('object')
bike['weathersit']=bike['weathersit'].astype('object')
bike['Month']=bike['Month'].astype('object')
bike['weekday']=bike['weekday'].astype('object')

In [None]:
bike.head()

In [None]:
bike.info()

In [None]:
# Creating the dummies of multiple features that can be useful for the model building
bikedf=pd.get_dummies(bike,drop_first=True)

In [None]:
# Checking the ionfo of new dataframe after creating the dummies 
bikedf.info()

In [None]:
# Checking the dataframe for how it looks
bikedf.head()

In [None]:
## Checking the columns of the new dataframe
bikedf.columns

### Splitting the dataframe into test and train dataframes

In [None]:
# Splitting the dataframe into test and train dataframes 
np.random.seed(0)
bikedf_train, bikedf_test= train_test_split(bikedf,train_size=0.7,test_size=0.3,random_state=100)

In [None]:
# Checking the train dataframe for how it looks
bikedf_train.head()

#### Rescaling the features

In [None]:
# Rescaling the numeric variables 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars=['Humidity','windspeed','Count','atemp']
bikedf_train[num_vars] = scaler.fit_transform(bikedf_train[num_vars])

In [None]:
# Checking the shape of the train dataframe
bikedf_train.shape

In [None]:
#Checking the shape of test dataframe
bikedf_test.shape

In [None]:
# Checking the train dataframe for how it looks after rescaling the numeric variables
bikedf_train.head()

In [None]:
# Checking the description of the train dataframe after rescaling for reassurance
bikedf_train.describe()

#### Dividing the train dataset into  X and Y  sets for the model building

In [None]:
# Spitting the traiin dataframe into dependent and independent variables
y_train=bikedf_train.pop('Count')
X_train=bikedf_train

In [None]:
# Checking all the independent varibles
X_train.head()

In [None]:
# Checking the dependent variable
y_train.head()

In [None]:
# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 15)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
# Getting the preferance and ranking of the variables as providd by running RFE
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
# Fetching the variables that are to be included in the initial linear model given by RFE
col = X_train.columns[rfe.support_]
col

In [None]:
# Dropping columns which were dropped by RFE
X_train.columns[~rfe.support_]

In [None]:
# Creating X_train dataframe with RFE selected variables
X_train_new = X_train[col]

In [None]:
# Adding a constant variable
X_train_rfe0 = sm.add_constant(X_train_new)

In [None]:
# Running the linear model "lm0"
lm0 = sm.OLS(y_train,X_train_rfe0).fit()

In [None]:
# Seeing the summary of the linear model.
print(lm0.summary())

In [None]:
# Calculating the VIFs
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping the variable'Humidity' as it has VIF ia 25.72 > 5  
X_train_new=X_train_new.drop(['Humidity'],axis=1)

In [None]:
# Building a new model "lm1"
X_train_rfe1 = sm.add_constant(X_train_new)
lm1 = sm.OLS(y_train,X_train_rfe1).fit()
print(lm1.summary())

In [None]:
# Checking VIFs of variables of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping variable 'atemp' as its VIF 6.26 > 5
X_train_new=X_train_new.drop('atemp',axis=1)

In [None]:
# Building a new model "lm2"
X_train_rfe2 = sm.add_constant(X_train_new)
lm2=sm.OLS(y_train,X_train_rfe2).fit()
print(lm2.summary())

In [None]:
# Checking VIFs of variables of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping variable 'season_winter' as its p-value=0.630 which is more than prescribed value of 0.05 
X_train_new=X_train_new.drop('season_winter',axis=1)

In [None]:
# Building a new model "lm3"
X_train_rfe3= sm.add_constant(X_train_new)
lm3=sm.OLS(y_train,X_train_rfe3).fit()
print(lm3.summary())

In [None]:
# Checking VIFs of variables of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping the variable 'Month_4' as its p-value= 0.619 which is more than 0.05
X_train_new=X_train_new.drop('Month_4',axis=1)

In [None]:
# Building a new model "lm4"
X_train_rfe4=sm.add_constant(X_train_new)
lm4=sm.OLS(y_train,X_train_rfe4).fit()
print(lm4.summary())

In [None]:
# Checking VIFs of variables of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Adding a new variable to X_train dataframe to check its significance on the target variable 
X_train_new['weekday_1']=X_train['weekday_1'] 

In [None]:
# Building a new model "lm5"
X_train_rfe5=sm.add_constant(X_train_new)
lm5=sm.OLS(y_train,X_train_rfe5).fit()
print(lm5.summary())

In [None]:
# Adding a new variable to X_train dataframe to check its significance on the target variable 
X_train_new['weekday_6']=X_train['weekday_6']

In [None]:
# Building a new model "lm6"
X_train_rfe6 = sm.add_constant(X_train_new)
lm6 = sm.OLS(y_train,X_train_rfe6).fit()
print(lm6.summary())

In [None]:
# Checking VIFs of variables of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Adding new variable to X_train to check its impact on target variable
X_train_new['workingday']=X_train['workingday']

In [None]:
# Building a new model "lm7"
X_train_rfe7 = sm.add_constant(X_train_new)
lm7 = sm.OLS(y_train,X_train_rfe7).fit()
print(lm7.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping the variable 'weekday_6' as it p-value is exceeding the limit 
X_train_new=X_train_new.drop('weekday_6',axis=1)

In [None]:
# Building the new model "lm8"
X_train_rfe8 = sm.add_constant(X_train_new)
lm8 = sm.OLS(y_train,X_train_rfe8).fit()
print(lm8.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping the variable 'workingday' as its p-value exeeds the limit
X_train_new=X_train_new.drop('workingday',axis=1)

In [None]:
# Building the new model  "lm9"
X_train_rfe9 = sm.add_constant(X_train_new)
lm9 = sm.OLS(y_train,X_train_rfe9).fit()
print(lm9.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Adding the left out month variable 'Month_2'
X_train_new['Month_2']=X_train['Month_2']

In [None]:
#Building the new model "lm10"
X_train_rfe10 = sm.add_constant(X_train_new)
lm10 = sm.OLS(y_train,X_train_rfe10).fit()
print(lm10.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping the variable 'Month_2' as its p-value exceeds the limit
X_train_new=X_train_new.drop('Month_2',axis=1)

In [None]:
# Adding the left out month variable 'Month_7'
X_train_new['Month_7']=X_train['Month_7']

In [None]:
#Building the new model "lm11"
X_train_rfe11=sm.add_constant(X_train_new)
lm11=sm.OLS(y_train,X_train_rfe11).fit()
print(lm11.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Adding the left out month variable 'Month_10'
X_train_new['Month_10']=X_train['Month_10']

In [None]:
#Building the new model "lm12"
X_train_rfe12=sm.add_constant(X_train_new)
lm12=sm.OLS(y_train,X_train_rfe12).fit()
print(lm12.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Adding the left out month variable 'Month_11'
X_train_new['Month_11']=X_train['Month_11']

In [None]:
#Building the new model "lm13"
X_train_rfe13=sm.add_constant(X_train_new)
lm13=sm.OLS(y_train,X_train_rfe13).fit()
print(lm13.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Dropping the variable 'Month_11' as its p-value exceeds the limit
X_train_new=X_train_new.drop('Month_11',axis=1)

In [None]:
# Adding the left out month variable 'Month_12'
X_train_new['Month_12']=X_train['Month_12']
#Building the new model "lm14"
X_train_rfe14=sm.add_constant(X_train_new)
lm14=sm.OLS(y_train,X_train_rfe14).fit()
print(lm14.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Dropping the variable 'Month_12' as its p-value exceeds the limit
X_train_new=X_train_new.drop('Month_12',axis=1)

In [None]:
# Adding the left out weekday variable 'weekday_2'
X_train_new['weekday_2']=X_train['weekday_2']
#Builing the new model "lm15"
X_train_rfe15=sm.add_constant(X_train_new)
lm15=sm.OLS(y_train,X_train_rfe15).fit()
print(lm15.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Adding the left out weekday variable 'weekday_3'
X_train_new['weekday_3']=X_train['weekday_3']
#Builing the new model "lm16"
X_train_rfe16=sm.add_constant(X_train_new)
lm16=sm.OLS(y_train,X_train_rfe16).fit()
print(lm16.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Dropping the variable 'weekday_3' as its p-value exceeds the limit
X_train_new=X_train_new.drop('weekday_3',axis=1)

In [None]:
# Adding the left out weekday variable 'weekday_4'
X_train_new['weekday_4']=X_train['weekday_4']
#Builing the new model "lm17"
X_train_rfe17=sm.add_constant(X_train_new)
lm17=sm.OLS(y_train,X_train_rfe17).fit()
print(lm17.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Dropping the variable 'weekday_4' as its p-value exceeds the limit
X_train_new=X_train_new.drop('weekday_4',axis=1)

In [None]:
# Adding the left out weekday variable 'weekday_4'
X_train_new['weekday_5']=X_train['weekday_5']
#Builing the new model "lm18"
X_train_rfe18=sm.add_constant(X_train_new)
lm18=sm.OLS(y_train,X_train_rfe18).fit()
print(lm18.summary())

In [None]:
# Checking the VIFs of the variable of new model
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Dropping the variable 'weekday_5' as its p-value exceeds the limit
X_train_new=X_train_new.drop('weekday_5',axis=1)

In [None]:
#Builing the new model "lm19"
X_train_rfe19=sm.add_constant(X_train_new)
lm19=sm.OLS(y_train,X_train_rfe19).fit()
print(lm19.summary())

In [None]:
# Checking the VIFs of the variable of new mode
vif=pd.DataFrame()
X=X_train_new
vif['Features']=X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
lm19.params

### Residual Analysis of the train data

In [None]:
# Predicting the values of target variable using the selected model i.e lm19
y_train_Count = lm19.predict(X_train_rfe19)

In [None]:
# Calculating the residual 
residual=y_train-y_train_Count

In [None]:
# Plotting a distplot for checking if the errors are distributed normally.
fig = plt.figure()
plt.figure(figsize=(10,8))
sns.distplot((residual), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                   
plt.xlabel('Errors', fontsize = 18)  
plt.show()

### Assumption of Error Terms Being Independent

In [None]:
# Plotting the regression graph of "residual" vs "predicted values" of target variable
plt.figure(figsize=(10,8))
sns.regplot(x=y_train_Count, y=residual)
plt.title('Residual Vs. Predicted Values (Pattern Indentification)', fontdict={'fontsize': 20})
plt.xlabel('Predicted Values', fontdict={'fontsize': 15})
plt.ylabel('Residuals', fontdict={'fontsize': 15})
plt.show()

### Homoscedasticity

In [None]:
# Plotting the regression plot of "actual target values" vs "predicted target values"
plt.figure(figsize=(10,8))
sns.regplot(x=y_train, y=y_train_Count)
plt.title('Predicted Points Vs. Actual Points', fontdict={'fontsize': 20})
plt.xlabel('Actual Points', fontdict={'fontsize': 15})
plt.ylabel('Predicted Points', fontdict={'fontsize': 15})
plt.show()

## Model Evaluation 

In [None]:
# Transforming the values of numeric variables as required by the model
bikedf_test[num_vars]=scaler.transform(bikedf_test[num_vars])

In [None]:
# Checking if the values are transformed as required
bikedf_test.describe()

### Splitting the test dataframe ino X and y 

In [None]:
y_test=bikedf_test.pop('Count')
X_test=bikedf_test[X_train_new.columns]

In [None]:
# Checking if the indepedent variables are in sync wiht the train variables
X_test.columns

In [None]:
# Adding the constant variable the X_test variables
X_test_m4=sm.add_constant(X_test)

In [None]:
# Predicting the values of target varible using the built model
y_pred_m4 = lm19.predict(X_test_m4)

### Predicting Test Data Using Developed Model

In [None]:
#Plotting the regression plot of y_test vs y_predicted to check how good the model is doing.
plt.figure(figsize=(10,8))
sns.regplot(y_test, y_pred_m4)
plt.title('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16)  
plt.show()

In [None]:
# Plotting scatter plot of y_test vs y_predicted
plt.figure(figsize=(10,8))
plt.scatter(y_test,y_pred_m4)
plt.title('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16)  
plt.show()

In [None]:
# Calculating the R-squared value of the model on the test target variable and predicted target variable of the test dataset
r2_score(y_test,y_pred_m4)

In [None]:
# Calculating the Mean Squared Error of test dataset
np.sqrt(mean_squared_error(y_test, y_pred_m4))

In [None]:
# Calculating the adjusted R-squared  of the model
lm19.rsquared_adj

### We can see that the equation of our best fitted line is:
$ Count= 0.458 \times const + 0.247 \times Year - 0.077 \times holiday - 0.141\times windspeed - 0.194\times season spring + 0.069\times Month 3 + 0.115\times Month 5 + 0.143\times Month 6 + 0.143\times Month 8 + 0.181\times Month 9 - 0.305\times weathersit Light rain Light snow Thunderstorm - 0.090\times weathersit Mist cloudy - 0.033\times weekday 1 + 0.117\times Month 7 + 0.109\times Month 10 - 0.029\times  weekday 2 $