# Importing EDA Libraries

In [136]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Reading and Understanding Data

In [137]:
# Reading the csv file having data
bs_data=pd.read_csv(r'day.csv')

In [138]:
# checking on first 5 columns
bs_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [139]:
#checking on numeric columns and range of values
bs_data.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,365.5,2.49863,0.5,6.526027,0.028767,2.99726,0.683562,1.394521,20.319259,23.726322,62.765175,12.76362,849.249315,3658.757534,4508.006849
std,210.877136,1.110184,0.500343,3.450215,0.167266,2.006161,0.465405,0.544807,7.506729,8.150308,14.237589,5.195841,686.479875,1559.758728,1936.011647
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.424346,3.95348,0.0,1.500244,2.0,20.0,22.0
25%,183.25,2.0,0.0,4.0,0.0,1.0,0.0,1.0,13.811885,16.889713,52.0,9.04165,316.25,2502.25,3169.75
50%,365.5,3.0,0.5,7.0,0.0,3.0,1.0,1.0,20.465826,24.368225,62.625,12.125325,717.0,3664.5,4548.5
75%,547.75,3.0,1.0,10.0,0.0,5.0,1.0,2.0,26.880615,30.445775,72.989575,15.625589,1096.5,4783.25,5966.0
max,730.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,35.328347,42.0448,97.25,34.000021,3410.0,6946.0,8714.0


In [140]:
#the casual+registered=cnt. Here cnt is the dependent variable. we can drop casual and resgistered column because we dont want to analyse or build model on them separately
bs_data=bs_data.drop(['casual','registered'],axis=1)

In [141]:
# from column temp to atemp. the atemp column is more useful. so we can drop temp column
bs_data=bs_data.drop('temp',axis=1)

In [142]:
# checking total number of columns and rows
bs_data.shape

(730, 13)

In [143]:
# checking data type of each column and also looking for missing values if any
bs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    int64  
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   atemp       730 non-null    float64
 10  hum         730 non-null    float64
 11  windspeed   730 non-null    float64
 12  cnt         730 non-null    int64  
dtypes: float64(3), int64(9), object(1)
memory usage: 74.3+ KB


In [144]:
# no missing value in any column. All are int or float values except dteday is string

In [145]:
#checking number of unique values in data colmns
for i in bs_data.columns:
    print(i,bs_data[i].nunique())

instant 730
dteday 730
season 4
yr 2
mnth 12
holiday 2
weekday 7
workingday 2
weathersit 3
atemp 689
hum 594
windspeed 649
cnt 695


In [146]:
#we can drop instant and dteday because both are unique for each row. From date we have already year, month and weekday information.
bs_data=bs_data.drop(['instant','dteday'],axis=1)

#### We can see season column has 4 level of values, mnth has 12, weekday has 7, weathersit has 3. The columns yr,holiday, workingday are having 2 level of values. So we have to create dummy variables out of the categorical columns in which level has more that 2.

In [None]:
plt.figure(figsize=[20,12])
sns.pairplot(bs_data)

In [None]:
plt.figure(figsize=[20,12])
plt.subplot(4,2,1)
sns.boxplot(x='season',y='cnt',data=bs_data)
plt.subplot(4,2,2)
sns.boxplot(x='mnth',y='cnt',data=bs_data)
plt.subplot(4,2,3)
sns.boxplot(x='weekday',y='cnt',data=bs_data)
plt.subplot(4,2,4)
sns.boxplot(x='holiday',y='cnt',data=bs_data)
plt.subplot(4,2,5)
sns.boxplot(x='weathersit',y='cnt',data=bs_data)
plt.subplot(4,2,6)
sns.boxplot(x='workingday',y='cnt',data=bs_data)
plt.subplot(4,2,7)
sns.boxplot(x='yr',y='cnt',data=bs_data)
plt.show()

In [None]:
plt.figure(figsize=[20,12])
sns.heatmap(bs_data.corr(),annot=True)

## From the correlation heatmap, cnt is highly correlated with atemp , yr, season and month.
## While in the pairplot we have observed a good relationship of cnt with atemp, hum and windspeed.
## Both 2 visualisation clearly show that categorical values have no impact on dependent variable.

### Start analysing categorical columns and creating dummy variables.

In [None]:
#lets pick season column first
bs_data.season.value_counts().plot(kind='bar')

In [None]:
bs_data['season']=bs_data['season'].map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})

In [None]:
seasons=pd.get_dummies(bs_data['season'],drop_first=True)
seasons

In [None]:
bs_data=pd.concat([bs_data,seasons],axis=1)
bs_data.head()

In [None]:
bs_data=bs_data.drop('season',axis=1)

In [None]:
bs_data.head()

In [None]:
bs_data.mnth.value_counts().plot(kind='bar')

In [None]:
bs_data['mnth']=bs_data['mnth'].map({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr',5:'May', 6:'Jun', 7:'Jul', 8:'Aug',9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'})

In [None]:
mnths=pd.get_dummies(bs_data['mnth'],drop_first=True)
bs_data=pd.concat([bs_data,mnths],axis=1)
bs_data=bs_data.drop('mnth',axis=1)
bs_data.head()

In [None]:
bs_data.columns

In [None]:
bs_data.weekday.value_counts().plot(kind='bar')

In [None]:
bs_data['weekday']=bs_data['weekday'].map({0:'Sunday',1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday',5:'Friday', 6:'Saturday'})

In [None]:
weekday=pd.get_dummies(bs_data['weekday'],drop_first=True)
bs_data=pd.concat([bs_data,weekday],axis=1)
bs_data=bs_data.drop('weekday',axis=1)
bs_data.head()

In [None]:
bs_data.weathersit.value_counts().plot(kind='bar')

In [None]:
bs_data['weathersit']=bs_data['weathersit'].map({1:'Clear',2:'Mist',3:'Light'})

In [None]:
weathersit=pd.get_dummies(bs_data['weathersit'],drop_first=True)
bs_data=pd.concat([bs_data,weathersit],axis=1)
bs_data=bs_data.drop('weathersit',axis=1)
bs_data.head()

# Splitting into train and test

In [None]:
bs_train,bs_test=train_test_split(bs_data,train_size=0.7,random_state=100)

In [None]:
bs_test.shape

# Lets start preparing data for model

In [None]:
bs_train.head()

## Rescaling of vairables

In [None]:
num_vars=['atemp','hum','windspeed','cnt']
scaler=MinMaxScaler()
bs_train[num_vars]=scaler.fit_transform(bs_train[num_vars])
bs_train.describe()

In [None]:
plt.figure(figsize=[20,12])
sns.heatmap(bs_train.corr(),annot=True)
plt.show()

# Training the model

## Using the Varaible selection method via RFE

In [None]:
y_train=bs_train.pop('cnt')
X_train=bs_train 
lm=LinearRegression()
rfe=RFE(lm,n_features_to_select=20)
rfe=rfe.fit(X_train,y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col=X_train.columns[rfe.support_]
X_train.columns[~rfe.support_]

In [None]:
X_train_rfe=X_train[col]

In [None]:
# So creating the model and checking summary is an iterative process so lets create a function that we will be calling when need.
def lr_model_summary(X,y):
    X=sm.add_constant(X)
    lr_model=sm.OLS(y,X).fit()
    print(lr_model.summary())
    return lr_model

In [None]:
#similarly creating function for calculating VIF
def get_vif(X):
    vif=pd.DataFrame()
    vif['Features']=X.columns
    vif['VIF']=[variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
    vif['VIF']=round(vif['VIF'],2)
    vif=vif.sort_values(by='VIF',ascending=False)
    #print(vif)
    return vif

In [None]:
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
X_train_rfe=X_train_rfe.drop('workingday',axis=1)
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
X_train_rfe=X_train_rfe.drop('Saturday',axis=1)
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
X_train_rfe=X_train_rfe.drop('May',axis=1)
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
X_train_rfe=X_train_rfe.drop('spring',axis=1)
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
X_train_rfe=X_train_rfe.drop('hum',axis=1)
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
X_train_rfe=X_train_rfe.drop('atemp',axis=1)
lr_model_summary(X_train_rfe,y_train)
get_vif(X_train_rfe)

In [None]:
#We can see atemp is correlated with Sep and Aug. So lets drop AUg and Sep and add atemp
X_train_rfe1=X_train_rfe
X_train_rfe1['atemp']=X_train['atemp']
X_train_rfe1=X_train_rfe1.drop(['Sep','Aug'],axis=1)
lr_model_summary(X_train_rfe1,y_train)
get_vif(X_train_rfe1)

In [None]:
# so the R-squared has been increased and Vif of atemp is under 5. so lets analyze other columns with their correlation

#We can see spring is correlated with Feb ,Jan and Mar. So lets drop AUg and Sep and add atemp
X_train_rfe1['spring']=X_train['spring']
X_train_rfe1=X_train_rfe1.drop(['Jan','Feb'],axis=1)
lr_model_summary(X_train_rfe1,y_train)
get_vif(X_train_rfe1)

In [None]:
# so the R-squared has been increased and Vif of atemp is under 5. so lets analyze other columns with their correlation

#We can see winter is correlated with Nov ,Dec and Oct. So lets drop AUg and Sep and add atemp
#X_train_rfe1['winter']=X_train['winter']
X_train_rfe1=X_train_rfe1.drop(['Nov','Dec'],axis=1)
lr_model_summary(X_train_rfe1,y_train)
get_vif(X_train_rfe1)