In [None]:
#1. Data Understanding and Exploration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

BS_day = pd.read_csv("day.csv")

#1=spring, #2=summer, #3=fall & #4=winter
BS_day.loc[(BS_day['season'] == 1) , 'season'] = 'spring'
BS_day.loc[(BS_day['season'] == 2) , 'season'] = 'summer'
BS_day.loc[(BS_day['season'] == 3) , 'season'] = 'fall'
BS_day.loc[(BS_day['season'] == 4) , 'season'] = 'winter'

def object_map(x):
    return x.map({1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul',8: 'Aug',9: 'Sept',10: 'Oct',11: 'Nov',12: 'Dec'})

BS_day[['mnth']] = BS_day[['mnth']].apply(object_map)

def str_map(x):
    return x.map({1: 'Wed', 2: 'Thurs', 3: 'Fri', 4: 'Sat', 5: 'Sun', 6: 'Mon', 0: 'Tues'})

# Applying the function to the two columns
BS_day[['weekday']] = BS_day[['weekday']].apply(str_map)

# 1-Clear, Few clouds, Partly cloudy, Partly cloudy
BS_day.loc[(BS_day['weathersit'] == 1) , 'weathersit'] = 'A'

# 2-Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
BS_day.loc[(BS_day['weathersit'] == 2) , 'weathersit'] = 'B'

# 3-Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
BS_day.loc[(BS_day['weathersit'] == 3) , 'weathersit'] = 'C'

# 4-Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
BS_day.loc[(BS_day['weathersit'] == 4) , 'weathersit'] = 'D'

In [None]:
#2. Data Visualisation

In [None]:
sns.distplot(BS_day['temp'])

# humidity
sns.distplot(BS_day['hum'])

# wind speed
sns.distplot(BS_day['windspeed'])

# Target variable: count of total rental bikes
sns.distplot(BS_day['cnt'])

# Converting date to datetime format
BS_day['dteday']=BS_day['dteday'].astype('datetime64')

IntVariableList = ["casual","registered","cnt"]
for var in IntVariableList:
    BS_day[var] = BS_day[var].astype("float")

# heatmap
mask = np.array(cor)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(10,10)
sns.heatmap(cor, mask=mask,vmax=.8, square=True,annot=True)

In [None]:
# 3. Data Preparation 

In [None]:
BS_day_categorical=BS_day.select_dtypes(include=['object'])

BS_day_dummies = pd.get_dummies(BS_day_categorical, drop_first=True)

BS_day = BS_day.drop(list(BS_day_categorical.columns), axis=1)

BS_day = pd.concat([BS_day, BS_day_dummies], axis=1)

BS_day=BS_day.drop(['instant','dteday'], axis = 1, inplace = False)

In [None]:
# 4. Model Building and Evaluation

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(BS_day, train_size=0.7, test_size=0.3, random_state=100)

# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

var = ['temp', 'hum', 'windspeed','casual','registered','cnt']

df_train[var] = scaler.fit_transform(df_train[var])

y_train = df_train.pop('cnt')
X_train = df_train.drop(["casual","registered"],axis=1) 

In [None]:
# Building the final model with the features

In [None]:
import statsmodels.api as sm
X_train_lm = sm.add_constant(X_train)

lr = sm.OLS(y_train, X_train_lm).fit()

lm = LinearRegression()

lm.fit(X_train, y_train)

In [None]:
#RFE

In [None]:
from sklearn.feature_selection import RFE

# RFE with 15 features
lm = LinearRegression()
rfe1 = RFE(lm, 15)

# Fit with 15 features
rfe1.fit(X_train, y_train)

# Print the boolean results
print(rfe1.support_)           
print(rfe1.ranking_)  

In [None]:
# Model Building and Evaluation

In [None]:
import statsmodels.api as sm  

col1 = X_train.columns[rfe1.support_]
X_train_rfe1 = X_train[col1]
X_train_rfe1 = sm.add_constant(X_train_rfe1)

lm1 = sm.OLS(y_train, X_train_rfe1).fit()

from statsmodels.stats.outliers_influence import variance_inflation_factor

a=X_train_rfe1.drop('const',axis=1)

vif = pd.DataFrame()
vif['Features'] = a.columns
vif['VIF'] = [variance_inflation_factor(a.values, i) for i in range(a.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)

from sklearn.feature_selection import RFE

lm = LinearRegression()
rfe2 = RFE(lm, 7)

rfe2.fit(X_train, y_train)

import statsmodels.api as sm  

col1 = X_train.columns[rfe2.support_]

X_train_rfe2 = X_train[col1]

X_train_rfe2 = sm.add_constant(X_train_rfe2)

lm2 = sm.OLS(y_train, X_train_rfe2).fit()

b=X_train_rfe2.drop('const',axis=1)

vif = pd.DataFrame()
vif['Features'] = b.columns
vif['VIF'] = [variance_inflation_factor(b.values, i) for i in range(b.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)

X_train_rfe2.drop("hum",axis=1,inplace=True)

X_train_rfe2 = sm.add_constant(X_train_rfe2)

lm3 = sm.OLS(y_train, X_train_rfe2).fit()   

c=X_train_rfe2.drop('const',axis=1)

vif = pd.DataFrame()
vif['Features'] = c.columns
vif['VIF'] = [variance_inflation_factor(c.values, i) for i in range(c.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)

In [None]:
# Residual Analysis

In [None]:
y_train_cnt = lm3.predict(X_train_rfe2)
fig = plt.figure()
sns.distplot((y_train - y_train_cnt), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)
plt.xlabel('Errors', fontsize = 18)                         

In [None]:
# Making Predictions

In [None]:
df_test[var] = scaler.transform(df_test[var])

y_test = df_test.pop('cnt')
X_test = df_test.drop(["casual","registered"],axis=1)

col2=c.columns

X_test_rfe2 = X_test[col2]

X_test_rfe2 = sm.add_constant(X_test_rfe2)

y_pred = lm3.predict(X_test_rfe2)

fig = plt.figure()
plt.scatter(y_test, y_pred)
fig.suptitle('y_test vs y_pred')              
plt.xlabel('y_test')                          
plt.ylabel('y_pred')

from sklearn.metrics import r2_score