In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') #Supress warnings
pd.set_option('display.max_columns',200)

## Step 1: Reading and Understanding the Data


In [None]:
# import the data frame : Read & Understand data
df = pd.read_csv('../input/boombikes/day.csv')
df.head()

In [None]:
# Check for shape of data
df.shape

In [None]:
# Checking for presence of null values
df.isnull().sum()

In [None]:
# Correcting the column names
df.rename(columns={'dteday':'date','yr':'year','mnth':'month','cnt':'count','hum':'humidity'},inplace=True)

In [None]:
# Check for datatype
df.info()

#### Inference:
    - No null values are observed
    - Variables are converted into required data types

In [None]:
# Describing the data with numeric data
df.describe() 

#### Data check

In [None]:
# Converting the date column from object to date i.e., required format
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.info() # re-check for date column dtype

#### Mapping the categorical values
    
    - mnth : month ( 1 to 12)
    - weathersit : 
        -1: Clear, Few clouds, Partly cloudy, Partly cloudy
        -2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
        -3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
        -4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

In [None]:
import calendar
def season_func(x):
    if x==1:
        return ('spring')
    elif x==2:
        return ('Summer')
    elif x==3:
        return ('fall')
    elif x==4:
        return ('winter')
    
def weather(x):
    if x==1:
        return ('Clear')
    elif x==2:
        return ('Mist')
    elif x==3:
        return ('Light Snow')
    elif x==4:
        return ('Heavy Rain')

df['season'] = df['season'].apply(season_func)
df['month'] = df['month'].apply(lambda x: calendar.month_name[x])
df['weathersit'] = df['weathersit'].apply(weather)
df['weekday'] = df['weekday'].apply(lambda x: calendar.day_abbr[x-1])

In [None]:
df.head()

In [None]:
df.info()

## Step 2: Visualising the Data

**understanding the data**.
- If there is some obvious multicollinearity going on, this is the first place to catch it
- Here's where you'll also identify if some predictors directly have a strong association with the outcome variable

We'll visualise our data using `matplotlib` and `seaborn`.

In [None]:
# Numeric variables
num_vars = ['temp','atemp','windspeed','humidity','casual','registered','count']


In [None]:
#Pair plots for numeric variables
num_vars = list(num_vars)
sns.pairplot(df[num_vars])
plt.show()

In [None]:
# Continued
sns.set_style('darkgrid')
plt.figure(figsize=(14,10))
for i in range(1,len(num_vars)):
    plt.subplot(3,3,i)
    sns.distplot(df[num_vars[i]])
plt.show()

#### Inference:
    - temp is showing good pattern with cnt in terms of linear regression
    - temp & atemp with respect to all other variables are following almost same pattern
    - Casual bookings i.e., new users versus registered users and total count appears to have good correlation
    - Registered users are highly correlated with count.

#### Regression plot for numerical variables

In [None]:
plt.subplots(figsize=(15,4))
plt.subplot(1,3,1)
sns.regplot(data=df,x="windspeed",y="count",color = "Orange")
plt.subplot(1,3,2)
sns.regplot(data=df,x="temp",y="count",color = "Seagreen")
plt.subplot(1,3,3)
sns.regplot(data=df,x="humidity",y="count");

### Inference:
    - By the above plot we can observe that correlation between temp and count is good/positive when compared to other variables
    - Bookings are likely to happen during the days where humidity ~40+ %
    - Booking are likely to happen during the speed of wind are lower

### Data visualization for categorical variables

In [None]:
# Categorical variables
cat_vars = list(df.select_dtypes(exclude=['float64','int64','datetime64']))
cat_vars

In [None]:
plt.subplots(figsize = (16,16),ncols=3,nrows=2,sharey=True)
plt.subplot(331)
sns.boxplot(data=df,x="month",y="count")
plt.subplot(3,3,2)
sns.boxplot(data=df,x="season",y="count")
plt.subplot(333)
sns.boxplot(data=df,x="year",y="count")
plt.subplot(334)
sns.boxplot(data=df,x="holiday",y="count")
plt.subplot(335)
sns.boxplot(data=df,x="workingday",y="count")
plt.subplot(336)
sns.boxplot(data=df,x="weekday",y="count")
plt.subplot(337)
sns.boxplot(data=df,x="weathersit",y="count")
;

### Checking the distrubution of no of days & booking over season

In [None]:
sns.barplot(x='season',y='count',data=df)
plt.title('Total No. of bookings in different season');

#### Inference:
    - It is more evident from the plot that there is a significant growth in 2019
    - There is no much variance in number of bookings versus day of the week
    - When we compare the number of days of holidays v/s booking., number of booking/day is more during holidays.
    Hence this can be good predictor on number of bookings
    - Number of bookings are more in season 3:fall
    - Season is one of the good predictor

#### YoY - Monthy Bookings comparision

In [None]:
plt.figure(figsize=(10,8))
plt.subplot(2,1,1)
sns.boxplot(data=df,x='month',y='count',hue='year')
plt.subplot(2,1,2)
sns.barplot(x='month',y='count',data=df,hue='year')

#### Inference:
    - Month on month bookings are significantly increased
    - Total number of bookings in 2019 are more than 2018 (Where lengend 0 - yr 2018 / 1 - yr 2019)

#### Correlation matrix

In [None]:
df_corr = round(df[num_vars].corr(),3)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_corr,cmap='BuGn',annot=True,square=True)
plt.show()

In [None]:
mask = np.array(df_corr)
mask[np.tril_indices_from(mask)]=False
plt.figure(figsize=(10,10))
sns.heatmap(df_corr,mask=mask,annot=True,vmax=0.8,square=True)
plt.show()

#### Inference:
    - Temp & atemp are highly correlated amy lead to multicollinearity.
    - wind and humidity are appearing to be negatively correlated
    - It is observed heat maps numeric variables temp, atemp, casual, registered are showing positive correlation with count

## Step 3: Data Preparation

In [None]:
# Dropping the variables that are not helpful in modeling - Instant(Just acts as name tag/sr no), 
#casual & registered(sum of both is count) are also dependent variables, date(not a predicting factor) since it is an index
# atemp - which is highly correlated (+0.99) with dependent variable temp

df1 = df.drop(['date','instant','casual','registered','atemp'],axis=1)
df1.head()

### Create dummy variables - For categorical vars

In [None]:
# Encoding - Converting cat_vars into dummies
# Let's drop the first column from status df using 'drop_first = True'

season_dummies= pd.get_dummies(df1['season'],drop_first=True)
month_dummies = pd.get_dummies(df1['month'],drop_first=True)
weather_dummies = pd.get_dummies(df1['weathersit'],drop_first=True)
weekday_dummies = pd.get_dummies(df1['weekday'],drop_first=True)

In [None]:
# Add the results to the original dataframe
df_boom = pd.concat([df1,season_dummies,month_dummies,weather_dummies,weekday_dummies],axis=1)

In [None]:
df_boom.head() # Checking dummy vars

In [None]:
# Dropping the cat_vars which are converted into dummy vars
df_boom = df_boom.drop(['season','month','weekday','weathersit'],axis=1)
df_boom.head()

## Step 4: Splitting the Data into Training and Testing Sets

In [None]:
# Libraries for linear regression modelling
import sklearn
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE


In [None]:
#Splitting of data into train and test with 70:30 ratio
# We specify this so that the train and test data set always have the same rows, respectively

df_train, df_test = train_test_split(df_boom, train_size = 0.7, test_size = 0.3, random_state = 0)

In [None]:
# Checking for shape of train & test data

print('Shape of train data : {}'.format(df_train.shape))
print('Shape of test data : {}'.format(df_test.shape))

## Step 5: Linear Regression Modelling

In [None]:
# Let's check the correlation coefficients to see which variables are highly correlated

plt.figure(figsize = (16, 10))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
corr_table = round(df_train.corr(),3)#['count']#.sort_values(by='count',ascending=False)
corr_table[['count']].sort_values(by='count',ascending=False)

#### Inference:
    - As you might have noticed, `temp` seems to the correlated to `count` the most.
    - We can also observe a pattern during the months `June, July, & Aug` i.e., summer are positively correlated, meanwhile as it goes towards, winter & spring starting to correlate negatively


#### Scaling & defining of dependent variable and predicting variable

In [None]:
# scaling the numeric variables
scaler = MinMaxScaler()
df_train[['temp','humidity','windspeed','count']] = scaler.fit_transform(df_train[['temp','humidity','windspeed','count']])

In [None]:
# Defining of predictor and dependent variables

y_train = df_train.pop('count')
X_train = df_train



In [None]:
X_train.head()

In [None]:
df_train.describe()

## Step 6: Building our model

This time, we will be using the **LinearRegression function from SciKit Learn** for its compatibility with RFE (which is a utility from sklearn)

### RFE
Recursive feature elimination

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 15)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_] # column / variable names in top 15 rank
col

In [None]:
X_train.columns[~rfe.support_] # Removing the false variables

### Building model using statsmodel, for the detailed statistics

In [None]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)
lm1 = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model
lm1.params

In [None]:
#Let's see the summary of our linear model
print(lm1.summary())

In [None]:
X_train_rfe.columns

In [None]:
X_train_new = X_train_rfe.drop(['const'], axis=1)

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

****`holiday` is having high p-value `0.118`; can be dropped

In [None]:
#Creating of new X_train data
X_train_new = X_train_new.drop(["holiday"], axis = 1)

In [None]:
X_train_new.columns

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)
lm2 = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model
lm2.params

In [None]:
#Let's see the summary of our linear model
print(lm2.summary())

In [None]:
X_train_new = X_train_lm.drop(['const'], axis=1)

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

`Sun` is having high `p-value` `0.288` ; can be dropped

In [None]:
X_train_new = X_train_new.drop(['Sun'],axis=1)
X_train_new.columns

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)
lm3 = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model
lm3.params

In [None]:
#Let's see the summary of our linear model
print(lm3.summary())

In [None]:
X_train_new = X_train_lm.drop(['const'], axis=1)

In [None]:
X_train_new.columns

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

`temp` is having `10.03` so it can be dropped

In [None]:
X_train_new = X_train_new.drop(['temp'],axis=1)

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)
lm4 = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model
lm4.params

In [None]:
#Let's see the summary of our linear model
print(lm4.summary())

In [None]:
X_train_new = X_train_lm.drop(['const'], axis=1)

In [None]:
X_train_new.columns

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_new.head()

`winter` is having higher p-value `0.829` so it can be dropped

In [None]:
X_train_new = X_train_new.drop(['winter'],axis=1)
X_train_new.columns

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)
lm5 = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model
lm5.params

In [None]:
lm5.summary()

In [None]:
X_train_new = X_train_lm.drop(['const'],axis=1)

In [None]:
X_train_new.columns

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

    - `July` is having high p-value `0.265`	 ; so we can drop

In [None]:
X_train_new = X_train_new.drop(['July'],axis=1)
X_train_new.columns

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new)
lm6 = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model
lm6.params

In [None]:
lm6.summary()

In [None]:
X_train_new = X_train_lm.drop(['const'],axis=1)

In [None]:
X_train_new.columns

In [None]:
# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

    `p-value` of holiday is high > 0.05 i.e., 0.114 and low VIF of 1.04

All `p-vales` < 0.05, F-statistics = 3.40e-129 which is negligible, and VIF < 5. Hence we can say the model is stable and can proceed further for residuals analysis and testing

## Step 7: Residual Analysis of the train data

So, now to check if the error terms are also normally distributed (which is infact, one of the major assumptions of linear regression), let us plot the histogram of the error terms and see what it looks like.

In [None]:
y_train_price = lm6.predict(X_train_lm)

In [None]:
# Importing the required libraries for plots.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)                         # X-label

    - Error terms are distrubuted normally, we can proceed futher making predictions using final model

## Step 8: Making Predictions Using the Final Model

Now that we have fitted the model and checked the normality of error terms, it's time to go ahead and make predictions using the final.

#### Applying the scaling on the test sets

In [None]:
df_test.head()

In [None]:
df_test[['temp','humidity','windspeed','count']] = scaler.transform(df_test[['temp','humidity','windspeed','count']])

In [None]:
# Defining dependent and indepedent variables
y_test = df_test.pop('count')
X_test = df_test

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test derived from model
X_test_new = X_test[X_train_new.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

In [None]:
X_test_new.head()

In [None]:
X_train_new.columns

In [None]:
X_test_new.describe()

In [None]:
# Making predictions
y_test_pred = lm6.predict(X_test_new)


## Step 9: Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_test_pred)
fig.suptitle('y_test vs y_test_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_test_pred', fontsize=16)                          # Y-label

In [None]:
# R-Squared value for test set
r2 = round(r2_score(y_test, y_test_pred),4)
r2

    - Observed R-Squared value for train & test data set is 74 % and 79.77 % respectively. The difference is approximately 5 % which is acceptable in small difference in variance

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_test_pred)

    - Lower the mean_squared_error more the stability of the model

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_test_pred)

    - Lower the mean_absolute_error more the stability of the model

### Equation for `bestfitline` as follows:
count = 0.611045 + 0.232469 X year + 0.033295 X workingday -0.168256 X humidity X  -0.183157 X windspeed X  -0.286873 X spring -0.102989 X December -0.131633 X November + 0.083101 X September -0.170055 X Light Snow + 0.039602 X Sat
    
    - Demand increases with increase in parameters of Year, workingday & September i.e, summer season, Sat i.e, weekend
    - Demand decreases with increase in humidity, windspeed, spring, December and Light snow 