In [None]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
# import all required packages

# importing pandas and numpy
import pandas as pd
import numpy as np

# importing sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
# r2_score for model evaluation
from sklearn.metrics import r2_score

# importing stastsmodels
import statsmodels.api as sm
# importing VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing the required libraries for plots.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading the data and understanding the dataset

In [None]:
# Importing day.csv
day = pd.read_csv('../input/bike-sharing/day.csv')
day.head()

In [None]:
# lets see the columns information in the dataset
day.info()

In [None]:
# lets see the overall description of numeric variables
day.describe()

In [None]:
# Lets see total records and total columns
day.shape

In [None]:
initialTotalRecords = (day.shape)[0]
initialTotalColumns = (day.shape)[1]

print("Total Rows : ", initialTotalRecords);
print("Total Columns : ", initialTotalColumns);

In [None]:
# Lets see if there are any null data in our dataset or not
round(((day.isnull().sum()/initialTotalRecords)*100), 2)

###### Looks like there are no null data in our dataset so we are good to proceed further

In [None]:
# So lets remove some columns which are not useful for our predictions
# instant - Its an index no much significance of it to keep in dataset
# dteday - Its a date column, which is a kind of redundant column because we already have yr and month columns seperately
# casual & registered - It is also kind of redundant columns because its combined count is already mentioned in cnt column. 
#                  And we also need to work on the count of the bikes instead of category count so we can remove this colum
# Lets make a new dataframe without these columns with name 'bike'

bike = day.copy(deep=True) # deep=True, since when we change one data frame other should not get updated/effected by other.
columns_to_remove = ['instant', 'dteday', 'casual', 'registered'];
bike.drop(columns_to_remove, axis = 1, inplace = True)
bike.head()

In [None]:
updated_total_records = (bike.shape)[0]
updated_total_columns = (bike.shape)[1]

print("Updated total rows : ", updated_total_records);
print("Updated total columns : ", updated_total_columns);

# Columns were reduced from 16 to 12

In [None]:
# Lets check the unique values count for the categorical variables
categorical_columns = ['season', 'mnth', 'weathersit', 'weekday'];
for categorical_col in categorical_columns:
    print(bike[[categorical_col]].value_counts(), "\n");

### Visualizing the dataset on original data

In [None]:
# If we look our data set, it looks like all are numerical data, but actually columns
# 'cnt', 'temp', 'atemp', 'hum', 'windspeed' are actual numeric variable, remaining are numerical categorical variables
# But later on any way we will convert the, to dummy variables so, lets visualize only the actual numeric variables
# For any kind of corelation among them and with target variable 'cnt'
# If no linear relation observed among the cnt and any of the variable then regression model is not possible

numeric_columns = ['cnt', 'temp', 'atemp', 'hum', 'windspeed'];
sns.pairplot(bike[numeric_columns])


#### From the above plots we can observe there is a kind of correlation among
    1. cnt with temp, atemp variables
    2. temp and atemp

In [None]:
# Visualize the categorical data wrt to target variable cnt before making the dummy data
plt.figure(figsize=(25, 10))
plt.subplot(2,3,1)
sns.boxplot(x = 'season', y = 'cnt', data = bike)
plt.subplot(2,3,2)
sns.boxplot(x = 'mnth', y = 'cnt', data = bike)
plt.subplot(2,3,3)
sns.boxplot(x = 'weathersit', y = 'cnt', data = bike)
plt.subplot(2,3,4)
sns.boxplot(x = 'holiday', y = 'cnt', data = bike)
plt.subplot(2,3,5)
sns.boxplot(x = 'weekday', y = 'cnt', data = bike)
plt.subplot(2,3,6)
sns.boxplot(x = 'workingday', y = 'cnt', data = bike)
plt.show()

#### We can able to make the following insights from the above box plots wrt to target variable 'cnt'

    1. Working day:
    - Almost 69% of users books bike in working day which is closes to 5000
    - This indicates, workingday can be a good predictor for the dependent variable
    
    2. Weekday:
        - Almost all weekdays, the no.of bike users count was similar and it is around in between 3000-6000
        - Medians of all the weekdays are around in between the 4000-6000 that means, more than 50% of people using bikes in all days of a week irrespective of the day of the week.
        - Difference/distance between the 25% and 75% of box is more for weekdays 3(Wednesday) & 6(Saturday) but not a  significant difference when compared with others [Considering start of week as Sunday]
        
    3. Holiday:
        - Almost 97.6% of the bike booking were happening when it is not a holiday which means this data is clearly biased. This indicates, holiday CANNOT be a good predictor for the dependent variable.
         
    4. Weathersit:
        - Most of the bike users are in weathersit 1(Clear, Few clouds, Partly cloudy, Partly cloudy), followed by 2(Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist) which is almost 67% of users.
        - This was followed by weathersit2 with 30% of total booking
        - Very less no.of bike users are available in weathersit 3(Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds).
        - This indicates, weathersit does show some trend towards the bike bookings can be a good predictor for the dependent variable.
   
    5. Month:
        - Most of the bookings are happening around the months 6, 7, 8, 9 (more than 5000 bookings are happening. Almost 10%)
        - Where as months 1 & 2 are having less bookings (Less than 3000)
        - In almost all months the differenct between the 25% to 75% is similar but for months 3,4,9,10 is having significantly more difference
        - This indicates, mnth has some trend for bookings and can be a good predictor for the dependent variable.
        
    6. Season:
        - For season 3 having more no.of bike users (More than or equal to 5000 users. Almost 32%) followed by season2 & season4 with 27% & 25% (greater than 4000 and near to 5000)
        - Season 1 is having less users which is less than 3000
        - Almost all seasons are having the difference between 25% and 75% is significantly having no difference among them.
        - This indicates, season can be a good predictor for the dependent variable.

In [None]:
# Correlatio matrix to visulaize which columns are having corelation with cnt
plt.figure(figsize = (16, 10))
sns.heatmap(bike.corr(), annot = True, cmap="YlGnBu")
plt.show()

#### From the above heatmap it is clearly observed that 'cnt' column is having 
    - More Positive Correlation with the predictor variables  'atemp', 'temp' followed by 'yr'.
    - More Negative Correlation with 'weathersit', 'windspeed'

### Preparing data for model

#### Now lets convert the numeric categorical to the dummy variables
#### To do that we need to first convert the type of those columns and then convert them to dummy variables

### Why to convert the data type from numeric to category?
    Before answering that, lets look at the syntax of get_dummies.
#### Syntax: 
####    `pandas.get_dummies(data, prefix=None, prefix_sep=’_’, dummy_na=False, columns=None, sparse=False, drop_first=False, 
#### dtype=None)`

#### Lets see the description of some attributes which are rerquired for our question:
#### columns: 
        This attribute specifies the columns that needs to get dummies. Default is None. If not specified the columns it by default takes object, category datatype columns to get dummies.
        
#### drop_first:
        This is default false. It means that we get dummies for all categorical of all 'k' levels. If it is true, then we get the dummies for 'k-1' level which is expected for our model.

In [None]:
# Lets change the data types of some of the columns which needs to get dummy data.
columns_to_get_dummy = ['season', 'mnth', 'weathersit', 'weekday'];
# Convert datatype form numeric to category so that we can get dummies for these columns
bike[columns_to_get_dummy] = bike[columns_to_get_dummy].astype('category')

In [None]:
# Lets check info wether they have converted or not
bike[columns_to_get_dummy].info()

In [None]:
# Now, lets creqate dummy variables for the categorical varibles with drop_first=True 
bike = pd.get_dummies(bike, drop_first=True)
bike

In [None]:
# Now lets check the columns available in our data set after creating dummies
bike.info()

#### Since, we used drop_first=True, categorical levels of season, month, weekday, weather were reduced by 1 from their actual levels

### Splitting the data set into training and test data set

In [None]:
# Lets split the data set into training and test data set of 70-30 percentages respectively.
# Use the standard notation for them as df_train, df_test. Sicne, we are having more than one predictor features
# train_test_split returns data frames instead of series data.

# random_state: Controls the shuffling applied to the data before applying the split
# shuffel: Deafult True, so no need to specify it explicitly
df_train, df_test = train_test_split(bike, train_size=0.7, random_state=333);

In [None]:
# Lets check the shape and data in df_train, df_test
print("Training data set : ", df_train.shape);
print("Testing data set : ", df_test.shape);

### Data set was splitted as per our requirement
### 730: Actual total data
### 510: 70% of actual data (Training set)
### 220: 30% of actual data (Testing set)

In [None]:
# Lets see how data looks like in training data set
df_train.head()

In [None]:
df_train.describe()

##### If we look at the above data, all of them are numerical data.
#####  But, most of the columns are having data either 0/1 except the columns temp, atemp, hum, windspeed, cnt.
##### So, we need to rescale the data. So, that our predictions gets much more reliable and accurate.

#### So, lets rescale the data using `MinMaxScaling/Normalization` method.

### Why Normalization method is using?
#### Answer:
##### Beacuse, 
    1. Normalization methods makes the data to be present in between 0-1 which looks similar to the other columns data.
    2. It doesnt create any effect or change in the categorical data or dummy data that we have created already.

### Rescaling the features

In [None]:
### Need to do scaling on the training data set. Sicne it was used for training the model

# Create scaler object
scaler = MinMaxScaler();

# Create Columns list required for scaling
columns_req_for_scaling = ['temp', 'atemp', 'hum', 'windspeed', 'cnt'];

# Now, fit and transform the data for the above columns in our dataset
df_train[columns_req_for_scaling] = scaler.fit_transform(df_train[columns_req_for_scaling]);

In [None]:
# Lets check wether the data is scaled or not
df_train.describe()

##### Now, we can observe that mostly all the columns are sacled in between 0-1. It can be observed by checking the min value and max value of columns

##### Now, our dataset is ready for the training.

### Training the Model

In [None]:
# Lets create the X & Y variables from training data frame for training the model 
y_train = df_train.pop('cnt') # Our target varaiable is cnt, So lets assume it as y_train and rest columns as X_train
X_train = df_train

In [None]:
# Lets see what data is available in X_train
X_train.head()
# We can observe all the columns except cnt is available in X_train

In [None]:
X_train.describe()

##### Selecting features using RFE

In [None]:
# Create Linear regression object
lm = LinearRegression()

# Fit the model
lm.fit(X_train, y_train)

# Running RFE with the output number of the variable equal to 15 (50% of actual no.of columns available and it is suggestable)
# By passing the fitted model with it
rfe = RFE(lm, 15)

# get the fitted rfe
rfe = rfe.fit(X_train, y_train)

In [None]:
# List down all the 15 outcomes given by the RFE with rankings and its significant boolean
# support_ : provides wether that column is supported for the model or not
# ranking_ : provides the ranking of the models suitable for the model to pick
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
cols_supports_model = X_train.columns[rfe.support_]
cols_supports_model

In [None]:
cols_not_supported = X_train.columns[~rfe.support_]
cols_not_supported

In [None]:
# Remove columns which are not supported for model and store them 
# so that it will be used while testing the model with test dataset
X_train = X_train[cols_supports_model];

In [None]:
Cols_deleted = []
for val in cols_not_supported.values:
    Cols_deleted.append(val);

In [None]:
Cols_deleted

In [None]:
X_train.head()

In [None]:
### Common method to print the VIF continously
def printVIF(trainingDataSet, constantVariable):
    vif = pd.DataFrame();
    actualDataSet = trainingDataSet;
    dataSetWithoutConstant = actualDataSet.drop(constantVariable, axis=1);
    X = dataSetWithoutConstant;
    vif['Features'] = X.columns;
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])];
    vif['VIF'] = round(vif['VIF'], 2);
    vif = vif.sort_values(by = "VIF", ascending = False);
    print(vif);

In [None]:
# Fixed target variable for our model
constant_variable = 'const';

##### Building the Model with the variables supported for model

In [None]:
# To build the model lets add constant to the X_train so that our model soent pass through origin
# store it in new variable say X_train_lm
X_train_lm = sm.add_constant(X_train);

# Running the linear model
lm = sm.OLS(y_train,X_train_lm).fit();

In [None]:
# print the parameters given by the model
lm.params

In [None]:
# Lets look at the summary
lm.summary()

In [None]:
# Since we cannot take the decision based on the only pf values for feature removal. 
# Lets calculate the VIF for the X_train_lm columns
printVIF(X_train_lm, constant_variable);

##### From above model summary and VIF data we can see that 
    High p-value features are `atemp`
    High VIF value feature are `temp`
    
    R-Square value is 84.8%
    
##### Since, we follow one of the golden thumb rule is to remove the feature whic is having more p-value before removing the High VIF value.
##### So, lets remove the colum `atemp` from the model and lets rebuild model again

In [None]:
# Droping feature 'atemp' from training data set
X_train_lm = X_train_lm.drop(['atemp'], axis=1)
X_train_lm.columns

In [None]:
# Add deleted column into our Cols_deleted array for future use
Cols_deleted.append('atemp');
Cols_deleted

In [None]:
# Running the linear model again
lm = sm.OLS(y_train,X_train_lm).fit();
# Print summary of rebuilded model
lm.summary()

In [None]:
# Lets print the VIF for the current columns available in the training data set
printVIF(X_train_lm, constant_variable);

##### From the above summary and model we can derive following data
    High VIF value is `temp`
    No features with high p-values (>0.05)
    
    R-Square is still same 84.2%
    
##### So, now lets remove the columm `temp` which is having more VIF and rebuild our model again

In [None]:
# Droping feature 'temp' from training data set
X_train_lm = X_train_lm.drop(['temp'], axis=1)
X_train_lm.columns

In [None]:
# Add deleted column into our Cols_deleted array for future use
Cols_deleted.append('temp');
Cols_deleted

In [None]:
# Running the linear model again
lm = sm.OLS(y_train,X_train_lm).fit();
# Print summary of rebuilded model
lm.summary()

In [None]:
# Lets print the VIF for the current columns available in the training data set
printVIF(X_train_lm, constant_variable);

##### From the above summary and VIF we can say that:
    High P-value feature is `hum`
    High VIF feature is `hum`
    
    R-Square value was changed from 84.2 to 77.3 percentage
   
##### S0, lets remove the colum `hum` from the model and lets rebuild model again

In [None]:
# Droping feature 'hum' from training data set
X_train_lm = X_train_lm.drop(['hum'], axis=1)
X_train_lm.columns

In [None]:
# Add deleted column into our Cols_deleted array for future use
Cols_deleted.append('hum');
Cols_deleted

In [None]:
# Running the linear model again
lm = sm.OLS(y_train,X_train_lm).fit();
# Print summary of rebuilded model
lm.summary()

In [None]:
# Lets print the VIF for the current columns available in the training data set
printVIF(X_train_lm, constant_variable);

##### From the above summary and VIF we can say that:
    NO Feature having p-value (> 0.05)
    No High VIF feature
    
    R-Square value was changed slightly(negligable) from 77.3 to 77.2 percentage

#### Since we are not having any High VIF and High P-Values we can stop modeling here and can make this as our best fit model.
#### With 
    1. R-Square : 77.2%
    2. Total Coefficients : 12 + 1 Constant

In [None]:
# Coeffficents of model
lm.params

### Residual Analysis

In [None]:
# Get the predicted values of y from model using training data set
y_train_pred = lm.predict(X_train_lm)

# Calculate residuals 
res = (y_train - y_train_pred)

In [None]:
# Plot the residuals
sns.distplot(res);

#### We can observe that residuals are centered around the mean of 0 and it is normally distributed.

#### Prediction and evaluation on Test data

In [None]:
# Lets do pre processing on the test data set as we did it on training data set
# we do rescaling on test dataset for columns required for sacling and do transform on it instead of fit again
# Now, fit and transform the data for the above columns in our dataset
df_test[columns_req_for_scaling] = scaler.fit_transform(df_test[columns_req_for_scaling]);
df_test.head()

In [None]:
df_test.describe()

In [None]:
# Now lets create y_test, X_test data sets for evalution
y_test = df_test.pop('cnt')
X_test = df_test

In [None]:
# Lets add constant for X_test for fitting model on test data set
X_test_lm = sm.add_constant(X_test);

In [None]:
# Columns before dropping deleted columns from final model
X_test_lm.columns

In [None]:
# Remove columns that are not available in the final model so that our predictions will be accurate and matches with final model
X_test_lm = X_test_lm.drop(Cols_deleted, axis=1)
X_test_lm.columns

In [None]:
# Now predict the model based on the test data set using the final model obj
y_test_pred = lm.predict(X_test_lm)

### Assumptions

In [None]:
### Residuals are normally distributed
# Get the predicted values of y from model using training data set
y_train_pred = lm.predict(X_train_lm)

# Calculate residuals 
res = (y_train - y_train_pred)
# Plot the residuals
sns.distplot(res);

#### We can observe that residuals are centered around the mean of 0 and it is normally distributed.

In [None]:
### There is a linear relationship between X & Y
bike_assump = bike[[ 'temp', 'atemp', 'hum', 'windspeed','cnt']]
sns.pairplot(bike_assump, diag_kind='kde')
plt.show()

##### We can see from above graph that there is a linear relationship between X('temp', 'atemp') & y('cnt')

In [None]:
#### There is no multicollinearity between the predictor variables in the final model
printVIF(X_train_lm, constant_variable);

##### We can observe that all of the feature variables in the final model are having VIF less than 5. Hence we can say that there is no collinearity between the predictors

### Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread
fig = plt.figure()
plt.scatter(y_test, y_test_pred, alpha=.5)
fig.suptitle('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16) 
plt.show()

#### From the above graph we can say that spread of the y_test and y_pred are linear and high nice spread among them.

### Analyszing the R-Square for Test and Training data Set

In [None]:
# Calculate the R-Square for the training set
train_r2_score = r2_score(y_true = y_train, y_pred = y_train_pred);
train_r2_score

In [None]:
# Calculate the R-Square for the predicted set
test_r2_score = r2_score(y_true = y_test, y_pred = y_test_pred);
test_r2_score

### Conclusion

#### Hence, It is observed that our R-Squares values for training and predicted data sets are similar.
#### Our model is best fit for our data prediction.

### Final MLR line is as follows

### cnt = 0.235851 + (yr * 0.243876) + (workingday * 0.043512) - (windspeed * 0.175637) + (season_2 * 0.268363) + 
###            (season_3 * 0.316649) + (season_4 * 0.199574) + (mnth_3 * 0.056134) + (mnth_9 * 0.090393) +
###            (mnth_10 * 0.105879) + (weekday_6 * 0.045960) - (weathersit_2 * 0.083298) - (weathersit_3 * 0.347539)

#### Above equation interpretations
    cnt: Total no.of biker users count and it is the target variable in our model
    yr: Year in which bike was rented/used. Unit change in year causes the 0.243876 units change in users count
    workingday: Wether the bike rented day is holiday or workingday. Unit increase in workingday increases 0.043512 unit changes in users count
    windspeed: Unit increase in windspeed decreases 0.175637 units in users count
    season_2: Season_2 is summer. Unit increase in season_2 increases 0.268363 units in users count
    season_3: season_3 is fall. Unit increase in season_3 increases 0.316649 units in users count
    season_4: season_4 is winter. Unit increase in season_4 increases 0.199574 units in user count
    mnth_3: mnth_3 is march. Unit increase in mnth_3 increases 0.056134 units in users count
    mnth_9: mnth_9 is september. Unit increase in mnth_9 increases 0.090393 units in users count
    mnth_10: mnth_10 is october. Unit increase in mnth_10 increases 0.105879 units in users count
    weekday_6: weekday_6 is friday. Unit increase in weekday_6 increases 0.045960 units in users count
    weathersit_2: unit increase in weathersit_2 decreases 0.083298 units in users count
    weathersit_3: unit increase in weathersit_3 decreases 0.347539 units in users count

#### From the above equation for the obtained model top 3 predictor variables are 
####  `season_3` , `season_2`,  `yr`  with its coefficients 0.316649, 0.268363, 0.243876 respectively

#### where as `weathersit_3`, `windspeed`, `weathersit_2` are negatively related with target variable with its coefficients 0.347539, 0.175637, 0.083298