In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Data Loading

In [None]:
hour = pd.read_csv("/kaggle/input/rental-bike-sharing/hour.csv")
day = pd.read_csv("/kaggle/input/rental-bike-sharing/day.csv")

In [None]:
hour.head()

In [None]:
day.head()

In [None]:
print("Shape of hour data: {}".format(hour.shape))
print("Shape of Day data: {}".format(day.shape))

In [None]:
hour_features = hour.drop("cnt", axis = 1).columns
hour_features

In [None]:
# Data types
hour.dtypes

In [None]:
print(hour["yr"].value_counts())
print(hour["season"].value_counts())

### Feature Description
- instant: record index (Irrelevant)
- dteday : date 
- season : season (1:winter, 2:spring, 3:summer, 4:fall) 
- yr : year (0: 2011, 1:2012) 
- mnth : month ( 1 to 12) 
- hr : hour (0 to 23) 
- holiday : weather day is holiday or not 
- weekday : day of the week 
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0. 
- weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale) 
- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale) 
- hum: Normalized humidity. The values are divided to 100 (max) 
- windspeed: Normalized wind speed. The values are divided to 67 (max) 
- casual: count of casual users 
- registered: count of registered users 
- cnt: count of total rental bikes including both casual and registered 
 

In [None]:
from datetime import datetime
from dateutil import parser

In [None]:
# Missing values
plt.figure(figsize = (8,6))
hour.isna().sum().plot() # NO missing Values
plt.title("Missing Values")
plt.show()

In [None]:
hour.head()

**Ideas**
1. Divide hour of day into 5 categories 
* Early Morning - 4th hour to 7th hour 
* Morning - 8th hour to 12th hour
* Afternoon - 13th hour to 16th hour
* Evening - 17th hour to 20th hour
* Night - 21st hour to 3rd hour

2. Divide Weekday - Weekday or Weekend
3. Season, Weather Sit - Change it to OHE as the it is not an ordinal data


## Data Transformation

In [None]:
hour["registered"].min(), hour["registered"].max(),

In [None]:
# Let us take a look at the Regsitered count targets distribution
hour.groupby(pd.cut(hour["registered"], bins = range(0,1000,50)))["registered"].count().hist()

The least populated class of registered count has only 1 occurence. SO we cannot use a Stratified Sampling approach.
We will use the simple random sampling to create the test set


In [None]:
# BEfore moving on let us create the Test dataset
hour.shape

from sklearn.model_selection import train_test_split
train_hour, test_hour = train_test_split(hour, test_size = 0.2, random_state = 12)

train_hour.shape, test_hour.shape

In [None]:
# Using Train set for further analysis
train_hour.head()

Early Morning - 4th hour to 7th hour

Morning - 8th hour to 12th hour

Afternoon - 13th hour to 16th hour

Evening - 17th hour to 20th hour

Night - 21st hour to 3rd hour

In [None]:
def hour_transformer(x):
    if x >=4 and x <=7:
        return "Early_Morning"
    elif x>=8 and x<=12:
        return "Morning"
    elif x >= 13 and x <=16:
        return "Afternoon" 
    elif x >=17 and x <=20:
        return "Evening"
    elif x>=21 or x <=3:
        return "Night"


In [None]:
## Transformer
from sklearn.base import BaseEstimator, TransformerMixin


class dataset_transformer(BaseEstimator, TransformerMixin):
    def __init__(self, transform_cols = 1, drop_cols = 1, transform_season = 1, tranform_weathersit = 1, transform_weekday = 1, add_hour_of_day = 1, drop_originals = 0):
        self.transform_cols = transform_cols
        self.drop_cols = drop_cols
        self.transform_season = transform_season
        self.transform_weathersit = tranform_weathersit
        self.transform_weekday = transform_weekday
        self.add_hour_of_day = add_hour_of_day
        self.drop_originals = drop_originals
        return None
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        transformed_df = X.copy()
        if self.transform_cols:
                if self.drop_cols:
                    transformed_df.drop("instant", axis = 1, inplace = True)
                if self.transform_season:
                    transformed_df["season_tr"] = transformed_df["season"].map({1:'winter', 2:'summer', 3:'spring', 4:'fall'})
                if self.transform_weathersit:
                    transformed_df["weathersit_tr"] = transformed_df["weathersit"].map({1:'Clear', 2:'Mist', 3:'Light_Snow', 4:'Heavy_Rain'})
                if self.transform_weekday:
                    transformed_df["weekday_tr"] =  transformed_df["weekday"].transform(lambda x: 0 if (x == 0 or x == 6) else 1)
                if self.add_hour_of_day:
                    transformed_df["hour_of_day_tr"] = transformed_df["hr"].transform(lambda x: hour_transformer(x))
                if self.drop_originals:
                    transformed_df.drop(["season", "weathersit", "weekday", "hr"], axis = 1, inplace = True)
        return transformed_df

    
        

In [None]:
col_trans = dataset_transformer(drop_originals = 1)
train_hour_tr = col_trans.transform(train_hour)
train_hour_tr.head()

## EDA

In [None]:
# Looking at the summary stats
train_hour_tr.describe()
print("The average registered count of the training dataset %.0f" %(train_hour_tr["registered"].mean()))
print("The average registered count of the Testing dataset %.0f" %(test_hour_tr["registered"].mean()))

In [None]:
# Check if the holiday represents the weekdays and weekends 
pd.crosstab(train_hour_tr["holiday"],train_hour_tr["weekday_tr"])

The weekends do not count as holidays. However, there are 412 weekday holidays in the 2 years

In [None]:
# Registered count in weekdays and weekends
plt.title("Registrations on weekdays vs weekends")
ax = sns.barplot(x = "weekday_tr", y = "registered", data = train_hour_tr)
ax.set_xticklabels({'Weekend':0, 'Weekday':1}.keys())
plt.show()

There are more registrations on the weekdays than on the weekends. One reason could be is may be people rely on bikes for daily workplace commute.

In [None]:
# We will use MannWhitneyU test to see if the registrations change with the weekday or weekend
# Even though MannWhitneyU test does not expect the dependent variable to be normally distributed, let us still go ahead and check if it is normally distributed.
sns.displot(data = train_hour_tr, x = "registered", hue = "weekday_tr", kind = 'kde', legend = False)
plt.xlabel("Registrations")
plt.legend({"Weekend":0, "Weekday":1}.keys())
plt.show()

In [None]:
# CHecking for Skewness of values
from scipy.stats import skewtest, skew
for i in train_hour_tr[["temp", "atemp", "registered", "casual"]].columns:
    print(i,':', skewtest(train_hour_tr[i]).pvalue, ['Skewed' if skewtest(train_hour_tr[i]).pvalue <= 0.05 else 'Normal'])
print("**********************************************")
for i in train_hour_tr[["temp", "atemp", "registered", "casual"]].columns:
    print(i,':', skew(train_hour_tr[i]))
    
# H0 - This function tests the null hypothesis that the skewness of the population that the sample was drawn from is the same as that of a corresponding normal distribution.

**The registered count is not normally distributed, as confirmed by skewtest**

Data is not normally distributed. So, let us transform the data.

In [None]:
hour_df = train_hour_tr[["registered"]].copy()
hour_df["registered_norm"] = np.sqrt(hour_df["registered"])
hour_df

In [None]:
sns.displot(data = hour_df, x = "registered_norm", kind = "kde")
plt.show()

In [None]:
train_hour_tr["registered_norm"] = hour_df["registered_norm"].copy()
train_hour_tr

In [None]:
sns.displot(data = train_hour_tr, x = "registered_norm", hue = "weekday_tr", kind = 'kde', legend = False)
plt.xlabel("Registrations")
plt.legend({"Weekend":0, "Weekday":1}.keys())
plt.show()

***Even though the dependent variable is not normally distributued, the Mann-whitneyU test does not require the target to be normally ditributed.***

### Testing for hypothesis, does the registered counts change with weekdays

In [None]:
from scipy.stats import mannwhitneyu
_, pval = mannwhitneyu(train_hour_tr[train_hour_tr.weekday_tr == 0]["registered"], train_hour_tr[train_hour_tr.weekday_tr == 1]["registered"])
print("p-value of test is %.4f. Hence, the null hypo is %s." %(pval, 'rejected' if pval <=0.05 else 'not rejected'))
#The Mann-Whitney U test is a nonparametric test of the null hypothesis that the distribution underlying sample x is the same as the distribution underlying sample y. It is often used as a test of of difference in location between distributions.

Hence, there is indeed a difference in the registered counts between weekdays and weekends

In [None]:
train_hour_tr.head()

#### Holidays vs Registrations

In [None]:
sns.barplot(data = train_hour_tr, x = "holiday", y = 'registered', hue = 'workingday')
plt.show()

In [None]:
train_hour_tr.groupby(['holiday', 'workingday',"weekday_tr"])["registered"].count()

The non-holiday days have more bookings. This also provides some evidence that the people mostly rely on the bikes for daily commute to workplace. The Weekday/Weekend relationship with the registrations also signified so.

### Registrations by Hour of Day

In [None]:
sns.barplot(data = train_hour_tr, x = 'hour_of_day_tr', y = 'registered', hue = 'weekday_tr')
plt.title("Registrations by Hour of day")
plt.show()

* There are more registrations in the evening than anytime in the day. 

* Night bookings are fewer than the other times in weekdays. However, in weekends, early morning registrations are fewer.

* Weekday Evenings are booked more often than the mornings. May be people commute to home/nearby areas (from workplace) during the evenings. 

* Weekend afternoons also see more registrations than any other time in the weekends.

### Registrations by temperature

In [None]:
plt.figure(figsize  = (20,8))
ax1 = plt.subplot(211)
sns.regplot(data = train_hour_tr, x = train_hour_tr['atemp'], y = 'registered', marker = 'o', scatter_kws={'color':'y'},line_kws={'color':'r'})
ax2 = plt.subplot(212)
sns.scatterplot(data = train_hour_tr, x = train_hour_tr['atemp'], y = 'hour_of_day_tr', hue = train_hour_tr['registered'], size = train_hour_tr['registered'])
# plt.title("Registration by feeling temperature")
plt.show()

We will confirm this relation by checking the correlation coefficient for continuous variables

The regplot shows that the registrations will increase as the temperature increases.

In [None]:
train_hour_tr.head()

### Registrations by weather

In [None]:
sns.barplot(data = train_hour_tr, x = 'weathersit_tr', y  = 'registered')
plt.show()

In [None]:
train_hour_tr[["weathersit_tr",'registered']].groupby(["weathersit_tr"]).agg(['min',"max","mean"])

The resgitrations are more when the weather is clear or there is misty. However, it is highly unpredictable when it rains though.

In [None]:
# Encoding categorical values
train_hour_tr_encoded = pd.get_dummies(train_hour_tr.drop('dteday', axis = 1), drop_first= True)
train_hour_tr_encoded.head()

In [None]:
train_hour_tr_encoded.columns.to_list()

In [None]:
cat_cols = ["holiday","workingday",'weekday_tr',
 'season_tr_spring',
 'season_tr_summer',
 'season_tr_winter',
 'weathersit_tr_Heavy_Rain',
 'weathersit_tr_Light_Snow',
 'weathersit_tr_Mist',
 'hour_of_day_tr_Early_Morning',
 'hour_of_day_tr_Evening',
 'hour_of_day_tr_Morning',
 'hour_of_day_tr_Night']


In [None]:
# Checking the MannWhitneyU test p-values for each categorical column
for i in cat_cols:
    _,pval = mannwhitneyu(train_hour_tr_encoded[train_hour_tr_encoded[i] == 0]["registered"], train_hour_tr_encoded[train_hour_tr_encoded[i] == 1]["registered"])
    print("p-value for field :%s is %.3f, null hyp %s" %(i, pval, 'rejected' if pval<=0.05 else 'not rejected'))

The default assumption or null hypothesis is that there is no difference between the distributions of the data samples. Rejection of this hypothesis suggests that there is likely some difference between the samples.

Except for HEavy rain, looks like all other categorical value seem to have an impact n the registerations

In [None]:
train_hour_tr_encoded = train_hour_tr_encoded.drop(['yr', 'mnth'], axis = 1)
num_cols = ['temp', 'atemp', 'hum', 'windspeed', 'registered']

In [None]:
train_hour_tr_num = train_hour_tr_encoded[num_cols].copy()
train_hour_tr_num

In [None]:
# Corelations matrix for data
plt.figure(figsize = (8,6))
sns.heatmap(train_hour_tr_encoded[num_cols].corr(method = 'spearman'), annot = True, cmap = 'RdYlGn') # Spearmann since,the target variable is not normally distributed
plt.show()

Only 2 of the columns, i.e atemp and humidity are moderately correlated with registrations. 

### Feature Selection

In [None]:
from statsmodels.api import OLS
model = OLS(train_hour_tr_encoded["registered"], train_hour_tr_encoded.drop(["registered", 'cnt', 'casual', 'registered_norm'],axis =1))
res = model.fit()
print(res.summary())

In [None]:
col_list  = ['temp', 'hum']

model = OLS(train_hour_tr_encoded["registered"], train_hour_tr_encoded.drop(["registered", 'cnt', 'casual', 'registered_norm'],axis =1)[col_list])
res = model.fit()
print(res.summary())

### Using ExtraTreeRegressor to check feature importances    

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(random_state = 23)

train_hour_tr_encoded.head()

X = train_hour_tr_encoded.drop(["casual", "registered", "cnt","registered_norm","atemp"], axis = 1)
y = train_hour_tr_encoded["registered"]

In [None]:
etr.fit(X,y)
feat_imp = pd.Series(etr.feature_importances_, index= list(X.columns))
feat_imp.nlargest(10).plot(kind ='bar')
plt.show()

In [None]:
feat_imp.nlargest(10)

### Feature Selection using permutation importances

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestRegressor

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2)
model = RandomForestRegressor(random_state  = 23).fit(X_train, y_train)
perm = PermutationImportance(model, random_state = 23).fit(X_val,y_val)

eli5.show_weights(perm,feature_names = X_val.columns.to_list())

In [None]:
features = ["hour_of_day_tr_Evening", "temp", "hour_of_day_tr_Night","hour_of_day_tr_Early_Morning", "hum", "workingday", "windspeed"]

In [None]:
X_sel = X[features]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_sel, y, test_size = 0.2, random_state = 23)

In [None]:
# Linear Regressor
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.coef_, lr.intercept_

In [None]:
y_train_pred = lr.predict(X_train)
from sklearn.metrics import mean_squared_error
err = np.sqrt(mean_squared_error(y_train_pred, y_train))
print("Training error : %.3f" %(err))

In [None]:
sns.displot((y_train - y_train_pred))
plt.show()

The errors or residuals are normally distributed

In [None]:
sns.scatterplot(x = y_train_pred, y =  y_train)
plt.show()

In [None]:
# lets us check the error in Validation set
y_val_pred_lr = lr.predict(X_val)
err = np.sqrt(mean_squared_error(y_val, y_val_pred_lr))
print("Validation error : %.3f" %(err))

In [None]:
sns.displot((y_val - y_val_pred_lr))
plt.show()

In [None]:
sns.scatterplot(x = y_val_pred_lr, y =  y_val)
plt.show()

The Linear regression scores does not vary much between training and validation set. Lets explore other models 

In [None]:
## Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_train_pred_rf = rf.predict(X_train)
err = np.sqrt(mean_squared_error(y_train_pred_rf, y_train))
print("Training error : %.3f" %(err))

In [None]:
sns.displot(y_train - y_train_pred_rf)
plt.show()

In [None]:
y_val_pred_rf = rf.predict(X_val)
err = np.sqrt(mean_squared_error(y_val_pred_rf, y_val))
print("Validation error : %.3f" %(err))

The Random forest is overfitting the data. However, the validation score is still better than the Linear Regression

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# Tuning Random Forest
rf.get_params()

param_grid = [{"max_depth": range(5,45,5),
               "min_samples_split": range(10,210, 10),
               "n_estimators": range(100,1010,10),
              "bootstrap": [True, False]}]

rSearch = RandomizedSearchCV(rf, param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs =-1, verbose = 2)
rSearch.fit(X_train, y_train)

In [None]:
for param,result in zip(rSearch.cv_results_["params"], rSearch.cv_results_["mean_test_score"]):
    print(param, np.sqrt(-result))

In [None]:
rSearch.best_estimator_, np.sqrt(-rSearch.best_score_)

In [None]:
final_model = rSearch.best_estimator_
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
tr_err = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("Training Error after tuning :  %.3f" %(tr_err))

In [None]:
y_val_pred_rf = final_model.predict(X_val)
val_err = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
print("Validation Error after tuning:  %.3f" %(val_err))

In [None]:
# Performance on Test 
test_hour.head()

#### Tranformation of test dataset

In [None]:
test_hour_tr = col_trans.transform(test_hour)
test_hour_tr_encoded = pd.get_dummies(test_hour_tr.drop('dteday', axis = 1), drop_first= True)
test_hour_fin = test_hour_tr_encoded.drop(["casual", "registered", "cnt","atemp"], axis = 1)
y_test = test_hour_tr_encoded["registered"]

In [None]:
X_test = test_hour_fin[features]
X_test.head()

In [None]:
y_test_pred = final_model.predict(X_test)

test_err = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Testing error: %.3f" %(test_err))

In [None]:
sns.displot( y_test - y_test_pred)
plt.show()

In [None]:
# Standard error of mean
from scipy import stats
conf = 0.95 
sq_errs = (y_test  - y_test_pred) ** 2
dof = len(sq_errs)-1
mean_of_sq_errors = sq_errs.mean()
standard_error_of_mean = stats.sem(sq_errs) # Standard_Dev/sq.root of sample size

#The confidence Interval of the error
np.sqrt(stats.t.interval(conf, dof, loc = mean_of_sq_errors, scale = standard_error_of_mean))

### Summary

After the feature selection, the below variables were found out to be important that the others

* hour_of_day_tr_Evening
* temp
* hour_of_day_tr_Night
* hour_of_day_tr_Early_Morning
* hum
* workingday
* windspeed

Though the Linear Regressor was stable between the train and test samples, a Random Forest Regressor predict with least error. The score further improved after the hyperparamter tuning using Randomised Search CV and the best estimator was used to check the score/performance of the test data. 

However, the prediction errors can be in the range of 100 to 109.

Next Steps - A more confident result could be obained if GridSearchCV were tried instead of RandomizedSearchCV in the hyperparamter tuning and if other models could be explored.
Similar model should be created for "Casual" counts and then the "Registered" and the "Casual" can be summed up to determine the "Cnt" i.e. the total count.
