**If you like my notebook, please upvote my work!**

**If you use parts of this notebook in your scripts/notebooks, giving some kind of credit for instance link back to this notebook would be very much appreciated. Thanks in advance! :)**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Importing important libraries

In [None]:
import calendar
import seaborn as sb
import xgboost as xgb
import plotly.express as px
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_log_error,make_scorer
from sklearn.model_selection import train_test_split,GridSearchCV

# Loading the training dataset

In [None]:
#Reading the file
file = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")

In [None]:
file.describe()

# Checking for null values

In [None]:
file.isnull().sum(axis=0)

No data cleaning is required since no Null values are found!

In [None]:
file.columns

# Data visualization:

## Heatmap of all the continuous values in the file.

In [None]:
corr = file[['temp','atemp','humidity', 'windspeed','casual', 'registered','count']].corr()
f,axes = plt.subplots(1,1,figsize = (8,8))
sb.heatmap(corr,square=True,annot = True,linewidth = .5,center = 1.4,ax = axes)

### The conclusions drawn are:
1. We can see that temp and atemp have a very strong positive correlation therefore we can use only temp as a variable without any loss of information. 

2. We can infer from the correlaton matrix that windspeed has almost no correlation with the casual,registered or count which we wish to predict so we can remove that

## Data visualizaton for non continuous variables in data

First we have to separate the individual date and time for each data point into hour,day,month and year.

In [None]:
file = file
file['Date'] = pd.DatetimeIndex(file['datetime']).date
file['Hour'] = pd.DatetimeIndex(file['datetime']).hour
file['Day'] = pd.DatetimeIndex(file['datetime']).day
file['Month'] = pd.DatetimeIndex(file['datetime']).month
file['Year'] = pd.DatetimeIndex(file['datetime']).year
file['Weekday'] = pd.DatetimeIndex(file['datetime']).weekday_name

In [None]:
a = []
for i in file.index:
    a.append('Total Count : '+str(file['count'][i]))
file['count_vis'] = a

In [None]:
fig = px.line(x = 'Date', y = "count", data_frame = file,color = 'Hour',range_y = (0,1150),
              title = 'Interactive LinePlot of the whole dataset(Hover for more details)',
              hover_data = ['Hour','Date','casual','registered'],
              hover_name = 'count_vis', text = None,
              height = 670,width = 980)
fig.show()

**The sudden periodic changes between the differrent regions is due to the missing data.These are the regions in which the regions we have to predict the result.**

## 1. Season

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'season'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

## 2. Holiday

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'holiday'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

## 3. Working day

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'workingday'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

Combining the graphs of casual and registered into one of may make it seem like that holiday and workingday have no dependence on count but we can clearly see that holiday increases the casual amount by upto 40% and a reverse kind of trend is observed in working day so it is reasonable to take two different models one for casual and another for registered.

Therefore what I will attempt to do is make two separate models for the casual and the registerd training them separately and then adding the result to get the count.

## 4. Weather

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'weather'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

## 5. Date and Time

### 5.a. Hour

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Hour'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

We can see that in the final graph there are two prominent peaks.
1. At 8 a.m
2. At 5 p.m
Where as the trend of casual is mostly the same throughout the day. 

From this we can conclude that the registered are mostly people going on their jobs which explains the peaks at the start and end of office hours. Clearly these people would have a more definite and predictible schedule and are therefore more likely to be registered.In order to test this we plot somw more graphs.

In [None]:
file.groupby('Weekday').count().index

In [None]:
file1 = file.groupby(['Hour','Weekday']).mean().reset_index()
dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']}
dic1 = {'registered':'Average count of registered poeple commuting.','count': 'Average people commuting','Hour':'Hour of the day',
        'Weekday':'Day of the week'}
fig = px.line(x = 'Hour', y = "registered", data_frame = file1.reset_index(),color = 'Weekday',
              title = 'Interactive LinePlot of the registered separated by weekday(Hover for more details)',labels = dic1,
              hover_data = ['count'],category_orders = dic,range_y = [0,550],height = 670,width = 980)
fig.show()

Clearly We can see that on the days of saturday and sunday,the general trend more or less follows the trend of casual where as on weekdays there is a completely different trend of two peaks at 8 am and 5 pm which confirms that those peaks are due to the workpeople commuting.

### 5.b. Day

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Day'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

**From the above graphs we can conclude that the feature day has hardly any influence over the features registered and count.**

### 5.c. Month

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Month'
#order = ['January','February','March','April','May','June','July','August','September','October','November','December']
plot = sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

### 5.d. Year 

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Year'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

In [None]:
file.describe()

In [None]:
file.columns

# Adding relevant columns for each of the categorical data columns and removing unnecesary ones

## 1. Season

In [None]:
for i in file.groupby('season').count().index:
    s = 's'+str(i)
    a=[]
    for j in file.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)

## 2. Weather 

In [None]:
for i in file.groupby('weather').count().index:
    s = 'w'+str(i)
    a=[]
    for j in file.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)

## 3. Hour

In [None]:
for i in file.groupby('Hour').count().index:
    s = 'Hour'+str(i)
    a=[]
    for j in file.Hour:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)

## 4.Month

In [None]:
for i in file.groupby("Month").count().index:
    s = 'Month' + str(i)
    a = []
    for j in file.Month:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s] = a
file.sample(5)

In [None]:
file.columns

## Removing unnecessary columns

In [None]:
feed = file[['Hour0', 'Hour1', 'Hour2', 'Hour3', 'Hour4', 'Hour5',
       'Hour6', 'Hour7', 'Hour8', 'Hour9', 'Hour10', 'Hour11', 'Hour12',
       'Hour13', 'Hour14', 'Hour15', 'Hour16', 'Hour17', 'Hour18', 'Hour19',
       'Hour20', 'Hour21', 'Hour22', 'Hour23','Month1', 'Month2', 'Month3',
       'Month4', 'Month5', 'Month6', 'Month7', 'Month8', 'Month9', 'Month10',
       'Month11', 'Month12','Year','s1','s2','s3','s4','holiday','workingday',
        'w1','w2','w3','w4','temp','humidity','casual','registered']]

In [None]:
feed.describe()

In [None]:
feed.columns

# Preparing training and testing sets

### 1. Training set

In [None]:
df_train_x = feed.drop('casual',axis = 1).drop('registered',axis=1)
df_train_x.describe()

### 2. Training set

In [None]:
df_reg_train_y = feed['registered']
df_reg_train_y.describe

In [None]:
df_cas_train_y = feed['casual']
df_cas_train_y.describe

# Machine learning model

### Splitting data into train and test sets

In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(df_train_x, df_cas_train_y, test_size=0.15, random_state=42)
x2_train, x2_test, y2_train, y2_test = train_test_split(df_train_x, df_reg_train_y, test_size=0.15, random_state=42)

### Using polynomial on the dataset

In [None]:
poly = PolynomialFeatures(degree=2)
poly_x1_train = poly.fit_transform(x1_train)
poly_x1_test = poly.fit_transform(x1_test)
poly_x2_train = poly.fit_transform(x2_train)
poly_x2_test = poly.fit_transform(x2_test)

### Trying different models to see which one works best for the given data.

In [None]:
rf = RandomForestRegressor()

parameters = {'n_estimators':[50,100,150,200,250],
              'min_impurity_decrease':[0.0,0.001,0.01],
              'max_depth':[20,40,60,80,100]}

models = ['Normal Linear Regression: ','Linear Regression over polynomial: ',
          'Decision Tree Regressor: ','XG Boosting: ']

### Defining a custom scorer function for the models

In [None]:
def custom_scorer(y_true,y_pred):
    for i in range(len(y_pred)):
        if y_pred[i]<0:
            y_pred[i] = 1
    return np.sqrt(mean_squared_log_error(y_true, y_pred ))
scorer = make_scorer(custom_scorer,greater_is_better = False)

Here I have defined a scorer function as we are using mean squared log loss which does not work on negative values and the models were sometimes predicting negative values which messes with the scores and since we know that these values will always be positive we just replace them with the minimum value in the train set which is 1.

In [None]:
predict = []
reg = LinearRegression().fit(x1_train, y1_train)
pre_reg = reg.predict(x1_test)

reg_poly = LinearRegression().fit(poly_x1_train, y1_train)
pre_reg_poly = reg_poly.predict(poly_x1_test)

rf_reg = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1)
rf_reg.fit(x1_train, y1_train)
pre_rf_reg = rf_reg.predict(x1_test)

predict.append(pre_reg)
predict.append(pre_reg_poly)
predict.append(pre_rf_reg)

In [None]:
for prediction in range(len(predict)):
    pre = []
    for p in predict[prediction]:
        if p < 1:
            pre.append(1)
        else:
            pre.append(p)
    print(models[prediction]+str(np.sqrt(mean_squared_log_error(y1_test, pre ))))

In [None]:
predict = []
cas = LinearRegression().fit(x2_train, y2_train)
pre_cas = cas.predict(x2_test)

cas_poly = LinearRegression().fit(poly_x2_train, y2_train)
pre_cas_poly = cas_poly.predict(poly_x2_test)

rf_cas = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1)
rf_cas.fit(x2_train, y2_train)
pre_rf_cas = rf_cas.predict(x2_test)

predict.append(pre_cas)
predict.append(pre_cas_poly)
predict.append(pre_rf_cas)

In [None]:
for prediction in range(len(predict)):
    pre = []
    for p in predict[prediction]:
        if p < 1:
            pre.append(1)
        else:
            pre.append(p)
    print(models[prediction]+str(np.sqrt(mean_squared_log_error(y2_test, pre ))))

In [None]:
print("For Random Forest Model: ")
print("\t Best Parametres for registered are: ",end='')
print(rf_reg.best_params_)
print("\t Best Parametres for casual are: ",end = '')
print(rf_cas.best_params_)

In [None]:
predict1 = []

reg1 = LinearRegression().fit(x1_train, y1_train)
pre_reg1 = reg1.predict(x1_test)

reg1_poly = LinearRegression().fit(poly_x1_train, y1_train)
pre_reg1_poly = reg1_poly.predict(poly_x1_test)

rf1 = RandomForestRegressor(n_estimators = 200,max_depth=80,min_impurity_decrease = 0.001).fit(x1_train, y1_train)
pre_rf1 = rf1.predict(x1_test)

for i in range(pre_reg1.size):
    if pre_reg1[i]<1:
        pre_reg1[i] = 1 
    if pre_reg1_poly[i]<1:
        pre_reg1_poly[i] = 1
    if pre_rf1[i]<1:
        pre_rf1[i] = 1

predict1.append(pre_reg1)
predict1.append(pre_reg1_poly)
predict1.append(pre_rf1)

x1_final = x1_test.copy()
x1_final['Output'] = y1_test
x1_final['Lin_reg'] = pre_reg1
x1_final['Lin_reg_poly'] = pre_reg1_poly
x1_final['RF_reg'] = pre_rf1
x1_final['Resid'] = y1_test-pre_reg1
x1_final['Resid_poly'] = y1_test-pre_reg1_poly

for prediction in predict1:
    print(np.sqrt(mean_squared_log_error( y1_test, prediction )))

Since we know that the output is never less than 1 we replace all negative values with 1 before appending in pre to calculate error.

In [None]:
predict2 = []

reg2 = LinearRegression().fit(x2_train, y2_train)
pre_reg2 = reg2.predict(x2_test)

reg2_poly = LinearRegression().fit(poly_x2_train, y2_train)
pre_reg2_poly = reg2_poly.predict(poly_x2_test)

rf2 = RandomForestRegressor(n_estimators = 150,max_depth=60,min_impurity_decrease = 0.0).fit(x2_train, y2_train)
pre_rf2 = rf2.predict(x2_test)

for i in range(pre_reg2.size):
    if pre_reg2[i]<1:
        pre_reg2[i] = 1 
    if pre_reg2_poly[i]<1:
        pre_reg2_poly[i] = 1
    if pre_rf2[i]<1:
        pre_rf2[i] = 1

predict2.append(pre_reg2)
predict2.append(pre_reg2_poly)
predict2.append(pre_rf2)

x2_final = x2_test.copy()
x2_final['Output'] = y2_test
x2_final['Lin_reg'] = pre_reg2
x2_final['Lin_reg_poly'] = pre_reg2_poly
x2_final['RF_reg'] = pre_rf2
x2_final['Resid'] = y2_test-pre_reg2
x2_final['Resid_poly'] = y2_test-pre_reg2_poly

for prediction in predict2:
    print(np.sqrt(mean_squared_log_error( y2_test, prediction )))

## Plotting the residual plots

In [None]:
from plotly.subplots import make_subplots
name1  = ['Residual for casual without polynomial features'] *1633
name2  = ['Residual for casual with polynomial features'] *1633
name3  = ['Residual for registered without polynomial features'] *1633
name4  = ['Residual for registered with polynomial features'] *1633
dic = {'Lin_reg': 'Predicted Output','Resid':'Deviation from predicted','Output':'Expected Output','Lin_reg_poly': 'Predicted Output',
       'Resid_poly':'Deviation from predicted'}
fig1 = px.scatter(data_frame = x1_final,x = 'Lin_reg', y = 'Resid',hover_data = ['Output'],labels = dic,hover_name = name1,
                  color_discrete_sequence = ['red'])
fig2 = px.scatter(data_frame = x1_final,x = 'Lin_reg_poly', y = 'Resid_poly',hover_data = ['Output'],labels = dic,hover_name = name2,
                  color_discrete_sequence = ['blue'])
fig3 = px.scatter(data_frame = x2_final,x = 'Lin_reg', y = 'Resid',hover_data = ['Output'],labels = dic,hover_name = name3,
                  color_discrete_sequence = ['darkgreen'])
fig4 = px.scatter(data_frame = x2_final,x = 'Lin_reg_poly', y = 'Resid_poly',hover_data = ['Output'],labels = dic,hover_name = name4,
                  color_discrete_sequence = ['gold'])
trace1 = fig1['data'][0]
trace2 = fig2['data'][0]
trace3 = fig3['data'][0]
trace4 = fig4['data'][0]

fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing  = 0.2,
                    row_titles = ['Using Polynomial','Without Polynomial'],column_titles = ['Casual','Registered'],
                    x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)')

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=2, col=1)
fig.add_trace(trace4, row=2, col=2)
fig.show()

**Since the residual plots show a conical divergence therefore we can conclude that Linear Regression is definitely not a suitable model for the predicting in the above distribution of data**

Clearly the random forest model works best in this situation.

### Retraining the decision tree over the whole dataset for submission.

In [None]:
rf1 = RandomForestRegressor(n_estimators = 200,max_depth=80,min_impurity_decrease = 0.001).fit(df_train_x,df_cas_train_y)
rf2 = RandomForestRegressor(n_estimators = 150,max_depth=60,min_impurity_decrease = 0.0).fit(df_train_x,df_reg_train_y)

### Reading the test file

In [None]:
test_file = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
test=test_file
test.describe()

## Processing of the test file

In [None]:
test['mth'] = pd.DatetimeIndex(test['datetime']).month
test['yr'] = pd.DatetimeIndex(test['datetime']).year
test['dy'] = pd.DatetimeIndex(test['datetime']).day
test['hr'] = pd.DatetimeIndex(test['datetime']).hour

for i in test.groupby("season").count().index:
    s = 's' + str(i)
    a = []
    for j in test.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby("weather").count().index:
    s = 'w' + str(i)
    a = []
    for j in test.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby('hr').count().index:
    s = 'hr'+str(i)
    a=[]
    for j in test.hr:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s]=a
for i in test.groupby("mth").count().index:
    s = 'm' + str(i)
    a = []
    for j in test.mth:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
test.sample(10)

In [None]:
test = test[['hr0','hr1','hr2','hr3','hr4','hr5','hr6','hr7','hr8','hr9','hr10','hr11','hr12','hr13','hr14','hr15','hr16','hr17','hr18',
                 'hr19','hr20','hr21','hr22','hr23','m1','m2','m3','m4','m5','m6','m7','m8','m9','m10','m11','m12','yr',
                 's1','s2','s3','s4','holiday','workingday','w1','w2','w3','w4','temp','humidity']]
test.describe

## Predicting the output over test set

In [None]:
pre_cas = rf1.predict(test)
pre_reg = rf2.predict(test)
final_predictions = pd.DataFrame(pre_cas+pre_reg,columns = ['cout'])

final_predictions.describe

In [None]:
s=[]
for j in final_predictions.cout:
    if int(j)<1:
        s.append(1)
    else:
        s.append(j)
final_predictions['count'] = s 

**Since we know that the output is never less than 1 we have to replace all negative values with 1.**

In [None]:
final_predictions.describe

In [None]:
final_predictions['datetime']=test_file['datetime']
final_predictions = final_predictions[['datetime','count']]

In [None]:
final_predictions.describe()

## Exporting output to csv

In [None]:
final_predictions.to_csv('submission.csv',index=False)