In [29]:
import numpy as np
import pandas as pd
import math

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [30]:
pd.set_option("display.max_columns", None)
plt.style.use('seaborn-v0_8')
plt.rcParams["font.weight"] = "bold"
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams["axes.titlesize"] = 25
plt.rcParams["axes.titleweight"] = 'bold'
plt.rcParams['xtick.labelsize']=15
plt.rcParams['ytick.labelsize']=15
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["legend.fontsize"] = 15
plt.rcParams["legend.title_fontsize"] = 15

In [31]:
data_path = r"D:\RideWise-Predicting-Bike-sharing-Demand\Data\day.csv"

In [32]:
df = pd.read_csv(r"D:\RideWise-Predicting-Bike-sharing-Demand\Data\day.csv", encoding= 'unicode_escape')

In [33]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [34]:
# Drop unwanted columns
columns_to_drop = ['instant', 'casual', 'registered','dteday','atemp']
df = df.drop(columns=columns_to_drop)
print(f"Unwanted columns are: {columns_to_drop}")

Unwanted columns are: ['instant', 'casual', 'registered', 'dteday', 'atemp']


In [35]:
# Create weekend column (1 if weekend, 0 if not weekend)
df['weekend'] = ((df['weekday'] == 0) | (df['weekday'] == 6)).astype(int)
print("Created weekend column (1 for weekend, 0 for weekday)")

Created weekend column (1 for weekend, 0 for weekday)


In [37]:
df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,weekend
0,1,0,1,0,6,0,2,0.344167,0.805833,0.160446,985,1
1,1,0,1,0,0,0,2,0.363478,0.696087,0.248539,801,1
2,1,0,1,0,1,1,1,0.196364,0.437273,0.248309,1349,0
3,1,0,1,0,2,1,1,0.2,0.590435,0.160296,1562,0
4,1,0,1,0,3,1,1,0.226957,0.436957,0.1869,1600,0


## In All Model testing we have observed , that the accuarcy of Random Forest Regressor ,Gardient boosting Regression and Gradient Boosting Regression(GridSearchCV) was better than the remaining models . So now we will try to Increase their accuracy by the implementation of Interactive features in our data and Retrain our model on that new data .

In [38]:
# Creation of Interactive features :- 

df['temp_hum'] = df['temp'] * df['hum']
df['temp_windspeed'] = df['temp'] * df['windspeed']

print("Created Interactive features 'temp_hum' ,'temp_windspeed'")

Created Interactive features 'temp_hum' ,'temp_windspeed'


In [39]:
df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,weekend,temp_hum,temp_windspeed
0,1,0,1,0,6,0,2,0.344167,0.805833,0.160446,985,1,0.277341,0.05522
1,1,0,1,0,0,0,2,0.363478,0.696087,0.248539,801,1,0.253012,0.090338
2,1,0,1,0,1,1,1,0.196364,0.437273,0.248309,1349,0,0.085865,0.048759
3,1,0,1,0,2,1,1,0.2,0.590435,0.160296,1562,0,0.118087,0.032059
4,1,0,1,0,3,1,1,0.226957,0.436957,0.1869,1600,0,0.09917,0.042418


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [41]:
# Creating a dependent variable for rented bike count
dependent_variable = 'cnt'

In [42]:
# Create a list of independent variables
independent_variables = list(set(df.columns.tolist()) - {dependent_variable})
independent_variables

['windspeed',
 'yr',
 'season',
 'temp_windspeed',
 'temp_hum',
 'temp',
 'weekend',
 'weekday',
 'workingday',
 'weathersit',
 'holiday',
 'hum',
 'mnth']

In [43]:
# Create the dependent variable data
y = np.sqrt(df['cnt'])

# Create the data of independent variables
X = df.drop('cnt',axis=1)

In [44]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [45]:
X_train[0:10]

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,weekend,temp_hum,temp_windspeed
97,2,0,4,0,5,1,2,0.335833,0.83625,0.226992,0,0.28084,0.076231
503,2,1,5,0,5,1,1,0.564167,0.523333,0.136817,0,0.295247,0.077188
642,4,1,10,0,4,1,2,0.6575,0.722917,0.117546,0,0.475318,0.077286
498,2,1,5,0,0,0,1,0.6125,0.57625,0.225117,1,0.352953,0.137884
303,4,0,10,0,1,1,1,0.34,0.703333,0.10635,0,0.239133,0.036159
570,3,1,7,0,2,1,1,0.750833,0.655,0.211454,0,0.491796,0.158767
250,3,0,9,0,4,1,3,0.633913,0.939565,0.192748,0,0.595602,0.122185
17,1,0,1,0,2,1,2,0.216667,0.861667,0.146775,0,0.186695,0.031801
595,3,1,8,0,6,0,1,0.678333,0.603333,0.177867,1,0.409261,0.120653
240,3,0,8,0,1,1,1,0.636667,0.554583,0.159825,0,0.353085,0.101755


In [46]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(584, 13)
(584,)
(147, 13)
(147,)


In [48]:
X_train[0:10]

array([[0.33333333, 0.        , 0.27272727, 0.        , 0.83333333,
        1.        , 0.5       , 0.35024023, 0.80442593, 0.48605788,
        0.        , 0.44966876, 0.26936801],
       [0.33333333, 1.        , 0.36363636, 0.        , 0.83333333,
        1.        , 0.        , 0.63925722, 0.35526299, 0.24873701,
        0.        , 0.47485777, 0.27326701],
       [1.        , 1.        , 0.81818182, 0.        , 0.66666667,
        1.        , 0.5       , 0.75739478, 0.64174706, 0.19801995,
        0.        , 0.78969407, 0.2736701 ],
       [0.33333333, 1.        , 0.36363636, 0.        , 0.        ,
        0.        , 0.        , 0.7004354 , 0.43122037, 0.48112329,
        1.        , 0.57575101, 0.52075448],
       [1.        , 0.        , 0.81818182, 0.        , 0.16666667,
        1.        , 0.        , 0.35551467, 0.61363607, 0.16855453,
        0.        , 0.37674787, 0.10597451],
       [0.66666667, 1.        , 0.54545455, 0.        , 0.33333333,
        1.        , 0.     

### Random Forest Regressor

In [49]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [52]:
rf.score(X_test,y_test)

0.8715056600548468

In [55]:
y_pred_train = rf.predict(X_train)
y_pred = rf.predict(X_test)

In [56]:
MAE = mean_absolute_error(y_test,y_pred)
print(f"The Mean Absolute Error (MAE) is {MAE}.")

MSE = mean_squared_error(y_test,y_pred)
print(f"The Mean Squred Error(MSE) is {MSE}.")

RMSE = np.sqrt(MSE)
print(f"The Root Mean Squared Error(RMSE) is {RMSE}.")

R2 = r2_score(y_test,y_pred)
print(f"The R2 Score is {R2}.")

adj_r2 = 1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))
print(f"Adjusted R2 is {adj_r2}.")

The Mean Absolute Error (MAE) is 4.164387571319309.
The Mean Squred Error(MSE) is 36.49959908780042.
The Root Mean Squared Error(RMSE) is 6.041489806976457.
The R2 Score is 0.8715056600548468.
Adjusted R2 is 0.8589460629173505.


In [57]:
test_dict = {'Model':"Random Forest Regressor",
              'MAE':round(MAE,4),
              'MSE':round(MSE,4),
              'RMSE':round(RMSE,4),
              'R2_score':round(R2,4),
              'Adjusted_R2':round(adj_r2,4)}

In [60]:
test_results_df = pd.DataFrame(test_dict,index=[0])
test_results_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2_score,Adjusted_R2
0,Random Forest Regressor,4.1644,36.4996,6.0415,0.8715,0.8589


### Gradient Boosting

In [61]:
gradboo_reg = GradientBoostingRegressor()
gradboo_reg.fit(X_train,y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [62]:
gradboo_reg.score(X_test,y_test)

0.8820788140641467

In [63]:
y_pred_train = gradboo_reg.predict(X_train)
y_pred = gradboo_reg.predict(X_test)

In [64]:
MAE = mean_absolute_error(y_test,y_pred)
print(f"The Mean Absolute Error (MAE) is {MAE}.")

MSE = mean_squared_error(y_test,y_pred)
print(f"The Mean Squred Error(MSE) is {MSE}.")

RMSE = np.sqrt(MSE)
print(f"The Root Mean Squared Error(RMSE) is {RMSE}.")

R2 = r2_score(y_test,y_pred)
print(f"The R2 Score is {R2}.")

adj_r2 = 1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))
print(f"Adjusted R2 is {adj_r2}.")

The Mean Absolute Error (MAE) is 4.153416891597129.
The Mean Squred Error(MSE) is 33.49623035889189.
The Root Mean Squared Error(RMSE) is 5.787592794840692.
The R2 Score is 0.8820788140641467.
Adjusted R2 is 0.8705526831080106.


In [65]:
test_dict = {'Model':"Gardient boosting Regression",
              'MAE':round(MAE,4),
              'MSE':round(MSE,4),
              'RMSE':round(RMSE,4),
              'R2_score':round(R2,4),
              'Adjusted_R2':round(adj_r2,4)}

In [66]:
import pandas as pd

test_results_df = pd.concat([test_results_df, pd.DataFrame([test_dict])], ignore_index=True)

test_results_df


Unnamed: 0,Model,MAE,MSE,RMSE,R2_score,Adjusted_R2
0,Random Forest Regressor,4.1644,36.4996,6.0415,0.8715,0.8589
1,Gardient boosting Regression,4.1534,33.4962,5.7876,0.8821,0.8706


### GridSearchCV on Gradient Boosting Regressor

In [67]:
gbr = GradientBoostingRegressor()

In [68]:
parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
              'subsample'    : [0.9, 0.5, 0.2, 0.1],
              'n_estimators' : [50,80,100,150],
               'max_depth'    : [4,6,8,10]}

In [69]:
grad_boo_tuning = GridSearchCV(gbr,param_grid=parameters,cv=4,n_jobs=1)
grad_boo_tuning.fit(X_train,y_train)

0,1,2
,estimator,GradientBoostingRegressor()
,param_grid,"{'learning_rate': [0.01, 0.02, ...], 'max_depth': [4, 6, ...], 'n_estimators': [50, 80, ...], 'subsample': [0.9, 0.5, ...]}"
,scoring,
,n_jobs,1
,refit,True
,cv,4
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,learning_rate,0.04
,n_estimators,150
,subsample,0.2
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,10
,min_impurity_decrease,0.0


In [70]:
print(f'The best parameter is {grad_boo_tuning.best_params_}')
print(f'The best negative mean squred error value is {grad_boo_tuning.best_score_}')

The best parameter is {'learning_rate': 0.04, 'max_depth': 10, 'n_estimators': 150, 'subsample': 0.2}
The best negative mean squred error value is 0.8702252886763573


In [71]:
grad_tuned = GradientBoostingRegressor(learning_rate=0.04,max_depth=10,n_estimators=150,subsample=0.5)
grad_tuned.fit(X_train,y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.04
,n_estimators,150
,subsample,0.5
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,10
,min_impurity_decrease,0.0


In [72]:
grad_tuned.score(X_test,y_test)

0.8811886560665059

In [73]:
y_pred_train = grad_tuned.predict(X_train)
y_pred = grad_tuned.predict(X_test)

In [74]:
MAE = mean_absolute_error(y_test,y_pred)
print(f"The Mean Absolute Error (MAE) is {MAE}.")

MSE = mean_squared_error(y_test,y_pred)
print(f"The Mean Squred Error(MSE) is {MSE}.")

RMSE = np.sqrt(MSE)
print(f"The Root Mean Squared Error(RMSE) is {RMSE}.")

R2 = r2_score(y_test,y_pred)
print(f"The R2 Score is {R2}.")

adj_r2 = 1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))
print(f"Adjusted R2 is {adj_r2}.")

The Mean Absolute Error (MAE) is 3.8864069697895913.
The Mean Squred Error(MSE) is 33.749085154305874.
The Root Mean Squared Error(RMSE) is 5.8093962813967055.
The R2 Score is 0.8811886560665059.
Adjusted R2 is 0.8695755171857884.


In [75]:
test_dict = {'Model':"Gradient Boosting Regression(GridSearchCV)",
              'MAE':round(MAE,4),
              'MSE':round(MSE,4),
              'RMSE':round(RMSE,4),
              'R2_score':round(R2,4),
              'Adjusted_R2':round(adj_r2,4)}

In [76]:
import pandas as pd

test_results_df = pd.concat([test_results_df, pd.DataFrame([test_dict])], ignore_index=True)

test_results_df


Unnamed: 0,Model,MAE,MSE,RMSE,R2_score,Adjusted_R2
0,Random Forest Regressor,4.1644,36.4996,6.0415,0.8715,0.8589
1,Gardient boosting Regression,4.1534,33.4962,5.7876,0.8821,0.8706
2,Gradient Boosting Regression(GridSearchCV),3.8864,33.7491,5.8094,0.8812,0.8696
