In [16]:
#Importing required libray
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime as dt
import math
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.metrics import mean_absolute_error
import pickle

In [2]:
#Mouting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#readinf the csv file
df=pd.read_csv('/content/drive/MyDrive/flask_delpoyment/SeoulBikeData.csv',encoding='latin')

In [4]:
# Preprocessing the data
df['Date'] = df['Date'].apply(lambda x: dt.datetime.strptime(x, "%d/%m/%Y"))
df['year'] = df['Date'].apply(lambda x: x.year)
df['month'] = df['Date'].apply(lambda x: x.month)
df['day'] = df['Date'].apply(lambda x: x.day)
df = df.drop(columns=['Date'], axis=1)
df['Rainfall(mm)'] = df['Rainfall(mm)'].fillna(df['Rainfall(mm)'].median())
df['Wind speed (m/s)'] = df['Wind speed (m/s)'].fillna(df['Wind speed (m/s)'].median())
df['Snowfall (cm)'] = df['Snowfall (cm)'].fillna(df['Snowfall (cm)'].median())
df['Solar Radiation (MJ/m2)'] = df['Solar Radiation (MJ/m2)'].fillna(df['Solar Radiation (MJ/m2)'].median())

In [5]:
# Encoding for Seasons column
df = pd.get_dummies(df, columns=['Seasons'], drop_first=True)

In [6]:
# Encoding for Functioning Day column
df['Functioning Day'] = df['Functioning Day'].map({'Yes': 1, 'No': 0})

In [7]:
# Encoding for Holiday column
df['Holiday'] = df['Holiday'].map({'Holiday': 1, 'No Holiday': 0})

In [8]:
X = df.drop(columns=['Rented Bike Count'], axis=1)
y = df['Rented Bike Count']

In [9]:
X

Unnamed: 0,Hour,Temperature(è),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(è),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Holiday,Functioning Day,year,month,day,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,1,2017,12,1,0,0,1
1,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,1,2017,12,1,0,0,1
2,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,1,2017,12,1,0,0,1
3,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,1,2017,12,1,0,0,1
4,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,1,2017,12,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,0,1,2018,11,30,0,0,0
8756,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,0,1,2018,11,30,0,0,0
8757,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,0,1,2018,11,30,0,0,0
8758,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,0,1,2018,11,30,0,0,0


In [10]:
X.shape

(8760, 17)

In [11]:
# Create train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [18]:
# #creating paramdict to check random forest with diffirent value of parameter through gridsearch
rf_model= RandomForestRegressor()
n_estimators=[60,80,100]
max_depth=[15,20]
max_leaf_nodes=[40,60,80]
params = {'n_estimators':n_estimators,'max_depth':max_depth ,'max_leaf_nodes':max_leaf_nodes}

In [19]:
#Fit Model
rf_regressor= GridSearchCV(rf_model,param_grid=params,verbose=0)
rf_regressor=rf_regressor.fit(X_train,y_train)

In [20]:
optimal_rf_model=rf_regressor.best_estimator_

In [21]:
#Predict train & test values
y_pred_train_rf = optimal_rf_model.predict(X_train)
y_pred_test_rf = optimal_rf_model.predict(X_test)

In [22]:
#checking the score
print("The train score is :", round(optimal_rf_model.score(X_train, y_train),2))
print("The test score is :", round(optimal_rf_model.score(X_test, y_test),2))

The train score is : 0.89
The test score is : 0.85


In [23]:
print('Evaluation metrics for training data\n')
#calculate MSE
MSE_rf_train= mean_squared_error(y_train, y_pred_train_rf)
print("MSE :",MSE_rf_train)

#calculate RMSE
RMSE_rf_train=np.sqrt(MSE_rf_train)
print("RMSE :",RMSE_rf_train)

#calculate MAE
MAE_rf_train= mean_absolute_error(y_train, y_pred_train_rf)
print("MAE :",MAE_rf_train)

#calculate r2 and adjusted r2
R2_rf_train= r2_score(y_train,y_pred_train_rf)
print("R2 :",R2_rf_train)
Adjusted_R2_rf_train = (1-(1-r2_score(y_train, y_pred_train_rf))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1)) )
print("Adjusted R2 :",Adjusted_R2_rf_train)

print('--'*80)

print('Evaluation metrics for testing data\n')

#calculate MSE
MSE_rf_test= mean_squared_error(y_test, y_pred_test_rf)
print("MSE :",MSE_rf_test)

#calculate RMSE
RMSE_rf_test=np.sqrt(MSE_rf_test)
print("RMSE :",RMSE_rf_test)

#calculate MAE
MAE_rf_test= mean_absolute_error(y_test, y_pred_test_rf)
print("MAE :",MAE_rf_test)

#calculate r2 and adjusted r2
R2_rf_test= r2_score(y_test, y_pred_test_rf)
print("R2 :",R2_rf_test)
Adjusted_R2_rf_test = (1-(1-r2_score(y_test, y_pred_test_rf))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1)))
print("Adjusted R2 :",Adjusted_R2_rf_test)

Evaluation metrics for training data

MSE : 44154.477493160084
RMSE : 210.12966828403856
MAE : 148.6484642847124
R2 : 0.8935792597244314
Adjusted R2 : 0.892746316545479
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Evaluation metrics for testing data

MSE : 64820.116435406344
RMSE : 254.5979505718896
MAE : 171.15216879361486
R2 : 0.8453277977011151
Adjusted R2 : 0.8441171957494203


In [25]:
# Saving model
pickle.dump(rf_regressor, open('model.pkl', 'wb'))