In [1]:
!pip install jovian --upgrade --quiet

<br><font size='4'>Importing required libraries.</font>

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
import pickle

<br><font size='4'>Reading the dataset using pandas function ```read_csv()```.</font>

In [3]:
df = pd.read_csv("output.csv")

In [4]:
df.columns

Index(['Station Name', 'Day', 'Time', 'Population', 'Weather'], dtype='object')

In [5]:
df.head(4).T

Unnamed: 0,0,1,2,3
Station Name,Adarsh Nagar,Adarsh Nagar,Adarsh Nagar,Adarsh Nagar
Day,MONDAY,MONDAY,MONDAY,MONDAY
Time,10:00,11:00,12:00,1:00
Population,1193,1163,812,668
Weather,Rainy,Rainy,Rainy,ThunderStorm


<br><font size='4'>Using ```Categorical```, we are converting instance into categorical values.</font>

In [6]:
def try_int(x):
    try:
        return int(x)
    except ValueError:
        return 

df["Weather"] = pd.Categorical(df["Weather"]).codes
df["Day"] = pd.Categorical(df["Day"]).codes
df["Station Name"] = pd.Categorical(df["Station Name"]).codes
df["Time"] = df["Time"].map(lambda x: try_int(x.split(":")[0]))
df["Population"] = df["Population"].map(lambda x: try_int(x))

<br><font size='4'>Now dividing the dataset into independent variables also known as ```Features``` and dependent varibales known as ```Targets```.</font>

In [7]:
x = df[["Station Name", "Day", "Time", "Weather"]].values
y = df[["Population"]].values
print(x,y)

[[ 1  1 10  1]
 [ 1  1 11  1]
 [ 1  1 12  1]
 ...
 [70  3  6  2]
 [70  3  7  0]
 [70  3  8  0]] [[1193]
 [1163]
 [ 812]
 ...
 [ 477]
 [ 661]
 [1611]]


<br><font size='4'>Eliminating all the NaN(Not a Number) and infinite values in the dataset.</font>

In [8]:
x = np.nan_to_num(x)
y = np.nan_to_num(y)
np.any(np.isnan(x))

False

<br><font size='4'>```preprocessing.scale()``` is used for standardizing the data along y-axis. It scales the data in such a way which tranforms the variance of each component to unit.</font>

In [9]:
norm_x = preprocessing.scale(x)
norm_y = preprocessing.scale(y)

<br><font size='4'>Dividing our dataset into training set and testing set. By specifying ```test_size``` we are using 30% of dataset as training set. As default ```test_size``` is 0.25.</font>

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.30, random_state=69)

<br><font size='4'>A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. Few of it's hyper-parameters are ```n_estimators``` which specifies the number of trees in forest. Another parameter is ```max_depth``` which specifies max depth of the tree. </font>

In [11]:
RandomForesRegModel = RandomForestRegressor(n_estimators = 10, random_state = 69)
RandomForesRegModel.fit(x_train.reshape(-1,4), y_train.ravel())

RandomForestRegressor(n_estimators=10, random_state=69)

<br><font size='4'>To make predictions, the ```predict``` method is used.</font>

In [12]:
y_pred = RandomForesRegModel.predict(x_test.reshape(-1,4))

In [13]:
y_pred[1:10].reshape(-1,1)

array([[ 158.62717949],
       [ 988.37661981],
       [ 828.00023919],
       [ 356.21379388],
       [ 271.26591052],
       [  64.28988866],
       [ 103.0854931 ],
       [ 444.38896856],
       [1615.81361305]])

In [14]:
y_test[1:10].reshape(-1,1)

array([[ 166],
       [ 884],
       [ 869],
       [ 364],
       [ 289],
       [  77],
       [  56],
       [ 459],
       [1616]])

<br><font size='4'>The Accuracy, ```r2_score``` and Mean Squared Error ```mean_squared_error```</font>

In [15]:
print('Accuracy: ', r2_score(y_test, y_pred.reshape(-1,1))*100)
print('Mean squared error: ', mean_squared_error(y_test, y_pred.reshape(-1,1)))

Accuracy:  98.36008534342926
Mean squared error:  3073.368723654206


<br><font size='4'>Saving the Models with ```pickle```</font>

In [16]:
PKL_filename = "RFR_Model.pkl"

with open(PKL_filename, 'wb') as file:
    pickle.dump(RandomForesRegModel, file)

In [17]:
with open (PKL_filename, 'rb') as file:
    Loaded_Model = pickle.load(file)
Loaded_Model

RandomForestRegressor(n_estimators=10, random_state=69)

In [18]:
score = Loaded_Model.score(x_test.reshape(-1,4),y_test)
score

0.9836008534342926

In [None]:
jovian.commit()

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..[0m
