In [1]:
!pip install jovian --upgrade --quiet

<br><font size='4'>Importing required libraries.</font>

In [2]:
import math
import numpy as np 
import pandas as pd
import matplotlib as mpl
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import r2_score,mean_squared_error

%matplotlib inline

<br><font size='4'>Reading the dataset using pandas function ```read_csv()```.</font>

In [3]:
from sklearn.utils import shuffle

# Load dataset
data = pd.read_csv("output.csv")
data = data.sample(frac=1) 
data

Unnamed: 0,Station Name,Day,Time,Population,Weather
335581,Surajmal Stadium,TUESDAY,2:00,150,Cool
355649,Vaishali,SATURDAY,6:00,132,Cool
369547,Udyog Bhawan,WEDNESDAY,12:00,89,Cool
226062,Model Town,SUNDAY,11:00,52,ThunderStorm
205780,Mayur Vihar -I,THURSDAY,1:00,70,Sunny
...,...,...,...,...,...
9621,AIIMS,SUNDAY,5:00,699,Cool
355630,Vaishali,FRIDAY,10:00,309,Sunny
290510,Preet Vihar,SUNDAY,10:00,53,Sunny
234013,Netaji Subhash Place,MONDAY,8:00,559,Cool


<br><font size='4'>Using ```Categorical```, we are converting instance into categorical values.</font>

In [4]:
def try_int(x):
    try:
        return int(x)
    except ValueError:
        return

data["Weather"] = pd.Categorical(data["Weather"]).codes
data["Day"] = pd.Categorical(data["Day"]).codes
data["Station Name"] = pd.Categorical(data["Station Name"]).codes
data["Time"] = data["Time"].map(lambda x: try_int(x.split(":")[0]))
data["Population"] = data["Population"].map(lambda x: try_int(x))
data.describe()

Unnamed: 0,Station Name,Day,Time,Population,Weather
count,382690.0,382690.0,382690.0,382690.0,382690.0
mean,35.0,3.0,6.272727,468.745248,1.024027
std,20.493928,2.000003,3.518551,433.622396,1.014112
min,0.0,0.0,1.0,15.0,0.0
25%,17.0,1.0,3.0,108.0,0.0
50%,35.0,3.0,6.0,311.0,1.0
75%,53.0,5.0,10.0,725.0,2.0
max,70.0,6.0,12.0,1630.0,3.0


<br><font size='4'>Now dividing the dataset into independent variables also known as ```Features``` and dependent varibales known as ```Targets```.</font>

In [5]:
X = data[["Station Name", "Day", "Time", "Weather"]].values #Features
y = data[["Population"]].values #Targets

<br><font size='4'>Eliminating all the NaN(Not a Number) and infinite values in the dataset.</font>

In [6]:
X = np.nan_to_num(X)
y = np.nan_to_num(y)

<br><font size='4'>```preprocessing.scale()``` is used for standardizing the data along y-axis. It scales the data in such a way which tranforms the variance of each component to unit. </font>

In [7]:
norm_x = preprocessing.scale(X)
norm_y = preprocessing.scale(y)
norm_x

array([[ 1.3662601 ,  1.        , -1.21434461, -1.00977882],
       [ 1.51264511, -0.5       , -0.07751136, -1.00977882],
       [ 1.46385011,  1.5       ,  1.62773853, -1.00977882],
       ...,
       [ 0.87831007,  0.        ,  1.0593219 ,  0.9623929 ],
       [ 0.39036003, -1.        ,  0.49090527, -1.00977882],
       [ 0.24397502,  1.        , -1.21434461,  0.9623929 ]])

<br><font size='4'>Dividing our dataset into training set and tseting set. By specifying ```test_size``` we are using 30% of dataset as training set. As default ```test_size``` is 0.25.</font>

In [8]:
X_train , X_test, y_train, y_test = train_test_split(norm_x, norm_y, test_size=0.3, random_state=69)
print(X_train.shape)
print(X_test.shape)

(267883, 4)
(114807, 4)


<br><font size='4'>Finally training the decision tree algorithm on our dataset. Scikit-Learn contains the ```tree``` library, which contains built-in classes/methods for various decision tree algorithms.</font>

In [9]:
from sklearn.tree import DecisionTreeRegressor

#Creating the decision tree regressor object
classifier = DecisionTreeRegressor()

# Training our dataset using decision tree object
classifier.fit(X_train.reshape(-1,4) , y_train.ravel())

DecisionTreeRegressor()

<br><font size='4'>To make predictions, the ```predict``` method of the DecisionTreeRegressor class is used.</font>

In [10]:
y_pred = classifier.predict(X_test.reshape(-1,4))

<br><font size='4'>Finally calculating the accuracy of our model using ```r2_model``` from ```metrics```.</font>

In [11]:
print("Accuracy:",r2_score(y_test, y_pred.reshape(-1,1)))

Accuracy: 0.9838475141432419


In [None]:
jovian.commit()

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..[0m
