In [None]:
# Importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the data
df = pd.read_csv('boston.csv', index_col = 0)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [None]:
# Seperating the input and output data
X = df.drop('MEDV', axis = 1)
y = df['MEDV']

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)

In [None]:
# Creating a Decision Tree Regressor
model = DecisionTreeRegressor()

# Fitting the model
model.fit(X_train, y_train)

In [None]:
# Takig predictions from the model
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [None]:
# Model Evaluation on training data
print('Training R2 Score :', np.round(metrics.r2_score(y_train, y_train_pred), 2))

# Model Evaluation on testing data
print('Testing R2 Score :', np.round(metrics.r2_score(y_test, y_pred), 2))

Training R2 Score : 1.0
Testing R2 Score : 0.61


In [None]:
# This is clearly an overfitted model

In [None]:
# Performing Hyper Parameter Optimization

In [None]:
# GridSearchCV()
from sklearn.model_selection import GridSearchCV
params = {
    'criterion' : ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [None]:
4*10*9*10

3600

In [None]:
# Using GridSearchCV
model = DecisionTreeRegressor()
grid_search = GridSearchCV(estimator = model, param_grid = params, cv = 5, verbose = 1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits


In [None]:
# Getting the best parameter combination
print(grid_search.best_params_)

{'criterion': 'poisson', 'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 4}


In [None]:
# Creating the Optimal Model
model = DecisionTreeRegressor(criterion = 'poisson', max_depth = 6,
                              min_samples_leaf = 1, min_samples_split = 4)
model.fit(X_train, y_train)

In [None]:
# Taking predictions from the model
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [None]:
# Model Evaluation on training data
print('Training R2 Score :', np.round(metrics.r2_score(y_train, y_train_pred), 2))

# Model Evaluation on testing data
print('Testing R2 Score :', np.round(metrics.r2_score(y_test, y_pred), 2))

Training R2 Score : 0.96
Testing R2 Score : 0.8


In [None]:
# The algorithm is still over-fitted but the situation is better than the one without Hyper-Parameter Optimization

In [None]:
# RandomizedSearchCV()
from sklearn.model_selection import RandomizedSearchCV
params = {
    'criterion' : ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [None]:
4*10*9*10

3600

In [None]:
# Fitting the RandomizedSearchCV()
model = DecisionTreeRegressor()
random_search = RandomizedSearchCV(estimator = model, param_distributions = params, cv = 5, verbose = 1)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# Getting the beat parameters
print(random_search.best_params_)

{'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10, 'criterion': 'poisson'}


In [None]:
# Creating the Optimal Model
model = DecisionTreeRegressor(min_samples_split = 5, min_samples_leaf = 1,
                              max_depth = 10, criterion = 'poisson')
model.fit(X_train, y_train)

In [None]:
# Taking predictions from the model
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

In [None]:
# Model Evaluation on training data
print('Training R2 Score :', np.round(metrics.r2_score(y_train, y_train_pred), 2))

# Model Evaluation on testing data
print('Testing R2 Score :', np.round(metrics.r2_score(y_test, y_pred), 2))

Training R2 Score : 0.99
Testing R2 Score : 0.78


In [None]:
# The model is still overfitted but better than the one without Hyper Parameter Optimization