In [None]:
# Importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading the data
iris = load_iris()
print('Keys :\n', iris.keys())

Keys :
 dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [None]:
# DESCR
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [None]:
# Seperating input features and target variable
X = iris.data
y = iris.target

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)

In [None]:
# Creating a Decision Tree Classifier
clf = DecisionTreeClassifier()

# Fitting the model
clf.fit(X_train, y_train)

In [None]:
# Takig predictions from the model
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

In [None]:
# Model Evaluation on training data
print('Training Accuracy :', np.round(metrics.accuracy_score(y_train, y_train_pred), 2))
print('Training Precision :', np.round(metrics.precision_score(y_train, y_train_pred, average = 'weighted'), 2))
print('Training Recall :', np.round(metrics.recall_score(y_train, y_train_pred, average = 'weighted'), 2))
print('Training F1 Score :', np.round(metrics.f1_score(y_train, y_train_pred, average = 'weighted'), 2))

# Model Evaluation on testing data
print('\nTesting Accuracy :', np.round(metrics.accuracy_score(y_test, y_pred), 2))
print('Testing Precision :', np.round(metrics.precision_score(y_test, y_pred, average = 'weighted'), 2))
print('Testing Recall :', np.round(metrics.recall_score(y_test, y_pred, average = 'weighted'), 2))
print('Testing F1 Score :', np.round(metrics.f1_score(y_test, y_pred, average = 'weighted'), 2))

Training Accuracy : 1.0
Training Precision : 1.0
Training Recall : 1.0
Training F1 Score : 1.0

Testing Accuracy : 0.93
Testing Precision : 0.94
Testing Recall : 0.93
Testing F1 Score : 0.93


In [None]:
'''
From the output of the above code, we can see that the model is slightly overfitted
'''

In [None]:
# Performing Hyper Parameter Optimization

In [None]:
# GridSearchCV()
from sklearn.model_selection import GridSearchCV
params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [None]:
2*9*9*10

1620

In [None]:
# Using GridSearchCV
clf = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = params, cv = 5, verbose = 1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1620 candidates, totalling 8100 fits


In [None]:
# Getting the best parameter combination
print(grid_search.best_params_)

{'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [None]:
# Creating the Optimal Model
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 2, min_samples_leaf = 1, min_samples_split = 2)
clf.fit(X_train, y_train)

In [None]:
# Taking predictions from the model
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

In [None]:
# Model Evaluation on training data
print('Training Accuracy :', np.round(metrics.accuracy_score(y_train, y_train_pred), 2))
print('Training Precision :', np.round(metrics.precision_score(y_train, y_train_pred, average = 'weighted'), 2))
print('Training Recall :', np.round(metrics.recall_score(y_train, y_train_pred, average = 'weighted'), 2))
print('Training F1 Score :', np.round(metrics.f1_score(y_train, y_train_pred, average = 'weighted'), 2))

# Model Evaluation on testing data
print('\nTesting Accuracy :', np.round(metrics.accuracy_score(y_test, y_pred), 2))
print('Testing Precision :', np.round(metrics.precision_score(y_test, y_pred, average = 'weighted'), 2))
print('Testing Recall :', np.round(metrics.recall_score(y_test, y_pred, average = 'weighted'), 2))
print('Testing F1 Score :', np.round(metrics.f1_score(y_test, y_pred, average = 'weighted'), 2))

Training Accuracy : 0.98
Training Precision : 0.98
Training Recall : 0.98
Training F1 Score : 0.97

Testing Accuracy : 0.9
Testing Precision : 0.9
Testing Recall : 0.9
Testing F1 Score : 0.9


In [None]:
# We are still getting an overfitted model

In [None]:
# RandomizedSearchCV()
from sklearn.model_selection import RandomizedSearchCV

# Creating a parameter dictionary
params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [1, 2, 3, 4, 5],
    'min_samples_split' : [2, 3, 4, 5],
    'min_samples_leaf' : [1, 2, 3, 4, 5]
}

In [None]:
2*5*4*5

200

In [None]:
# Fitting the RandomizedSearchCV()
clf = DecisionTreeClassifier()
random_search = RandomizedSearchCV(estimator = clf, param_distributions = params, cv = 5, verbose = 1)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# Getting the beat parameters
print(random_search.best_params_)

{'min_samples_split': 3, 'min_samples_leaf': 4, 'max_depth': 2, 'criterion': 'gini'}


In [None]:
# Creating teh Optimal Model
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 2, min_samples_leaf = 4, min_samples_split = 3)
clf.fit(X_train, y_train)

In [None]:
# Taking the predictions from the model
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)

In [None]:
# Model Evaluation on training data
print('Training Accuracy :', np.round(metrics.accuracy_score(y_train, y_train_pred), 2))
print('Training Precision :', np.round(metrics.precision_score(y_train, y_train_pred, average = 'weighted'), 2))
print('Training Recall :', np.round(metrics.recall_score(y_train, y_train_pred, average = 'weighted'), 2))
print('Training F1 Score :', np.round(metrics.f1_score(y_train, y_train_pred, average = 'weighted'), 2))

# Model Evaluation on testing data
print('\nTesting Accuracy :', np.round(metrics.accuracy_score(y_test, y_pred), 2))
print('Testing Precision :', np.round(metrics.precision_score(y_test, y_pred, average = 'weighted'), 2))
print('Testing Recall :', np.round(metrics.recall_score(y_test, y_pred, average = 'weighted'), 2))
print('Testing F1 Score :', np.round(metrics.f1_score(y_test, y_pred, average = 'weighted'), 2))

Training Accuracy : 0.98
Training Precision : 0.98
Training Recall : 0.98
Training F1 Score : 0.97

Testing Accuracy : 0.9
Testing Precision : 0.9
Testing Recall : 0.9
Testing F1 Score : 0.9


In [None]:
# As we can see, the model is still over-fitted.