In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import warnings
warnings.filterwarnings('ignore')

import random
random.seed(10)

In [2]:
iris_data = pd.read_csv('../data/Iris.csv')

In [3]:
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


<img height="600" width="750" src="./img/classification/iris_flowers.png">

### Scikit-Learn uses CART (Classification And Regression Tree) algorithm, which will generate binary tree.
  * The algorithm first splits the train set into two subsets using single feature K, threshold $t_k$ (example "petal lenght < 2.45 cm").
  * The algorithm searches for $(K, t_k)$ the purest subset, the cost function is privided by hyper-parameter "criterion" 
    * **For Classification** this can be "gini" for the **Gini impurity** and "entropy" for the **information gain**).
    * **For Regression** this can be "mse" or "mae"
    * It stops recursing once it reaches the maximum depth ("max_depth" hyper parameter), or if it cannot find a split that can reduce impurity.
    * 

<img height="600" width="550" src="./img/classification/dtree_iris_sample.png">


In [4]:
X, y = iris_data.iloc[:,1:-1], iris_data.iloc[:, -1]

In [30]:
y.shape

(150,)

In [6]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

## DecisionTree Classification

In [7]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')

In [8]:
tree_clf.predict_proba(X_train)[:5, :]

array([[ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.97297297,  0.02702703],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ]])

In [9]:
pred_y_train = tree_clf.predict(X_train)

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

def printAccuracyScores(pred_y, y):
    conf_matrix = confusion_matrix(pred_y, y)
    print("conf_matrix : ", conf_matrix)
    prec_score = precision_score(pred_y, y, average='macro')
    print("precision_score : ", prec_score)
    rec_score = recall_score(pred_y, y, average='macro')
    print("recall_score : ", rec_score)
    fl_score = f1_score(pred_y, y, average='macro')
    print("fl_score : ", fl_score)

In [11]:
printAccuracyScores(pred_y_train, y_train)

conf_matrix :  [[40  0  0]
 [ 0 36  1]
 [ 0  5 38]]
precision_score :  0.950802584949
recall_score :  0.952231301069
fl_score :  0.949968730457


In [12]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(tree_clf, X_train, y_train, cv=10, scoring="f1_macro")
print("cv_score : ", cv_score)

cv_score :  [ 0.92207792  1.          1.          1.          0.66666667  0.82222222
  1.          0.91534392  0.91534392  0.8962963 ]


### This looks good lets try GridSearchCV and try to find best parameters.

In [18]:
param_grid = [
    {'max_leaf_nodes':[2,4,5,6,8,10,12],
     'max_depth' : [2,3,4,5,6,7,8]}
  ]

grd_tree_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(grd_tree_clf, param_grid, cv=5,
                           scoring='f1_macro')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_leaf_nodes': [2, 4, 5, 6, 8, 10, 12], 'max_depth': [2, 3, 4, 5, 6, 7, 8]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1_macro', verbose=0)

In [19]:
grid_search.best_params_

{'max_depth': 5, 'max_leaf_nodes': 8}

In [20]:
grid_search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=8, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [16]:
best_tree_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=8, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
best_tree_clf.fit(X_train, y_train)
pred_y_train = best_tree_clf.predict(X_train)

printAccuracyScores(pred_y_train, y_train)

conf_matrix :  [[40  0  0]
 [ 0 40  0]
 [ 0  1 39]]
precision_score :  0.991869918699
recall_score :  0.991666666667
fl_score :  0.99166536438


In [17]:
pred_y_test = best_tree_clf.predict(X_test)
printAccuracyScores(pred_y_test, y_test)

conf_matrix :  [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
precision_score :  1.0
recall_score :  1.0
fl_score :  1.0


In [53]:
iris_data.Species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [57]:
from sklearn.tree import export_graphviz
import pydot
X_features  = [feature for feature in iris_data.columns if feature not in ['Id']]
export_graphviz(
        best_tree_clf,
        out_file="./iris_classification_tree.dot",
        feature_names=X_features,
        class_names=iris_data.Species.unique(),
        rounded=True,
        filled=True
    )      

In [58]:
# To convert dot file to png you may have to install graphviz on OS (Linux/Windows)
(graph,) = pydot.graph_from_dot_file('./iris_classification_tree.dot')
graph.write_png('./iris_classification_tree.png')

True