In [1]:
# !conda install python-graphviz

In [2]:
# !conda install pydotplus

In [3]:
# !conda install export_graphviz

In [4]:
# !pip install --upgrade scikit-learn==0.20.3 --user

In [5]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV
import graphviz
import pydotplus
from sklearn.externals.six import StringIO  
from IPython.display import Image  

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('Iris.csv')

In [7]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
df.drop(labels='Id', axis=1, inplace=True)  # Drop Id 

In [9]:
df.shape

(150, 5)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
SepalLengthCm    150 non-null float64
SepalWidthCm     150 non-null float64
PetalLengthCm    150 non-null float64
PetalWidthCm     150 non-null float64
Species          150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [11]:
df.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [12]:
# Label Encode 'Species'

le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

In [13]:
# Independent and Dependent variables

X = df.iloc[:, :-1]
Y = df['Species']

In [14]:
# Train Test Split

x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25)

In [15]:
# Build Decision Tree Model

classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [16]:
# Hyperparamter Tuning

parameters = {'max_leaf_nodes': list(range(2, 100)), 
              'min_samples_split': [2, 3, 4]}


grid_search = GridSearchCV(estimator = classifier,
                           param_grid  = parameters,
                           scoring = 'accuracy', 
                           cv = 5,
                           n_jobs=1)

grid_search = grid_search.fit(x_train, y_train)

best_accuracy = grid_search.best_score_   
best_parameters = grid_search.best_params_  

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 94.64 %
Best Parameters: {'max_leaf_nodes': 6, 'min_samples_split': 2}


In [18]:
# Make new DecisionTreeClassifier model with best parameters

final_classifier = DecisionTreeClassifier(max_leaf_nodes=6, min_samples_split=2)
final_classifier.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=6, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [19]:
# Predict test set values

predicted_values = final_classifier.predict(x_test)

In [20]:
compare_results_df = pd.DataFrame(data=zip(y_test, predicted_values), columns=['Real Values', 'Predicted Values'])
compare_results_df.head()

Unnamed: 0,Real Values,Predicted Values
0,0,0
1,2,2
2,1,1
3,2,2
4,1,1


In [21]:
dot_data = StringIO()
export_graphviz(final_classifier, out_file=dot_data, feature_names=X.columns,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph)
Image(graph.create_png())

InvocationException: GraphViz's executables not found

In [None]:
# dot_data = export_graphviz(final_classifier)
# graph = graphviz.Source(dot_data)
# graph