In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics

import graphviz

import is4151is5451 as iot

In [2]:
iot.set_default_pandas_options()

In [3]:
# data prep
class_labels = ['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile']
df = pd.read_csv('../data/zoo.csv', index_col=0)

X = df.drop('type', axis=1).values
y = df['type'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, shuffle = True, stratify = df['type'])

X_headers = df.drop('type', axis=1).columns.values



# modelling

clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best')

clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)

print('Training Accuracy = {}'.format(metrics.accuracy_score(y_train, y_train_pred)))

Training Accuracy = 1.0


In [4]:
print('Training Confusion = \n{}'.format(metrics.confusion_matrix(y_train, y_train_pred, labels=class_labels)))

Training Confusion = 
[[ 3  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0]
 [ 0  0  9  0  0  0  0]
 [ 0  0  0  6  0  0  0]
 [ 0  0  0  0  7  0  0]
 [ 0  0  0  0  0 28  0]
 [ 0  0  0  0  0  0  3]]


In [5]:
y_test_pred = clf.predict(X_test)

print('Testing Accuracy = {}'.format(metrics.accuracy_score(y_test, y_test_pred)))

Testing Accuracy = 0.9032258064516129


In [6]:
print('Testing Confusion = \n{}'.format(metrics.confusion_matrix(y_test, y_test_pred, labels=class_labels)))

Testing Confusion = 
[[ 1  0  0  0  0  0  0]
 [ 0  6  0  0  0  0  0]
 [ 0  0  4  0  0  0  0]
 [ 0  0  0  2  0  0  0]
 [ 0  0  0  1  2  0  0]
 [ 0  0  0  0  0 13  0]
 [ 1  0  0  1  0  0  0]]


In [7]:
# export test dataset predicted result

result = pd.concat([pd.DataFrame(X_test, columns=X_headers), pd.Series(y_test, name='type'), pd.Series(y_test_pred, name='pred_type')], axis=1)
result.to_csv('../data/zoo-predict.csv')

In [8]:
# export decision tree from training dataset

dot_data = tree.export_graphviz(clf, out_file=None, feature_names=X_headers, 
                                                    class_names=class_labels, 
                                                    filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("../data/zoo-model")

'../data/zoo-model.pdf'