In [1]:
# Stylianos Topalidis
# AEM: 9613
# email: styltopa@ece.auth.gr 

In [4]:
from sklearn import datasets, tree
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt


In [8]:
# Load the data
iris = datasets.load_iris()


# First attribute: sepal length (cm)
# Second attribute: sepal width (cm)
irisData = iris.data[:, 0:2]

# Fields of iris data object
# ['DESCR', 'data', 'data_module', 'feature_names', \
#  'filename', 'frame', 'target', 'target_names']

# Percentage of the data for the training 
trainPercent = 0.5


# Turn the target values into a set of all possible targets {0, 1, 2} 
# to exclude duplicates (multiple 0, 1 and 2)
targetSet = set(iris.target)
# Turn the set into a list and then into a numpy array
# for convenience 
targetList = list(targetSet)
targetArr = np.array(targetList) 



# Indexes for all the data samples (iterates over the data 
# for the different targets: setosa, versicolor, virginica) 
indsPerKind = []
# Indexes for the training and testing data in the original dataset
trainingInds = []
testingInds = []


# For each of the flower kinds
for targetCount in range(len(targetArr)):
    indexesArr = np.where(iris.target == targetCount)

    # the first element of the list are the actual data 
    # and the second one is its data type 
    indsPerKind = list(indexesArr[0])

    # number of training data derived from the training percentage selected
    # per group identifier (0, 1, 2)
    numOfTrainingDataPerIdentifier = round(trainPercent*len(indsPerKind))
    numOfTestingDataPerIdentifier = len(indsPerKind) - numOfTrainingDataPerIdentifier

    # concatenate the indices of the new target (identifier) training data 
    # with the indices of the old target training data.
    trainingInds = trainingInds + indsPerKind[0:numOfTrainingDataPerIdentifier]
    testingInds = testingInds + indsPerKind[numOfTrainingDataPerIdentifier:]
    

# list -> np.array
trainingInds = np.array(trainingInds)
testingInds = np.array(testingInds)

# To avoid training the tree with batches of data of the same target,
# as given (all setosa first, then all versicolor and finally all virginica),
# we permute the training data
np.random.seed(0)
trainingIndsPerm =  np.random.permutation(trainingInds)

# Training data and target values
trainingArr = irisData[trainingIndsPerm]
targetArrTraining = iris.target[trainingIndsPerm]

# Testing data and target values
testingArr = irisData[testingInds]
targetArrTesting = iris.target[testingInds]

print('Classification accuracy')
# tree depths
treeDepths = np.array([3, 4, 5])
for depthCount in treeDepths:
    # Classifier training
    clf = tree.DecisionTreeClassifier(max_depth=depthCount)
    clf = clf.fit(trainingArr, targetArrTraining)
    # Classifier predictions of the targets
    targetArrPred = clf.predict(testingArr)


    # number of correctly predicted target values
    correctlyPredicted = 0

    numOfTrainingData = len(targetSet)*numOfTrainingDataPerIdentifier


    targetAndPred = np.stack((targetArrTesting, targetArrPred), axis=1) 

    for predCount in range(numOfTrainingData):
        if targetArrTesting[predCount] == targetArrPred[predCount]: 
            # print(targetAndPred[predCount])
            correctlyPredicted = correctlyPredicted + 1
        # else:
            # print(targetAndPred[predCount])


    accuracy = correctlyPredicted/numOfTrainingData
    # print(correctlyPredicted)
    # print(numOfTrainingData)
    print('For decision tree depth = ', depthCount, ': ', accuracy*100, '%')

Classification accuracy
For decision tree depth =  3 :  69.33333333333334 %
For decision tree depth =  4 :  80.0 %
For decision tree depth =  5 :  78.66666666666666 %
For decision tree depth =  6 :  80.0 %
For decision tree depth =  7 :  68.0 %
For decision tree depth =  8 :  70.66666666666667 %


In [None]:
# 1) Find the decision bounds from the classifier, make a function to get it from the 
# branches of the tree
# 2) Plot the space partition and their corresponding prdeiction of the target variable.
# 3) Plot the points 



# fig, ax = plt.subplots()
# ax.scatter()
