# Decision Trees - Practical Example

In [None]:
# import the relevant packages
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree

### The dataset

In [None]:
# We can load the iris dataset straight from sklearn
iris = load_iris()
X, y = iris.data, iris.target

In [None]:
# Check what the input looks like
X

In [None]:
# Check what the target looks like
y

In [None]:
# We see that there are 150 samples, each with 4 features
# The 4 features are: Sepal length, Sepal width, Petal length and Petal width
np.shape(X)

# For the target (y), consider 0 = Setosa, 1 = Versicolour and 2 = Virginica

### Creating the decision tree

In [None]:
# First, we need to define the decision tree and its parameters, if any
# Then, we need to train/create the tree based on the data
# Both are easily achieved through sklearn, with 2 simple commands

In [None]:
# Defining the tree classifier
clf = DecisionTreeClassifier()

In [None]:
# Training/creating the decision tree
clf = clf.fit(X, y)

In [None]:
# At this point, we have created a fully working decision tree for the Iris dataset

In [None]:
# We can now give this tree an input, and it will predict the class of the flower (Versicolour because the output is 1)
clf.predict([[6.1, 2.7, 3.9, 1.2]])

### Visualizing the tree

In [None]:
# With sklearn, we also have capabilities to plot the tree

In [None]:
# The default plot_tree function returns a list with text information about the tree and also plots a small image of it
plot_tree(clf)

In [None]:
# To better visualize it, we can use matplotlib, and control the size of the figure
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True)
plt.show()

In [None]:
# As can be seen from the image above, though, the features and classes don't have names
# It is not informative, as we don't know what feature X[2] corresponds to
# So, we can add a list of feature and class names to the function

In [None]:
# Adding the feature and class names, as well
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True, 
          feature_names=["Sepal Length", "Sepal Width", "Petal Length", "Petal Width"], 
          class_names=["Setosa", "Versicolour", "Virginica"])
plt.show()

# See if you can interpret the tree. Remember the predicting values were [6.1, 2.7, 3.9, 1.2]

### How to interpret

* Nodes are the questions, with 4 pieces of information:
    1. the question itself, ex: Petal Width <= 0.8
    2. gini impurity - measures the misclassification of the data points. Zero means that there's no misclassification, so it's value should be as closest to zero as possible.
    3. the number of data points available
    4. the number of datapoints per class or target
* Branches are the answers:
    1. left branch is True
    2. right branch is False
* Leaves are the outcomes

### Exercise: Playing with _pruning_

Add the pruning hyperparameter (_ccp_alpha_) to the classifier and try different values to see the different outcomes of the tree.

#### Exercise Solution: Playing with _pruning_

In [None]:
# Defining the tree classifier
clf = DecisionTreeClassifier(ccp_alpha=0.1)
# Training/creating the decision tree
clf = clf.fit(X, y)
# We can now give this tree an input, and it will predict the class of the flower (Versicolour because the output is 1)
clf.predict([[6.1, 2.7, 3.9, 1.2]])
# Plot the tree
plt.figure(figsize=(15,12))
plot_tree(clf, filled=True, 
          feature_names=["Sepal Length", "Sepal Width", "Petal Length", "Petal Width"], 
          class_names=["Setosa", "Versicolour", "Virginica"])
plt.show()