# Introduction to Decision Trees

<img src="../images/tree_graphic.jpg" width="800">

# Lets GO!

### Import Libraries

In [None]:
import pandas as pd
import seaborn as sns
from sklearn import datasets

sns.set(rc={'figure.figsize':(7.5,5)})

### Load Iris Dataset

In [None]:
data = datasets.load_iris()

iris = pd.DataFrame(data['data'], columns=data['feature_names'])
iris['target'] = data['target']
iris = iris.assign(species=lambda x: x['target'].map(dict(enumerate(data['target_names']))))

# Shuffle dataset and print first 5 rows
iris.sample(frac=1, random_state=32).head()

In [None]:
iris.groupby(['target', 'species']).median()

### Plot Sepal and Petal Shapes

<img src="../images/iris.png" width="800">

In [None]:
sns.scatterplot(x='sepal length (cm)', y='sepal width (cm)', data=iris, hue='species')

In [None]:
sns.scatterplot(x='petal length (cm)', y='petal width (cm)', data=iris, hue='species')

### Decision Trees are Cool BUT...
They quickly overfit if we don't know what we're doing

In [None]:
X = iris[['sepal length (cm)', 'sepal width (cm)']].to_numpy()
y = iris[['target']].to_numpy()

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


def plot_surface(X, y, max_depth=None, max_leaf_nodes=None, min_impurity_decrease=0):
    x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
    y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
    X_grid = np.c_[xx.ravel(), yy.ravel()]

    # Predict
    model = DecisionTreeClassifier(
        max_depth=max_depth,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease
    )
    model.fit(X, y)
    z = model.predict(X_grid)
    z = z.reshape(xx.shape)

    # Plot
    plt.figure()
    plt.contourf(xx, yy, z, cmap=ListedColormap(["blue", "red", "green"]), alpha=.2)
    plt.scatter(X[:, 0], X[:, 1], cmap=ListedColormap(["blue", "red", "green"]), s=20, c=y)
    
    plt.title(f"Max Depth: {max_depth}, Max Leaves: {max_leaf_nodes}, Min Impurity Decrease: {min_impurity_decrease}")
    plt.show()

In [None]:
plot_surface(X, y)

### Pruning

#### Max Depth

This parameter sets the max depth, or number of decisions being made in our tree. The lower the number, the less decision boundaries you will have.

In [None]:
for depth in [1, 3, 5, 7]:
    plot_surface(X,y, max_depth=depth)

#### Max Leaves

This parameter sets the maximum number of leaf nodes (or decisions). The fewer nodes allowed, the less decision boundaries your tree will create.

In [None]:
for nodes in [2, 3, 5, 7]:
    plot_surface(X,y, max_leaf_nodes=nodes)

#### Minimum Impurity Decrease

The definition of impurity is out of scope for this quick course, but this parameter forces the model to only create new decision boundaries that contribute enough impurity decrease to the model. The higher the number, the less decision boundaries will be created.

In [None]:
for depth in [0, .01, .1, .25]:
    plot_surface(X,y, min_impurity_decrease=depth)

### Cross Validation: What's the best tree?

In [None]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier()

cv_params = {
    'max_depth': [1, 3, 5, 7],
    'max_leaf_nodes': [2, 3, 5, 7],
    'min_impurity_decrease': [0, .01, .1, .25]
}

In [None]:
grid_search = GridSearchCV(model, param_grid=cv_params)

grid_search.fit(X,y)

In [None]:
grid_search.best_params_

In [None]:
plot_surface(
    X, y,
    max_depth=grid_search.best_params_['max_depth'],
    max_leaf_nodes=grid_search.best_params_['max_leaf_nodes'],
    min_impurity_decrease=grid_search.best_params_['min_impurity_decrease']
)