<a href="https://colab.research.google.com/github/ramonVDAKKER/teaching-data-science-emas/blob/main/notebooks/demo_decision_trees_splitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The basics of decision trees

This notebook contains snippets to facilitate a demo and discussion during a lecture. We focus on the technique and not on understanding the empirical application associated to the dataset we use.

## 0. Import packages

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import plot_partial_dependence
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from keras.layers import Dense
from keras.models import Sequential

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -U pandas-profiling[notebook]
!jupyter nbextension enable --py widgetsnbextension
import pandas_profiling

In [None]:
%matplotlib inline

## 1. Load dataset

In [None]:
data = load_breast_cancer()
y_df = pd.DataFrame(data.target, columns=["target"])
X_df = pd.DataFrame(data.data, columns=data.feature_names)
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, test_size=0.3, random_state=123)
data_train_df = pd.concat([y_train_df, X_train_df], axis=1)

In [None]:
print(f"The dataset, available for training, has {len(data_train_df)} rows and {data_train_df.shape[1]} columns.")
print("Some rows of the dataset:")
display(data_train_df.head(5))

## 2. Splitting mechanism illustrated

In [None]:
def gini_impurity(y_left, y_right):
  p_L = np.mean(y_left.values)
  gini_left = 2 * p_L * (1 - p_L)
  p_R = np.mean(y_right.values)
  gini_right = 2 * p_R * (1 - p_R)
  n_right = len(y_right)
  weight_right = n_right / (len(y_left) + n_right)
  gini = weight_right * gini_right + (1 - weight_right) * gini_left
  return gini, gini_left, gini_right
def determine_gini_impurity_for_feature_and_split_ext(y, X_df, name_feature, threshold):
  left = (X_df[name_feature] <= threshold)
  y_left = y[left]
  y_right = y[~ left]
  return gini_impurity(y_left, y_right) 
def determine_gini_impurity_for_feature_and_split(y, X_df, name_feature, threshold):
  gini, _, _ = determine_gini_impurity_for_feature_and_split_ext(y, X_df, name_feature, threshold)
  return gini
vector_determine_gini_impurity_for_feature_and_split = np.vectorize(determine_gini_impurity_for_feature_and_split, excluded=["y", "X_df"])
def analyze_feature(X_df, name_feature, y):
  feature_values = X_df[name_feature].sort_values(ascending=True)[1:-1]
  gini = vector_determine_gini_impurity_for_feature_and_split(y=y, X_df=X_df, name_feature=name_feature, threshold=feature_values)
  plt.plot(feature_values, gini)
  plt.ylabel("Gini")
  plt.xlabel("Splitvalue")
  plt.title(f"Gini as function of possible split values for feature {name_feature}.")
  print(f"Split value: {feature_values.iloc[gini.argmin()]}")
  print(f"Gini: {gini.min()}")

Let us analyze a feature and evaluate how we should split for this feature:

In [None]:
name_feature = "mean perimeter"
sns.histplot(data=data_train_df, x=name_feature, hue="target", multiple="stack")
plt.show()

The following graph shows how the Gini varies with the threshold:

In [None]:
analyze_feature(X_train_df, name_feature, y_train_df)

In [None]:
def determine_best_split(y, X_df):
  out_df = pd.DataFrame(columns=["Gini", "Split"])
  for name_feature in X_df.columns:
    feature_values = X_df[name_feature].sort_values(ascending=True)[1:-1]
    gini = vector_determine_gini_impurity_for_feature_and_split(y=y, X_df=X_df, name_feature=name_feature, threshold=feature_values)
    out_df.loc[name_feature, "Split"] = feature_values.iloc[gini.argmin()]
    out_df.loc[name_feature, "Gini"] = gini.min()
  return out_df
display(determine_best_split(y_train_df, X_train_df).sort_values(by="Gini"))

## 4. Train decision tree using Scikit

Let us check that Scikit finds the same answer for the first split. To this end we train a decision tree with 1 as maximum depth.

In [None]:
dtree = tree.DecisionTreeClassifier(max_depth=1)
dtree = dtree.fit(X_train_df, y_train_df)
tree.plot_tree(dtree) 

In [None]:
name_feature = X_train_df.columns[20]
sns.histplot(data=data_train_df, x=name_feature, hue="target", multiple="stack")

Let us check if our ad hoc code yields the same Gini for both leaves:

In [None]:
print(f"Gini before splitting:  {(2 * np.mean(y_train_df.values) * (1 - np.mean(y_train_df.values))):.03f}")
split_value = 16.795
gini, gini_left, gini_right = determine_gini_impurity_for_feature_and_split_ext(y_train_df, X_train_df, name_feature, split_value)
print(f"Gini left: {gini_left:.03f}.")
print(f"Gini right: {gini_right:.03f}.")
frac_left = np.mean(X_train_df[name_feature] <= split_value)
print(f"Gini after splitting: {frac_left * gini_left + (1 - frac_left) * gini_right:.03f}")

Now we understand how the decision tree comes to its splits, we train a "full" tree.

In [None]:
dtree = tree.DecisionTreeClassifier()
dtree = dtree.fit(X_train_df, y_train_df)
tree.plot_tree(dtree) 

Use the tree to predict $Y$ for an observation $X$:

In [None]:
x = X_train_df.loc[0]
print("For\n\n")
print(x)
print(f"\n\nthe tree yields Y_hat={dtree.predict(x.values.reshape(1, -1))[0]}.")
print(f"The true outcome is {y_train_df.loc[0][0]}.")

Now we "score" full dataset:

In [None]:
y_train_pred = dtree.predict(X_train_df)

In [None]:
print(y_train_pred)

Let us determine the confusion matrix:

In [None]:
class_names = [0, 1]
def confusion_matrix(model_object, X_df, y_df):
    disp = plot_confusion_matrix(model_object, X_df, y_df)
    disp.ax_.set_title("Confusion matrix")
    plt.show()
    y_hat = model_object.predict(X_df)
    accuracy = accuracy_score(y_df, y_hat, normalize=True)
    print(f"The accuracy is {100 * accuracy:0.2f}%.")
confusion_matrix(dtree, X_train_df, y_train_df)

Next we evaluate the model on the test data:

In [None]:
confusion_matrix(dtree, X_test_df, y_test_df)

Let us also fit a decision tree with restricted depth and evaluate its performance on the train and test set:

In [None]:
dtree = tree.DecisionTreeClassifier(max_depth=3)
dtree = dtree.fit(X_train_df, y_train_df)
confusion_matrix(dtree, X_train_df, y_train_df)
confusion_matrix(dtree, X_test_df, y_test_df)