<a href="https://colab.research.google.com/github/isa-ulisboa/greends-pml/blob/main/notebooks/decision_tree_validation_curve.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import validation_curve
from sklearn import tree
import numpy as np
import matplotlib.pyplot as plt

# generate a toy dataset
X, y = make_classification(n_samples=1000, n_features=10, random_state=42,n_classes=2)

# define the model
model = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=4)

# define the range of hyperparameters to test
param_range = np.arange(4, 10)

# use validation_curve to compute training and validation scores for different hyperparameters
train_scores, test_scores = validation_curve(
    model, X, y,
    param_name="max_depth", param_range=param_range,
    cv=5,
    scoring="accuracy")

# calculate the mean and standard deviation of the training and validation scores for each hyperparameter
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# plot the validation curves
plt.plot(param_range, train_mean, label="Training score", color="darkorange")
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color="darkorange")
plt.plot(param_range, test_mean, label="Cross-validation score", color="navy")
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color="navy")
plt.legend(loc="best")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.show()

# calculate bias and variance
bias = (1 - test_mean) ** 2
variance = test_std ** 2

print("Bias:", bias)
print("Variance:", variance)
