# ML500 Exercises: Trees, Ensembles & Boosting

These exercises cover decision trees, random forests, gradient boosting,
histogram-based gradient boosting with early stopping, and feature importance.

**Difficulty increases with each exercise.**

In [None]:
# ============================================================
# Setup: Run this cell first
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.datasets import load_iris, load_breast_cancer, make_classification

np.random.seed(42)
RANDOM_STATE = 42

print("Setup complete.")

---
## Exercise 1: Decision Tree Visualization with Different Depths

**Goal:** Train decision trees on the Iris dataset with different `max_depth`
values and visualize them using `plot_tree`.

**Tasks:**
1. Load the Iris dataset and split 80/20.
2. Train decision trees with `max_depth` = 2, 4, and None (unlimited).
3. Report test accuracy for each depth.
4. Visualize all three trees side by side using `plot_tree`.
5. Discuss the tradeoff between depth and generalization.

In [None]:
# Exercise 1 - Starter Code

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

depths = [2, 4, None]

# TODO 1: Train trees with different depths and record accuracies
# models = {}
# for depth in depths:
#     dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
#     dt.fit(X_train, y_train)
#     acc = accuracy_score(y_test, dt.predict(X_test))
#     models[depth] = dt
#     print(f"max_depth={str(depth):>4s}  Test accuracy: {acc:.4f}  Tree depth: {dt.get_depth()}")

# TODO 2: Visualize all three trees
# fig, axes = plt.subplots(1, 3, figsize=(24, 8))
# for idx, depth in enumerate(depths):
#     plot_tree(
#         models[depth], ax=axes[idx],
#         feature_names=iris.feature_names,
#         class_names=iris.target_names,
#         filled=True, rounded=True, fontsize=8
#     )
#     axes[idx].set_title(f'max_depth={depth}')
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

depths = [2, 4, None]

# 1. Train and evaluate
models = {}
for depth in depths:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    acc = accuracy_score(y_test, dt.predict(X_test))
    models[depth] = dt
    print(f"max_depth={str(depth):>4s}  Test accuracy: {acc:.4f}  Tree depth: {dt.get_depth()}")

# 2. Visualize
fig, axes = plt.subplots(1, 3, figsize=(24, 8))
for idx, depth in enumerate(depths):
    plot_tree(
        models[depth], ax=axes[idx],
        feature_names=iris.feature_names,
        class_names=iris.target_names,
        filled=True, rounded=True, fontsize=8
    )
    axes[idx].set_title(f'max_depth={depth}')
plt.tight_layout()
plt.show()

# Discussion:
# - Shallow trees (depth=2) are simple but may underfit.
# - Unlimited depth trees capture all training patterns but may overfit.
# - A moderate depth (e.g., 4) often provides the best bias-variance tradeoff.
```

</details>

---
## Exercise 2: Decision Tree vs. Random Forest

**Goal:** Compare a single decision tree against a random forest (100 trees) on
the breast cancer dataset using 5-fold cross-validation.

**Tasks:**
1. Load the breast cancer dataset.
2. Create a `DecisionTreeClassifier` (`random_state=42`).
3. Create a `RandomForestClassifier` (100 trees, `random_state=42`).
4. Evaluate both with 5-fold cross-validation.
5. Print and compare the results.

In [None]:
# Exercise 2 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

data = load_breast_cancer()
X, y = data.data, data.target

# TODO 1: Create models
# dt = DecisionTreeClassifier(random_state=42)
# rf = RandomForestClassifier(n_estimators=100, random_state=42)

# TODO 2: 5-fold cross-validation for both
# dt_scores = cross_val_score(dt, X, y, cv=5, scoring='accuracy')
# rf_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

# TODO 3: Print comparison
# print(f"Decision Tree:  {dt_scores.mean():.4f} +/- {dt_scores.std():.4f}")
# print(f"Random Forest:  {rf_scores.mean():.4f} +/- {rf_scores.std():.4f}")
# print(f"\nImprovement: {(rf_scores.mean() - dt_scores.mean()):.4f}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

data = load_breast_cancer()
X, y = data.data, data.target

# 1. Create models
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 2. Cross-validation
dt_scores = cross_val_score(dt, X, y, cv=5, scoring='accuracy')
rf_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

# 3. Comparison
print(f"Decision Tree:  {dt_scores.mean():.4f} +/- {dt_scores.std():.4f}")
print(f"Random Forest:  {rf_scores.mean():.4f} +/- {rf_scores.std():.4f}")
print(f"\nImprovement: {(rf_scores.mean() - dt_scores.mean()):.4f}")
print(f"\nRandom Forest typically wins because it averages many trees,")
print(f"reducing variance while maintaining low bias.")
```

</details>

---
## Exercise 3: Gradient Boosting -- Learning Rate Comparison

**Goal:** Train `GradientBoostingClassifier` with different learning rates and
plot training vs. validation error across boosting iterations.

**Tasks:**
1. Load the breast cancer dataset and split 80/20.
2. Train GradientBoosting models with `learning_rate` = 0.01, 0.1, 0.5
   (use `n_estimators=200`).
3. For each model, extract the staged predictions (training and test error).
4. Plot training and test error vs. number of iterations for all learning rates.
5. Discuss the effect of learning rate on convergence.

In [None]:
# Exercise 3 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

learning_rates = [0.01, 0.1, 0.5]
n_estimators = 200

# TODO 1: Train models and collect staged scores
# fig, axes = plt.subplots(1, 3, figsize=(18, 5))
#
# for idx, lr in enumerate(learning_rates):
#     gb = GradientBoostingClassifier(
#         n_estimators=n_estimators, learning_rate=lr, random_state=42
#     )
#     gb.fit(X_train, y_train)
#
#     # Staged predictions: error at each boosting iteration
#     train_errors = []
#     test_errors = []
#     for y_train_pred in gb.staged_predict(X_train):
#         train_errors.append(1 - accuracy_score(y_train, y_train_pred))
#     for y_test_pred in gb.staged_predict(X_test):
#         test_errors.append(1 - accuracy_score(y_test, y_test_pred))
#
#     # Plot
#     axes[idx].plot(range(1, n_estimators + 1), train_errors, label='Train Error')
#     axes[idx].plot(range(1, n_estimators + 1), test_errors, label='Test Error')
#     axes[idx].set_xlabel('Number of Iterations')
#     axes[idx].set_ylabel('Error Rate')
#     axes[idx].set_title(f'Learning Rate = {lr}')
#     axes[idx].legend()
#
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

learning_rates = [0.01, 0.1, 0.5]
n_estimators = 200

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, lr in enumerate(learning_rates):
    gb = GradientBoostingClassifier(
        n_estimators=n_estimators, learning_rate=lr, random_state=42
    )
    gb.fit(X_train, y_train)

    # Staged predictions
    train_errors = []
    test_errors = []
    for y_train_pred in gb.staged_predict(X_train):
        train_errors.append(1 - accuracy_score(y_train, y_train_pred))
    for y_test_pred in gb.staged_predict(X_test):
        test_errors.append(1 - accuracy_score(y_test, y_test_pred))

    axes[idx].plot(range(1, n_estimators + 1), train_errors, label='Train Error')
    axes[idx].plot(range(1, n_estimators + 1), test_errors, label='Test Error')
    axes[idx].set_xlabel('Number of Iterations')
    axes[idx].set_ylabel('Error Rate')
    axes[idx].set_title(f'Learning Rate = {lr}')
    axes[idx].legend()

    final_acc = accuracy_score(y_test, gb.predict(X_test))
    print(f"LR={lr}: Final test accuracy = {final_acc:.4f}")

plt.tight_layout()
plt.show()

# Discussion:
# - Low learning rate (0.01): Slow convergence, may need more iterations.
# - Medium learning rate (0.1): Good balance of speed and accuracy.
# - High learning rate (0.5): Fast convergence but may overfit or oscillate.
```

</details>

---
## Exercise 4: HistGradientBoosting with Early Stopping

**Goal:** Use `HistGradientBoostingClassifier` with early stopping to
automatically determine the optimal number of boosting iterations.

**Tasks:**
1. Load the breast cancer dataset and split 80/20.
2. Train `HistGradientBoostingClassifier` with `early_stopping=True`,
   `max_iter=500`, `validation_fraction=0.15`, `n_iter_no_change=10`.
3. Report the best iteration and the number of iterations used.
4. Report the test accuracy.
5. Compare against a model without early stopping (`max_iter=500`).

In [None]:
# Exercise 4 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TODO 1: Train with early stopping
# hgb_early = HistGradientBoostingClassifier(
#     max_iter=500,
#     early_stopping=True,
#     validation_fraction=0.15,
#     n_iter_no_change=10,
#     random_state=42
# )
# hgb_early.fit(X_train, y_train)

# TODO 2: Report best iteration
# print(f"Number of iterations used: {hgb_early.n_iter_}")
# print(f"Test accuracy (early stop): {accuracy_score(y_test, hgb_early.predict(X_test)):.4f}")

# TODO 3: Train without early stopping for comparison
# hgb_full = HistGradientBoostingClassifier(
#     max_iter=500, early_stopping=False, random_state=42
# )
# hgb_full.fit(X_train, y_train)
# print(f"\nWithout early stopping:")
# print(f"Iterations used: 500")
# print(f"Test accuracy (no early stop): {accuracy_score(y_test, hgb_full.predict(X_test)):.4f}")

# TODO 4: Compare
# print(f"\nEarly stopping saved {500 - hgb_early.n_iter_} iterations.")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. With early stopping
hgb_early = HistGradientBoostingClassifier(
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=10,
    random_state=42
)
hgb_early.fit(X_train, y_train)

# 2. Report
early_acc = accuracy_score(y_test, hgb_early.predict(X_test))
print(f"=== With Early Stopping ===")
print(f"Iterations used: {hgb_early.n_iter_}")
print(f"Test accuracy: {early_acc:.4f}")

# 3. Without early stopping
hgb_full = HistGradientBoostingClassifier(
    max_iter=500, early_stopping=False, random_state=42
)
hgb_full.fit(X_train, y_train)
full_acc = accuracy_score(y_test, hgb_full.predict(X_test))
print(f"\n=== Without Early Stopping ===")
print(f"Iterations used: 500")
print(f"Test accuracy: {full_acc:.4f}")

# 4. Compare
print(f"\nEarly stopping saved {500 - hgb_early.n_iter_} iterations.")
print(f"Accuracy difference: {early_acc - full_acc:.4f}")
```

</details>

---
## Exercise 5: Permutation Importance vs. Built-in Feature Importance

**Goal:** Compare permutation importance (model-agnostic) with the built-in
`feature_importances_` of a Random Forest, and discuss differences.

**Tasks:**
1. Load the breast cancer dataset and split 80/20.
2. Train a `RandomForestClassifier` (100 trees).
3. Compute the built-in `feature_importances_`.
4. Compute permutation importance on the test set.
5. Plot both importance rankings side by side (top 10 features).

In [None]:
# Exercise 5 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TODO 1: Train Random Forest
# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)
# print(f"Test accuracy: {rf.score(X_test, y_test):.4f}")

# TODO 2: Get built-in feature importances
# builtin_imp = rf.feature_importances_

# TODO 3: Compute permutation importance
# perm_imp = permutation_importance(
#     rf, X_test, y_test, n_repeats=10, random_state=42
# )

# TODO 4: Get top 10 features for each method
# builtin_sorted_idx = np.argsort(builtin_imp)[::-1][:10]
# perm_sorted_idx = np.argsort(perm_imp.importances_mean)[::-1][:10]

# TODO 5: Plot side by side
# fig, axes = plt.subplots(1, 2, figsize=(16, 6))
#
# axes[0].barh(range(10), builtin_imp[builtin_sorted_idx])
# axes[0].set_yticks(range(10))
# axes[0].set_yticklabels(feature_names[builtin_sorted_idx])
# axes[0].set_xlabel('Importance')
# axes[0].set_title('Built-in Feature Importance (Gini)')
# axes[0].invert_yaxis()
#
# axes[1].barh(range(10), perm_imp.importances_mean[perm_sorted_idx])
# axes[1].set_yticks(range(10))
# axes[1].set_yticklabels(feature_names[perm_sorted_idx])
# axes[1].set_xlabel('Mean Accuracy Decrease')
# axes[1].set_title('Permutation Importance')
# axes[1].invert_yaxis()
#
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. Train
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
print(f"Test accuracy: {rf.score(X_test, y_test):.4f}")

# 2. Built-in importance
builtin_imp = rf.feature_importances_

# 3. Permutation importance
perm_imp = permutation_importance(
    rf, X_test, y_test, n_repeats=10, random_state=42
)

# 4. Top 10
builtin_sorted_idx = np.argsort(builtin_imp)[::-1][:10]
perm_sorted_idx = np.argsort(perm_imp.importances_mean)[::-1][:10]

# 5. Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

axes[0].barh(range(10), builtin_imp[builtin_sorted_idx])
axes[0].set_yticks(range(10))
axes[0].set_yticklabels(feature_names[builtin_sorted_idx])
axes[0].set_xlabel('Importance')
axes[0].set_title('Built-in Feature Importance (Gini)')
axes[0].invert_yaxis()

axes[1].barh(range(10), perm_imp.importances_mean[perm_sorted_idx])
axes[1].set_yticks(range(10))
axes[1].set_yticklabels(feature_names[perm_sorted_idx])
axes[1].set_xlabel('Mean Accuracy Decrease')
axes[1].set_title('Permutation Importance')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

# Discussion:
# - Built-in importance is based on Gini impurity decrease (training data).
# - Permutation importance measures accuracy drop when a feature is shuffled (test data).
# - Permutation importance is more reliable as it uses the test set and is model-agnostic.
# - Built-in importance can be biased toward high-cardinality features.
print("\nTop 5 features (built-in):", list(feature_names[builtin_sorted_idx[:5]]))
print("Top 5 features (permutation):", list(feature_names[perm_sorted_idx[:5]]))
```

</details>