In [2]:
"""
Decision tree max_leaf_nodes sweep

This subroutine evaluates a range of values for DecisionTreeClassifier's max_leaf_nodes, 
fits the classifier for each value using cross-validation, and returns a pandas DataFrame 
with the mean and std cross-val scores plus the trained model for the best value.

"""


from typing import Iterable, Optional, Dict, Any, Tuple
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

def sweep_max_leaf_nodes(
    X,
    y,
    max_leaf_nodes_range: Iterable[int],
    *,
    clf_params: Optional[Dict[str, Any]] = None,
    cv: int = 5,
    scoring: Optional[str] = None,
    verbose: bool = False
) -> Tuple[pd.DataFrame, DecisionTreeClassifier]:
    """
    Evaluate DecisionTreeClassifier over a range of max_leaf_nodes.

    Parameters
    - X, y: training data (array-like / DataFrame / Series).
    - max_leaf_nodes_range: iterable of positive integers to test.
    - clf_params: optional dict of extra DecisionTreeClassifier parameters.
    - cv: cross-validation folds (default 5).
    - scoring: scoring string passed to cross_val_score (default None -> estimator.score).
    - verbose: if True prints progress.

    Returns
    - results_df: pandas DataFrame with columns ['max_leaf_nodes', 'mean_score', 'std_score'] sorted by max_leaf_nodes.
    - best_model: DecisionTreeClassifier fitted on the full X,y with the best max_leaf_nodes (highest mean_score).
    """
    if clf_params is None:
        clf_params = {}

    rows = []
    for m in sorted(set(int(v) for v in max_leaf_nodes_range if int(v) > 0)):
        clf = DecisionTreeClassifier(max_leaf_nodes=m, **clf_params)
        scores = cross_val_score(clf, X, y, cv=cv, scoring=scoring)
        mean, std = float(np.mean(scores)), float(np.std(scores))
        rows.append({"max_leaf_nodes": m, "mean_score": mean, "std_score": std})
        if verbose:
            print(f"max_leaf_nodes={m}  mean_score={mean:.4f}  std={std:.4f}")

    results_df = pd.DataFrame(rows).sort_values("max_leaf_nodes").reset_index(drop=True)

    best_row = results_df.loc[results_df["mean_score"].idxmax()]
    best_m = int(best_row["max_leaf_nodes"])
    best_model = DecisionTreeClassifier(max_leaf_nodes=best_m, **clf_params)
    best_model.fit(X, y)

    return results_df, best_model

In [3]:
#
# Example usage
#

from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

range_to_test = range(2, 31)  # test leaf nodes 2..30
results, best_clf = sweep_max_leaf_nodes(X, y, range_to_test, clf_params={"random_state":42}, cv=5, scoring="accuracy", verbose=True)

print(results.head())
print("Best max_leaf_nodes:", best_clf.get_params()["max_leaf_nodes"])


"""
Output explanation
- results is a pandas DataFrame with max_leaf_nodes, mean_score, and std_score for each tested value.
- best_clf is the DecisionTreeClassifier fitted on the full dataset with the max_leaf_nodes that achieved the highest mean cross-validation score.

"""

max_leaf_nodes=2  mean_score=0.6667  std=0.0000
max_leaf_nodes=3  mean_score=0.9333  std=0.0471
max_leaf_nodes=4  mean_score=0.9600  std=0.0249
max_leaf_nodes=5  mean_score=0.9667  std=0.0365
max_leaf_nodes=6  mean_score=0.9600  std=0.0327
max_leaf_nodes=7  mean_score=0.9667  std=0.0365
max_leaf_nodes=8  mean_score=0.9600  std=0.0327
max_leaf_nodes=9  mean_score=0.9600  std=0.0327
max_leaf_nodes=10  mean_score=0.9600  std=0.0327
max_leaf_nodes=11  mean_score=0.9600  std=0.0327
max_leaf_nodes=12  mean_score=0.9600  std=0.0327
max_leaf_nodes=13  mean_score=0.9600  std=0.0327
max_leaf_nodes=14  mean_score=0.9600  std=0.0327
max_leaf_nodes=15  mean_score=0.9600  std=0.0327
max_leaf_nodes=16  mean_score=0.9600  std=0.0327
max_leaf_nodes=17  mean_score=0.9600  std=0.0327
max_leaf_nodes=18  mean_score=0.9600  std=0.0327
max_leaf_nodes=19  mean_score=0.9600  std=0.0327
max_leaf_nodes=20  mean_score=0.9600  std=0.0327
max_leaf_nodes=21  mean_score=0.9600  std=0.0327
max_leaf_nodes=22  mean_scor