# OpenML Regression Benchmark 2025

This notebook benchmarks a **custom Decision Trees → MLP architecture** against several standard scikit‑learn regressors on every dataset in the **`New_OpenML_Suite_2025_regression`**.

---

### Notebook outline

1. **Setup** – install/import libraries
2. **Configuration** – constants & hyper‑parameters
3. **Custom Model Definition** – placeholder implementation
4. **Helper Functions** – preprocessing & evaluation utilities
5. **Benchmark Loop** – iterate over datasets and models, log to Weights & Biases
6. **Results Summary** – aggregate and visualise results

> **Tip:** Sections flagged with `TODO` are hooks for deeper customisation.


In [1]:
# ---- Setup --------------------------------------------------------------
%pip install openml wandb scikit-learn pandas nbformat numpy --quiet

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import openml
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_validate, KFold, train_test_split
from sklearn.metrics import (make_scorer, r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from IPython.display import display

import wandb

# Authenticate W&B (expects environment variable or local API key)
wandb.login()
import copy
import joblib

Note: you may need to restart the kernel to use updated packages.


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mspicecat[0m ([33mspicecat-club[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
suites = openml.study.list_suites(output_format="dataframe", status="all")
display(suites)

Unnamed: 0,id,alias,main_entity_type,name,status,creation_date,creator
14,14,OpenML100,task,"Collaborative, reproducible benchmarking and a...",in_preparation,2019-02-21 18:40:13,1
99,99,OpenML-CC18,task,OpenML-CC18 Curated Classification benchmark,active,2019-02-21 18:47:13,1
218,218,AutoML-Benchmark,task,AutoML Benchmark,in_preparation,2019-05-02 13:35:08,869
219,219,FOREX,task,Forex,in_preparation,2019-06-04 00:45:17,1
225,225,OpenML-friendly,task,OpenML100-friendly,active,2019-09-16 19:41:46,1
...,...,...,...,...,...,...,...
451,451,f1f014076bfc49aaaa655f22efd10c18,task,Benchmark,in_preparation,2025-03-12 07:47:56,48020
452,452,db9adbe551614b1480abda01689ca6fd,task,Benchmark,in_preparation,2025-03-12 07:47:57,48020
453,453,ffbe01c051554f0ab15fede1dec6f63f,task,Meta AutoML Benchmark Suite,in_preparation,2025-03-14 19:30:38,38009
454,454,classification_2025_March_1,task,New_OpenML_Suite_2025_classification,in_preparation,2025-03-17 11:27:55,25914


In [3]:
# ---- Configuration ------------------------------------------------------
SUITE_NAME = (
    "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23 - A curated tabular regression benchmarking suite
)
WANDB_PROJECT = "OpenML_Regression_Benchmark_2025"
CV_FOLDS = 5
RANDOM_STATE = 42

# Scoring dictionary (positive = higher is better)
scoring = {
    "r2": make_scorer(r2_score),
    "mae": make_scorer(mean_absolute_error, greater_is_better=False),
    "rmse": make_scorer(
        lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
        greater_is_better=False,
    ),
}

# Baseline models
baseline_models = {
    "LinearRegression": LinearRegression(),
    # "Ridge": Ridge(random_state=RANDOM_STATE),
    # "Lasso": Lasso(random_state=RANDOM_STATE),
    # "SVR": SVR(),
    # "RandomForestRegressor": RandomForestRegressor(random_state=RANDOM_STATE),
    # "GradientBoostingRegressor": GradientBoostingRegressor(random_state=RANDOM_STATE),
    "MLPRegressor": MLPRegressor(random_state=RANDOM_STATE),
}

# Placeholder for our custom architecture (defined next)
from typing import Optional, Iterable

In [4]:
# ---- Custom DecisionTree→MLP Model -------------------------------------
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_is_fitted
from scipy import sparse


class TreeMLPRegressor(BaseEstimator, RegressorMixin):
    """A simple tree feature extractor followed by an MLP regressor.

    Stage 1: Train `n_trees` individual `DecisionTreeRegressor`s (or a RandomForest).
    Stage 2: For each sample, produce one‑hot encoded leaf indices per tree.
    Stage 3: Concatenate original X with tree‑encoded features → feed to an MLP.

    Note: This is a minimal, *illustrative* implementation. Optimizations such
    as sparse handling, batching, or GPU acceleration are omitted for clarity.
    """

    def __init__(
        self,
        n_trees: int = 10,
        tree_max_depth: Optional[int] = None,
        mlp_hidden_layer_sizes: Iterable[int] = (100,),
        mlp_max_iter: int = 200,
        random_state: Optional[int] = None,
    ):
        self.n_trees = n_trees
        self.tree_max_depth = tree_max_depth
        self.mlp_hidden_layer_sizes = tuple(mlp_hidden_layer_sizes)
        self.mlp_max_iter = mlp_max_iter
        self.random_state = random_state

    # --------------------------- Fit ------------------------------------
    def fit(self, X, y):
        rng = np.random.RandomState(self.random_state)

        # Stage 1 – train independent Decision Trees
        self.trees_ = []
        for i in range(self.n_trees):
            tree = DecisionTreeRegressor(
                max_depth=self.tree_max_depth,
                random_state=rng.randint(0, 1_000_000),
            )
            tree.fit(X, y)
            self.trees_.append(tree)

        # Stage 2 – compute leaf indices for training data
        leaf_indices = [tree.apply(X).reshape(-1, 1) for tree in self.trees_]
        leaf_indices = np.hstack(leaf_indices)

        # One‑hot encode leaf indices
        self.ohe_ = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
        leaf_ohe = self.ohe_.fit_transform(leaf_indices)

        # Combine original features with tree features
        if sparse.issparse(leaf_ohe):
            X_aug = sparse.hstack([sparse.csr_matrix(X), leaf_ohe]).tocsr()
        else:
            X_aug = np.hstack([X, leaf_ohe])

        # Stage 3 – train the downstream MLP
        self.mlp_ = MLPRegressor(
            hidden_layer_sizes=self.mlp_hidden_layer_sizes,
            max_iter=self.mlp_max_iter,
            random_state=self.random_state,
        )
        self.mlp_.fit(X_aug, y)
        return self

    # ------------------------ Predict -----------------------------------
    def predict(self, X):
        check_is_fitted(self, ["trees_", "ohe_", "mlp_"])
        leaf_indices = np.hstack([tree.apply(X).reshape(-1, 1) for tree in self.trees_])
        leaf_ohe = self.ohe_.transform(leaf_indices)

        if sparse.issparse(leaf_ohe):
            X_aug = sparse.hstack([sparse.csr_matrix(X), leaf_ohe]).tocsr()
        else:
            X_aug = np.hstack([X, leaf_ohe])

        return self.mlp_.predict(X_aug)

In [9]:
# ---- Helper Utilities ---------------------------------------------------


def make_preprocessing_pipeline(X: pd.DataFrame):
    """Return a ColumnTransformer that imputes and scales/encodes appropriately."""
    numeric_features = X.select_dtypes(include=["int", "float"]).columns.tolist()
    categorical_features = X.select_dtypes(exclude=["int", "float"]).columns.tolist()

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    return preprocessor


def evaluate_model(model_name, model, X, y):
    from sklearn.model_selection import cross_val_predict
    """Create pipeline, perform cross‑validation, return metric means."""
    pre = make_preprocessing_pipeline(X)
    pipe = Pipeline(steps=[("pre", pre), ("model", model)])

    cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(
        pipe, X, y, cv=cv, scoring=scoring, return_train_score=False, n_jobs=-1
    )
    # Flip sign on MAE/RMSE (they were negated)
    metrics = {
        "r2": np.mean(cv_results["test_r2"]),
        "mae": -np.mean(cv_results["test_mae"]),
        "rmse": -np.mean(cv_results["test_rmse"]),
    }
    
    # Fit once on a train/test split for W&B visualizations
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    pipe_fit = Pipeline(steps=[('pre', pre), ('model', copy.deepcopy(model))])
    pipe_fit.fit(X_train, y_train)
    wandb.sklearn.plot_regressor(pipe_fit, X_train, X_test, y_train, y_test, model_name=model_name)

    return metrics

In [10]:
# ---- Main Benchmark Loop -----------------------------------------------
results = []

suite = openml.study.get_suite(SUITE_NAME)
print(f"Evaluating {len(suite.tasks)} tasks from suite '{SUITE_NAME}'")

# Add custom model to list
models = baseline_models.copy()
# models["TreeMLPRegressor"] = TreeMLPRegressor(random_state=RANDOM_STATE)

for task_id in suite.tasks[:2]:
    try:
        task = openml.tasks.get_task(task_id)
        dataset = task.get_dataset()
        X, y, categorical_indicator, attr_names = dataset.get_data(
            target=dataset.default_target_attribute, dataset_format="dataframe"
        )

        print(f"\nDataset: {dataset.name} (task_id={task_id}) | shape={X.shape}")

        for name, model in models.items():
            run = wandb.init(
                project=WANDB_PROJECT,
                name=f"{dataset.name}-{name}",
                config={
                    "dataset_name": dataset.name,
                    "task_id": task_id,
                    "model": name,
                    "cv_folds": CV_FOLDS,
                    "random_state": RANDOM_STATE,
                    "model_params": (
                        model.get_params() if hasattr(model, "get_params") else {}
                    ),
                },
                reinit="finish_previous",
            )

            try:
                metrics = evaluate_model(name, model, X, y)
                wandb.log(metrics)
                # Store for local summary
                record = {
                    "dataset": dataset.name,
                    "task_id": task_id,
                    "model": name,
                    **metrics,
                }
                results.append(record)
            except Exception as model_exc:
                print(
                    f"⚠️  Error evaluating model '{name}' on '{dataset.name}': {model_exc}"
                )
                wandb.log({"error": str(model_exc)})
            finally:
                run.finish()
    except Exception as ds_exc:
        print(f"⚠️  Skipping task_id={task_id} due to error: {ds_exc}")

Evaluating 35 tasks from suite '8f0ea660163b436bbd4abd49665c7b1d'

Dataset: abalone (task_id=361234) | shape=(4177, 8)


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting LinearRegression.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: [32m[41mERROR[0m X_test contains values that are not numbers. Please vectorize, label encode or one hot encode X_test and call the plotting function again.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode o

0,1
mae,▁
r2,▁
rmse,▁

0,1
mae,1.58555
r2,0.52571
rmse,2.21135


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting MLPRegressor.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: [32m[41mERROR[0m X_test contains values that are not numbers. Please vectorize, label encode or one hot encode X_test and call the plotting function again.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or one hot encode X and call the plotting function again.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: [32m[41mERROR[0m X contains values that are not numbers. Please vectorize, label encode or on

0,1
mae,▁
r2,▁
rmse,▁

0,1
mae,1.50507
r2,0.56262
rmse,2.12482



Dataset: airfoil_self_noise (task_id=361235) | shape=(1503, 5)


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting LinearRegression.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


0,1
mae,▁
r2,▁
rmse,▁

0,1
mae,3.7542
r2,0.50514
rmse,4.83142


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting MLPRegressor.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


0,1
mae,▁
r2,▁
rmse,▁

0,1
mae,14.20294
r2,-6.01698
rmse,18.17611


In [7]:
# ---- Results Summary ----------------------------------------------------
results_df = pd.DataFrame(results)
display(results_df)

# Log aggregated table to W&B
if len(results_df):
    table = wandb.Table(dataframe=results_df)
    wandb.log({"benchmark_results": table})

In [8]:
# ---- Appendix -----------------------------------------------------------
import sys, sklearn, platform, openml

print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("scikit-learn:", sklearn.__version__)
print("OpenML:", openml.__version__)

Python: 3.13.3
Platform: Linux-5.15.0-135-generic-x86_64-with-glibc2.35
scikit-learn: 1.6.1
OpenML: 0.15.1


## W&B Sweeps
Set up hyper‑parameter sweeps with Weights & Biases to optimize model performance.

In [None]:

# ---- Sweep helper --------------------------------------------------------
def make_model_from_sweep(cfg):
    if cfg.model == "RandomForest":
        return RandomForestRegressor(
            n_estimators=cfg.rf_n_estimators,
            max_depth=cfg.rf_max_depth,
            random_state=RANDOM_STATE,
        )
    if cfg.model == "GradientBoosting":
        return GradientBoostingRegressor(
            n_estimators=cfg.gb_n_estimators,
            learning_rate=cfg.gb_lr,
            random_state=RANDOM_STATE,
        )
    if cfg.model == "MLP":
        return MLPRegressor(
            hidden_layer_sizes=tuple(cfg.mlp_layers),
            learning_rate_init=cfg.mlp_lr,
            max_iter=cfg.mlp_max_iter,
            random_state=RANDOM_STATE,
        )
    raise ValueError(cfg.model)


In [None]:

# ---- Define and run a W&B sweep ------------------------------------------
sweep_config = {
    "method": "bayes",
    "metric": {"name": "rmse", "goal": "minimize"},
    "parameters": {
        "dataset_idx": {"values": [0,1,2]},
            "model": {"values": ["RandomForest", "GradientBoosting", "MLP"]},
        "rf_n_estimators": {"values": [100, 200, 400]},
        "rf_max_depth": {"values": [None, 10, 20]},
        "gb_n_estimators": {"values": [100, 300]},
        "gb_lr": {"values": [0.05, 0.1, 0.2]},
        "mlp_layers": {"values": [(64,), (128, 64)]},
        "mlp_lr": {"values": [0.001, 0.01]},
        "mlp_max_iter": {"values": [200, 400]},
    },
}

sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT)

# Example: run sweep on first dataset in the suite
suite = openml.study.get_suite(SUITE_NAME)
task_selected = openml.tasks.get_task(suite.tasks[cfg.dataset_idx])
ds = task_selected.get_dataset()
X_sweep, y_sweep, *_ = ds.get_data(
    target=ds.default_target_attribute, dataset_format="dataframe"
)

def sweep_train():
    with wandb.init() as run:
        cfg = wandb.config
        model = make_model_from_sweep(cfg)
        metrics = evaluate_model(cfg.model, model, X_sweep, y_sweep)
        wandb.log(metrics)

# Launch trials (adjust count or run multiple agents)
wandb.agent(sweep_id, function=sweep_train, count=20)


In [None]:

# ---- Reproducibility helpers ---------------------------------------------
def log_dataset_artifact(run, dataset):
    """Log OpenML dataset as W&B artifact and attach to run."""
    art = wandb.Artifact(dataset.name.replace(" ", "_"), type="dataset")
    if getattr(dataset, "url", None):
        art.add_reference(dataset.url)
    run.use_artifact(art)

def save_model_artifact(run, pipe_fitted, dataset_name, model_name):
    """Save fitted model pipeline and log to W&B."""
    out_path = f"/tmp/{dataset_name.replace(' ', '_')}_{model_name}_{uuid.uuid4().hex}.joblib"
    joblib.dump(pipe_fitted, out_path)
    art = wandb.Artifact(f"{dataset_name}-{model_name}", type="model")
    art.add_file(out_path)
    run.log_artifact(art)


In [None]:

# ---- Feature importance helper ------------------------------------------
from sklearn.inspection import permutation_importance

def log_permutation_importance(pipe_fitted, X_val, y_val, model_name):
    """Compute and log permutation importances for tree models."""
    try:
        pre = pipe_fitted.named_steps.get("pre")
        X_val_enc = pre.transform(X_val) if pre else X_val
        res = permutation_importance(pipe_fitted, X_val_enc, y_val, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1)
        imp_table = {f"perm_imp_{model_name}_{i}": val for i, val in enumerate(res.importances_mean)}
        wandb.log(imp_table)
    except Exception as exc:
        print(f"[perm importance skipped] {exc}")
