# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


## Setup

In [1]:
%pip install -Uq pip black blackcellmagic scikit-learn openml optuna pandas joblib jupyter ipywidgets nbformat setuptools papermill
%load_ext blackcellmagic

Note: you may need to restart the kernel to use updated packages.


In [None]:
SEED = 42
N_TRIALS = 1
OBJECTIVE_CV = 2
N_TASKS = 35
CV = 2

## Model Definitions

In [3]:
# Control Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

models = {
    "Dummy": DummyRegressor(strategy="mean"),
    "RF": RandomForestRegressor(max_depth=2, random_state=SEED),
    "LR": LinearRegression(),
    "MLP": MLPRegressor(solver="sgd", random_state=SEED),
}

In [4]:
# Test Models
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor


class ForestMLPRegressor(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        n_trees=5,
        trees_max_depth=3,
        final_estimator=MLPRegressor(),
        final_estimator__hidden_layer_sizes=100,
        random_state=0,
    ):
        self.n_trees = n_trees
        self.trees_max_depth = trees_max_depth
        self.final_estimator = final_estimator
        self.final_estimator__hidden_layer_sizes = final_estimator__hidden_layer_sizes
        self.random_state = random_state

    def _make_stacking(self):
        return StackingRegressor(
            estimators=[
                (
                    f"tree_{i}",
                    DecisionTreeRegressor(
                        max_depth=self.trees_max_depth,
                        random_state=self.random_state + i,
                    ),
                )
                for i in range(self.n_trees)
            ],
            final_estimator=clone(self.final_estimator).set_params(
                hidden_layer_sizes=self.final_estimator__hidden_layer_sizes,
            ),
            passthrough=True,
        )

    def fit(self, X, y):
        self.model_ = self._make_stacking()
        self.model_.fit(X, y)
        return self

    def predict(self, X):
        return self.model_.predict(X)


test_models = {
    "TreeMLP": StackingRegressor(
        estimators=[("tree", DecisionTreeRegressor(random_state=SEED))],
        final_estimator=models["MLP"],
        passthrough=True,
    ),
    "ForestMLP": ForestMLPRegressor(final_estimator=models["MLP"], random_state=SEED),
}

models.update(test_models)

### Data Preprocessing


In [5]:
# Data Preprocessing
import numpy as np
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(
    [
        ("numeric_imputer", SimpleImputer(strategy="median")),
        ("numeric_scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("categorical_imputer", SimpleImputer(strategy="most_frequent")),
        (
            "categorical_encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        ),
    ]
)
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),  # type: ignore
    (categorical_transformer, make_column_selector(dtype_include=["object", "category"])),  # type: ignore
    sparse_threshold=0.0,
)

models = {
    model_name: Pipeline([("preprocessor", preprocessor), ("model", model)])
    for model_name, model in models.items()
}

### Hyperparameter Optimization Configuration and Space


In [6]:
# Hyperparameter Optimization Space
hpo_spaces = {
    "Dummy": {},
    "RF": {
        "n_estimators": ("int", {"low": 5, "high": 100}),
        "max_depth": ("int", {"low": 1, "high": 20}),
    },
    "LR": {},
    "MLP": {
        "hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    },
    "TreeMLP": {
        "tree__max_depth": ("int", {"low": 1, "high": 10}),
        "final_estimator__hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    },
    "ForestMLP": {
        "n_trees": ("int", {"low": 2, "high": 10}),
        "trees_max_depth": ("int", {"low": 1, "high": 10}),
        "final_estimator__hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    },
}

models = {
    model_name: (model, hpo_spaces[model_name]) for model_name, model in models.items()
}

In [7]:
# Hyperparameter Optimization Configuration
import optuna
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score

# Study configuration
hpo_config = {
    "n_trials": N_TRIALS,
    "timeout": None,
    "show_progress_bar": False,
}

# Cross-validation parameters for HPO objective function
objective_cv_params = {"cv": OBJECTIVE_CV, "scoring": "neg_mean_squared_error"}


def create_objective(model, hpo_space, X, y):
    def objective(trial: optuna.Trial) -> float:
        param_type_map = {
            "int": trial.suggest_int,
            "float": trial.suggest_float,
            "categorical": trial.suggest_categorical,
        }
        params = {
            f"model__{p}": param_type_map[typ](f"model__{p}", **kw)
            for p, (typ, kw) in hpo_space.items()
        }
        model.set_params(**params)
        return cross_val_score(model, X, y, **objective_cv_params).mean()

    return objective


class TemplateRegressor(RegressorMixin, BaseEstimator):
    def __init__(self, model, hpo_space):
        self.model = model
        self.hpo_space = hpo_space

    def fit(self, X, y):
        model = clone(self.model)
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=SEED))
        study.optimize(
            create_objective(model, self.hpo_space, X, y),
            **hpo_config,
        )
        model.set_params(**study.best_params)
        self.fitted_model_ = model.fit(X, y)
        return self

    def predict(self, X):
        return self.fitted_model_.predict(X)

## Define Benchmark Suite and Tasks


In [8]:
# Define Benchmark Suite
import openml
from IPython.display import display

SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23
suite = openml.study.get_suite(SUITE_ID)
display(suite)
print(suite.description)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

Inclusion Criteria:  

* There are between 500 and 100000 observations.
* There are less than 5000 features after one-hot encoding all categorical features.
* The dataset is not in a sparse format.
* The observations are i.i.d., which means that we exclude datasets that have time dependencies or require grouped data splits.
* The dataset comes with a source or reference that clearly describes it.
* We did not consider the dataset to be artificial, but allowed simulated datasets.
* The data is not a subset of a larger dataset.
* There is a numeric target variable with at least 5 different values.
* The dataset is not trivially solvable by a linear model, i.e. the training error of a linear model fitted to the whole data has an R2 of less than 1.
* The dataset does not have ethical concerns.
* The use of the dataset for benchmarking is not forbidden.

In addition to the datasets, the OpenML tasks also contain resampling splits, which were determined according to the following rule: If th

In [9]:
# Download tasks from the suite
tasks = openml.tasks.get_tasks(
    (suite.tasks or [])[:N_TASKS], download_data=True, download_qualities=True
)
display(tasks)

[OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361234
 Task URL.............: https://www.openml.org/t/361234
 Estimation Procedure.: crossvalidation
 Target Feature.......: rings,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361235
 Task URL.............: https://www.openml.org/t/361235
 Estimation Procedure.: crossvalidation
 Target Feature.......: sound_pressure,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361236
 Task URL.............: https://www.openml.org/t/361236
 Estimation Procedure.: crossvalidation
 Target Feature.......: verification.time,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361237
 Task URL.............: https://www.openml.org/t/36123

In [10]:
import warnings

import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_validate

cv_params = {
    "scoring": ["neg_mean_squared_error", "r2"],
    "return_train_score": True,
}


def run_model_on_task(model_name, model, X, y, cv):
    # print(f"Running {model_name=}")
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    run = cross_validate(
        TemplateRegressor(*model),
        X,
        y,
        cv=cv,
        **cv_params,
    )
    return model_name, pd.DataFrame(run).mean(axis=0)

### Benchmarking Execution


In [11]:
from joblib import Parallel, delayed

# selected_models = models
selected_models = {m: models[m] for m in ["MLP"]}

runs = {}
for task in tasks:
    splits = task.download_split().split
    X, y = task.get_X_and_y(dataset_format="dataframe")  # type: ignore
    cv = [s[0] for s in splits[0].values()][:CV]
    results = Parallel(n_jobs=-1)(
        delayed(run_model_on_task)(model_name, model, X, y, cv)
        for model_name, model in selected_models.items()
    )
    runs[task.id] = dict(results)  # type: ignore

  ret = a @ b
  ret = a @ b
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
[W 2025-05-21 16:13:17,421] Trial 0 failed with parameters: {'model__hidden_layer_sizes': 40} because of the following error: ValueError('\nAll the 2 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n2 fits failed with the following error:\nTraceback (most recent call last):\n  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score\n    estimator.fit(X_train, y_train, **fit_params)\n    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1389, in wrapper\n    return fit_method(estimator, *args, **kwargs)\

ValueError: 
All the 2 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1575843/1907628840.py", line 43, in fit
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/optuna/study/study.py", line 475, in optimize
    _optimize(
    ~~~~~~~~~^
        study=self,
        ^^^^^^^^^^^
    ...<7 lines>...
        show_progress_bar=show_progress_bar,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 63, in _optimize
    _optimize_sequential(
    ~~~~~~~~~~~~~~~~~~~~^
        study,
        ^^^^^^
    ...<8 lines>...
        progress_bar=progress_bar,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 160, in _optimize_sequential
    frozen_trial = _run_trial(study, func, catch)
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 248, in _run_trial
    raise func_err
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1575843/1907628840.py", line 29, in objective
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 684, in cross_val_score
    cv_results = cross_validate(
        estimator=estimator,
    ...<9 lines>...
        error_score=error_score,
    )
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 431, in cross_validate
    _warn_or_raise_about_fit_failures(results, error_score)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 517, in _warn_or_raise_about_fit_failures
    raise ValueError(all_fits_failed_message)
ValueError: 
All the 2 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 754, in fit
    return self._fit(X, y, incremental=False)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ahteh/Developer/research/.venv/lib/python3.13/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 496, in _fit
    raise ValueError(
    ...<2 lines>...
    )
ValueError: Solver produced non-finite parameter weights. The input data may contain large values and need to be preprocessed.



### Aggregate Results


In [None]:
# %%capture output
# Aggregate Results
import pandas as pd

runs_df = pd.DataFrame(runs)
metrics = cv_params["scoring"]
metrics_df: dict[str, tuple[pd.DataFrame, pd.DataFrame]] = {}
for metric in metrics:
    metrics_df[metric] = (
        runs_df.map(lambda r: r[f"test_{metric}"]),
        runs_df.map(lambda r: r[f"train_{metric}"]),
    )

# Use first metric for ranking
test_ranks_df = metrics_df[metrics[0]][0].rank(axis=0, ascending=False)
test_ranks_df["avg_rank"] = test_ranks_df.mean(axis=1)
test_ranks_df["std_rank"] = test_ranks_df.std(axis=1)

train_ranks_df = metrics_df[metrics[0]][1].rank(axis=0, ascending=False)
train_ranks_df["avg_rank"] = train_ranks_df.mean(axis=1)
train_ranks_df["std_rank"] = train_ranks_df.std(axis=1)

metrics_df["rank"] = (test_ranks_df, train_ranks_df)

# Display all metrics and ranks
for metric, (test, train) in metrics_df.items():
    display(f"test_{metric}", test, f"train_{metric}", train)

'test_neg_mean_squared_error'

Unnamed: 0,361234,361235
MLP,-5.189078,-15.938179


'train_neg_mean_squared_error'

Unnamed: 0,361234,361235
MLP,-4.219045,-16.880282


'test_r2'

Unnamed: 0,361234,361235
MLP,0.503856,0.653468


'train_r2'

Unnamed: 0,361234,361235
MLP,0.593732,0.645399


'test_rank'

Unnamed: 0,361234,361235,avg_rank,std_rank
MLP,1.0,1.0,1.0,0.0


'train_rank'

Unnamed: 0,361234,361235,avg_rank,std_rank
MLP,1.0,1.0,1.0,0.0


In [None]:
# output.show()

In [None]:
# # Single model evaluation
# from sklearn.model_selection import cross_validate

# task = tasks[0]
# splits = task.download_split().split
# X, y = task.get_X_and_y(dataset_format="dataframe")  # type: ignore
# cv = [s[0] for s in splits[0].values()][:3]  # Using pre-defined OpenML splits
# model = TemplateRegressor(*models["ForestMLP"])
# run = cross_validate(
#     model,
#     X,
#     y,
#     cv=cv
#     **cv_params,
# )
# display(pd.DataFrame(run).mean(axis=0))