# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


In [1]:
# %pip install -q pip openml scikit-learn pandas nbformat setuptools

In [2]:
# Setup
import openml
from IPython.display import display

In [None]:
# Data Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


def make_preprocessor(dataset: openml.OpenMLDataset) -> ColumnTransformer:
    """Creates a preprocessing pipeline for a given OpenML dataset.

    Args:
        dataset: An OpenMLDataset object.

    Returns:
        A ColumnTransformer object configured for the dataset.
    """
    target = dataset.default_target_attribute
    numeric_features = dataset.get_features_by_type("numeric", exclude=[target])
    nominal_features = dataset.get_features_by_type("nominal", exclude=[target])

    numeric_transformer = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )
    nominal_transformer = Pipeline(
        [("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]
    )

    return ColumnTransformer(
        [
            ("numeric_preprocessor", numeric_transformer, numeric_features),
            ("nominal_preprocessor", nominal_transformer, nominal_features),
        ],
        sparse_threshold=0.0,
    )

In [4]:
# Baseline Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# Baseline models
models = {
    "DummyRegressor": DummyRegressor(strategy="mean"),
    # "RandomForestRegressor": RandomForestRegressor(),
    "LinearRegression": LinearRegression(),
    # "MLPRegressor": MLPRegressor(),
}

In [5]:
# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV, KFold

search_params = {
    "cv": 5,
    "n_jobs": -1,
    "n_trials": 4,
    "random_state": 42,
    "return_train_score": False,
    # "scoring": "neg_mean_squared_error",
    "timeout": None,
    "verbose": 0,
}

hpo_grid = {
    "RandomForestRegressor": {
        "n_estimators": [5, 10],
        "max_depth": [2, 5],
    },
    "MLPRegressor": {
        "hidden_layer_sizes": [(5,), (10,)],
        "learning_rate_init": [1e-3, 1e-1],
        "max_iter": [100, 200],
    },
}


def make_hpo_model(
    preprocessor: ColumnTransformer, model_name: str
) -> GridSearchCV | Pipeline:
    pipe = Pipeline([("preprocessor", preprocessor), ("model", models[model_name])])
    return (
        GridSearchCV(
            pipe,
            {f"model__{k}": v for k, v in hpo_grid[model_name].items()},
            cv=KFold(n_splits=4, shuffle=True),
        )
        if model_name in hpo_grid
        else pipe
    )

In [6]:
def evaluate_model(task: openml.OpenMLTask, model_name: str, seed: int = 42):
    dataset = task.get_dataset()
    preprocessor = make_preprocessor(dataset)
    model = make_hpo_model(preprocessor, model_name)
    run = openml.runs.run_model_on_task(model, task, seed=seed, n_jobs=-1)
    return run

In [7]:
# Configuration
SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23 - A curated tabular regression benchmarking suite
suite = openml.study.get_suite(SUITE_ID)
display(suite)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

In [8]:
# Run Models
N_TASKS = 3

for task_id in suite.tasks[:N_TASKS]:
    task = openml.tasks.get_task(task_id)
    display(task)
    for model_name in models.keys():
        run = evaluate_model(task, model_name)
        display(model_name, run)

OpenML Regression Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
Task ID..............: 361234
Task URL.............: https://www.openml.org/t/361234
Estimation Procedure.: crossvalidation
Target Feature.......: rings

'DummyRegressor'

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 2.3628 +- 0.1077
Local Runtime - ms (+- STD): 18.0556 +- 3.0808
Run ID.....................: None
Task ID....................: 361234
Task Type..................: None
Task URL...................: https://www.openml.org/t/361234
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.dummy.DummyRegressor)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44956
Dataset URL................: https://ww



'LinearRegression'

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 1.5850 +- 0.0716
Local Runtime - ms (+- STD): 21.3776 +- 5.1716
Run ID.....................: None
Task ID....................: 361234
Task Type..................: None
Task URL...................: https://www.openml.org/t/361234
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.linear_model._base.LinearRegression)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44956
Dataset URL.............

OpenML Regression Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
Task ID..............: 361235
Task URL.............: https://www.openml.org/t/361235
Estimation Procedure.: crossvalidation
Target Feature.......: sound_pressure

'DummyRegressor'

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 5.6380 +- 0.2973
Local Runtime - ms (+- STD): 11.5454 +- 2.1022
Run ID.....................: None
Task ID....................: 361235
Task Type..................: None
Task URL...................: https://www.openml.org/t/361235
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.dummy.DummyRegressor)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44957
Dataset URL................: https://ww



'LinearRegression'

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 3.7512 +- 0.2356
Local Runtime - ms (+- STD): 15.3360 +- 4.5533
Run ID.....................: None
Task ID....................: 361235
Task Type..................: None
Task URL...................: https://www.openml.org/t/361235
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.linear_model._base.LinearRegression)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44957
Dataset URL.............

OpenML Regression Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
Task ID..............: 361236
Task URL.............: https://www.openml.org/t/361236
Estimation Procedure.: crossvalidation
Target Feature.......: verification.time

'DummyRegressor'

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 8262.8385 +- 323.8520
Local Runtime - ms (+- STD): 25.6880 +- 9.4362
Run ID.....................: None
Task ID....................: 361236
Task Type..................: None
Task URL...................: https://www.openml.org/t/361236
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.dummy.DummyRegressor)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44958
Dataset URL................: https



'LinearRegression'

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 4902.5903 +- 302.2368
Local Runtime - ms (+- STD): 29.5565 +- 11.5432
Run ID.....................: None
Task ID....................: 361236
Task Type..................: None
Task URL...................: https://www.openml.org/t/361236
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.linear_model._base.LinearRegression)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44958
Dataset URL.......