# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


# Setup Environment

In [1]:
# Install required packages if not already present
# %pip install -Uq pip scikit-learn openml optuna pandas jupyter ipywidgets nbformat setuptools

In [2]:
# Baseline Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

models = {
    "Dummy": DummyRegressor(strategy="mean"),
    "RF": RandomForestRegressor(),
    # "LR": LinearRegression(),
    # "MLP": MLPRegressor(solver="sgd"),
}

In [3]:
# Data Preprocessing
import numpy as np
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(
    [
        ("numeric_imputer", SimpleImputer(strategy="median")),
        ("numierc_scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("categorical_imputer", SimpleImputer(strategy="most_frequent")),
        (
            "categorical_encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        ),
    ]
)
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),  # type: ignore
    (categorical_transformer, make_column_selector(dtype_include=["object", "category"])),  # type: ignore
    sparse_threshold=0.0,
)

models = {
    model_name: Pipeline([("preprocessor", preprocessor), ("model", model)])
    for model_name, model in models.items()
}

## Hyperparameter Optimization (HPO) Objective Functions

The following Python cell defines functions that create 'objectives' for Optuna. Each objective function encapsulates the logic for:
1. Defining the hyperparameter search space for a specific regression model.
2. Evaluating a set of hyperparameters using cross-validation on the training data.

These objective functions are then used by Optuna to find the best hyperparameter combination for each model.

In [4]:
# Hyperparameter Optimization Space
hpo_spaces = {
    "Dummy": {},
    "RF": {
        "n_estimators": ("int", {"low": 5, "high": 100}),
        "max_depth": ("int", {"low": 1, "high": 20}),
    },
    # "LR": {},
    # "MLP": {
    #     "hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    #     "alpha": ("float", {"low": 1e-5, "high": 1e-1, "log": True}),
    #     "learning_rate_init": ("float", {"low": 1e-5, "high": 1e-1, "log": True}),
    #     "max_iter": ("int", {"low": 100, "high": 1000}),
    # }
}

models = {
    model_name: (models[model_name], hpo_space)
    for model_name, hpo_space in hpo_spaces.items()
}

In [5]:
# Hyperparameter Optimization Configuration
import optuna
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score

# Study configuration
hpo_config = {
    "n_trials": 10,
    "timeout": None,
    "n_jobs": -1,
    "show_progress_bar": True,
}

# Cross-validation parameters for HPO objective function
objective_cv_params = {"cv": 5, "scoring": "neg_mean_squared_error", "n_jobs": -1}


def create_objective(model, hpo_space, X, y):
    def objective(trial: optuna.Trial) -> float:
        param_type_map = {
            "int": trial.suggest_int,
            "float": trial.suggest_float,
            "categorical": trial.suggest_categorical,
        }
        params = {
            f"model__{p}": param_type_map[typ](f"model__{p}", **kw)
            for p, (typ, kw) in hpo_space.items()
        }
        model.set_params(**params)
        return cross_val_score(model, X, y, **objective_cv_params).mean()

    return objective


class TemplateRegressor(RegressorMixin, BaseEstimator):
    def __init__(self, model, hpo_space):
        self.model = model
        self.hpo_space = hpo_space

    def fit(self, X, y):
        model = clone(self.model)
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study()
        study.optimize(
            create_objective(model, self.hpo_space, X, y),
            **hpo_config,
        )
        model.set_params(**study.best_params)
        self.fitted_model_ = model.fit(X, y)
        return self

    def predict(self, X):
        return self.fitted_model_.predict(X)

In [6]:
# Define Benchmark Suite
import openml
from IPython.display import display

SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23
suite = openml.study.get_suite(SUITE_ID)
display(suite)
print(suite.description)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

Inclusion Criteria:  

* There are between 500 and 100000 observations.
* There are less than 5000 features after one-hot encoding all categorical features.
* The dataset is not in a sparse format.
* The observations are i.i.d., which means that we exclude datasets that have time dependencies or require grouped data splits.
* The dataset comes with a source or reference that clearly describes it.
* We did not consider the dataset to be artificial, but allowed simulated datasets.
* The data is not a subset of a larger dataset.
* There is a numeric target variable with at least 5 different values.
* The dataset is not trivially solvable by a linear model, i.e. the training error of a linear model fitted to the whole data has an R2 of less than 1.
* The dataset does not have ethical concerns.
* The use of the dataset for benchmarking is not forbidden.

In addition to the datasets, the OpenML tasks also contain resampling splits, which were determined according to the following rule: If th

In [7]:
# Download tasks from the suite
N_TASKS = 2
tasks = openml.tasks.get_tasks(
    (suite.tasks or [])[:N_TASKS], download_data=True, download_qualities=True
)
display(tasks)

[OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361234
 Task URL.............: https://www.openml.org/t/361234
 Estimation Procedure.: crossvalidation
 Target Feature.......: rings,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361235
 Task URL.............: https://www.openml.org/t/361235
 Estimation Procedure.: crossvalidation
 Target Feature.......: sound_pressure]

In [None]:
from sklearn.model_selection import cross_validate

cv_params = {
    "scoring": ["neg_mean_squared_error", "r2"],
    "n_jobs": -1,
    "return_train_score": True,
}

runs = {}

for task in tasks:
    model_results = {}
    splits = task.download_split().split
    X, y = task.get_X_and_y(dataset_format="dataframe")  # type: ignore

    for model_name, model in models.items():
        print(f"Running {model_name=} on {task.id=}...")
        run = cross_validate(
            TemplateRegressor(*model),
            X,
            y,
            cv=[s[0] for s in splits[0].values()],  # Using pre-defined OpenML splits
            **cv_params,
        )
        model_results[model_name] = run
    runs[task.id] = model_results

Running model_name='Dummy' on task.id=361234...


Best trial: 6. Best value: -10.4063: 100%|██████████| 10/10 [00:27<00:00,  2.71s/it]
Best trial: 0. Best value: -10.378: 100%|██████████| 10/10 [00:27<00:00,  2.73s/it]
Best trial: 9. Best value: -10.3641: 100%|██████████| 10/10 [00:27<00:00,  2.73s/it]
Best trial: 5. Best value: -10.342: 100%|██████████| 10/10 [00:27<00:00,  2.74s/it]
Best trial: 7. Best value: -10.5371: 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]
Best trial: 5. Best value: -10.3806: 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]
Best trial: 5. Best value: -10.1761: 100%|██████████| 10/10 [00:27<00:00,  2.74s/it]
Best trial: 3. Best value: -10.507: 100%|██████████| 10/10 [00:27<00:00,  2.74s/it]
Best trial: 7. Best value: -10.4749: 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]
Best trial: 7. Best value: -10.3881: 100%|██████████| 10/10 [00:27<00:00,  2.78s/it]


Running model_name='RF' on task.id=361234...


  0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
# Aggregate Results
import numpy as np
import pandas as pd

runs_df = pd.DataFrame(runs)
metrics = cv_params["scoring"]
metrics_df = {}
for metric in metrics:
    metrics_df[metric] = (
        runs_df.map(lambda r: np.mean(r[f"test_{metric}"])),
        runs_df.map(lambda r: np.mean(r[f"train_{metric}"])),
    )

# Use first metric for ranking
metrics_df["rank"] = tuple(df.rank(axis=0, ascending=False) for df in metrics_df[metrics[0]])
metrics_df["avg_rank"] = tuple(df.mean(axis=1) for df in metrics_df["rank"])

# Display all metrics and ranks
for metric, (test, train) in metrics_df.items():
    display(f"test_{metric}", test, f"train_{metric}", train)
# RF	-6.333056	-39.548363

'test_neg_mean_squared_error'

Unnamed: 0,361234,361235
Dummy,-10.39585,-47.632423
RF,-4.739057,-9.48314


'train_neg_mean_squared_error'

Unnamed: 0,361234,361235
Dummy,-10.392585,-47.556044
RF,-2.030896,-7.350572


'test_r2'

Unnamed: 0,361234,361235
Dummy,-0.001675,-0.007684
RF,0.542955,0.797799


'train_r2'

Unnamed: 0,361234,361235
Dummy,0.0,0.0
RF,0.804768,0.845665


'test_rank'

Unnamed: 0,361234,361235
Dummy,2.0,2.0
RF,1.0,1.0


'train_rank'

Unnamed: 0,361234,361235
Dummy,2.0,2.0
RF,1.0,1.0


'test_avg_rank'

Dummy    2.0
RF       1.0
dtype: float64

'train_avg_rank'

Dummy    2.0
RF       1.0
dtype: float64

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

tid = suite.tasks[0]
task = openml.tasks.get_task(tid)
ds = task.get_dataset()

mlp = MLPRegressor(
    hidden_layer_sizes=(10,),
    max_iter=200,
)
tree_mlp = StackingRegressor(
    estimators=[("Tree", DecisionTreeRegressor())],
    final_estimator=mlp,
    passthrough=True,
)
forest_mlp = StackingRegressor(
    estimators=[(f"Tree {i}", DecisionTreeRegressor(random_state=i)) for i in range(5)],
    final_estimator=mlp,
    passthrough=True,
)

mlp_pipe = Pipeline([("preprocessor", preprocessor), ("model", mlp)])
tree_mlp_pipe = Pipeline([("preprocessor", preprocessor), ("model", tree_mlp)])
forest_mlp_pipe = Pipeline([("preprocessor", preprocessor), ("model", forest_mlp)])

# mlp_run = openml.runs.run_model_on_task(mlp_pipe, tsk, n_jobs=-1)
# tree_mlp_run = openml.runs.run_model_on_task(tree_mlp_pipe, tsk, n_jobs=-1)
forest_mlp_run = openml.runs.run_model_on_task(forest_mlp_pipe, task, n_jobs=-1)

In [None]:
import numpy as np
import random
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
display(mlp_run, tree_mlp_run)

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

ds = fetch_openml(data_id=44956)
display(ds)
[openml.tasks.get_task(t).dataset_id for t in suite.tasks]