<a href="https://colab.research.google.com/github/spicecat/research/blob/main/regression_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


## Setup

In [1]:
%pip install -q pip black blackcellmagic scikit-learn openml optuna optuna-dashboard joblib
%load_ext blackcellmagic

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
SEED = 42
SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"
N_TRIALS = 15
OBJECTIVE_CV = 3
N_TASKS = 10
CV = 10

## Models

### Model Definition

In [3]:
# Control Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

models = {
    "Dummy": DummyRegressor(strategy="mean"),
    "RF": RandomForestRegressor(max_depth=2, random_state=SEED),
    # "LR": LinearRegression(),
    "MLP": MLPRegressor(solver="adam", random_state=SEED),
}

In [4]:
# Test Models
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor


class ForestMLPRegressor(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        n_trees=5,
        trees_max_depth=2,
        final_estimator=MLPRegressor(),
        final_estimator__hidden_layer_sizes=100,
        random_state=0,
    ):
        self.n_trees = n_trees
        self.trees_max_depth = trees_max_depth
        self.final_estimator = final_estimator
        self.final_estimator__hidden_layer_sizes = final_estimator__hidden_layer_sizes
        self.random_state = random_state

    def _make_stacking(self):
        return StackingRegressor(
            estimators=[
                (
                    f"tree_{i}",
                    DecisionTreeRegressor(
                        max_depth=self.trees_max_depth,
                        random_state=self.random_state + i,
                    ),
                )
                for i in range(self.n_trees)
            ],
            final_estimator=clone(self.final_estimator).set_params(
                hidden_layer_sizes=self.final_estimator__hidden_layer_sizes,
            ),
            passthrough=True,
        )

    def fit(self, X, y):
        self.model_ = self._make_stacking()
        self.model_.fit(X, y)
        return self

    def predict(self, X):
        return self.model_.predict(X)


test_models = {
    "TreeMLP": StackingRegressor(
        estimators=[("tree", DecisionTreeRegressor(random_state=SEED))],
        final_estimator=models["MLP"],
        passthrough=True,
    ),
    # "ForestMLP": ForestMLPRegressor(final_estimator=models["MLP"], random_state=SEED),
}

models.update(test_models)

### Data Preprocessing


In [5]:
# Data Preprocessing
import numpy as np
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(
    [
        ("numeric_imputer", SimpleImputer(strategy="median")),
        ("numeric_scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("categorical_imputer", SimpleImputer(strategy="most_frequent")),
        (
            "categorical_encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        ),
    ]
)
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),  # type: ignore
    (categorical_transformer, make_column_selector(dtype_include=["object", "category"])),  # type: ignore
    sparse_threshold=0.0,
)

models = {
    model_name: Pipeline([("preprocessor", preprocessor), ("model", model)])
    for model_name, model in models.items()
}

### Hyperparameter Optimization

In [6]:
# Hyperparameter Optimization Space
hpo_spaces = {
    "Dummy": {},
    "RF": {
        "n_estimators": ("int", {"low": 5, "high": 20}),
        "max_depth": ("int", {"low": 1, "high": 3}),
    },
    "LR": {},
    "MLP": {
        "hidden_layer_sizes": ("int", {"low": 5, "high": 20}),
    },
    "TreeMLP": {
        "tree__max_depth": ("int", {"low": 1, "high": 3}),
        "final_estimator__hidden_layer_sizes": ("int", {"low": 5, "high": 20}),
    },
    "ForestMLP": {
        "n_trees": ("int", {"low": 2, "high": 3}),
        "trees_max_depth": ("int", {"low": 1, "high": 3}),
        "final_estimator__hidden_layer_sizes": ("int", {"low": 5, "high": 20}),
    },
}

models = {
    model_name: (model, hpo_spaces[model_name]) for model_name, model in models.items()
}

In [7]:
# Hyperparameter Optimization Configuration
import optuna
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score

# Study configuration
hpo_config = {
    "n_trials": N_TRIALS,
    "timeout": None,
    "show_progress_bar": True,
}

# Cross-validation parameters for HPO objective function
objective_cv_params = {"cv": OBJECTIVE_CV, "scoring": "neg_mean_squared_error"}


def create_objective(model, hpo_space, X, y):
    def objective(trial: optuna.Trial) -> float:
        param_type_map = {
            "int": trial.suggest_int,
            "float": trial.suggest_float,
            "categorical": trial.suggest_categorical,
        }
        params = {
            f"model__{p}": param_type_map[typ](f"model__{p}", **kw)
            for p, (typ, kw) in hpo_space.items()
        }
        model.set_params(**params)
        return cross_val_score(model, X, y, **objective_cv_params).mean()

    return objective

In [8]:
storage = optuna.storages.JournalStorage(
    optuna.storages.journal.JournalFileBackend("optuna_journal_storage.log")  # type: ignore
)


class TemplateRegressor(RegressorMixin, BaseEstimator):
    _split_counters = {}

    def __init__(self, study_name, model, hpo_space):
        self.model = model
        self.hpo_space = hpo_space
        self.study_name = study_name
        if study_name not in TemplateRegressor._split_counters:
            TemplateRegressor._split_counters[study_name] = 0

    def fit(self, X, y):
        model = clone(self.model)
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        sampler = optuna.samplers.TPESampler(seed=SEED)

        study_name = f"{self.study_name}_{TemplateRegressor._split_counters[self.study_name]}"
        TemplateRegressor._split_counters[self.study_name] += 1

        study = optuna.create_study(
            storage=storage,
            sampler=sampler,
            study_name=study_name,
            direction="maximize",
            load_if_exists=True,
        )
        study.optimize(
            create_objective(model, self.hpo_space, X, y),
            **hpo_config,
        )
        model.set_params(**study.best_params)
        self.fitted_model_ = model.fit(X, y)
        return self

    def predict(self, X):
        return self.fitted_model_.predict(X)

## Benchmark


### Define Benchmark Suite

In [9]:
# Define Benchmark Suite
import openml
from IPython.display import display

# SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23
suite = openml.study.get_suite(SUITE_ID)
display(suite)
print(suite.description)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

Inclusion Criteria:  

* There are between 500 and 100000 observations.
* There are less than 5000 features after one-hot encoding all categorical features.
* The dataset is not in a sparse format.
* The observations are i.i.d., which means that we exclude datasets that have time dependencies or require grouped data splits.
* The dataset comes with a source or reference that clearly describes it.
* We did not consider the dataset to be artificial, but allowed simulated datasets.
* The data is not a subset of a larger dataset.
* There is a numeric target variable with at least 5 different values.
* The dataset is not trivially solvable by a linear model, i.e. the training error of a linear model fitted to the whole data has an R2 of less than 1.
* The dataset does not have ethical concerns.
* The use of the dataset for benchmarking is not forbidden.

In addition to the datasets, the OpenML tasks also contain resampling splits, which were determined according to the following rule: If th

In [10]:
# Download tasks from the suite
# N_TASKS = 2
tasks = openml.tasks.get_tasks(
    (suite.tasks or []), download_data=True, download_qualities=True
)
tasks.sort(key=lambda t: t.get_dataset().qualities.get("NumberOfInstances", 0))  # type: ignore
tasks = tasks[:N_TASKS]
display(tasks)

[OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361618
 Task URL.............: https://www.openml.org/t/361618
 Estimation Procedure.: crossvalidation
 Target Feature.......: area,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361619
 Task URL.............: https://www.openml.org/t/361619
 Estimation Procedure.: crossvalidation
 Target Feature.......: G3,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361617
 Task URL.............: https://www.openml.org/t/361617
 Estimation Procedure.: crossvalidation
 Target Feature.......: heating_load,
 OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361622
 Task URL.............: https://www.openml.org/t/361622
 Estimation Proc

### Benchmark Execution


In [11]:
# Optuna Dashboard
from google.colab import output
from optuna_dashboard import run_server
import threading

dashboard_thread = threading.Thread(target=lambda: run_server(storage, port=8787))
dashboard_thread.start()
output.serve_kernel_port_as_iframe(8787, path="/dashboard")

<IPython.core.display.Javascript object>

Bottle v0.13.3 server starting up (using WSGIRefServer())...
Listening on http://localhost:8787/
Hit Ctrl-C to quit.



In [12]:
# Run Model on Task
import warnings

import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_validate

cv_params = {
    "scoring": ["r2", "neg_mean_squared_error"],
    "return_train_score": True,
}

def run_model_on_task(model_name, target_name, model, X, y, cv):
    print(f"Running {model_name=}")
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    run = cross_validate(
        TemplateRegressor(f"{target_name}_{model_name}", *model),
        X,
        y,
        cv=cv,
        **cv_params,
    )
    run_df = pd.DataFrame(run).mean(axis=0)
    return (target_name, model_name), run_df

In [None]:
# Run Tasks
from joblib import Parallel, delayed
from sklearn.model_selection import cross_validate

jobs = []
for task in tasks:
    target_name = task.target_name  # type: ignore
    splits = task.download_split().split
    X, y = task.get_X_and_y(dataset_format="dataframe")  # type: ignore
    cv = [s[0] for s in splits[0].values()][:CV]
    for model_name, model in models.items():
        jobs.append((model_name, target_name, model, X, y, cv))  # type: ignore

n_jobs = -1
results = Parallel(n_jobs)(delayed(run_model_on_task)(*job) for job in jobs)  # type: ignore

runs = {task.target_name: {} for task in tasks}  # type: ignore
for (target_name, model_name), run in results:
    runs[target_name][model_name] = run

127.0.0.1 - - [02/Jun/2025 04:03:00] "GET /dashboard HTTP/1.1" 200 4145
127.0.0.1 - - [02/Jun/2025 04:03:01] "GET /static/bundle.js HTTP/1.1" 200 4140872
127.0.0.1 - - [02/Jun/2025 04:03:03] "GET /api/studies HTTP/1.1" 200 23
127.0.0.1 - - [02/Jun/2025 04:08:07] "GET /api/studies HTTP/1.1" 200 7695
127.0.0.1 - - [02/Jun/2025 04:39:08] "GET /api/studies HTTP/1.1" 200 38198


## Aggregate Results
[https://optuna.github.io/optuna-dashboard/](https://optuna.github.io/optuna-dashboard/)

In [None]:
# Aggregate Results
import pandas as pd

runs_df = pd.DataFrame(runs)
metrics = cv_params["scoring"]
metrics_df: dict[str, tuple[pd.DataFrame, pd.DataFrame]] = {}
for metric in metrics:
    metrics_df[metric] = (
        runs_df.map(lambda r: r[f"test_{metric}"]),
        runs_df.map(lambda r: r[f"train_{metric}"]),
    )

# Use first metric for ranking
test_ranks_df = metrics_df[metrics[0]][0].rank(axis=0, ascending=False)
test_ranks_df["avg_rank"] = test_ranks_df.mean(axis=1)
test_ranks_df["std_rank"] = test_ranks_df.std(axis=1)

train_ranks_df = metrics_df[metrics[0]][1].rank(axis=0, ascending=False)
train_ranks_df["avg_rank"] = train_ranks_df.mean(axis=1)
train_ranks_df["std_rank"] = train_ranks_df.std(axis=1)

metrics_df["rank"] = (test_ranks_df, train_ranks_df)

# Display all metrics and ranks
for metric, (test, train) in metrics_df.items():
    display(f"test_{metric}", test, f"train_{metric}", train)

In [None]:
!cp optuna_journal_storage.log /content/drive/MyDrive/