# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


## Setup

In [1]:
%pip install -Uq pip black blackcellmagic scikit-learn openml optuna pandas joblib jupyter ipywidgets nbformat setuptools
%load_ext blackcellmagic

Note: you may need to restart the kernel to use updated packages.


## Model Definitions

In [2]:
# Control Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

SEED = 42

models = {
    "Dummy": DummyRegressor(strategy="mean"),
    "RF": RandomForestRegressor(max_depth=2, random_state=SEED),
    "LR": LinearRegression(),
    "MLP": MLPRegressor(solver="sgd", random_state=SEED),
}

In [3]:
# Test Models
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor


class ForestMLPRegressor(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        n_trees=5,
        trees_max_depth=3,
        final_estimator=MLPRegressor(),
        final_estimator__hidden_layer_sizes=100,
        random_state=0,
    ):
        self.n_trees = n_trees
        self.trees_max_depth = trees_max_depth
        self.final_estimator = final_estimator
        self.final_estimator__hidden_layer_sizes = final_estimator__hidden_layer_sizes
        self.random_state = random_state

    def _make_stacking(self):
        return StackingRegressor(
            estimators=[
                (
                    f"tree_{i}",
                    DecisionTreeRegressor(
                        max_depth=self.trees_max_depth,
                        random_state=self.random_state + i,
                    ),
                )
                for i in range(self.n_trees)
            ],
            final_estimator=clone(self.final_estimator).set_params(
                hidden_layer_sizes=self.final_estimator__hidden_layer_sizes,
            ),
            passthrough=True,
        )

    def fit(self, X, y):
        self.model_ = self._make_stacking()
        self.model_.fit(X, y)
        return self

    def predict(self, X):
        return self.model_.predict(X)


test_models = {
    "TreeMLP": StackingRegressor(
        estimators=[("tree", DecisionTreeRegressor(random_state=SEED))],
        final_estimator=models["MLP"],
        passthrough=True,
    ),
    "ForestMLP": ForestMLPRegressor(final_estimator=models["MLP"], random_state=SEED),
}

models.update(test_models)

### Data Preprocessing


In [4]:
# Data Preprocessing
import numpy as np
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(
    [
        ("numeric_imputer", SimpleImputer(strategy="median")),
        ("numeric_scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("categorical_imputer", SimpleImputer(strategy="most_frequent")),
        (
            "categorical_encoder",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        ),
    ]
)
preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),  # type: ignore
    (categorical_transformer, make_column_selector(dtype_include=["object", "category"])),  # type: ignore
    sparse_threshold=0.0,
)

models = {
    model_name: Pipeline([("preprocessor", preprocessor), ("model", model)])
    for model_name, model in models.items()
}

### Hyperparameter Optimization Configuration and Space


In [5]:
# Hyperparameter Optimization Space
hpo_spaces = {
    "Dummy": {},
    "RF": {
        "n_estimators": ("int", {"low": 5, "high": 100}),
        "max_depth": ("int", {"low": 1, "high": 20}),
    },
    "LR": {},
    "MLP": {
        "hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    },
    "TreeMLP": {
        "tree__max_depth": ("int", {"low": 1, "high": 10}),
        "final_estimator__hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    },
    "ForestMLP": {
        "n_trees": ("int", {"low": 2, "high": 10}),
        "trees_max_depth": ("int", {"low": 1, "high": 10}),
        "final_estimator__hidden_layer_sizes": ("int", {"low": 5, "high": 100}),
    },
}

models = {
    model_name: (model, hpo_spaces[model_name]) for model_name, model in models.items()
}

In [6]:
# Hyperparameter Optimization Configuration
import optuna
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import cross_val_score

# Study configuration
hpo_config = {
    "n_trials": 10,
    "timeout": None,
    "show_progress_bar": True,
}

# Cross-validation parameters for HPO objective function
objective_cv_params = {"cv": 5, "scoring": "neg_mean_squared_error"}


def create_objective(model, hpo_space, X, y):
    def objective(trial: optuna.Trial) -> float:
        param_type_map = {
            "int": trial.suggest_int,
            "float": trial.suggest_float,
            "categorical": trial.suggest_categorical,
        }
        params = {
            f"model__{p}": param_type_map[typ](f"model__{p}", **kw)
            for p, (typ, kw) in hpo_space.items()
        }
        model.set_params(**params)
        return cross_val_score(model, X, y, **objective_cv_params).mean()

    return objective


class TemplateRegressor(RegressorMixin, BaseEstimator):
    def __init__(self, model, hpo_space):
        self.model = model
        self.hpo_space = hpo_space

    def fit(self, X, y):
        model = clone(self.model)
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=SEED))
        study.optimize(
            create_objective(model, self.hpo_space, X, y),
            **hpo_config,
        )
        model.set_params(**study.best_params)
        self.fitted_model_ = model.fit(X, y)
        return self

    def predict(self, X):
        return self.fitted_model_.predict(X)

## Define Benchmark Suite and Tasks


In [7]:
# Define Benchmark Suite
import openml
from IPython.display import display

SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23
suite = openml.study.get_suite(SUITE_ID)
display(suite)
print(suite.description)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

Inclusion Criteria:  

* There are between 500 and 100000 observations.
* There are less than 5000 features after one-hot encoding all categorical features.
* The dataset is not in a sparse format.
* The observations are i.i.d., which means that we exclude datasets that have time dependencies or require grouped data splits.
* The dataset comes with a source or reference that clearly describes it.
* We did not consider the dataset to be artificial, but allowed simulated datasets.
* The data is not a subset of a larger dataset.
* There is a numeric target variable with at least 5 different values.
* The dataset is not trivially solvable by a linear model, i.e. the training error of a linear model fitted to the whole data has an R2 of less than 1.
* The dataset does not have ethical concerns.
* The use of the dataset for benchmarking is not forbidden.

In addition to the datasets, the OpenML tasks also contain resampling splits, which were determined according to the following rule: If th

In [8]:
# Download tasks from the suite
N_TASKS = 1
# N_TASKS = len(suite.tasks or [])
tasks = openml.tasks.get_tasks(
    (suite.tasks or [])[:N_TASKS], download_data=True, download_qualities=True
)
display(tasks)

[OpenML Regression Task
 Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_REGRESSION
 Task ID..............: 361234
 Task URL.............: https://www.openml.org/t/361234
 Estimation Procedure.: crossvalidation
 Target Feature.......: rings]

In [9]:
import warnings

import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_validate

cv_params = {
    "scoring": ["neg_mean_squared_error", "r2"],
    "return_train_score": True,
}


def run_model_on_task(model_name, model, X, y, cv):
    print(f"Running {model_name=}")
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    run = cross_validate(
        TemplateRegressor(*model),
        X,
        y,
        cv=cv,
        **cv_params,
    )
    return model_name, pd.DataFrame(run).mean(axis=0)

### Benchmarking Execution


In [10]:
from joblib import Parallel, delayed

# selected_models = models
selected_models = {m: models[m] for m in ["Dummy", "RF"]}

runs = {}
for task in tasks:
    splits = task.download_split().split
    X, y = task.get_X_and_y(dataset_format="dataframe")  # type: ignore
    cv = [s[0] for s in splits[0].values()][:3]
    results = Parallel(n_jobs=-1)(
        delayed(run_model_on_task)(model_name, model, X, y, cv)
        for model_name, model in selected_models.items()
    )
    runs[task.id] = dict(results)  # type: ignore

Running model_name='Dummy'
Running model_name='RF'


  0%|          | 0/10 [00:00<?, ?it/s]  0%|          | 0/10 [00:00<?, ?it/s]Best trial: 0. Best value: -10.4063:   0%|          | 0/10 [00:00<?, ?it/s]Best trial: 0. Best value: -10.4063:  10%|█         | 1/10 [00:00<00:01,  8.11it/s]

Best trial: 0. Best value: -10.4063:  10%|█         | 1/10 [00:00<00:01,  8.11it/s]Best trial: 0. Best value: -10.4063:  20%|██        | 2/10 [00:00<00:00,  8.46it/s]Best trial: 0. Best value: -10.4063:  20%|██        | 2/10 [00:00<00:00,  8.46it/s]Best trial: 0. Best value: -10.4063:  30%|███       | 3/10 [00:00<00:00,  7.81it/s]

Best trial: 0. Best value: -10.4063:  30%|███       | 3/10 [00:00<00:00,  7.81it/s]Best trial: 0. Best value: -10.4063:  40%|████      | 4/10 [00:00<00:00,  8.15it/s]Best trial: 0. Best value: -10.4063:  40%|████      | 4/10 [00:00<00:00,  8.15it/s]Best trial: 0. Best value: -10.4063:  50%|█████     | 5/10 [00:00<00:00,  8.38it/s]

Best trial: 0. Best value: -10.4063:  50%|█████     | 5/10 [00:00<00:00,  8.38it/s]Best trial: 0. Best value: -10.4063:  60%|██████    | 6/10 [00:00<00:00,  6.35it/s]Best trial: 0. Best value: -10.4063:  60%|██████    | 6/10 [00:00<00:00,  6.35it/s]Best trial: 0. Best value: -10.4063:  70%|███████   | 7/10 [00:00<00:00,  7.00it/s]

Best trial: 0. Best value: -10.4063:  70%|███████   | 7/10 [00:01<00:00,  7.00it/s]Best trial: 0. Best value: -10.4063:  80%|████████  | 8/10 [00:01<00:00,  7.50it/s]Best trial: 0. Best value: -10.4063:  80%|████████  | 8/10 [00:01<00:00,  7.50it/s]Best trial: 0. Best value: -10.4063:  90%|█████████ | 9/10 [00:01<00:00,  7.87it/s]

Best trial: 0. Best value: -10.4063:  90%|█████████ | 9/10 [00:01<00:00,  7.87it/s]Best trial: 0. Best value: -10.4063: 100%|██████████| 10/10 [00:01<00:00,  8.16it/s]Best trial: 0. Best value: -10.4063: 100%|██████████| 10/10 [00:01<00:00,  7.76it/s]
  0%|          | 0/10 [00:00<?, ?it/s]Best trial: 0. Best value: -10.378:   0%|          | 0/10 [00:00<?, ?it/s]Best trial: 0. Best value: -10.378:  10%|█         | 1/10 [00:00<00:01,  8.85it/s]

Best trial: 0. Best value: -10.378:  10%|█         | 1/10 [00:00<00:01,  8.85it/s]Best trial: 0. Best value: -10.378:  20%|██        | 2/10 [00:00<00:00,  8.81it/s]Best trial: 0. Best value: -10.378:  20%|██        | 2/10 [00:00<00:00,  8.81it/s]Best trial: 0. Best value: -10.378:  30%|███       | 3/10 [00:00<00:00,  8.83it/s]

Best trial: 0. Best value: -10.378:  30%|███       | 3/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.378:  40%|████      | 4/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.378:  40%|████      | 4/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.378:  50%|█████     | 5/10 [00:00<00:00,  8.80it/s]

Best trial: 0. Best value: -10.378:  50%|█████     | 5/10 [00:00<00:00,  8.80it/s]Best trial: 0. Best value: -10.378:  60%|██████    | 6/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.378:  60%|██████    | 6/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.378:  70%|███████   | 7/10 [00:00<00:00,  8.86it/s]

Best trial: 0. Best value: -10.378:  70%|███████   | 7/10 [00:00<00:00,  8.86it/s]Best trial: 0. Best value: -10.378:  80%|████████  | 8/10 [00:00<00:00,  8.85it/s]Best trial: 0. Best value: -10.378:  80%|████████  | 8/10 [00:01<00:00,  8.85it/s]Best trial: 0. Best value: -10.378:  90%|█████████ | 9/10 [00:01<00:00,  8.86it/s]

Best trial: 0. Best value: -10.378:  90%|█████████ | 9/10 [00:01<00:00,  8.86it/s]Best trial: 0. Best value: -10.378: 100%|██████████| 10/10 [00:01<00:00,  8.71it/s]Best trial: 0. Best value: -10.378: 100%|██████████| 10/10 [00:01<00:00,  8.79it/s]
  0%|          | 0/10 [00:00<?, ?it/s]Best trial: 0. Best value: -10.3806:   0%|          | 0/10 [00:00<?, ?it/s]Best trial: 0. Best value: -10.3806:  10%|█         | 1/10 [00:00<00:01,  8.76it/s]

Best trial: 0. Best value: -10.3806:  10%|█         | 1/10 [00:00<00:01,  8.76it/s]Best trial: 0. Best value: -10.3806:  20%|██        | 2/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.3806:  20%|██        | 2/10 [00:00<00:00,  8.83it/s]Best trial: 0. Best value: -10.3806:  30%|███       | 3/10 [00:00<00:00,  8.87it/s]

Best trial: 0. Best value: -10.3806:  30%|███       | 3/10 [00:00<00:00,  8.87it/s]Best trial: 0. Best value: -10.3806:  40%|████      | 4/10 [00:00<00:00,  8.86it/s]Best trial: 0. Best value: -10.3806:  40%|████      | 4/10 [00:00<00:00,  8.86it/s]Best trial: 0. Best value: -10.3806:  50%|█████     | 5/10 [00:00<00:00,  8.85it/s]

Best trial: 0. Best value: -10.3806:  50%|█████     | 5/10 [00:00<00:00,  8.85it/s]Best trial: 0. Best value: -10.3806:  60%|██████    | 6/10 [00:00<00:00,  8.86it/s]Best trial: 0. Best value: -10.3806:  60%|██████    | 6/10 [00:00<00:00,  8.86it/s]Best trial: 0. Best value: -10.3806:  70%|███████   | 7/10 [00:00<00:00,  8.86it/s]

Best trial: 0. Best value: -10.3806:  70%|███████   | 7/10 [00:00<00:00,  8.86it/s]Best trial: 0. Best value: -10.3806:  80%|████████  | 8/10 [00:00<00:00,  8.87it/s]Best trial: 0. Best value: -10.3806:  80%|████████  | 8/10 [00:01<00:00,  8.87it/s]Best trial: 0. Best value: -10.3806:  90%|█████████ | 9/10 [00:01<00:00,  8.87it/s]Best trial: 0. Best value: -4.95303:   0%|          | 0/10 [00:03<?, ?it/s]Best trial: 0. Best value: -4.95303:  10%|█         | 1/10 [00:03<00:31,  3.55s/it]

Best trial: 0. Best value: -10.3806:  90%|█████████ | 9/10 [00:01<00:00,  8.87it/s]Best trial: 0. Best value: -10.3806: 100%|██████████| 10/10 [00:01<00:00,  8.84it/s]Best trial: 0. Best value: -10.3806: 100%|██████████| 10/10 [00:01<00:00,  8.85it/s]


Best trial: 0. Best value: -4.95303:  10%|█         | 1/10 [00:08<00:31,  3.55s/it]Best trial: 0. Best value: -4.95303:  20%|██        | 2/10 [00:08<00:36,  4.56s/it]

Best trial: 2. Best value: -5.191:  20%|██        | 2/10 [00:09<00:36,  4.56s/it]  Best trial: 2. Best value: -5.191:  30%|███       | 3/10 [00:09<00:19,  2.76s/it]

Best trial: 2. Best value: -5.191:  30%|███       | 3/10 [00:10<00:19,  2.76s/it]Best trial: 2. Best value: -5.191:  40%|████      | 4/10 [00:10<00:12,  2.04s/it]

Best trial: 2. Best value: -5.191:  40%|████      | 4/10 [00:15<00:12,  2.04s/it]Best trial: 2. Best value: -5.191:  50%|█████     | 5/10 [00:15<00:15,  3.08s/it]

Best trial: 5. Best value: -5.5079:  50%|█████     | 5/10 [00:15<00:15,  3.08s/it]Best trial: 5. Best value: -5.5079:  60%|██████    | 6/10 [00:15<00:08,  2.25s/it]

Best trial: 5. Best value: -5.5079:  60%|██████    | 6/10 [00:18<00:08,  2.25s/it]Best trial: 5. Best value: -5.5079:  70%|███████   | 7/10 [00:18<00:07,  2.41s/it]

Best trial: 5. Best value: -5.5079:  70%|███████   | 7/10 [00:19<00:07,  2.41s/it]Best trial: 5. Best value: -5.5079:  80%|████████  | 8/10 [00:19<00:03,  1.86s/it]

Best trial: 5. Best value: -5.5079:  80%|████████  | 8/10 [00:21<00:03,  1.86s/it]Best trial: 5. Best value: -5.5079:  90%|█████████ | 9/10 [00:21<00:02,  2.00s/it]

Best trial: 5. Best value: -5.5079:  90%|█████████ | 9/10 [00:23<00:02,  2.00s/it]Best trial: 5. Best value: -5.5079: 100%|██████████| 10/10 [00:23<00:00,  1.96s/it]Best trial: 5. Best value: -5.5079: 100%|██████████| 10/10 [00:23<00:00,  2.35s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Best trial: 0. Best value: -4.88801:   0%|          | 0/10 [00:03<?, ?it/s]Best trial: 0. Best value: -4.88801:  10%|█         | 1/10 [00:03<00:31,  3.56s/it]

Best trial: 0. Best value: -4.88801:  10%|█         | 1/10 [00:09<00:31,  3.56s/it]Best trial: 0. Best value: -4.88801:  20%|██        | 2/10 [00:09<00:37,  4.68s/it]

Best trial: 2. Best value: -5.17973:  20%|██        | 2/10 [00:09<00:37,  4.68s/it]Best trial: 2. Best value: -5.17973:  30%|███       | 3/10 [00:09<00:19,  2.84s/it]

Best trial: 3. Best value: -5.21983:  30%|███       | 3/10 [00:10<00:19,  2.84s/it]Best trial: 3. Best value: -5.21983:  40%|████      | 4/10 [00:10<00:12,  2.10s/it]

Best trial: 3. Best value: -5.21983:  40%|████      | 4/10 [00:15<00:12,  2.10s/it]Best trial: 3. Best value: -5.21983:  50%|█████     | 5/10 [00:15<00:16,  3.23s/it]

Best trial: 5. Best value: -5.5861:  50%|█████     | 5/10 [00:16<00:16,  3.23s/it] Best trial: 5. Best value: -5.5861:  60%|██████    | 6/10 [00:16<00:09,  2.35s/it]

Best trial: 5. Best value: -5.5861:  60%|██████    | 6/10 [00:19<00:09,  2.35s/it]Best trial: 5. Best value: -5.5861:  70%|███████   | 7/10 [00:19<00:07,  2.53s/it]

Best trial: 5. Best value: -5.5861:  70%|███████   | 7/10 [00:20<00:07,  2.53s/it]Best trial: 5. Best value: -5.5861:  80%|████████  | 8/10 [00:20<00:03,  1.95s/it]

Best trial: 5. Best value: -5.5861:  80%|████████  | 8/10 [00:22<00:03,  1.95s/it]Best trial: 5. Best value: -5.5861:  90%|█████████ | 9/10 [00:22<00:02,  2.09s/it]

Best trial: 5. Best value: -5.5861:  90%|█████████ | 9/10 [00:24<00:02,  2.09s/it]Best trial: 5. Best value: -5.5861: 100%|██████████| 10/10 [00:24<00:00,  2.03s/it]Best trial: 5. Best value: -5.5861: 100%|██████████| 10/10 [00:24<00:00,  2.44s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Best trial: 0. Best value: -4.89426:   0%|          | 0/10 [00:03<?, ?it/s]Best trial: 0. Best value: -4.89426:  10%|█         | 1/10 [00:03<00:32,  3.59s/it]

Best trial: 0. Best value: -4.89426:  10%|█         | 1/10 [00:09<00:32,  3.59s/it]Best trial: 0. Best value: -4.89426:  20%|██        | 2/10 [00:09<00:37,  4.72s/it]

Best trial: 2. Best value: -5.17916:  20%|██        | 2/10 [00:09<00:37,  4.72s/it]Best trial: 2. Best value: -5.17916:  30%|███       | 3/10 [00:09<00:20,  2.86s/it]

Best trial: 3. Best value: -5.36247:  30%|███       | 3/10 [00:10<00:20,  2.86s/it]Best trial: 3. Best value: -5.36247:  40%|████      | 4/10 [00:10<00:12,  2.12s/it]

Best trial: 3. Best value: -5.36247:  40%|████      | 4/10 [00:15<00:12,  2.12s/it]Best trial: 3. Best value: -5.36247:  50%|█████     | 5/10 [00:15<00:16,  3.21s/it]

Best trial: 5. Best value: -5.55537:  50%|█████     | 5/10 [00:16<00:16,  3.21s/it]Best trial: 5. Best value: -5.55537:  60%|██████    | 6/10 [00:16<00:09,  2.34s/it]

Best trial: 5. Best value: -5.55537:  60%|██████    | 6/10 [00:19<00:09,  2.34s/it]Best trial: 5. Best value: -5.55537:  70%|███████   | 7/10 [00:19<00:07,  2.51s/it]

Best trial: 5. Best value: -5.55537:  70%|███████   | 7/10 [00:20<00:07,  2.51s/it]Best trial: 5. Best value: -5.55537:  80%|████████  | 8/10 [00:20<00:03,  1.94s/it]

Best trial: 5. Best value: -5.55537:  80%|████████  | 8/10 [00:22<00:03,  1.94s/it]Best trial: 5. Best value: -5.55537:  90%|█████████ | 9/10 [00:22<00:02,  2.09s/it]

Best trial: 5. Best value: -5.55537:  90%|█████████ | 9/10 [00:24<00:02,  2.09s/it]Best trial: 5. Best value: -5.55537: 100%|██████████| 10/10 [00:24<00:00,  2.02s/it]Best trial: 5. Best value: -5.55537: 100%|██████████| 10/10 [00:24<00:00,  2.44s/it]


### Aggregate Results


In [11]:
# %%capture output
# Aggregate Results
import pandas as pd

runs_df = pd.DataFrame(runs)
metrics = cv_params["scoring"]
metrics_df: dict[str, tuple[pd.DataFrame, pd.DataFrame]] = {}
for metric in metrics:
    metrics_df[metric] = (
        runs_df.map(lambda r: r[f"test_{metric}"]),
        runs_df.map(lambda r: r[f"train_{metric}"]),
    )

    # Use first metric for ranking
test_ranks_df = metrics_df[metrics[0]][0].rank(axis=0, ascending=False)
test_ranks_df["avg_rank"] = test_ranks_df.mean(axis=1)
test_ranks_df["std_rank"] = test_ranks_df.std(axis=1)

train_ranks_df = metrics_df[metrics[0]][1].rank(axis=0, ascending=False)
train_ranks_df["avg_rank"] = train_ranks_df.mean(axis=1)
train_ranks_df["std_rank"] = train_ranks_df.std(axis=1)

metrics_df["rank"] = (test_ranks_df, train_ranks_df)

# Display all metrics and ranks
for metric, (test, train) in metrics_df.items():
    display(f"test_{metric}", test, f"train_{metric}", train)

'test_neg_mean_squared_error'

Unnamed: 0,361234
Dummy,-10.464317
RF,-5.61449


'train_neg_mean_squared_error'

Unnamed: 0,361234
Dummy,-10.384931
RF,-1.080574


'test_r2'

Unnamed: 0,361234
Dummy,-0.000932
RF,0.462583


'train_r2'

Unnamed: 0,361234
Dummy,0.0
RF,0.895948


'test_rank'

Unnamed: 0,361234,avg_rank,std_rank
Dummy,2.0,2.0,0.0
RF,1.0,1.0,0.0


'train_rank'

Unnamed: 0,361234,avg_rank,std_rank
Dummy,2.0,2.0,0.0
RF,1.0,1.0,0.0


In [12]:
# output.show()

In [13]:
# # Single model evaluation
# from sklearn.model_selection import cross_validate

# task = tasks[0]
# splits = task.download_split().split
# X, y = task.get_X_and_y(dataset_format="dataframe")  # type: ignore
# cv = [s[0] for s in splits[0].values()][:3]  # Using pre-defined OpenML splits
# model = TemplateRegressor(*models["ForestMLP"])
# run = cross_validate(
#     model,
#     X,
#     y,
#     cv=cv
#     **cv_params,
# )
# display(pd.DataFrame(run).mean(axis=0))