# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


In [54]:
# %pip install -q pip openml scikit-learn pandas nbformat setuptools

In [55]:
# Setup
import openml
import pandas as pd
from IPython.display import display

In [56]:
# Data Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


def make_preprocessor(dataset: openml.OpenMLDataset) -> ColumnTransformer:
    target = dataset.default_target_attribute
    numeric_features = dataset.get_features_by_type("numeric", exclude=[target])
    nominal_features = dataset.get_features_by_type("nominal", exclude=[target])

    numeric_transformer = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )
    nominal_transformer = Pipeline(
        [("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]
    )

    return ColumnTransformer(
        [
            ("numeric_preprocessor", numeric_transformer, numeric_features),
            ("nominal_preprocessor", nominal_transformer, nominal_features),
        ],
        sparse_threshold=0.0,  # Ensure dense output
    )

In [57]:
# Baseline Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# Baseline models
models = {
    "DummyRegressor": DummyRegressor(strategy="mean"),
    "RandomForestRegressor": RandomForestRegressor(),
    "LinearRegression": LinearRegression(),
    "MLPRegressor": MLPRegressor(),
}

In [None]:
# Hyperparameter Optimization
from scipy.stats import uniform
from sklearn.model_selection import KFold, RandomizedSearchCV

search_params = {
    "cv": 5,
    "n_jobs": -1,
    "n_trials": 20,
    "return_train_score": False,
    # "scoring": "neg_mean_squared_error",
    "timeout": None,
    "verbose": 0,
}

hpo_grid = {
    "RandomForestRegressor": {
        "n_estimators": [5, 10],
        "max_depth": [2, 5],
    },
    "MLPRegressor": {
        "hidden_layer_sizes": [(5,), (10,)],
        "learning_rate_init": [1e-3, 1e-1],
        "max_iter": [100, 200],
    },
}


def make_hpo_model(
    preprocessor: ColumnTransformer, model_name: str
) -> RandomizedSearchCV | Pipeline:
    pipe = Pipeline([("preprocessor", preprocessor), ("model", models[model_name])])
    return (
        RandomizedSearchCV(
            pipe,
            {f"model__{k}": v for k, v in hpo_grid[model_name].items()},
            cv=KFold(n_splits=4, shuffle=True),
        )
        if model_name in hpo_grid
        else pipe
    )

In [59]:
# Evaluate Model
def evaluate_model(task: openml.OpenMLTask, model_name: str, seed: int = 42):
    dataset = task.get_dataset()
    preprocessor = make_preprocessor(dataset)
    model = make_hpo_model(preprocessor, model_name)
    run = openml.runs.run_model_on_task(model, task, seed=seed, n_jobs=-1)
    return run

In [60]:
# Define Benchmark Suite
SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23 - A curated tabular regression benchmarking suite
suite = openml.study.get_suite(SUITE_ID)
display(suite)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

In [None]:
# Run Models
N_TASKS = 999
task_results = {}

for task_id in suite.tasks[:N_TASKS]:
    model_results = {}
    task = openml.tasks.get_task(task_id)
    for model_name in models.keys():
        print(f"Running {model_name} on task {task.target_name} id {task_id}...")
        run = evaluate_model(task, model_name)
        model_results[model_name] = run
    task_results[task_id] = model_results

In [None]:
# Aggregate Results
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

results_df = pd.DataFrame(task_results)
mse_df = results_df.map(lambda run: run.get_metric_fn(mean_squared_error).mean())
r2_df = results_df.map(lambda run: run.get_metric_fn(r2_score).mean())
rank_df = mse_df.rank(axis=0, method="min", ascending=True)
average_rank = rank_df.mean(axis=1)
display(mse_df, r2_df, rank_df, average_rank)

In [61]:
# Single Model Evaluation
task_id = suite.tasks[0]
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
preprocessor = make_preprocessor(dataset)

model_name = "LinearRegression"
model = make_hpo_model(preprocessor, model_name)
run = openml.runs.run_model_on_task(model, task, n_jobs=-1)
display(run)



OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 1.5850 +- 0.0716
Local Runtime - ms (+- STD): 24.1282 +- 7.6310
Run ID.....................: None
Task ID....................: 361234
Task Type..................: None
Task URL...................: https://www.openml.org/t/361234
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.linear_model._base.LinearRegression)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44956
Dataset URL.............