# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


In [None]:
# %pip install -q pip openml scikit-learn pandas nbformat setuptools

In [1]:
# Baseline Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# Baseline models
models = {
    "DummyRegressor": DummyRegressor(strategy="mean"),
    "RandomForestRegressor": RandomForestRegressor(),
    # "LinearRegression": LinearRegression(),
    # "MLPRegressor": MLPRegressor(),
}

In [2]:
# Hyperparameter Optimization
from scipy.stats import loguniform, randint
from sklearn.model_selection import KFold, RandomizedSearchCV

search_params = {
    "cv": 5,
    "n_jobs": -1,
    "n_trials": 10,
    "return_train_score": False,
    "scoring": "neg_mean_squared_error",
    "timeout": None,
    "verbose": 0,
}

hpo_grid = {
    "RandomForestRegressor": {
        "n_estimators": randint(5, 100),
        "max_depth": [2, 5, 10],
    },
    "MLPRegressor": {
        "hidden_layer_sizes": [(10,), (50,), (10, 10), (50, 50)],
        "alpha": loguniform(1e-5, 1e-1),
        "learning_rate_init": loguniform(1e-4, 1e-1),
        "max_iter": [100, 400, 1000],
    },
}

In [33]:
# Define Benchmark Suite
import openml
from IPython.display import display

SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23 - A curated tabular regression benchmarking suite
suite = openml.study.get_suite(SUITE_ID)
display(suite)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

In [4]:
# Data Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


def make_preprocessor(dataset: openml.OpenMLDataset) -> ColumnTransformer:
    target = dataset.default_target_attribute
    numeric_features = dataset.get_features_by_type("numeric", exclude=[target])
    nominal_features = dataset.get_features_by_type("nominal", exclude=[target])

    numeric_transformer = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )
    nominal_transformer = Pipeline(
        [("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]
    )

    return ColumnTransformer(
        [
            ("numeric_preprocessor", numeric_transformer, numeric_features),
            ("nominal_preprocessor", nominal_transformer, nominal_features),
        ],
        sparse_threshold=0.0,  # Ensure dense output
    )

In [29]:
# Run Models
SEED = 42
N_TASKS = 2
task_runs = {}

for task_id in suite.tasks[:N_TASKS]:
    model_results = {}
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    preprocessor = make_preprocessor(dataset)
    for model_name in models.keys():
        print(f"Running {model_name=} on {dataset.name=}...")
        pipe = Pipeline([("preprocessor", preprocessor), ("model", models[model_name])])
        model = (
            RandomizedSearchCV(
                pipe,
                {f"model__{k}": v for k, v in hpo_grid[model_name].items()},
                cv=KFold(n_splits=4, shuffle=True),
            )
            if model_name in hpo_grid
            else pipe
        )
        # run = openml.runs.run_model_on_task(model, task, seed=SEED, n_jobs=-1)
        # model_results[model_name] = run
    task_runs[task_id] = model_results

Running model_name='DummyRegressor' on dataset.name='abalone'...
Running model_name='RandomForestRegressor' on dataset.name='abalone'...
Running model_name='DummyRegressor' on dataset.name='airfoil_self_noise'...
Running model_name='RandomForestRegressor' on dataset.name='airfoil_self_noise'...


In [8]:
# Aggregate Results
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

runs_df = pd.DataFrame(task_runs)
mse_df = runs_df.map(lambda run: run.get_metric_fn(mean_squared_error).mean())
r2_df = runs_df.map(lambda run: run.get_metric_fn(r2_score).mean())
rank_df = mse_df.rank(axis=0, method="min", ascending=True)
average_rank = rank_df.mean(axis=1)
display(mse_df, r2_df, rank_df, average_rank)

Unnamed: 0,361234,361235
DummyRegressor,10.39585,47.632423
RandomForestRegressor,4.622136,3.688402


Unnamed: 0,361234,361235
DummyRegressor,-0.001675,-0.007684
RandomForestRegressor,0.554442,0.921467


Unnamed: 0,361234,361235
DummyRegressor,2.0,2.0
RandomForestRegressor,1.0,1.0


DummyRegressor           2.0
RandomForestRegressor    1.0
dtype: float64

In [10]:
# Single Model Evaluation
task_id = suite.tasks[4]
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
preprocessor = make_preprocessor(dataset)

model_name = "RandomForestRegressor"
pipe = Pipeline([("preprocessor", preprocessor), ("model", models[model_name])])
model = pipe

run = openml.runs.run_model_on_task(model, task, n_jobs=-1)
display(run)

OpenML Run
Uploader Name..............: None
Metric.....................: None
Local Result - MAE (+- STD): 2.3273 +- 0.0312
Local Runtime - ms (+- STD): 60488.1172 +- 735.7695
Run ID.....................: None
Task ID....................: 361241
Task Type..................: None
Task URL...................: https://www.openml.org/t/361241
Flow ID....................: None
Flow Name..................: sklearn.pipeline.Pipeline(preprocessor=sklearn.compose._column_transformer.ColumnTransformer(numeric_preprocessor=sklearn.pipeline.Pipeline(imputer=sklearn.impute._base.SimpleImputer,scaler=sklearn.preprocessing._data.StandardScaler),nominal_preprocessor=sklearn.pipeline.Pipeline(onehot=sklearn.preprocessing._encoders.OneHotEncoder)),model=sklearn.ensemble._forest.RandomForestRegressor)
Flow URL...................: None
Setup ID...................: None
Setup String...............: Python_3.13.3. Sklearn_1.6.1. NumPy_2.2.5. SciPy_1.15.2.
Dataset ID.................: 44963
Dataset URL.....

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor


class ForestMLP(MLPRegressor):
    def __init__(self, forest_args, mlp_args):
        self.forest_ = RandomForestRegressor(**forest_args)
        super().__init__(**mlp_args)

    def fit(self, X, y):
        self.forest_.fit(X, y)
        super().fit(X, y)
        # return self


task_id = suite.tasks[0]
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
preprocessor = make_preprocessor(dataset)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
X = preprocessor.fit_transform(X)

# mlp = MLPRegressor(
#     hidden_layer_sizes=(10,),
#     max_iter=200,
# )
# mlp.fit(X, y)
# display(mlp.score(X, y))

forest_mlp = ForestMLP(
    forest_args={
        "n_estimators": 100,
        "max_depth": 2,
    },
    mlp_args={
        "hidden_layer_sizes": (10,),
        "max_iter": 200,
    },
)
# forest_mlp.forest_.fit(X, y)
forest_mlp.fit(X, y)

AttributeError: 'ForestMLP' object has no attribute 'forest_args'