# OpenML-CTR23 Regression Benchmark

This notebook benchmarks a variety of regression models on the `OpenML-CTR23 - A curated tabular regression benchmarking suite` benchmark suite.


In [1]:
# %pip install -q pip openml scikit-learn pandas nbformat setuptools

In [None]:
# Baseline Models
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# Baseline models
models = {
    "DummyRegressor": DummyRegressor(strategy="mean"),
    "RandomForestRegressor": RandomForestRegressor(),
    # "LinearRegression": LinearRegression(),
    "MLPRegressor": MLPRegressor(solver="sgd"),
}

In [3]:
# Hyperparameter Optimization
from scipy.stats import loguniform, randint
from sklearn.model_selection import KFold, RandomizedSearchCV

search_params = {
    "cv": 5,
    "n_jobs": -1,
    "n_trials": 10,
    "return_train_score": False,
    "scoring": "neg_mean_squared_error",
    "timeout": None,
    "verbose": 0,
}

hpo_grid = {
    "RandomForestRegressor": {
        "n_estimators": randint(5, 100),
        "max_depth": [2, 5, 10],
    },
    "MLPRegressor": {
        "hidden_layer_sizes": [(10,), (50,), (10, 10), (50, 50)],
        "alpha": loguniform(1e-5, 1e-1),
        "learning_rate_init": loguniform(1e-4, 1e-1),
        "max_iter": [100, 400, 1000],
    },
}

In [4]:
# Define Benchmark Suite
import openml
from IPython.display import display

SUITE_ID = "8f0ea660163b436bbd4abd49665c7b1d"  # OpenML-CTR23 - A curated tabular regression benchmarking suite
suite = openml.study.get_suite(SUITE_ID)
display(suite)

OpenML Benchmark Suite
ID..............: 353
Name............: OpenML-CTR23 - A curated tabular regression benchmarking suite
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/353
# of Data.......: 35
# of Tasks......: 35
Creator.........: https://www.openml.org/u/30127
Upload Time.....: 2023-05-31 16:39:49

In [5]:
# Data Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


def make_preprocessor(dataset: openml.OpenMLDataset) -> ColumnTransformer:
    target = dataset.default_target_attribute
    numeric_features = dataset.get_features_by_type("numeric", exclude=[target])
    nominal_features = dataset.get_features_by_type("nominal", exclude=[target])

    numeric_transformer = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )
    nominal_transformer = Pipeline(
        [("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]
    )

    return ColumnTransformer(
        [
            ("numeric_preprocessor", numeric_transformer, numeric_features),
            ("nominal_preprocessor", nominal_transformer, nominal_features),
        ],
        sparse_threshold=0.0,  # Ensure dense output
    )

In [None]:
# Run Models
SEED = 42
N_TASKS = 2
task_runs = {}

for task_id in suite.tasks[:N_TASKS]:
    model_results = {}
    task = openml.tasks.get_task(task_id)
    dataset = task.get_dataset()
    preprocessor = make_preprocessor(dataset)
    for model_name in models.keys():
        print(f"Running {model_name=} on {dataset.name=}...")
        pipe = Pipeline([("preprocessor", preprocessor), ("model", models[model_name])])
        model = (
            RandomizedSearchCV(
                pipe,
                {f"model__{k}": v for k, v in hpo_grid[model_name].items()},
                cv=KFold(n_splits=4, shuffle=True),
            )
            if model_name in hpo_grid
            else pipe
        )
        # run = openml.runs.run_model_on_task(model, task, seed=SEED, n_jobs=-1)
        # model_results[model_name] = run
    task_runs[task_id] = model_results

In [None]:
# Aggregate Results
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

runs_df = pd.DataFrame(task_runs)
mse_df = runs_df.map(lambda run: run.get_metric_fn(mean_squared_error).mean())
r2_df = runs_df.map(lambda run: run.get_metric_fn(r2_score).mean())
rank_df = mse_df.rank(axis=0, method="min", ascending=True)
average_rank = rank_df.mean(axis=1)
display(mse_df, r2_df, rank_df, average_rank)

In [None]:
# Single Model Evaluation
task_id = suite.tasks[4]
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
preprocessor = make_preprocessor(dataset)

model_name = "RandomForestRegressor"
pipe = Pipeline([("preprocessor", preprocessor), ("model", models[model_name])])
model = pipe

run = openml.runs.run_model_on_task(model, task, n_jobs=-1)
display(run)

In [12]:
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

task_id = suite.tasks[0]
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
preprocessor = make_preprocessor(dataset)

mlp = MLPRegressor(
    hidden_layer_sizes=(10,),
    max_iter=200,
)
tree_mlp = StackingRegressor(
    estimators=[("Tree", DecisionTreeRegressor())],
    final_estimator=mlp,
    passthrough=True,
)
forest_mlp = StackingRegressor(
    estimators=[(f"Tree {i}", DecisionTreeRegressor(random_state=i)) for i in range(5)],
    final_estimator=mlp,
    passthrough=True,
)

mlp_pipe = Pipeline([("preprocessor", preprocessor), ("model", mlp)])
tree_mlp_pipe = Pipeline([("preprocessor", preprocessor), ("model", tree_mlp)])
forest_mlp_pipe = Pipeline([("preprocessor", preprocessor), ("model", forest_mlp)])

# mlp_run = openml.runs.run_model_on_task(mlp_pipe, task, n_jobs=-1)
# tree_mlp_run = openml.runs.run_model_on_task(tree_mlp_pipe, task, n_jobs=-1)
forest_mlp_run = openml.runs.run_model_on_task(forest_mlp_pipe, task, n_jobs=-1)



ValueError: Found a second occurence of component sklearn.tree._classes.DecisionTreeRegressor when trying to serialize StackingRegressor(estimators=[('Tree 0', DecisionTreeRegressor(random_state=0)),
                              ('Tree 1', DecisionTreeRegressor(random_state=1)),
                              ('Tree 2', DecisionTreeRegressor(random_state=2)),
                              ('Tree 3', DecisionTreeRegressor(random_state=3)),
                              ('Tree 4',
                               DecisionTreeRegressor(random_state=4))],
                  final_estimator=MLPRegressor(hidden_layer_sizes=(10,)),
                  passthrough=True).

In [None]:
display(mlp_run, tree_mlp_run)

In [None]:
from sklearn.datasets import fetch_openml

dataset = fetch_openml(data_id=44956)
display(dataset)
[openml.tasks.get_task(task).dataset_id for task in suite.tasks]