Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from scipy.stats import loguniform

# Enable successive halving
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

import os
import warnings
warnings.filterwarnings("ignore")


Config

In [4]:
DATASETS = ["adult", "credit-g"]
TEST_SIZE = 0.2

# Number of times to repeat each HPO method
N_RUNS = 20
RUN_SEEDS = list(range(N_RUNS))

# Random Search budget (number of sampled configs)
N_ITER_RANDOM = 20

# Save to project root: ../results/hpo
OUTPUT_DIR = "../results/hpo"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Saving HPO results to:", os.path.abspath(OUTPUT_DIR))


Saving HPO results to: c:\BudgetAware_HPO_Dissertation\results\hpo


Dataset Loader

In [5]:
def load_openml_dataset(name):
    dataset = fetch_openml(name=name, version=1, as_frame=True)
    X = dataset.data
    y = pd.factorize(dataset.target)[0]  # convert labels to numeric
    return X, y


MLP Pipeline Builder (with preprocessing)

In [6]:
def build_mlp_pipeline(X, random_state=42):
    categorical_cols = X.select_dtypes(include=["object", "category"]).columns
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
            ("num", StandardScaler(), numeric_cols)
        ]
    )

    clf = MLPClassifier(
        hidden_layer_sizes=(50,),   # will be overridden by HPO
        activation="relu",
        solver="adam",
        learning_rate_init=0.001,   # overridden in HPO
        max_iter=200,               # baseline minimum; halving overrides it during search
        alpha=0.0001,
        batch_size="auto",
        random_state=random_state
    )

    model = Pipeline([
        ("preprocessor", preprocessor),
        ("clf", clf)
    ])

    return model


Hyperparameter Search Spaces

In [7]:
def get_param_distributions():
    return {
        "clf__hidden_layer_sizes": [
            (50,), (100,), (50, 50), (100, 50), (100, 100)
        ],
        "clf__alpha": loguniform(1e-5, 1e-1),
        "clf__learning_rate_init": loguniform(1e-4, 1e-1),
    }


Random Search Function

In [8]:
def run_random_search(X, y, run_seed, dataset_name):
    print(f"\n[Random Search] Dataset={dataset_name}, seed={run_seed}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=run_seed, stratify=y
    )

    model = build_mlp_pipeline(X, random_state=run_seed)
    param_dist = get_param_distributions()

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=N_ITER_RANDOM,
        scoring="f1_macro",
        cv=3,
        n_jobs=-1,
        random_state=run_seed,
        verbose=1,
        refit=True
    )

    search.fit(X_train, y_train)

    preds = search.best_estimator_.predict(X_test)
    test_f1 = f1_score(y_test, preds, average="macro")

    result_row = {
        "dataset": dataset_name,
        "method": "random_search",
        "run_seed": run_seed,
        "best_cv_f1": search.best_score_,
        "test_f1": test_f1,
        **search.best_params_
    }

    return result_row, pd.DataFrame(search.cv_results_)


Successive Halving Function

In [9]:
def run_halving_random_search(X, y, run_seed, dataset_name):
    print(f"\n[Successive Halving] Dataset={dataset_name}, seed={run_seed}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=run_seed, stratify=y
    )

    model = build_mlp_pipeline(X, random_state=run_seed)
    param_dist = get_param_distributions()

    search = HalvingRandomSearchCV(
        estimator=model,
        param_distributions=param_dist,
        factor=3,
        resource="clf__max_iter",
        min_resources=100,
        max_resources=1200,
        scoring="f1_macro",
        cv=3,
        random_state=run_seed,
        n_jobs=-1,
        verbose=1,
        refit=True
    )

    search.fit(X_train, y_train)

    preds = search.best_estimator_.predict(X_test)
    test_f1 = f1_score(y_test, preds, average="macro")

    result_row = {
        "dataset": dataset_name,
        "method": "halving_random_search",
        "run_seed": run_seed,
        "best_cv_f1": search.best_score_,
        "test_f1": test_f1,
        **search.best_params_
    }

    return result_row, pd.DataFrame(search.cv_results_)


Main Loop over Datasets and Methods

In [11]:
summary_rows = []
cv_rows = []

for dataset_name in DATASETS:
    print("\n" + "="*60)
    print("DATASET:", dataset_name)
    print("="*60)

    X, y = load_openml_dataset(dataset_name)

    for run_seed in RUN_SEEDS:
        # Random Search
        rs_row, rs_cv = run_random_search(X, y, run_seed, dataset_name)
        summary_rows.append(rs_row)
        rs_cv["dataset"] = dataset_name
        rs_cv["method"] = "random_search"
        rs_cv["run_seed"] = run_seed
        cv_rows.append(rs_cv)

        # Successive Halving
        sh_row, sh_cv = run_halving_random_search(X, y, run_seed, dataset_name)
        summary_rows.append(sh_row)
        sh_cv["dataset"] = dataset_name
        sh_cv["method"] = "halving_random_search"
        sh_cv["run_seed"] = run_seed
        cv_rows.append(sh_cv)

df_summary = pd.DataFrame(summary_rows)
df_cv = pd.concat(cv_rows, ignore_index=True)

df_summary



DATASET: adult

[Random Search] Dataset=adult, seed=0
Fitting 3 folds for each of 20 candidates, totalling 60 fits

[Successive Halving] Dataset=adult, seed=0
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 100
max_resources_: 1200
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 100
Fitting 3 folds for each of 12 candidates, totalling 36 fits
----------
iter: 1
n_candidates: 4
n_resources: 300
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 2
n_candidates: 2
n_resources: 900
Fitting 3 folds for each of 2 candidates, totalling 6 fits

[Random Search] Dataset=adult, seed=1
Fitting 3 folds for each of 20 candidates, totalling 60 fits

[Successive Halving] Dataset=adult, seed=1
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 100
max_resources_: 1200
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 100
Fitting 3 fo

KeyboardInterrupt: 

Save Results to CSV

In [17]:
summary_path = os.path.join(OUTPUT_DIR, "mlp_hpo_summary.csv")
cv_path = os.path.join(OUTPUT_DIR, "mlp_hpo_cv_results.csv")

df_summary.to_csv(summary_path, index=False)
df_cv.to_csv(cv_path, index=False)

print("Saved summary to:", os.path.abspath(summary_path))
print("Saved CV results to:", os.path.abspath(cv_path))


Saved summary to: c:\BudgetAware_HPO_Dissertation\results\hpo\mlp_hpo_summary.csv
Saved CV results to: c:\BudgetAware_HPO_Dissertation\results\hpo\mlp_hpo_cv_results.csv
