Imports & Setup

In [32]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score


Configuration: Datasets + Settings

In [33]:
import sys
sys.executable


'/Users/srinivass/Budgetaware_hpo/.venv/bin/python'

In [34]:
# Project root (macOS)
PROJECT_ROOT = Path("/Users/srinivass/Budgetaware_hpo")

# Output directory
BASELINE_DIR = PROJECT_ROOT / "results" / "baselines"
BASELINE_DIR.mkdir(parents=True, exist_ok=True)

# Experiment config
DATASET_NAME = "covertype"
TEST_SIZE = 0.2
VAL_SIZE = 0.25   # 0.25 of remaining 80% â†’ 20% val
N_REPEATS = 20
RANDOM_SEEDS = list(range(N_REPEATS))

print("Saving baselines to:", BASELINE_DIR)


Saving baselines to: /Users/srinivass/Budgetaware_hpo/results/baselines


Dataset Loader Function

In [35]:
def load_covertype():
    X, y = fetch_openml(
        name="covertype",
        version=2,
        as_frame=False,
        return_X_y=True
    )
    y = y.astype(int)
    return X, y

X, y = load_covertype()
from sklearn.utils import resample

MAX_SAMPLES = 50000  # adjust later if needed

if X.shape[0] > MAX_SAMPLES:
    X, y = resample(
        X, y,
        n_samples=MAX_SAMPLES,
        stratify=y,
        random_state=42
    )

print("Using dataset shape:", X.shape)



Using dataset shape: (50000, 54)


Baseline MLP builder (reasonable, not weak)

In [36]:
def build_mlp_baseline(random_state):
    return Pipeline([
        ("scaler", StandardScaler(with_mean=False)),
        ("mlp", MLPClassifier(
            hidden_layer_sizes=(100,),
            activation="relu",
            solver="adam",
            learning_rate_init=1e-3,
            alpha=1e-4,
            batch_size=256,
            max_iter=300,
            early_stopping=True,
            n_iter_no_change=20,
            random_state=random_state
        ))
    ])


Run Baseline

In [37]:
all_results = []

for seed in RANDOM_SEEDS:
    print(f"\nRun {seed + 1}/{N_REPEATS}")

    # Train / val / test split
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y,
        test_size=TEST_SIZE,
        stratify=y,
        random_state=seed
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp,
        test_size=VAL_SIZE,
        stratify=y_temp,
        random_state=seed
    )

    model = build_mlp_baseline(seed)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    f1 = f1_score(y_test, preds, average="macro")

    all_results.append({
        "dataset": DATASET_NAME,
        "seed": seed,
        "f1_macro": f1
    })

    print(f"Macro-F1: {f1:.4f}")



Run 1/20
Macro-F1: 0.8157

Run 2/20
Macro-F1: 0.8188

Run 3/20
Macro-F1: 0.8110

Run 4/20
Macro-F1: 0.8140

Run 5/20
Macro-F1: 0.8106

Run 6/20
Macro-F1: 0.7976

Run 7/20
Macro-F1: 0.8153

Run 8/20
Macro-F1: 0.7945

Run 9/20
Macro-F1: 0.8152

Run 10/20
Macro-F1: 0.8260

Run 11/20
Macro-F1: 0.8110

Run 12/20
Macro-F1: 0.7927

Run 13/20
Macro-F1: 0.8016

Run 14/20
Macro-F1: 0.8084

Run 15/20
Macro-F1: 0.8118

Run 16/20
Macro-F1: 0.8077

Run 17/20
Macro-F1: 0.8042

Run 18/20
Macro-F1: 0.8008

Run 19/20
Macro-F1: 0.8111

Run 20/20
Macro-F1: 0.8177


In [38]:
df_baseline = pd.DataFrame(all_results)

output_path = BASELINE_DIR / "mlp_baseline_covertype.csv"
df_baseline.to_csv(output_path, index=False)

print("\nBaseline summary:")
print(df_baseline["f1_macro"].describe())

print("\nSaved baseline to:")
print(output_path)



Baseline summary:
count    20.000000
mean      0.809283
std       0.008529
min       0.792746
25%       0.803517
50%       0.810983
75%       0.815205
max       0.825980
Name: f1_macro, dtype: float64

Saved baseline to:
/Users/srinivass/Budgetaware_hpo/results/baselines/mlp_baseline_covertype.csv
