Imports

In [2]:
import warnings
warnings.filterwarnings("ignore")

import optuna
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import os

Load dataset

In [3]:
credit = fetch_openml("credit-g", version=1, as_frame=True)
X = credit.data
y = credit.target

categorical = X.select_dtypes(include=["object", "category"]).columns
numeric = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns

preprocess = ColumnTransformer([
    ("num", StandardScaler(), numeric),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
])

Train-test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


HPO objective

In [5]:
def objective(trial):
    C = trial.suggest_float("C", 1e-4, 10.0, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    model = Pipeline([
        ("pre", preprocess),
        ("clf", LogisticRegression(
            max_iter=300,
            C=C,
            penalty="l2",
            class_weight=class_weight
        ))
    ])

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return f1_score(y_test, preds, average="macro")


Run HPO

In [6]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)

study.optimize(objective, n_trials=30)

best_f1 = study.best_value
best_params = study.best_params

best_f1, best_params


[I 2025-12-02 01:26:20,224] A new study created in memory with name: no-name-a31eefda-3fad-42b6-a77d-29213d93cfd9
[I 2025-12-02 01:26:20,249] Trial 0 finished with value: 0.5069696562233875 and parameters: {'C': 0.0074593432857265485, 'class_weight': None}. Best is trial 0 with value: 0.5069696562233875.
[I 2025-12-02 01:26:20,271] Trial 1 finished with value: 0.6945041927740889 and parameters: {'C': 0.09846738873614563, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:26:20,299] Trial 2 finished with value: 0.4117647058823529 and parameters: {'C': 0.00019517224641449495, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:26:20,328] Trial 3 finished with value: 0.6567996567996568 and parameters: {'C': 0.3470266988650412, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:26:20,378] Trial 4 finished with value: 0.6442154336891179 and parameters: {'C': 1.452824663751602,

(0.6945041927740889, {'C': 0.09846738873614563, 'class_weight': None})

Save HPO results

In [7]:
os.makedirs("../results", exist_ok=True)

hpo_df = pd.DataFrame({
    "dataset": ["credit-g"],
    "model": ["logistic_regression"],
    "hpo_f1_macro": [best_f1],
    "best_params": [best_params]
})

hpo_df.to_csv("../results/lr_creditg_hpo.csv", index=False)

hpo_df


Unnamed: 0,dataset,model,hpo_f1_macro,best_params
0,credit-g,logistic_regression,0.694504,"{'C': 0.09846738873614563, 'class_weight': None}"
