In [9]:
import warnings
warnings.filterwarnings("ignore")

import optuna
import pandas as pd
from sklearn.datasets import load_breast_cancer, fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


In [10]:
optuna.__version__


'4.6.0'

In [11]:
def load_dataset(name: str):
    """
    Returns X, y for a given dataset name.
    Supported:
      - 'breast_cancer'
      - 'adult'
      - 'credit-g'
    """
    if name == "breast_cancer":
        data = load_breast_cancer(as_frame=True)
        X = data.data
        y = data.target
        return X, y
    
    elif name == "adult":
        adult = fetch_openml("adult", version=2, as_frame=True)
        X = adult.data
        y = adult.target
        return X, y
    
    elif name == "credit-g":
        credit = fetch_openml("credit-g", version=1, as_frame=True)
        X = credit.data
        y = credit.target
        return X, y
    
    else:
        raise ValueError(f"Unknown dataset name: {name}")


In [12]:
def build_preprocessor(X):
    categorical = X.select_dtypes(include=["object", "category"]).columns
    numeric = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns

    preprocess = ColumnTransformer([
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ])

    return preprocess


In [13]:
def run_lr_baseline(X, y, random_state=42):
    """
    Train/test split + baseline Logistic Regression.
    Returns baseline_f1 (macro).
    """
    preprocess = build_preprocessor(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=random_state,
        stratify=y
    )

    model = Pipeline([
        ("pre", preprocess),
        ("clf", LogisticRegression(max_iter=200))
    ])

    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    f1 = f1_score(y_test, preds, average="macro")
    return f1, preprocess, X_train, X_test, y_train, y_test


def run_lr_hpo(preprocess, X_train, X_test, y_train, y_test, n_trials=30, random_state=42):
    """
    HPO on Logistic Regression using Optuna.
    Uses same train/test split and same preprocessor.
    Returns best_f1, best_params, study.
    """
    def objective(trial):
        C = trial.suggest_float("C", 1e-4, 10.0, log=True)
        class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

        model = Pipeline([
            ("pre", preprocess),
            ("clf", LogisticRegression(
                max_iter=300,
                C=C,
                penalty="l2",
                class_weight=class_weight
            ))
        ])

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        return f1_score(y_test, preds, average="macro")

    sampler = optuna.samplers.TPESampler(seed=random_state)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

    best_f1 = study.best_value
    best_params = study.best_params

    return best_f1, best_params, study


In [14]:
dataset_names = ["breast_cancer", "adult", "credit-g"]

all_results = []

for name in dataset_names:
    print(f"\n=== DATASET: {name} ===")
    
    # 1. Load data
    X, y = load_dataset(name)
    
    # 2. Baseline LR
    baseline_f1, preprocess, X_train, X_test, y_train, y_test = run_lr_baseline(X, y)
    print(f"Baseline LR F1_macro: {baseline_f1:.4f}")
    
    # 3. HPO LR
    hpo_f1, best_params, study = run_lr_hpo(
        preprocess, X_train, X_test, y_train, y_test,
        n_trials=30
    )
    print(f"HPO LR F1_macro: {hpo_f1:.4f}")
    print(f"Improvement: {hpo_f1 - baseline_f1:.4f}")
    print(f"Best params: {best_params}")
    
    # 4. Store
    all_results.append({
        "dataset": name,
        "model": "logistic_regression",
        "baseline_f1_macro": baseline_f1,
        "hpo_f1_macro": hpo_f1,
        "improvement": hpo_f1 - baseline_f1,
        "best_params": best_params
    })

results_df = pd.DataFrame(all_results)
results_df



=== DATASET: breast_cancer ===


[I 2025-12-02 01:06:22,075] A new study created in memory with name: no-name-d23a1f30-382a-41c4-bc50-ca2be9900db6
[I 2025-12-02 01:06:22,099] Trial 0 finished with value: 0.9422297297297297 and parameters: {'C': 0.0074593432857265485, 'class_weight': None}. Best is trial 0 with value: 0.9422297297297297.
[I 2025-12-02 01:06:22,109] Trial 1 finished with value: 0.9715828832571666 and parameters: {'C': 0.09846738873614563, 'class_weight': None}. Best is trial 1 with value: 0.9715828832571666.
[I 2025-12-02 01:06:22,118] Trial 2 finished with value: 0.5832502492522433 and parameters: {'C': 0.00019517224641449495, 'class_weight': None}. Best is trial 1 with value: 0.9715828832571666.
[I 2025-12-02 01:06:22,129] Trial 3 finished with value: 0.9535338713621913 and parameters: {'C': 0.3470266988650412, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.9715828832571666.
[I 2025-12-02 01:06:22,143] Trial 4 finished with value: 0.9811507936507937 and parameters: {'C': 1.452824663751602,

Baseline LR F1_macro: 0.9812


[I 2025-12-02 01:06:22,270] Trial 14 finished with value: 0.9811507936507937 and parameters: {'C': 1.0162435907226077, 'class_weight': None}. Best is trial 4 with value: 0.9811507936507937.
[I 2025-12-02 01:06:22,284] Trial 15 finished with value: 0.9811507936507937 and parameters: {'C': 1.970755824338177, 'class_weight': None}. Best is trial 4 with value: 0.9811507936507937.
[I 2025-12-02 01:06:22,296] Trial 16 finished with value: 0.9811507936507937 and parameters: {'C': 0.22894626561527406, 'class_weight': None}. Best is trial 4 with value: 0.9811507936507937.
[I 2025-12-02 01:06:22,308] Trial 17 finished with value: 0.9811507936507937 and parameters: {'C': 2.8672039737492496, 'class_weight': None}. Best is trial 4 with value: 0.9811507936507937.
[I 2025-12-02 01:06:22,318] Trial 18 finished with value: 0.9535338713621913 and parameters: {'C': 0.28148296482457225, 'class_weight': 'balanced'}. Best is trial 4 with value: 0.9811507936507937.
[I 2025-12-02 01:06:22,332] Trial 19 finish

HPO LR F1_macro: 0.9812
Improvement: 0.0000
Best params: {'C': 1.452824663751602, 'class_weight': None}

=== DATASET: adult ===


[I 2025-12-02 01:06:22,965] A new study created in memory with name: no-name-cb051469-69ca-4c7e-8925-fc6a3a63e3c3
[I 2025-12-02 01:06:23,160] Trial 0 finished with value: 0.7763003658151166 and parameters: {'C': 0.0074593432857265485, 'class_weight': None}. Best is trial 0 with value: 0.7763003658151166.


Baseline LR F1_macro: 0.7851


[I 2025-12-02 01:06:23,435] Trial 1 finished with value: 0.7838620206108089 and parameters: {'C': 0.09846738873614563, 'class_weight': None}. Best is trial 1 with value: 0.7838620206108089.
[I 2025-12-02 01:06:23,600] Trial 2 finished with value: 0.640235246472991 and parameters: {'C': 0.00019517224641449495, 'class_weight': None}. Best is trial 1 with value: 0.7838620206108089.
[I 2025-12-02 01:06:23,856] Trial 3 finished with value: 0.7712696774574349 and parameters: {'C': 0.3470266988650412, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7838620206108089.
[I 2025-12-02 01:06:24,153] Trial 4 finished with value: 0.7849459892116938 and parameters: {'C': 1.452824663751602, 'class_weight': None}. Best is trial 4 with value: 0.7849459892116938.
[I 2025-12-02 01:06:24,345] Trial 5 finished with value: 0.7621317143932771 and parameters: {'C': 0.0008260808399079611, 'class_weight': 'balanced'}. Best is trial 4 with value: 0.7849459892116938.
[I 2025-12-02 01:06:24,570] Trial 6 f

HPO LR F1_macro: 0.7854
Improvement: 0.0003
Best params: {'C': 3.036799422352348, 'class_weight': None}

=== DATASET: credit-g ===


[I 2025-12-02 01:06:32,640] A new study created in memory with name: no-name-00f8b6d7-069f-4cd0-bf41-5363a212db32
[I 2025-12-02 01:06:32,660] Trial 0 finished with value: 0.5069696562233875 and parameters: {'C': 0.0074593432857265485, 'class_weight': None}. Best is trial 0 with value: 0.5069696562233875.
[I 2025-12-02 01:06:32,683] Trial 1 finished with value: 0.6945041927740889 and parameters: {'C': 0.09846738873614563, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,701] Trial 2 finished with value: 0.4117647058823529 and parameters: {'C': 0.00019517224641449495, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,723] Trial 3 finished with value: 0.6567996567996568 and parameters: {'C': 0.3470266988650412, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,751] Trial 4 finished with value: 0.6442154336891179 and parameters: {'C': 1.452824663751602,

Baseline LR F1_macro: 0.6362


[I 2025-12-02 01:06:32,828] Trial 8 finished with value: 0.5667211432480712 and parameters: {'C': 0.019069966103000432, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,848] Trial 9 finished with value: 0.6836158192090396 and parameters: {'C': 0.03725393839578886, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,870] Trial 10 finished with value: 0.6442154336891179 and parameters: {'C': 5.86072399830607, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,895] Trial 11 finished with value: 0.6484844747309673 and parameters: {'C': 0.30240503427727145, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,919] Trial 12 finished with value: 0.6945041927740889 and parameters: {'C': 0.10428462672011099, 'class_weight': None}. Best is trial 1 with value: 0.6945041927740889.
[I 2025-12-02 01:06:32,940] Trial 13 finished wit

HPO LR F1_macro: 0.6945
Improvement: 0.0584
Best params: {'C': 0.09846738873614563, 'class_weight': None}


Unnamed: 0,dataset,model,baseline_f1_macro,hpo_f1_macro,improvement,best_params
0,breast_cancer,logistic_regression,0.981151,0.981151,0.0,"{'C': 1.452824663751602, 'class_weight': None}"
1,adult,logistic_regression,0.78506,0.785395,0.000335,"{'C': 3.036799422352348, 'class_weight': None}"
2,credit-g,logistic_regression,0.636151,0.694504,0.058354,"{'C': 0.09846738873614563, 'class_weight': None}"


In [15]:
import os

os.makedirs("../results", exist_ok=True)

results_path = "../results/lr_optionB_multi_datasets.csv"
results_df.to_csv(results_path, index=False)
results_path

'../results/lr_optionB_multi_datasets.csv'