In [None]:
# ============================================================
# Polynomial Logistic Regression with Optuna tuning
# Optimizes validation accuracy (60/20/20 split)
# ============================================================

import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna

# ------------------------------------------------------------
# 1. Create or load your dataset
# ------------------------------------------------------------
n = 10

X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

# 60/20/20 split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# ------------------------------------------------------------
# 2. Define the Optuna objective
# ------------------------------------------------------------
def objective(trial):
    # Hyperparameters to tune
    degree = trial.suggest_int("degree", 1, 4)
    C = trial.suggest_float("C", 1e-4, 1e3, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2", "l1"])
    solver = "saga" if penalty == "l1" else "lbfgs"

    # Build pipeline: Polynomial → Standardize → Logistic Regression
    model = Pipeline([
        ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(
            penalty=penalty,
            C=C,
            solver=solver,
            max_iter=5000,
            random_state=42))
    ])

    model.fit(X_train, y_train)
    preds_val = model.predict(X_val)
    acc = accuracy_score(y_val, preds_val)
    return acc

# ------------------------------------------------------------
# 3. Run Optuna optimization
# ------------------------------------------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

best_trial = study.best_trial
print("\nBest Validation Results")
print("------------------------")
print(f"Validation Accuracy: {best_trial.value:.4f}")
for k, v in best_trial.params.items():
    print(f"  {k}: {v}")

# ------------------------------------------------------------
# 4. Retrain with best params on (Train + Val), test on Test Set
# ------------------------------------------------------------
def train_full_and_test(params):
    degree = params["degree"]
    C = params["C"]
    penalty = params["penalty"]
    solver = "saga" if penalty == "l1" else "lbfgs"

    model = Pipeline([
        ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(
            penalty=penalty,
            C=C,
            solver=solver,
            max_iter=5000,
            random_state=42))
    ])

    # Train on 80% (train + val)
    X_combined = np.vstack([X_train, X_val])
    y_combined = np.concatenate([y_train, y_val])
    model.fit(X_combined, y_combined)

    preds_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, preds_test)
    return test_acc

test_acc = train_full_and_test(best_trial.params)
print("\nFinal Test Accuracy (using best params): {:.4f}".format(test_acc))
