In [6]:
# Core utilities
import numpy as np
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple

# Dataset
from sklearn.datasets import load_iris

# Model selection & evaluation
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

# Preprocessing & DR
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Classifiers
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [7]:
@dataclass
class ModelResult:
    name: str
    phase: str
    acc_mean: float
    acc_std: float
    f1_mean: float
    f1_std: float
    best_params_each_fold: List[Dict[str, Any]]


In [8]:
def make_models_and_grids(random_state: int = 42):
    models = {}

    # 1) SVM
    models["SVM"] = (
        SVC(),
        {
            "clf__C": [0.1, 1, 10, 100],
            "clf__gamma": ["scale", "auto"],
            "clf__kernel": ["linear", "rbf"],
        },
        True,
    )

    # 2) k-NN
    models["kNN"] = (
        KNeighborsClassifier(),
        {
            "clf__n_neighbors": [3, 5, 7, 9, 11],
            "clf__weights": ["uniform", "distance"],
            "clf__p": [1, 2],
        },
        True,
    )

    # 3) Decision Tree
    models["DecisionTree"] = (
        DecisionTreeClassifier(random_state=random_state),
        {
            "clf__max_depth": [None, 2, 3, 4, 5, 8],
            "clf__min_samples_split": [2, 5, 10],
            "clf__criterion": ["gini", "entropy"],
        },
        False,
    )

    # 4) Random Forest
    models["RandomForest"] = (
        RandomForestClassifier(random_state=random_state),
        {
            "clf__n_estimators": [100, 300],
            "clf__max_depth": [None, 3, 5, 8],
            "clf__min_samples_split": [2, 5],
        },
        False,
    )

    # 5) MLP
    models["MLP"] = (
        MLPClassifier(random_state=random_state, max_iter=400),
        {
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
        },
        True,
    )

    return models


In [9]:
def build_pipeline(classifier, needs_scaling, use_lda, n_components):
    steps = []

    if needs_scaling:
        steps.append(("scaler", StandardScaler()))

    if use_lda:
        steps.append(("lda", LinearDiscriminantAnalysis(n_components=n_components)))

    steps.append(("clf", classifier))
    return Pipeline(steps)


In [10]:
def evaluate_nested_cv(X, y, use_lda, random_state=42):
    n_classes = len(np.unique(y))
    n_features = X.shape[1]
    lda_components = min(n_classes - 1, n_features)

    models = make_models_and_grids(random_state)
    outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    phase_name = "Phase 2 (LDA)" if use_lda else "Phase 1 (Baseline)"
    results = []

    for model_name, (clf, grid, needs_scaling) in models.items():
        acc_scores, f1_scores, best_params = [], [], []

        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            pipe = build_pipeline(
                classifier=clf,
                needs_scaling=needs_scaling,
                use_lda=use_lda,
                n_components=lda_components,
            )

            search = GridSearchCV(
                pipe,
                grid,
                scoring="f1_macro",
                cv=inner_cv,
                n_jobs=-1,
            )

            search.fit(X_train, y_train)
            y_pred = search.predict(X_test)

            acc_scores.append(accuracy_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred, average="macro"))
            best_params.append(search.best_params_)

        results.append(
            ModelResult(
                model_name,
                phase_name,
                np.mean(acc_scores),
                np.std(acc_scores, ddof=1),
                np.mean(f1_scores),
                np.std(f1_scores, ddof=1),
                best_params,
            )
        )

    return results


In [11]:
def print_results(results):
    print(f"{'Model':<14} {'Phase':<18} {'Accuracy (mean±std)':<22} {'Macro-F1 (mean±std)'}")
    print("-" * 75)
    for r in results:
        print(
            f"{r.name:<14} {r.phase:<18} "
            f"{r.acc_mean:.4f} ± {r.acc_std:.4f}     "
            f"{r.f1_mean:.4f} ± {r.f1_std:.4f}"
        )


In [12]:
def print_results(results):
    print(f"{'Model':<14} {'Phase':<18} {'Accuracy (mean±std)':<22} {'Macro-F1 (mean±std)'}")
    print("-" * 75)
    for r in results:
        print(
            f"{r.name:<14} {r.phase:<18} "
            f"{r.acc_mean:.4f} ± {r.acc_std:.4f}     "
            f"{r.f1_mean:.4f} ± {r.f1_std:.4f}"
        )


In [13]:
iris = load_iris()
X = iris.data
y = iris.target

print("Dataset:", iris.target_names)
print("Samples:", X.shape[0], "| Features:", X.shape[1])


Dataset: ['setosa' 'versicolor' 'virginica']
Samples: 150 | Features: 4


In [14]:
baseline_results = evaluate_nested_cv(X, y, use_lda=False)

print("=== Phase 1: Baseline (No Dimensionality Reduction) ===")
print_results(baseline_results)




=== Phase 1: Baseline (No Dimensionality Reduction) ===
Model          Phase              Accuracy (mean±std)    Macro-F1 (mean±std)
---------------------------------------------------------------------------
SVM            Phase 1 (Baseline) 0.9600 ± 0.0344     0.9596 ± 0.0348
kNN            Phase 1 (Baseline) 0.9600 ± 0.0466     0.9592 ± 0.0480
DecisionTree   Phase 1 (Baseline) 0.9333 ± 0.0629     0.9315 ± 0.0648
RandomForest   Phase 1 (Baseline) 0.9533 ± 0.0549     0.9526 ± 0.0560
MLP            Phase 1 (Baseline) 0.9400 ± 0.0734     0.9393 ± 0.0735




In [15]:
lda_results = evaluate_nested_cv(X, y, use_lda=True)

print("=== Phase 2: LDA Dimensionality Reduction ===")
print_results(lda_results)




=== Phase 2: LDA Dimensionality Reduction ===
Model          Phase              Accuracy (mean±std)    Macro-F1 (mean±std)
---------------------------------------------------------------------------
SVM            Phase 2 (LDA)      0.9733 ± 0.0466     0.9732 ± 0.0467
kNN            Phase 2 (LDA)      0.9733 ± 0.0466     0.9732 ± 0.0467
DecisionTree   Phase 2 (LDA)      0.9600 ± 0.0562     0.9588 ± 0.0583
RandomForest   Phase 2 (LDA)      0.9733 ± 0.0562     0.9728 ± 0.0574
MLP            Phase 2 (LDA)      0.9600 ± 0.0562     0.9593 ± 0.0573




In [16]:
svm_baseline = next(r for r in baseline_results if r.name == "SVM")
svm_lda = next(r for r in lda_results if r.name == "SVM")

print("Example best hyperparameters (first fold):")
print("SVM Baseline:", svm_baseline.best_params_each_fold[0])
print("SVM + LDA   :", svm_lda.best_params_each_fold[0])


Example best hyperparameters (first fold):
SVM Baseline: {'clf__C': 100, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}
SVM + LDA   : {'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}
