<h1>CELL 1 â€” Install (run once if needed)</h1>

In [43]:
# If you see "ModuleNotFoundError: No module named 'numpy' or 'sklearn'",
# run this cell, then restart the kernel/runtime.

!pip install -q numpy scikit-learn


zsh:1: /Users/classroomservices/Desktop/Winter/Machine Learning/Project/Code/venv/bin/pip: bad interpreter: /Users/classroomservices/Desktop/Winter/Code/venv/bin/python3.14: no such file or directory


<h1>ðŸ§© CELL 2 â€” Imports & Global Settings</h1>

In [44]:
# =========================
# Phase 1 (Baseline) Imports
# =========================

import numpy as np
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple

# Dataset (example: Iris). Replace later with your instructor datasets.
from sklearn.datasets import load_iris

# Model selection (CV + tuning)
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Pipeline prevents leakage: preprocessing is fit only on training folds
from sklearn.pipeline import Pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Metrics required by the project
from sklearn.metrics import accuracy_score, f1_score

# 5 required classifiers
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

RANDOM_STATE = 42


<h1>ðŸ§© CELL 3 â€” Result Container (Clean Output)</h1>

In [45]:
# ===================================
# Data structure to store model output
# ===================================

@dataclass
class Phase1Result:
    classifier_name: str
    acc_mean: float
    acc_std: float
    f1_mean: float
    f1_std: float
    best_params_each_fold: List[Dict[str, Any]]


<h1>ðŸ§© CELL 4 â€” Define Classifiers + Hyperparameter Grids (Grid Search)</h1>

In [46]:
# ============================================================
# Phase 1: Classifiers + Hyperparameter search space (Grid)
# ============================================================
# IMPORTANT:
# - We tune ONLY classifier hyperparameters in Phase 1.
# - No DR/FS methods are used here.
# - Grids are small enough to run quickly on Iris, but valid.
# - You can expand the grids later for stronger experiments.

def get_models_and_grids(random_state: int = 42) -> Dict[str, Tuple[Any, Dict[str, List[Any]], bool]]:
    """
    Returns dictionary:
      name -> (estimator, param_grid, needs_scaling)

    needs_scaling = True for SVM, kNN, MLP because they are distance/gradient-based.
    For tree-based models scaling is not required.
    """

    models = {}

    # 1) SVM
    models["SVM"] = (
        SVC(),
        {
            "clf__kernel": ["linear", "rbf"],
            "clf__C": [0.1, 1, 10, 100],
            "clf__gamma": ["scale", "auto"],  # used for rbf; safe if kernel=linear
        },
        True,
    )

    # 2) k-NN
    models["kNN"] = (
        KNeighborsClassifier(),
        {
            "clf__n_neighbors": [3, 5, 7, 9, 11],
            "clf__weights": ["uniform", "distance"],
            "clf__p": [1, 2],  # 1=Manhattan, 2=Euclidean
        },
        True,
    )

    # 3) Decision Tree
    models["DecisionTree"] = (
        DecisionTreeClassifier(random_state=random_state),
        {
            "clf__criterion": ["gini", "entropy"],
            "clf__max_depth": [None, 2, 3, 4, 5, 8, 10],
            "clf__min_samples_split": [2, 5, 10],
            "clf__min_samples_leaf": [1, 2, 4],
        },
        False,
    )

    # 4) Random Forest
    models["RandomForest"] = (
        RandomForestClassifier(random_state=random_state),
        {
            "clf__n_estimators": [100, 300],
            "clf__max_depth": [None, 3, 5, 8, 12],
            "clf__min_samples_split": [2, 5],
            "clf__min_samples_leaf": [1, 2],
            "clf__max_features": ["sqrt", "log2", None],
        },
        False,
    )

    # 5) MLP
    models["MLP"] = (
        MLPClassifier(random_state=random_state, max_iter=700),
        {
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
        },
        True,
    )

    return models


<h1>ðŸ§© CELL 5 â€” Build Leakage-Safe Pipeline (Baseline)</h1>

In [47]:
# ==========================================
# Pipeline builder (Baseline, Phase 1 only)
# ==========================================
# KEY POINT FOR LEAKAGE PREVENTION:
# - StandardScaler is inside the pipeline.
# - During CV, scaler is fit ONLY on training fold, not the test fold.
# - This avoids using any information from test data.

def build_baseline_pipeline(classifier: Any, needs_scaling: bool) -> Pipeline:
    steps = []
    if needs_scaling:
        steps.append(("scaler", StandardScaler()))
    steps.append(("clf", classifier))
    return Pipeline(steps)


<h1>ðŸ§© CELL 6 â€” Nested CV: 10-Fold Evaluation + Inner GridSearch (No Leakage)</h1>

In [48]:
# ==========================================================
# Phase 1 Core: Nested CV (Outer 10-fold + Inner tuning)
# ==========================================================
# WHY NESTED CV?
# - Project requires hyperparameter tuning without leakage.
# - If you tune using the whole dataset then cross-validate,
#   you leak test-fold information into tuning.
#
# Our approach:
# Outer loop (10-fold): estimates generalization performance
# Inner loop (5-fold): selects best hyperparameters on training fold only
#
# This is the standard leakage-safe experimental design.

def run_phase1_baseline_nested_cv(
    X: np.ndarray,
    y: np.ndarray,
    random_state: int = 42,
    outer_splits: int = 10,
    inner_splits: int = 5,
    tuning_scoring: str = "f1_macro",   # tune using macro-F1 (multi-class friendly)
) -> List[Phase1Result]:

    models = get_models_and_grids(random_state=random_state)

    outer_cv = StratifiedKFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    results: List[Phase1Result] = []

    for name, (clf, grid, needs_scaling) in models.items():
        acc_scores: List[float] = []
        f1_scores: List[float] = []
        best_params_folds: List[Dict[str, Any]] = []

        # OUTER CV: performance estimation
        for fold_id, (train_idx, test_idx) in enumerate(outer_cv.split(X, y), start=1):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            # Build baseline pipeline (scaler inside pipeline if needed)
            pipe = build_baseline_pipeline(clf, needs_scaling)

            # INNER CV: hyperparameter tuning ONLY on training fold
            search = GridSearchCV(
                estimator=pipe,
                param_grid=grid,
                scoring=tuning_scoring,
                cv=inner_cv,
                n_jobs=-1,
                refit=True,  # retrain on full training fold using best params
            )

            search.fit(X_train, y_train)  # tuning happens here (training fold only)

            # Evaluate on OUTER test fold (never seen during tuning)
            y_pred = search.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="macro")

            acc_scores.append(acc)
            f1_scores.append(f1)
            best_params_folds.append(search.best_params_)

            # Optional fold progress
            # print(f"{name} | Fold {fold_id}: acc={acc:.4f}, f1={f1:.4f}")

        results.append(
            Phase1Result(
                classifier_name=name,
                acc_mean=float(np.mean(acc_scores)),
                acc_std=float(np.std(acc_scores, ddof=1)),
                f1_mean=float(np.mean(f1_scores)),
                f1_std=float(np.std(f1_scores, ddof=1)),
                best_params_each_fold=best_params_folds,
            )
        )

    return results


<h1>ðŸ§© CELL 7 â€” Reporting: Results Table (Mean Â± Std)</h1>

In [49]:
# ============================
# Print results: mean Â± std
# ============================

def print_results_table(results: List[Phase1Result]) -> None:
    header = f"{'Classifier':<14} {'Accuracy (meanÂ±std)':<24} {'Macro-F1 (meanÂ±std)':<24}"
    print(header)
    print("-" * len(header))

    for r in results:
        acc_text = f"{r.acc_mean:.4f} Â± {r.acc_std:.4f}"
        f1_text  = f"{r.f1_mean:.4f} Â± {r.f1_std:.4f}"
        print(f"{r.classifier_name:<14} {acc_text:<24} {f1_text:<24}")


<h1>ðŸ§© CELL 8 â€” Reporting: Best Hyperparameters (Frequency Summary)<h1>

In [50]:
# ==========================================
# Best hyperparameters: frequency over folds
# ==========================================
# The project asks to report the best hyperparameters.
# Since we do nested CV, each outer fold has its own best params.
# We'll summarize how often each parameter set is selected.

from collections import Counter

def summarize_best_params(results: List[Phase1Result], top_k: int = 3) -> None:
    print("\nBest hyperparameters (frequency across outer folds):")

    for r in results:
        params_tuples = [tuple(sorted(p.items())) for p in r.best_params_each_fold]
        counts = Counter(params_tuples)

        print(f"\n{r.classifier_name}:")
        for params_tuple, freq in counts.most_common(top_k):
            print(f"  {freq:>2} folds -> {dict(params_tuple)}")


<h1>ðŸ§© CELL 9 â€” Load Dataset (Iris Example)</h1>

In [51]:
# ==============================
# Load dataset (Iris example)
# ==============================
# Replace this later with your instructor datasets.

iris = load_iris()
X = iris.data
y = iris.target

print("Dataset: Iris")
print("Shape X:", X.shape, "| Classes:", len(np.unique(y)))
print("Class names:", iris.target_names)


Dataset: Iris
Shape X: (150, 4) | Classes: 3
Class names: ['setosa' 'versicolor' 'virginica']


<h1>ðŸ§© CELL 10 â€” Run Phase 1 Baseline + Print Outputs</h1>

In [52]:
# =======================================
# Run Phase 1 baseline experiments (Iris)
# =======================================

phase1_results = run_phase1_baseline_nested_cv(
    X=X,
    y=y,
    random_state=RANDOM_STATE,
    outer_splits=10,
    inner_splits=5,
    tuning_scoring="f1_macro"
)

print("\n=== Phase 1 Results (Baseline: No DR/FS) ===")
print_results_table(phase1_results)

summarize_best_params(phase1_results, top_k=3)



=== Phase 1 Results (Baseline: No DR/FS) ===
Classifier     Accuracy (meanÂ±std)      Macro-F1 (meanÂ±std)     
----------------------------------------------------------------
SVM            0.9600 Â± 0.0344          0.9596 Â± 0.0348         
kNN            0.9600 Â± 0.0466          0.9592 Â± 0.0480         
DecisionTree   0.9400 Â± 0.0584          0.9387 Â± 0.0600         
RandomForest   0.9533 Â± 0.0549          0.9526 Â± 0.0560         
MLP            0.9400 Â± 0.0734          0.9393 Â± 0.0735         

Best hyperparameters (frequency across outer folds):

SVM:
   4 folds -> {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}
   2 folds -> {'clf__C': 100, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}
   2 folds -> {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}

kNN:
   3 folds -> {'clf__n_neighbors': 9, 'clf__p': 2, 'clf__weights': 'distance'}
   2 folds -> {'clf__n_neighbors': 7, 'clf__p': 2, 'clf__weights': 'uniform'}
   1 folds -> {'clf__n_neighbors': 11