In [None]:
!pip install pandas numpy scikit-learn openpyxl


In [1]:
import os, time
import numpy as np
import pandas as pd
import openpyxl

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# LDA (Phase 2 only)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 5 classifiers
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [None]:
# -------------------------
# Paths
# -------------------------
DATA_ROOT = "Datasets"
RESULTS_XLSX = "/Users/classroomservices/Desktop/Winter/Machine Learning/Project/Code/Datasets/Results.xlsx"   # your template file
LABEL_COL = "Label"
RANDOM_STATE = 42

# -------------------------
# CV (Outer must be 10)
# -------------------------
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# Inner tuning CV: not specified by prof -> 3 folds speeds up a lot
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

# -------------------------
# Parallelism (IMPORTANT)
# -------------------------
# If your run "stucks", set N_JOBS = 1 (most stable).
# On Mac, too much parallelism can stall in nested CV.
N_JOBS = 2   # try 2 first; if stuck -> set to 1

# -------------------------
# Random search iterations
# -------------------------
# More iterations = better tuning but slower.
# Typical balanced: 15–25.
N_ITER = 20


In [3]:
def load_one_dataset(dataset_id: int):
    """
    Loads Datasets/<id>/train.csv and test.csv, combines them.
    Reason: some datasets have too few samples per class in train.csv
    -> 10-fold stratified CV fails. Combining fixes that.
    """
    train_path = os.path.join(DATA_ROOT, str(dataset_id), "train.csv")
    test_path  = os.path.join(DATA_ROOT, str(dataset_id), "test.csv")

    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    df = pd.concat([train_df, test_df], ignore_index=True)

    if LABEL_COL not in df.columns:
        raise ValueError(f"'{LABEL_COL}' not found in dataset {dataset_id}")

    X = df.drop(columns=[LABEL_COL])
    y = pd.Series(pd.factorize(df[LABEL_COL])[0], index=df.index)  # numeric labels

    return X, y


In [4]:
def get_model_and_paramdist(clf_name: str):
    """
    Return (model, param_distributions)
    Using RandomizedSearchCV -> use parameter distributions (lists).
    We keep a reasonably rich search space (NOT tiny), but we don't try every combo.
    """

    if clf_name == "SVM":
        # cache_size improves speed for RBF kernels
        model = SVC(cache_size=2000)
        param_dist = {
            "clf__C": [0.1, 1, 10, 100],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto", 0.01, 0.1],
        }
        return model, param_dist

    if clf_name == "KNN":
        model = KNeighborsClassifier()
        param_dist = {
            "clf__n_neighbors": list(range(3, 22, 2)),  # 3..21 odd
            "clf__weights": ["uniform", "distance"],
            "clf__metric": ["minkowski", "euclidean", "manhattan"],
        }
        return model, param_dist

    if clf_name == "DT":
        model = DecisionTreeClassifier(random_state=RANDOM_STATE)
        param_dist = {
            "clf__max_depth": [None, 5, 10, 15, 20, 30],
            "clf__min_samples_split": [2, 5, 10, 20],
            "clf__min_samples_leaf": [1, 2, 4, 8],
        }
        return model, param_dist

    if clf_name == "RF":
        model = RandomForestClassifier(random_state=RANDOM_STATE)
        param_dist = {
            "clf__n_estimators": [100, 200, 300],
            "clf__max_depth": [None, 10, 20, 30],
            "clf__min_samples_split": [2, 5, 10],
            "clf__min_samples_leaf": [1, 2, 4],
        }
        return model, param_dist

    if clf_name == "MLP":
        # early_stopping speeds up a lot
        model = MLPClassifier(
            max_iter=300,
            early_stopping=True,
            n_iter_no_change=10,
            random_state=RANDOM_STATE
        )
        param_dist = {
            "clf__hidden_layer_sizes": [(50,), (100,), (150,), (50,50), (100,50)],
            "clf__alpha": [1e-5, 1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-4, 1e-3, 1e-2],
        }
        return model, param_dist

    raise ValueError("Unknown classifier: " + clf_name)


In [5]:
def format_params(best_params: dict, prefix: str) -> str:
    """
    Extract params of a given step (e.g., 'clf__', 'lda__') and format.
    """
    parts = []
    for k, v in best_params.items():
        if k.startswith(prefix):
            parts.append(f"{k.replace(prefix,'')}={v}")
    return "; ".join(parts)

def get_lda_n_components(best_params: dict):
    return best_params.get("lda__n_components", None)


In [6]:
def run_one_classifier(dataset_id: int, clf_name: str, use_lda: bool):
    """
    Runs nested CV for ONE dataset + ONE classifier.

    Phase 1 (Baseline): Imputer -> Scaler -> Classifier
    Phase 2 (LDA):      Imputer -> Scaler -> LDA -> Classifier

    Outer CV = 10 folds (required)
    Inner CV = 3 folds (tuning)
    RandomizedSearchCV = faster than GridSearchCV
    """
    X, y = load_one_dataset(dataset_id)
    model, param_dist = get_model_and_paramdist(clf_name)

    # LDA: components <= classes-1
    n_classes = len(np.unique(y))
    max_comp = max(1, n_classes - 1)

    # Build pipeline
    steps = [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
    if use_lda:
        steps.append(("lda", LinearDiscriminantAnalysis(solver="svd")))
    steps.append(("clf", model))

    pipe = Pipeline(steps)

    # Add LDA params to search space (Phase 2 only)
    if use_lda:
        param_dist = dict(param_dist)
        param_dist.update({
            "lda__n_components": list(range(1, max_comp + 1))
        })

    fold_rows = []
    phase_name = "PHASE 2 (LDA)" if use_lda else "PHASE 1 (Baseline)"

    print(f"\n==============================")
    print(f"✅ {phase_name} | Data {dataset_id} | {clf_name}")
    print(f"==============================")

    for fold_idx, (tr_idx, va_idx) in enumerate(outer_cv.split(X, y), start=1):
        print(f"➡️ {clf_name} Fold {fold_idx}/10 running...")

        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_dist,
            n_iter=N_ITER,
            scoring="f1_macro",
            cv=inner_cv,
            random_state=RANDOM_STATE,
            n_jobs=N_JOBS,
            refit=True
        )

        t0 = time.time()
        search.fit(X_tr, y_tr)
        fit_t = time.time() - t0

        y_pred = search.predict(X_va)
        acc = accuracy_score(y_va, y_pred)
        f1m = f1_score(y_va, y_pred, average="macro")

        print(f"✅ Done Fold {fold_idx} | Acc={acc:.4f} F1={f1m:.4f} | fit={fit_t:.1f}s")

        best = search.best_params_

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": float(acc),
            "f1_macro": float(f1m),
            "best_params": best,
            "clf_params_str": format_params(best, "clf__"),
            "lda_params_str": format_params(best, "lda__") if use_lda else "",
            "lda_n_components": get_lda_n_components(best) if use_lda else None
        })

    return fold_rows


In [7]:
def dataset_block_start_row(dataset_id: int) -> int:
    # Data1 at row 1, Data2 at row 13, ... (each dataset block is 12 rows)
    return 1 + (dataset_id - 1) * 12

def fold_row(dataset_id: int, fold_idx: int) -> int:
    # Fold1 row is start+2; Fold10 row is start+11
    return dataset_block_start_row(dataset_id) + 2 + (fold_idx - 1)


In [8]:
def write_phase1_before(ws_before, dataset_id: int, res: dict):
    """
    Writes into Sheet 'Before FS-DR':
      B..K metrics (Acc,F1 for SVM,KNN,DT,RF,MLP)
      L..P classifier params (one cell per classifier)
    """
    clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

    for f in range(1, 11):
        r = fold_row(dataset_id, f)

        # Metrics start at column B=2
        col = 2
        for clf in clfs:
            row = res[clf][f-1]
            ws_before.cell(row=r, column=col).value   = round(row["accuracy"], 4)
            ws_before.cell(row=r, column=col+1).value = round(row["f1_macro"], 4)
            col += 2

        # Params start at column L=12
        colp = 12
        for clf in clfs:
            row = res[clf][f-1]
            ws_before.cell(row=r, column=colp).value = row["clf_params_str"]
            colp += 1


In [9]:
def write_phase2_after(ws_after, dataset_id: int, res: dict):
    """
    Writes into Sheet 'After FS-DR':
      B..K metrics
      L = No. Selected Features (LDA components) -> stored as "SVM=2;KNN=1;..."
      M = Features Name -> "N/A (LDA)"
      N..R = per classifier: "LDA(...); clf params"
    """
    clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

    for f in range(1, 11):
        r = fold_row(dataset_id, f)

        # Metrics B..K
        col = 2
        for clf in clfs:
            row = res[clf][f-1]
            ws_after.cell(row=r, column=col).value   = round(row["accuracy"], 4)
            ws_after.cell(row=r, column=col+1).value = round(row["f1_macro"], 4)
            col += 2

        # L: No. Selected Features (one cell, so we store all)
        comps = [f"{clf}={res[clf][f-1]['lda_n_components']}" for clf in clfs]
        ws_after.cell(row=r, column=12).value = ";".join(comps)

        # M: Features Name
        ws_after.cell(row=r, column=13).value = "N/A (LDA)"

        # N..R: LDA params + clf params for each classifier
        c = 14
        for clf in clfs:
            row = res[clf][f-1]
            ws_after.cell(row=r, column=c).value = f"LDA({row['lda_params_str']}); {row['clf_params_str']}"
            c += 1


In [10]:
# Load Excel template
wb = openpyxl.load_workbook(RESULTS_XLSX)
ws_before = wb["Before FS-DR"]
ws_after  = wb["After FS-DR"]

clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

for dataset_id in range(1, 17):
    print(f"\n\n==================== DATASET {dataset_id} ====================")

    # -------- PHASE 1 (Baseline) --------
    res1 = {}
    for clf in clfs:
        res1[clf] = run_one_classifier(dataset_id, clf, use_lda=False)
    write_phase1_before(ws_before, dataset_id, res1)

    # Save checkpoint (so you never lose progress)
    wb.save("Results_checkpoint.xlsx")
    print("✅ Saved checkpoint after Phase 1:", "Results_checkpoint.xlsx")

    # -------- PHASE 2 (LDA) --------
    res2 = {}
    for clf in clfs:
        res2[clf] = run_one_classifier(dataset_id, clf, use_lda=True)
    write_phase2_after(ws_after, dataset_id, res2)

    # Save checkpoint again
    wb.save("Results_checkpoint.xlsx")
    print("✅ Saved checkpoint after Phase 2:", "Results_checkpoint.xlsx")

# Final save
final_name = "Results_FINAL.xlsx"
wb.save(final_name)
print("\n✅ DONE. Final file saved as:", final_name)


FileNotFoundError: [Errno 2] No such file or directory: 'Results.xlsx'