In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

In [2]:
import time
import numpy as np
import pandas as pd
import openpyxl

from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# ✅ Phase 2 DR method
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# ✅ Same SVM choice you used in Phase 1 (LinearSVC)
from sklearn.svm import LinearSVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
# Paths
DATA_ROOT = "Datasets"  # <-- if you use absolute path in phase1, keep that here too
RESULTS_XLSX = "/Users/classroomservices/Desktop/Winter/Machine Learning/Project/Code/Datasets/Results.xlsx"

# ✅ For Phase 2 output file
OUT_PHASE2 = "Results_AFTER_filled.xlsx"

LABEL_COL = "Label"
RANDOM_STATE = 42

# REQUIRED outer folds
OUTER_FOLDS_DEFAULT = 10

# Inner CV folds (same as Phase 1)
INNER_FOLDS = 3
inner_cv = StratifiedKFold(n_splits=INNER_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# n_jobs stability for nested CV on Mac
N_JOBS = 1

# ✅ Email: You can use RandomizedSearchCV for SVM (must mention in report)
N_ITER_SVM = 10   # 8-15 typical

In [4]:
import os

def load_one_dataset(dataset_id: int):
    """
    Loads Datasets/<id>/train.csv + test.csv and combines them.
    """
    train_path = os.path.join(DATA_ROOT, str(dataset_id), "train.csv")
    test_path  = os.path.join(DATA_ROOT, str(dataset_id), "test.csv")

    df = pd.concat([pd.read_csv(train_path), pd.read_csv(test_path)], ignore_index=True)

    if LABEL_COL not in df.columns:
        raise ValueError(f"'{LABEL_COL}' not found in dataset {dataset_id}")

    X = df.drop(columns=[LABEL_COL])
    y = pd.Series(pd.factorize(df[LABEL_COL])[0], index=df.index)
    return X, y

In [5]:
def get_model_and_grid(clf_name: str):
    """
    Same models and grids as your Phase 1 baseline code.
    """
    if clf_name == "SVM":
        model = LinearSVC(dual="auto", max_iter=100000, tol=2e-3, random_state=RANDOM_STATE)
        grid = {"clf__C": [0.01, 0.1, 1, 10]}
        return model, grid

    if clf_name == "KNN":
        model = KNeighborsClassifier()
        grid = {
            "clf__n_neighbors": [3, 5, 7, 9, 11],
            "clf__weights": ["uniform", "distance"]
        }
        return model, grid

    if clf_name == "DT":
        model = DecisionTreeClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__max_depth": [None, 5, 10, 20],
            "clf__min_samples_split": [2, 5, 10]
        }
        return model, grid

    if clf_name == "RF":
        model = RandomForestClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [None, 10, 20]
        }
        return model, grid

    if clf_name == "MLP":
        model = MLPClassifier(
            max_iter=300,
            early_stopping=True,
            n_iter_no_change=10,
            random_state=RANDOM_STATE
        )
        grid = {
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__alpha": [1e-4, 1e-3, 1e-2]
        }
        return model, grid

    raise ValueError("Unknown classifier: " + clf_name)

In [6]:
def format_params(best_params: dict, prefix: str):
    parts = []
    for k, v in best_params.items():
        if k.startswith(prefix):
            parts.append(f"{k.replace(prefix,'')}={v}")
    return "; ".join(parts)

def get_n_components(best_params: dict):
    return best_params.get("lda__n_components", None)

In [7]:
def run_lda_one_classifier(dataset_id: int, clf_name: str):
    """
    PHASE 2 (After FS/DR): LDA + classifier

    Pipeline:
      Imputer -> Scaler -> LDA -> Classifier

    Tuning:
      - SVM uses RandomizedSearchCV (email allowed; mention in report)
      - Others use GridSearchCV
    """
    X, y = load_one_dataset(dataset_id)
    model, clf_grid = get_model_and_grid(clf_name)

    # LDA max components = (#classes - 1)
    n_classes = len(np.unique(y))
    max_comp = max(1, n_classes - 1)

    # Outer CV always 10 folds
    outer_cv_local = StratifiedKFold(n_splits=OUTER_FOLDS_DEFAULT, shuffle=True, random_state=RANDOM_STATE)

    fold_rows = []
    print(f"\n✅ Phase 2 (LDA) | Data {dataset_id} | {clf_name} | max_comp={max_comp}")

    for fold_idx, (tr_idx, va_idx) in enumerate(outer_cv_local.split(X, y), start=1):
        print(f"➡️ {clf_name} Fold {fold_idx}/10 ...")

        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        # ✅ Phase 2 pipeline: add LDA
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("lda", LinearDiscriminantAnalysis(solver="svd")),
            ("clf", model)
        ])

        # Combine classifier grid + LDA grid
        param_grid = dict(clf_grid)
        param_grid.update({
            "lda__n_components": list(range(1, max_comp + 1))
        })

        # ✅ Email rule: SVM may use RandomizedSearchCV
        if clf_name == "SVM":
            search = RandomizedSearchCV(
                estimator=pipe,
                param_distributions=param_grid,
                n_iter=N_ITER_SVM,
                scoring="f1_macro",
                cv=inner_cv,
                n_jobs=N_JOBS,
                random_state=RANDOM_STATE,
                refit=True
            )
        else:
            search = GridSearchCV(
                estimator=pipe,
                param_grid=param_grid,
                scoring="f1_macro",
                cv=inner_cv,
                n_jobs=N_JOBS,
                refit=True
            )

        t0 = time.time()
        search.fit(X_tr, y_tr)
        fit_t = time.time() - t0

        y_pred = search.predict(X_va)
        acc = accuracy_score(y_va, y_pred)
        f1m = f1_score(y_va, y_pred, average="macro")

        best = search.best_params_

        print(f"✅ Done | Acc={acc:.4f} F1={f1m:.4f} | fit={fit_t:.1f}s")

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": float(acc),
            "f1_macro": float(f1m),
            "lda_params_str": format_params(best, "lda__"),
            "clf_params_str": format_params(best, "clf__"),
            "n_components": get_n_components(best),
        })

    return fold_rows

In [None]:
def dataset_block_start_row(dataset_id: int) -> int:
    return 1 + (dataset_id - 1) * 12

def fold_row(dataset_id: int, fold_idx: int) -> int:
    return dataset_block_start_row(dataset_id) + 2 + (fold_idx - 1)

def write_after_sheet(ws_after, dataset_id: int, results: dict):
    """
    Writes into 'After FS-DR' sheet.

    Template columns:
      B..K : Acc/F1 for SVM,KNN,DT,RF,MLP
      L    : No. Selected Features (we store all classifiers as one string)
      M    : Features Name (N/A for LDA)
      N..R : FS/DS Parameters for each classifier
    """
    clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

    for f in range(1, 11):
        r = fold_row(dataset_id, f)

        # B..K metrics
        col = 2
        for clf in clfs:
            row = results[clf][f-1]
            ws_after.cell(row=r, column=col).value   = round(row["accuracy"], 4)
            ws_after.cell(row=r, column=col+1).value = round(row["f1_macro"], 4)
            col += 2

        # L: No. Selected Features (LDA components)
        comps = [f"{clf}={results[clf][f-1]['n_components']}" for clf in clfs]
        ws_after.cell(row=r, column=12).value = ";".join(comps)

        # M: Features Name (not applicable for LDA)
        ws_after.cell(row=r, column=13).value = "N/A (LDA)"

        # N..R: FS/DS parameters for each classifier
        c = 14
        for clf in clfs:
            row = results[clf][f-1]
            ws_after.cell(row=r, column=c).value = f"LDA({row['lda_params_str']}); {row['clf_params_str']}"
            c += 1

In [None]:
wb = openpyxl.load_workbook(RESULTS_XLSX)
ws_after = wb["After FS-DR"]

clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

for dataset_id in range(1, 17):
    print(f"\n================ PHASE 2 (LDA) | DATA {dataset_id} ================")

    results = {}
    for clf in clfs:
        results[clf] = run_lda_one_classifier(dataset_id, clf)

    write_after_sheet(ws_after, dataset_id, results)

    wb.save(OUT_PHASE2)
    print("✅ Saved checkpoint:", OUT_PHASE2)

print("\n✅ Phase 2 complete:", OUT_PHASE2)



✅ Phase 2 (LDA) | Data 1 | SVM | max_comp=3
➡️ SVM Fold 1/10 ...
✅ Done | Acc=0.9139 F1=0.6367 | fit=0.4s
➡️ SVM Fold 2/10 ...
✅ Done | Acc=0.9085 F1=0.6294 | fit=0.4s
➡️ SVM Fold 3/10 ...
✅ Done | Acc=0.9241 F1=0.6304 | fit=0.4s
➡️ SVM Fold 4/10 ...
✅ Done | Acc=0.9225 F1=0.6387 | fit=0.4s
➡️ SVM Fold 5/10 ...
✅ Done | Acc=0.9218 F1=0.6258 | fit=0.4s
➡️ SVM Fold 6/10 ...
✅ Done | Acc=0.9139 F1=0.6100 | fit=0.4s
➡️ SVM Fold 7/10 ...
✅ Done | Acc=0.9155 F1=0.6376 | fit=0.4s
➡️ SVM Fold 8/10 ...
✅ Done | Acc=0.9218 F1=0.6329 | fit=0.4s
➡️ SVM Fold 9/10 ...
✅ Done | Acc=0.9115 F1=0.6189 | fit=0.4s
➡️ SVM Fold 10/10 ...
✅ Done | Acc=0.9107 F1=0.6271 | fit=0.4s

✅ Phase 2 (LDA) | Data 1 | KNN | max_comp=3
➡️ KNN Fold 1/10 ...
✅ Done | Acc=0.9836 F1=0.7282 | fit=1.3s
➡️ KNN Fold 2/10 ...
✅ Done | Acc=0.9797 F1=0.7147 | fit=1.3s
➡️ KNN Fold 3/10 ...
✅ Done | Acc=0.9781 F1=0.7083 | fit=1.3s
➡️ KNN Fold 4/10 ...
✅ Done | Acc=0.9781 F1=0.7141 | fit=1.3s
➡️ KNN Fold 5/10 ...
✅ Done | Acc=0.9789