In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"


In [2]:
import time
import numpy as np
import pandas as pd
import openpyxl

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# ✅ Allowed faster SVM
from sklearn.svm import LinearSVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [3]:
# Paths
DATA_ROOT = "Datasets"
RESULTS_XLSX = "/Users/classroomservices/Desktop/Winter/Machine Learning/Project/Code/Datasets/Results.xlsx"
OUT_PHASE1 = "Results_BEFORE_filled.xlsx"

LABEL_COL = "Label"
RANDOM_STATE = 42

# REQUIRED outer folds for the assignment
OUTER_FOLDS_DEFAULT = 10

# ✅ According to email:
# - Keep 10-fold normally
# - If SVM still too slow, you may set SVM only to 5 folds
SVM_OUTER_FOLDS = 10   # change to 5 ONLY if needed (must mention in report)

# Inner CV for tuning (not fixed in email; keep 3 for speed)
INNER_FOLDS = 3
inner_cv = StratifiedKFold(n_splits=INNER_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# n_jobs stability for nested CV on Mac
N_JOBS = 1


In [4]:
import os

def load_one_dataset(dataset_id: int):
    """
    Loads Datasets/<id>/train.csv + test.csv and combines them.
    This ensures 10-fold StratifiedKFold works for all datasets.
    """
    train_path = os.path.join(DATA_ROOT, str(dataset_id), "train.csv")
    test_path  = os.path.join(DATA_ROOT, str(dataset_id), "test.csv")

    df = pd.concat([pd.read_csv(train_path), pd.read_csv(test_path)], ignore_index=True)

    if LABEL_COL not in df.columns:
        raise ValueError(f"'{LABEL_COL}' not found in dataset {dataset_id}")

    X = df.drop(columns=[LABEL_COL])
    y = pd.Series(pd.factorize(df[LABEL_COL])[0], index=df.index)
    return X, y


In [5]:
def get_model_and_grid(clf_name: str):
    """
    Returns model + grid for GridSearchCV.
    Email says: reduce grid OR use LinearSVC.
    We'll use LinearSVC for SVM and keep grids reasonable for others.
    """

    if clf_name == "SVM":
        # ✅ Fast SVM (allowed by email)
        model = LinearSVC(dual="auto", max_iter=20000,tol=1e-3, random_state=RANDOM_STATE)
        grid = {
            "clf__C": [0.01, 0.1, 1, 10]
        }
        return model, grid

    if clf_name == "KNN":
        model = KNeighborsClassifier()
        grid = {
            "clf__n_neighbors": [3, 5, 7, 9, 11],
            "clf__weights": ["uniform", "distance"]
        }
        return model, grid

    if clf_name == "DT":
        model = DecisionTreeClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__max_depth": [None, 5, 10, 20],
            "clf__min_samples_split": [2, 5, 10]
        }
        return model, grid

    if clf_name == "RF":
        model = RandomForestClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [None, 10, 20]
        }
        return model, grid

    if clf_name == "MLP":
        model = MLPClassifier(
            max_iter=300,
            early_stopping=True,
            n_iter_no_change=10,
            random_state=RANDOM_STATE
        )
        grid = {
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__alpha": [1e-4, 1e-3, 1e-2]
        }
        return model, grid

    raise ValueError("Unknown classifier: " + clf_name)


In [6]:
def format_params(best_params: dict, prefix="clf__"):
    parts = []
    for k, v in best_params.items():
        if k.startswith(prefix):
            parts.append(f"{k.replace(prefix,'')}={v}")
    return "; ".join(parts)


In [7]:
def run_baseline_one_classifier(dataset_id: int, clf_name: str):
    """
    Phase 1 pipeline:
      Imputer -> Scaler -> Classifier

    Outer CV:
      - normally 10 folds
      - but if clf_name == 'SVM' and SVM_OUTER_FOLDS == 5, then SVM uses 5 folds only (email allowed)
    """
    X, y = load_one_dataset(dataset_id)
    model, grid = get_model_and_grid(clf_name)

    # choose outer folds for this classifier
    outer_folds = OUTER_FOLDS_DEFAULT
    if clf_name == "SVM" and SVM_OUTER_FOLDS == 5:
        outer_folds = 5
        print("⚠️ Using 5-fold OUTER CV for SVM ONLY (allowed by email). Must justify in report.")

    outer_cv_local = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=RANDOM_STATE)

    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", model)
    ])

    fold_rows = []
    print(f"\n✅ Phase 1 | Data {dataset_id} | {clf_name} | outer_folds={outer_folds}")

    for fold_idx, (tr_idx, va_idx) in enumerate(outer_cv_local.split(X, y), start=1):
        print(f"➡️ {clf_name} Fold {fold_idx}/{outer_folds} ...")

        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        gs = GridSearchCV(
            estimator=pipe,
            param_grid=grid,
            scoring="f1_macro",
            cv=inner_cv,
            n_jobs=N_JOBS,
            refit=True
        )

        t0 = time.time()
        gs.fit(X_tr, y_tr)
        fit_t = time.time() - t0

        y_pred = gs.predict(X_va)
        acc = accuracy_score(y_va, y_pred)
        f1m = f1_score(y_va, y_pred, average="macro")

        print(f"✅ Done | Acc={acc:.4f} F1={f1m:.4f} | fit={fit_t:.1f}s")

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": float(acc),
            "f1_macro": float(f1m),
            "params_str": format_params(gs.best_params_, "clf__")
        })

    return fold_rows, outer_folds


In [8]:
def dataset_block_start_row(dataset_id: int) -> int:
    return 1 + (dataset_id - 1) * 12

def fold_row(dataset_id: int, fold_idx: int) -> int:
    return dataset_block_start_row(dataset_id) + 2 + (fold_idx - 1)

def write_before_sheet(ws_before, dataset_id: int, results: dict, outer_folds_map: dict):
    """
    Writes folds into Before sheet.
    If SVM uses only 5 folds, we fill Fold1..Fold5 and leave Fold6..Fold10 empty for SVM columns.
    Other classifiers always fill 10 folds.
    """
    clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

    for f in range(1, 11):
        r = fold_row(dataset_id, f)

        col = 2  # B
        for clf in clfs:
            folds_used = outer_folds_map[clf]
            if f <= folds_used:
                row = results[clf][f-1]
                ws_before.cell(row=r, column=col).value   = round(row["accuracy"], 4)
                ws_before.cell(row=r, column=col+1).value = round(row["f1_macro"], 4)
            # else: leave blank
            col += 2

        # Params L..P
        colp = 12
        for clf in clfs:
            folds_used = outer_folds_map[clf]
            if f <= folds_used:
                ws_before.cell(row=r, column=colp).value = results[clf][f-1]["params_str"]
            colp += 1


In [None]:
wb = openpyxl.load_workbook(RESULTS_XLSX)
ws_before = wb["Before FS-DR"]

clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

for dataset_id in range(1, 17):
    print(f"\n================ PHASE 1 | DATA {dataset_id} ================")

    results = {}
    outer_folds_map = {}

    for clf in clfs:
        rows, used_folds = run_baseline_one_classifier(dataset_id, clf)
        results[clf] = rows
        outer_folds_map[clf] = used_folds

    write_before_sheet(ws_before, dataset_id, results, outer_folds_map)

    wb.save(OUT_PHASE1)
    print("✅ Saved checkpoint:", OUT_PHASE1)

print("\n✅ Phase 1 complete:", OUT_PHASE1)




✅ Phase 1 | Data 1 | SVM | outer_folds=10
➡️ SVM Fold 1/10 ...
✅ Done | Acc=0.9789 F1=0.7291 | fit=0.4s
➡️ SVM Fold 2/10 ...
✅ Done | Acc=0.9765 F1=0.9694 | fit=0.3s
➡️ SVM Fold 3/10 ...
✅ Done | Acc=0.9812 F1=0.7285 | fit=0.4s
➡️ SVM Fold 4/10 ...
✅ Done | Acc=0.9789 F1=0.9783 | fit=0.4s
➡️ SVM Fold 5/10 ...
✅ Done | Acc=0.9812 F1=0.9809 | fit=0.3s
➡️ SVM Fold 6/10 ...
✅ Done | Acc=0.9804 F1=0.9769 | fit=0.3s
➡️ SVM Fold 7/10 ...
✅ Done | Acc=0.9851 F1=0.7321 | fit=0.4s
➡️ SVM Fold 8/10 ...
✅ Done | Acc=0.9890 F1=0.9859 | fit=0.4s
➡️ SVM Fold 9/10 ...
✅ Done | Acc=0.9773 F1=0.7242 | fit=0.3s
➡️ SVM Fold 10/10 ...
✅ Done | Acc=0.9804 F1=0.9800 | fit=0.4s

✅ Phase 1 | Data 1 | KNN | outer_folds=10
➡️ KNN Fold 1/10 ...
✅ Done | Acc=0.9937 F1=0.7413 | fit=2.0s
➡️ KNN Fold 2/10 ...
✅ Done | Acc=0.9883 F1=0.9756 | fit=2.0s
➡️ KNN Fold 3/10 ...
✅ Done | Acc=0.9789 F1=0.7201 | fit=2.0s
➡️ KNN Fold 4/10 ...
✅ Done | Acc=0.9875 F1=0.7204 | fit=1.9s
➡️ KNN Fold 5/10 ...
✅ Done | Acc=0.9898 F1=