In [1]:
import os, time
import numpy as np
import pandas as pd
import openpyxl

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
DATA_ROOT = "Datasets"          # folder that contains 1..16
RESULTS_XLSX = "/Users/classroomservices/Desktop/Winter/Machine Learning/Project/Code/Datasets/Results.xlsx"   # your template
LABEL_COL = "Label"
RANDOM_STATE = 42

# Outer CV must be 10 (prof requirement)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# Inner CV for tuning (not fixed by prof) -> 3 for speed
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

# Parallel jobs (Mac pe -1 kabhi slow hota)
N_JOBS = 4


In [3]:
def load_one_dataset(dataset_id: int):
    train_path = os.path.join(DATA_ROOT, str(dataset_id), "train.csv")
    test_path  = os.path.join(DATA_ROOT, str(dataset_id), "test.csv")

    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    df = pd.concat([train_df, test_df], ignore_index=True)

    if LABEL_COL not in df.columns:
        raise ValueError(f"'{LABEL_COL}' column not found in dataset {dataset_id}")

    X = df.drop(columns=[LABEL_COL])
    y = df[LABEL_COL]
    y = pd.Series(pd.factorize(y)[0], index=df.index)  # numeric labels

    return X, y


In [4]:
def get_model_and_grid_baseline(clf_name: str):
    """
    Phase 1: Baseline
    Pipeline: Imputer -> Scaler -> Classifier
    """
    if clf_name == "SVM":
        model = SVC(cache_size=2000)  # speed
        grid = {
            "clf__C": [0.1, 1, 10],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"],
        }
        return model, grid

    if clf_name == "KNN":
        model = KNeighborsClassifier()
        grid = {
            "clf__n_neighbors": [3, 5, 7, 9, 11],
            "clf__weights": ["uniform", "distance"],
        }
        return model, grid

    if clf_name == "DT":
        model = DecisionTreeClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__max_depth": [None, 5, 10, 20],
            "clf__min_samples_split": [2, 5, 10],
        }
        return model, grid

    if clf_name == "RF":
        model = RandomForestClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_split": [2, 5],
        }
        return model, grid

    if clf_name == "MLP":
        model = MLPClassifier(
            max_iter=300,
            early_stopping=True,     # speed
            n_iter_no_change=10,
            random_state=RANDOM_STATE
        )
        grid = {
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
        }
        return model, grid

    raise ValueError("Unknown classifier: " + clf_name)


In [5]:
def format_clf_params(best_params: dict) -> str:
    parts = []
    for k, v in best_params.items():
        if k.startswith("clf__"):
            parts.append(f"{k.replace('clf__','')}={v}")
    return "; ".join(parts)


In [6]:
def run_baseline_one_classifier(dataset_id: int, clf_name: str):
    X, y = load_one_dataset(dataset_id)
    model, grid = get_model_and_grid_baseline(clf_name)

    fold_rows = []
    print(f"\n✅ Phase 1 | Data {dataset_id} | {clf_name}")

    for fold_idx, (tr_idx, va_idx) in enumerate(outer_cv.split(X, y), start=1):
        print(f"➡️ {clf_name} Fold {fold_idx}/10 running...")

        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", model),
        ])

        gs = GridSearchCV(
            pipe,
            param_grid=grid,
            scoring="f1_macro",
            cv=inner_cv,
            n_jobs=N_JOBS,
            refit=True
        )

        t0 = time.time()
        gs.fit(X_tr, y_tr)
        fit_t = time.time() - t0

        y_pred = gs.predict(X_va)
        acc = accuracy_score(y_va, y_pred)
        f1m = f1_score(y_va, y_pred, average="macro")

        print(f"✅ Done Fold {fold_idx} | Acc={acc:.4f} F1={f1m:.4f} | fit={fit_t:.1f}s")

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": float(acc),
            "f1_macro": float(f1m),
            "params_str": format_clf_params(gs.best_params_)
        })

    return fold_rows


In [7]:
def dataset_block_start_row(dataset_id: int) -> int:
    # Data1 at row 1, Data2 at row 13, ... (12-row blocks)
    return 1 + (dataset_id - 1) * 12

def fold_row(dataset_id: int, fold_idx: int) -> int:
    # Fold1 at start+2, Fold10 at start+11
    return dataset_block_start_row(dataset_id) + 2 + (fold_idx - 1)


In [8]:
def write_phase1_before_sheet(ws_before, dataset_id: int, results_dict: dict):
    """
    results_dict = {
      "SVM": [10 folds],
      "KNN": [10 folds],
      "DT":  [10 folds],
      "RF":  [10 folds],
      "MLP": [10 folds]
    }
    Writes:
      B..K = metrics
      L..P = params
    """
    clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

    for f in range(1, 11):
        r = fold_row(dataset_id, f)

        # Metrics start at column B=2
        col = 2
        for clf in clfs:
            row_data = results_dict[clf][f-1]
            ws_before.cell(row=r, column=col).value     = round(row_data["accuracy"], 4)
            ws_before.cell(row=r, column=col+1).value   = round(row_data["f1_macro"], 4)
            col += 2

        # Params start at column L=12
        colp = 12
        for clf in clfs:
            row_data = results_dict[clf][f-1]
            ws_before.cell(row=r, column=colp).value = row_data["params_str"]
            colp += 1


In [9]:
wb = openpyxl.load_workbook(RESULTS_XLSX)
ws_before = wb["Before FS-DR"]

clfs = ["SVM", "KNN", "DT", "RF", "MLP"]

for dataset_id in range(1, 17):
    print(f"\n================= PHASE 1 | DATA {dataset_id} =================")
    results = {}

    for clf in clfs:
        results[clf] = run_baseline_one_classifier(dataset_id, clf)

    # Write into Excel
    write_phase1_before_sheet(ws_before, dataset_id, results)

    # Save checkpoint after each dataset (safety)
    wb.save("Results_PHASE1_filled.xlsx")
    print("✅ Saved checkpoint: Results_PHASE1_filled.xlsx")

print("\n✅ PHASE 1 completed for all datasets.")




✅ Phase 1 | Data 1 | SVM
➡️ SVM Fold 1/10 running...
✅ Done Fold 1 | Acc=0.9828 F1=0.9825 | fit=7.2s
➡️ SVM Fold 2/10 running...
✅ Done Fold 2 | Acc=0.9836 F1=0.9834 | fit=5.7s
➡️ SVM Fold 3/10 running...
✅ Done Fold 3 | Acc=0.9859 F1=0.9858 | fit=5.4s
➡️ SVM Fold 4/10 running...
✅ Done Fold 4 | Acc=0.9820 F1=0.9817 | fit=4.8s
➡️ SVM Fold 5/10 running...
✅ Done Fold 5 | Acc=0.9844 F1=0.9842 | fit=5.9s
➡️ SVM Fold 6/10 running...
✅ Done Fold 6 | Acc=0.9828 F1=0.9825 | fit=5.1s
➡️ SVM Fold 7/10 running...


KeyboardInterrupt: 