In [13]:
# =========================
# 1) Imports
# =========================

import os                    # work with folders/paths
import numpy as np           # math/statistics
import pandas as pd          # reading CSV files

# Cross-validation + tuning
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Pipeline = prevents data leakage (everything happens inside CV folds)
from sklearn.pipeline import Pipeline

# Preprocessing inside folds
from sklearn.impute import SimpleImputer     # handle missing values
from sklearn.preprocessing import StandardScaler  # scale features

# Evaluation metrics
from sklearn.metrics import accuracy_score, f1_score

# The 5 classifiers (Phase 1 baseline)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [14]:
# =========================
# 2) Global settings
# =========================

DATA_ROOT = "Datasets"   # folder that contains 1..16 subfolders
LABEL_COL = "Label"      # label column name in your CSV
RANDOM_STATE = 42        # makes results reproducible

# Outer CV = 10 folds (matches Excel Fold1..Fold10)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

# Inner CV = used only for hyperparameter tuning (nested CV)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


In [15]:
# =========================
# 3) Function 1: Load ONE dataset
# =========================

def load_one_dataset(dataset_id: int):
    """
    Reads:
      Datasets/<dataset_id>/train.csv
      Datasets/<dataset_id>/test.csv

    Then combines them (train+test) and returns:
      X = features
      y = labels

    Why combine train+test?
    Some datasets have too few samples per class in train.csv alone,
    making 10-fold Stratified CV impossible. Combining fixes that.
    """

    train_path = os.path.join(DATA_ROOT, str(dataset_id), "train.csv")
    test_path  = os.path.join(DATA_ROOT, str(dataset_id), "test.csv")

    # Read both csv files
    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    # Combine them into one full dataset
    df = pd.concat([train_df, test_df], ignore_index=True)

    # Split features and label
    if LABEL_COL not in df.columns:
        raise ValueError(f"Label column '{LABEL_COL}' not found in dataset {dataset_id}.")

    X = df.drop(columns=[LABEL_COL])
    y = df[LABEL_COL]

    # Ensure labels are numeric for sklearn (safe even if already numeric)
    y = pd.Series(pd.factorize(y)[0], index=df.index)

    return X, y


In [16]:
# =========================
# 4) Function 2: Choose ONE classifier and its hyperparameter grid
# =========================

def get_model_and_grid(clf_name: str):
    """
    Returns:
      model, param_grid
    for the classifier name you pass in.
    """

    if clf_name == "SVM":
        model = SVC()
        grid = {
            "clf__C": [0.1, 1, 10],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"],
        }
        return model, grid

    if clf_name == "KNN":
        model = KNeighborsClassifier()
        grid = {
            "clf__n_neighbors": [3, 5, 7, 9, 11],
            "clf__weights": ["uniform", "distance"],
        }
        return model, grid

    if clf_name == "DT":
        model = DecisionTreeClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__max_depth": [None, 5, 10, 20],
            "clf__min_samples_split": [2, 5, 10],
        }
        return model, grid

    if clf_name == "RF":
        model = RandomForestClassifier(random_state=RANDOM_STATE)
        grid = {
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [None, 10, 20],
        }
        return model, grid

    if clf_name == "MLP":
        model = MLPClassifier(max_iter=500, random_state=RANDOM_STATE)
        grid = {
            "clf__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "clf__alpha": [1e-4, 1e-3, 1e-2],
            "clf__learning_rate_init": [1e-3, 1e-2],
        }
        return model, grid

    raise ValueError(f"Unknown classifier name: {clf_name}")


In [17]:
# =========================
# 5) Helper: Convert best_params dict to a short string
# =========================

def best_params_to_string(best_params: dict) -> str:
    """
    Example:
      {"clf__C":10, "clf__kernel":"rbf"} -> "C=10; kernel=rbf"
    """
    parts = []
    for k, v in best_params.items():
        parts.append(f"{k.replace('clf__','')}={v}")
    return "; ".join(parts)


In [18]:
# =========================
# 6) MAIN function: Run ONE classifier (Phase 1 baseline)
# =========================

def run_phase1_one_classifier(dataset_id: int, clf_name: str):
    """
    PHASE 1 BASELINE:
      - No feature selection
      - No dimensionality reduction

    For ONE dataset and ONE classifier:
      Outer loop: 10-fold Stratified CV
      Inner loop: GridSearchCV (tuning) on training fold only

    Prints progress so you always know what is running.
    """

    # --- Step A: Load data for ONE dataset ---
    X, y = load_one_dataset(dataset_id)

    # --- Step B: Get the chosen model and hyperparameter grid ---
    model, grid = get_model_and_grid(clf_name)

    print("\n====================================================")
    print(f"‚úÖ START | DATASET {dataset_id} | CLASSIFIER = {clf_name}")
    print("Phase 1 (Baseline): Imputer -> Scaler -> Classifier")
    print("====================================================")

    fold_rows = []  # store fold results here

    # --- Step C: Outer CV loop (Fold 1..10) ---
    for fold_idx, (train_idx, val_idx) in enumerate(outer_cv.split(X, y), start=1):

        print(f"\n‚û°Ô∏è  Now running: {clf_name} | Outer Fold {fold_idx}/10")

        # Split into training fold and validation fold
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # --- Step D: Create pipeline (preprocessing inside folds) ---
        # IMPORTANT: This prevents leakage.
        # imputer/scaler are fitted ONLY on X_train for each fold.
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("clf", model),
        ])

        # --- Step E: Hyperparameter tuning (inner CV only on training fold) ---
        gs = GridSearchCV(
            estimator=pipe,
            param_grid=grid,
            scoring="f1_macro",   # macro-F1 (required for multi-class)
            cv=inner_cv,
            n_jobs=-1,
            refit=True
        )

        # Fit ONLY using training fold
        gs.fit(X_train, y_train)

        # --- Step F: Evaluate on validation fold ---
        y_pred = gs.predict(X_val)

        acc = accuracy_score(y_val, y_pred)
        f1m = f1_score(y_val, y_pred, average="macro")

        print(f"‚úÖ Fold {fold_idx} DONE | Acc={acc:.4f} | Macro-F1={f1m:.4f}")
        print(f"‚≠ê Best params: {best_params_to_string(gs.best_params_)}")

        fold_rows.append({
            "fold": fold_idx,
            "accuracy": float(acc),
            "f1_macro": float(f1m),
            "best_params_str": best_params_to_string(gs.best_params_)
        })

    # --- Step G: Summary across 10 folds ---
    accs = [r["accuracy"] for r in fold_rows]
    f1s  = [r["f1_macro"] for r in fold_rows]

    print("\n---------------- SUMMARY ----------------")
    print(f"‚úÖ FINISHED {clf_name} | DATASET {dataset_id}")
    print(f"Accuracy: {np.mean(accs):.4f} ¬± {np.std(accs, ddof=1):.4f}")
    print(f"Macro-F1: {np.mean(f1s):.4f} ¬± {np.std(f1s, ddof=1):.4f}")
    print("----------------------------------------")

    return fold_rows


In [19]:
# =========================
# 7) Run ALL 5 classifiers one-by-one for ONE dataset
# =========================

dataset_id = 1  # change to 2..16 if you want

classifiers = ["SVM", "KNN", "DT", "RF", "MLP"]

phase1_results_data1 = {}

for clf_name in classifiers:
    print(f"\n\nüîÅ Moving to next classifier: {clf_name}")
    phase1_results_data1[clf_name] = run_phase1_one_classifier(dataset_id, clf_name)

print("\n‚úÖ ALL classifiers completed for this dataset.")




üîÅ Moving to next classifier: SVM

‚úÖ START | DATASET 1 | CLASSIFIER = SVM
Phase 1 (Baseline): Imputer -> Scaler -> Classifier

‚û°Ô∏è  Now running: SVM | Outer Fold 1/10


KeyboardInterrupt: 