# Master Comparison Experiments
This notebook compares a suite of classical models and an MLP across curated training datasets and a shared general test set.

## Workflow
1. Configure paths and helper utilities.
2. Load and align training datasets with the shared general test distribution.
3. Train baseline models and record timing and accuracy metrics.
4. Aggregate and persist comparison results for downstream analysis.

In [None]:
import time
from pathlib import Path

import pandas as pd
from IPython.display import display
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## Dataset Configuration
Define dataset sources and reproducibility constants. Update the dataset list as new feature collections become available.

In [None]:
NOTEBOOK_DIR = Path.cwd()
DATASETS = [
    ("20_bins", Path("../data/20_bins/20_bins_training_data.csv")),
    ("20_bins_all", Path("../data/20_bins_all/20_bins_all_training_data.csv")),
    ("20_bins_distortion", Path("../data/20_bins_distortion/20_bins_distortion_training_data.csv")),
    ("20_bins_energy", Path("../data/20_bins_energy/20_bins_energy_training_data.csv")),
    ("20_bins_shape", Path("../data/20_bins_shape/20_bins_shape_training_data.csv")),
    ("40_bins", Path("../data/40_bins/40_bins_training_data.csv")),
    ("60_bins", Path("../data/60_bins/60_bins_training_data.csv")),
]
GENERAL_TEST_PATH = Path("../data/test_data/test_data.csv")
RANDOM_STATE = 42
TEST_SIZE = 0.2

## Load General Test Distribution
Load the shared evaluation dataset once so each experiment reuses a consistent reference split.

In [None]:
general_test_df = pd.read_csv(GENERAL_TEST_PATH)
general_feature_df = general_test_df.drop(columns=["label"])
general_y = general_test_df["label"]
general_feature_columns = list(general_feature_df.columns)

print(
    f"[INFO] General test dataset loaded with {general_test_df.shape[0]} rows and "
    f"{general_feature_df.shape[1]} feature columns."
)

## Model Factory
Instantiate fresh scikit-learn models for every dataset to avoid cross-contamination of learned parameters.

In [None]:
def build_models(random_state: int = RANDOM_STATE) -> dict[str, object]:
    """Return the suite of models to benchmark."""
    return {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Support Vector Machine": SVC(kernel="rbf", gamma="scale"),
        "Random Forest": RandomForestClassifier(n_estimators=200, random_state=random_state, n_jobs=-1),
        "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
        "MLP Classifier": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=random_state),
    }

## Train And Evaluate
Iterate through each dataset, scale features, train every model, and capture accuracy metrics with timing information.

In [None]:
results = []

for dataset_name, dataset_path in DATASETS:
    dataset_df = pd.read_csv(dataset_path)
    feature_columns = [col for col in dataset_df.columns if col != "label"]
    shared_columns = [col for col in feature_columns if col in general_feature_columns]

    X = dataset_df[shared_columns]
    y = dataset_df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        stratify=y,
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    general_X_subset = general_feature_df[shared_columns]
    general_X_scaled = scaler.transform(general_X_subset)

    models = build_models()
    for model_name, model in models.items():
        start_time = time.time()
        model.fit(X_train_scaled, y_train)

        training_time = time.time() - start_time

        train_accuracy = accuracy_score(y_train, model.predict(X_train_scaled))
        test_accuracy = accuracy_score(y_test, model.predict(X_test_scaled))
        general_accuracy = accuracy_score(general_y, model.predict(general_X_scaled))

        results.append(
            {
                "Dataset": dataset_name,
                "Model": model_name,
                "Train Accuracy": train_accuracy,
                "Test Accuracy": test_accuracy,
                "General Test Accuracy": general_accuracy,
                "Training Time (s)": training_time,
            }
        )

    print(f"[INFO] Completed training for {dataset_name}.")

## Results
Sort by general test accuracy, display the table, and persist the summary for downstream analysis.

In [None]:
results_df = pd.DataFrame(results)
formatted_df = results_df.copy()
numeric_columns = [
    "Train Accuracy",
    "Test Accuracy",
    "General Test Accuracy",
    "Training Time (s)",
]

formatted_df[numeric_columns] = formatted_df[numeric_columns].round(4)

sorted_results = formatted_df.sort_values(by="General Test Accuracy", ascending=False)
display(sorted_results)
print(sorted_results.to_string(index=False))

output_path = NOTEBOOK_DIR / "model_experiments" / "model_performance_summary.csv"
output_path.parent.mkdir(parents=True, exist_ok=True)
sorted_results.to_csv(output_path, index=False)

print(f"[INFO] Saved results to {output_path.resolve()}")