### Importing the Packages

In [1]:
import os, random, numpy as np
import torch
from torchvision import models
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pickle

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


### Loading the Dataset for Model Training

In [None]:
data_dir = "/kaggle/input/sml-data"
data = np.load(os.path.join(data_dir, "train_B.npz"))
X, y = data["X"], data["y"]
print("Loaded dataset:", X.shape, y.shape)

encoder = LabelEncoder()
y = encoder.fit_transform(y)
print("Label mapping:", dict(zip(encoder.classes_, range(len(encoder.classes_)))))

X = X.astype("float32") / 255.0

X = np.transpose(X, (0, 3, 1, 2))
y = y.astype("int64")

num_classes = len(np.unique(y))
print(f"Prepared tensors: X={X.shape}, y={y.shape}, classes={num_classes}")

Loaded dataset: (12000, 224, 224, 3) (12000,)
Label mapping: {'Boot': 0, 'Sandal': 1, 'Shoe': 2}
Prepared tensors: X=(12000, 3, 224, 224), y=(12000,), classes=3


### Cross-Validation Split Function
Creating stratified k‑fold splits so that each fold maintains similar class proportions, so model evaluation is fair.

In [None]:
def make_folds(n, k=10, seed=42, y=None):
    
    np.random.seed(seed)
    unique_classes = np.unique(y)
    folds = [[] for _ in range(k)]
    
    for cls in unique_classes:
        cls_indices = np.where(y == cls)[0]
        np.random.shuffle(cls_indices)
        cls_splits = np.array_split(cls_indices, k)
        for fold_idx, split in enumerate(cls_splits):
            folds[fold_idx].extend(split)
    
    for i in range(k):
        np.random.shuffle(folds[i])
        folds[i] = np.array(folds[i])
    
    return folds

### Computing Metrics

Manually computing the Confusion Matrix and Evaluation Metrics (Precision, Recall, F1 Score, Accuracy)

In [4]:
def confusion_matrix_manual(y_true, y_pred, labels):
    n_classes = len(labels)
    label_to_idx = {lab: i for i, lab in enumerate(labels)} # mapping each label to an index
    cm = np.zeros((n_classes, n_classes), dtype=int)
    
    for actual, predicted in zip(y_true, y_pred):
        i = label_to_idx[actual]
        j = label_to_idx[predicted]
        cm[i, j] += 1
    return cm

In [5]:
def calc_metrics(cm):
    TP = np.diag(cm)
    FP = cm.sum(0) - TP
    FN = cm.sum(1) - TP
    precision = np.mean(TP / (TP + FP + 1e-9))
    recall    = np.mean(TP / (TP + FN + 1e-9))
    f1 = 2 * precision * recall / (precision + recall + 1e-9)
    acc = TP.sum() / cm.sum()
    return acc, precision, recall, f1

### Training & Nested Cross-Validation

In [None]:
def train_one_fold(X_train, y_train, X_val, y_val, model_builder,
                   lr=1e-3, epochs=5, batch=64, device="cpu"):

    train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
    val_ds   = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)
    val_dl   = DataLoader(val_ds, batch_size=batch, shuffle=False)

    model = model_builder().to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    for ep in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_dl)
        print(f"For Epoch {ep+1}/{epochs}, The Training loss ={avg_loss:.4f}")

    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in val_dl:
            xb = xb.to(device)
            probs = torch.softmax(model(xb), 1)
            preds.append(torch.argmax(probs, 1).cpu().numpy())
    preds = np.concatenate(preds)
    return model, preds

In [None]:
def evaluate_model_nested_cv(
    X, y, model_builder,
    candidate_lr=[1e-3, 3e-4, 1e-4],
    k_outer=10, k_inner=3, epochs=5,
    device="cpu"
):

    folds = make_folds(len(X), k_outer, seed=42, y=y)
    metrics_all = []

    for i in range(k_outer):
        print(f"\n\n\n For Outer Fold {i+1}/{k_outer}:")

        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(k_outer) if j != i])
        X_train, y_train = X[train_idx], y[train_idx]
        X_test,  y_test  = X[test_idx],  y[test_idx]

        print("	For tuning the Hyperparameter:")
        inner_folds = make_folds(len(X_train), k_inner, seed=42, y=y_train)
        mean_accs = []

        for lr in candidate_lr:
            inner_scores = []
            for j in range(k_inner):
                val_idx = inner_folds[j]
                tr_idx  = np.concatenate([inner_folds[m] for m in range(k_inner) if m != j])

                _, y_pred_val = train_one_fold(
                    X_train[tr_idx], y_train[tr_idx],
                    X_train[val_idx], y_train[val_idx],
                    model_builder=model_builder,
                    lr=lr, epochs=2, device=device
                )

                cm = confusion_matrix_manual(y_train[val_idx], y_pred_val, labels=np.unique(y))
                acc, prec, rec, f1 = calc_metrics(cm)
                inner_scores.append(acc)

            mean_accs.append(np.mean(inner_scores))

        best_lr = candidate_lr[int(np.argmax(mean_accs))]
        print(f"Best LR = {best_lr:.0e}")

        model, y_pred = train_one_fold(
            X_train, y_train, X_test, y_test,
            model_builder=model_builder,
            lr=best_lr, epochs=epochs, device=device
        )

        cm = confusion_matrix_manual(y_test, y_pred, labels=np.unique(y))
        acc, prec, rec, f1 = calc_metrics(cm)
        metrics_all.append([acc, prec, rec, f1])

        print(f"Fold {i+1}: Accuracy Score = {acc:.3f}, Precision = {prec:.3f}, Recall = {rec:.3f}, F1 Score = {f1:.3f}")

    metrics_all = np.array(metrics_all)
    mean, std = metrics_all.mean(0), metrics_all.std(0)

    print("\n\n Summary for the Nested CV Training:")
    print(f"Accuracy Score: {mean[0]:.4f} +/- {std[0]:.4f}")
    print(f"Precision Score: {mean[1]:.4f} +/- {std[1]:.4f}")
    print(f"Recall Score: {mean[2]:.4f} +/- {std[2]:.4f}")
    print(f"F1-score Score: {mean[3]:.4f} +/- {std[3]:.4f}")

    return mean, std

In [8]:
def build_efficientnet_b0(num_classes):
    model = models.efficientnet_b0(weights=None)
    in_features = model.classifier[1].in_features
    model.classifier = nn.Sequential(
        nn.Dropout(0.35),
        nn.Linear(in_features, num_classes),
        nn.BatchNorm1d(num_classes)
    )
    return model

In [None]:
print("\n### Evaluating EfficientNet‑B0 ###")
eff_mean_B, eff_std_B = evaluate_model_nested_cv(
    X, y,
    model_builder=lambda: build_efficientnet_b0(num_classes),
    candidate_lr= [3e-3, 1e-3, 5e-4, 3e-4, 1e-4],
    k_outer=10,
    k_inner=3,
    epochs=3,
    device=device
)


### Evaluating EfficientNet‑B0 ###



 For Outer Fold 1/10:
	For tuning the Hyperparameter:
For Epoch 1/2, The Training loss =0.9004
For Epoch 2/2, The Training loss =0.5505
For Epoch 1/2, The Training loss =0.8289
For Epoch 2/2, The Training loss =0.4918
For Epoch 1/2, The Training loss =0.7883
For Epoch 2/2, The Training loss =0.4794
For Epoch 1/2, The Training loss =0.8433
For Epoch 2/2, The Training loss =0.5294
For Epoch 1/2, The Training loss =0.8911
For Epoch 2/2, The Training loss =0.5392
For Epoch 1/2, The Training loss =0.9208
For Epoch 2/2, The Training loss =0.5239
For Epoch 1/2, The Training loss =0.8908
For Epoch 2/2, The Training loss =0.5722
For Epoch 1/2, The Training loss =0.9296
For Epoch 2/2, The Training loss =0.5910
For Epoch 1/2, The Training loss =0.9417
For Epoch 2/2, The Training loss =0.6393
For Epoch 1/2, The Training loss =1.0038
For Epoch 2/2, The Training loss =0.6722
For Epoch 1/2, The Training loss =0.9805
For Epoch 2/2, The Training loss =0.6465
For E

### Saving the Cross Validation Metrics

In [13]:
cv_results_effb0_B = {
    "mean": eff_mean_B,
    "std": eff_std_B,
    "accuracy": eff_mean_B[0],
    "precision": eff_mean_B[1],
    "recall": eff_mean_B[2],
    "f1": eff_mean_B[3]
}

save_path = "/kaggle/working/cv_results_effb0_trainB.pkl"
with open(save_path, "wb") as f:
    pickle.dump(cv_results_effb0_B, f)

print(f"Cross‑validation results saved to {save_path}")

Cross‑validation results saved to /kaggle/working/cv_results_effb0_trainB.pkl


### Training on the Full Dataset with the Final Model

In [None]:
def train_final_model(X, y, model_builder, lr=1e-3, epochs=5, batch=64, device="cpu"):
    train_ds = TensorDataset(torch.tensor(X), torch.tensor(y))
    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)

    model = model_builder().to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    # Training loop across epochs
    for ep in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            opt.zero_grad(); loss.backward(); opt.step()
            total_loss += loss.item()
        print(f"  Epoch {ep+1}/{epochs} – train loss: {total_loss/len(train_dl):.4f}")
    return model


print("\nTraining final EfficientNet‑B0 on entire Training Dataset:\n")
final_model = train_final_model(
    X, y,
    model_builder=lambda: build_efficientnet_b0(num_classes),
    lr=1e-03,
    epochs=5,
    batch=64,
    device=device
)

model_path = "/kaggle/working/efficientnetb0_final.pth"
torch.save(final_model.state_dict(), model_path)
print(f"Final EfficientNet‑B0 model saved to {model_path}")


Training final EfficientNet‑B0 on entire Training Dataset:

  Epoch 1/5 – train loss: 0.7406
  Epoch 2/5 – train loss: 0.4215
  Epoch 3/5 – train loss: 0.3030
  Epoch 4/5 – train loss: 0.2363
  Epoch 5/5 – train loss: 0.1820
Final EfficientNet‑B0 model saved to /kaggle/working/efficientnetb0_final.pth


### Testing the Model on the Final Unseen Test Dataset

In [None]:
print("\nLoading and preparing the Test Data: ")
# Loading the held‑out test dataset
test_data = np.load(os.path.join(data_dir, "test.npz"))
X_test, y_test = test_data["X"], test_data["y"]
print("Test dataset Loaded:", X_test.shape, y_test.shape)

y_test = encoder.transform(y_test)

X_test = X_test.astype("float32") / 255.0
X_test = np.transpose(X_test, (0, 3, 1, 2))
y_test = y_test.astype("int64")
print("Test tensors Ready:", X_test.shape)

test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

print("\nEvaluating final EfficientNet‑B0 on test dataset:")
model = build_efficientnet_b0(num_classes)
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)
model.eval()

y_pred_test = []
with torch.no_grad():
    for xb, _ in test_dl:
        xb = xb.to(device)
        probs = torch.softmax(model(xb), dim=1)
        y_pred_test.append(torch.argmax(probs, 1).cpu().numpy())
y_pred_test = np.concatenate(y_pred_test)

# calculating the prediction metrics
cm_test = confusion_matrix_manual(y_test, y_pred_test, labels=np.unique(y_test))
acc_test, prec_test, rec_test, f1_test = calc_metrics(cm_test)

print("\nFinal Results on the Test Dataset (Metrics and the Confusion Matrix): ")
print(f"Accuracy : {acc_test:.4f}")
print(f"Precision: {prec_test:.4f}")
print(f"Recall   : {rec_test:.4f}")
print(f"F1‑score : {f1_test:.4f}")

print("\nConfusion Matrix:")
print(cm_test)
print("Classes:", encoder.classes_)

cv_path = "/kaggle/input/pklfiles/cv_results_effb0_trainB.pkl"

with open(cv_path, "rb") as f:
    cv_results_effb0 = pickle.load(f)

cv_acc_mean  = cv_results_effb0["accuracy"]
cv_acc_std   = cv_results_effb0["std"][0]

print("\n=== CV vs. Test Comparison ===")
print(f"CV Accuracy : {cv_acc_mean:.4f} ± {cv_acc_std:.4f}")
print(f"Test Accuracy: {acc_test:.4f}")
print(f"Difference   : {acc_test - cv_acc_mean:.4f}")


Loading and preparing the Test Data: 
Test dataset Loaded: (3000, 224, 224, 3) (3000,)
Test tensors Ready: (3000, 3, 224, 224)

Evaluating final EfficientNet‑B0 on test dataset:

Final Results on the Test Dataset (Metrics and the Confusion Matrix): 
Accuracy : 0.9430
Precision: 0.9443
Recall   : 0.9430
F1‑score : 0.9436

Confusion Matrix:
[[967  14  19]
 [ 20 898  82]
 [ 16  20 964]]
Classes: ['Boot' 'Sandal' 'Shoe']

=== CV vs. Test Comparison ===
CV Accuracy : 0.9077 ± 0.0185
Test Accuracy: 0.9430
Difference   : 0.0353


In [31]:
print("\n=== Bootstrapping All Metrics on Test Dataset (Model B) ===")
boot_stats_B = bootstrap_metrics(y_test, y_pred_test)

for metric, (mean, low, high) in boot_stats_B.items():
    print(f"{metric.upper():>5}: {mean:.4f}  [{low:.4f}, {high:.4f}]")


=== Bootstrapping All Metrics on Test Dataset (Model B) ===
  ACC: 0.9432  [0.9343, 0.9510]
 PREC: 0.9445  [0.9359, 0.9522]
  REC: 0.9432  [0.9346, 0.9511]
   F1: 0.9438  [0.9351, 0.9516]


### Same Application for Model using Train A (Clean Data without Augmentation)

In [None]:
data_dir = "/kaggle/input/sml-data"
data = np.load(os.path.join(data_dir, "train_A.npz"))
X, y = data["X"], data["y"]
print("Loaded:", X.shape, y.shape)

encoder = LabelEncoder()
y = encoder.fit_transform(y)
print("Label mapping:", dict(zip(encoder.classes_,
                                 range(len(encoder.classes_)))))

X = X.astype("float32") / 255.0

X = np.transpose(X, (0,3,1,2))
y = y.astype("int64")
num_classes = len(np.unique(y))
print("Final tensors:", X.shape, "Classes:", num_classes)

Loaded: (12000, 224, 224, 3) (12000,)
Label mapping: {'Boot': 0, 'Sandal': 1, 'Shoe': 2}
Final tensors: (12000, 3, 224, 224) Classes: 3


In [10]:
print("\n### Evaluating EfficientNet‑B0 ###")
eff_mean_A, eff_std_A = evaluate_model_nested_cv(
    X, y,
    model_builder=lambda: build_efficientnet_b0(num_classes),
    candidate_lr= [3e-3, 1e-3, 5e-4, 3e-4, 1e-4],
    k_outer=10,
    k_inner=3,
    epochs=3,
    device=device
)


### Evaluating EfficientNet‑B0 ###



 For Outer Fold 1/10:
	For tuning the Hyperparameter:
For Epoch 1/2, The Training loss =0.5874
For Epoch 2/2, The Training loss =0.3059
For Epoch 1/2, The Training loss =0.5637
For Epoch 2/2, The Training loss =0.2998
For Epoch 1/2, The Training loss =0.5653
For Epoch 2/2, The Training loss =0.2929
For Epoch 1/2, The Training loss =0.6150
For Epoch 2/2, The Training loss =0.3359
For Epoch 1/2, The Training loss =0.6137
For Epoch 2/2, The Training loss =0.3386
For Epoch 1/2, The Training loss =0.5901
For Epoch 2/2, The Training loss =0.3229
For Epoch 1/2, The Training loss =0.6330
For Epoch 2/2, The Training loss =0.3547
For Epoch 1/2, The Training loss =0.6588
For Epoch 2/2, The Training loss =0.3716
For Epoch 1/2, The Training loss =0.7089
For Epoch 2/2, The Training loss =0.3858
For Epoch 1/2, The Training loss =0.7293
For Epoch 2/2, The Training loss =0.4127
For Epoch 1/2, The Training loss =0.7217
For Epoch 2/2, The Training loss =0.4207
For E

In [None]:
def train_final_model(X, y, model_builder, lr=1e-3, epochs=5, batch=64, device="cpu"):
    train_ds = TensorDataset(torch.tensor(X), torch.tensor(y))
    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)

    model = model_builder().to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    for ep in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            opt.zero_grad(); loss.backward(); opt.step()
            total_loss += loss.item()
        print(f"  Epoch {ep+1}/{epochs} – train loss: {total_loss/len(train_dl):.4f}")
    return model


print("\nTraining final EfficientNet‑B0 on entire Training Dataset:\n")
final_model_A = train_final_model(
    X, y,
    model_builder=lambda: build_efficientnet_b0(num_classes),
    lr=5e-04,        # LR chosen after performing cv
    epochs=5,
    batch=64,
    device=device
)

model_path = "/kaggle/working/efficientnetb0_final_dataA.pth"
torch.save(final_model_A.state_dict(), model_path)
print(f"Final EfficientNet‑B0 model saved to {model_path}")


Training final EfficientNet‑B0 on entire Training Dataset:

  Epoch 1/5 – train loss: 0.5682
  Epoch 2/5 – train loss: 0.2932
  Epoch 3/5 – train loss: 0.2320
  Epoch 4/5 – train loss: 0.1955
  Epoch 5/5 – train loss: 0.1623
Final EfficientNet‑B0 model saved to /kaggle/working/efficientnetb0_final_dataA.pth


In [18]:
import pickle

cv_results_effb0 = {
    "accuracy": eff_mean_A[0],
    "precision": eff_mean_A[1],
    "recall": eff_mean_A[2],
    "f1": eff_mean_A[3],
    "std": eff_std_A
}

save_path = "/kaggle/working/cv_results_effb0_trainA.pkl"
with open(save_path, "wb") as f:
    pickle.dump(cv_results_effb0, f)
print("Saved CV results to", save_path)

Saved CV results to /kaggle/working/cv_results_effb0_trainA.pkl


In [None]:
print("\nLoading and preparing the Test Data (Data A):")

data_dir = "/kaggle/input/sml-data"
test_data = np.load(os.path.join(data_dir, "test.npz"))
X_test, y_test = test_data["X"], test_data["y"]
print("Test dataset Loaded:", X_test.shape, y_test.shape)

from sklearn.preprocessing import LabelEncoder
train_data = np.load(os.path.join(data_dir, "train_A.npz"))
_, y_trainA = train_data["X"], train_data["y"]

encoder = LabelEncoder()
encoder.fit(y_trainA)
print("Re‑created encoder. Classes:", encoder.classes_)

y_test = encoder.transform(y_test)
X_test = X_test.astype("float32") / 255.0
X_test = np.transpose(X_test, (0, 3, 1, 2))
y_test = y_test.astype("int64")
print("Test tensors Ready:", X_test.shape)

test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

print("\nEvaluating final EfficientNet‑B0 (Data A) on test dataset:")
model_path = "/kaggle/working/efficientnetb0_final_dataA.pth"
model = build_efficientnet_b0(num_classes=len(encoder.classes_))
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)
model.eval()

y_pred_test = []
with torch.no_grad():
    for xb, _ in test_dl:
        xb = xb.to(device)
        y_pred_test.append(torch.argmax(model(xb), dim=1).cpu().numpy())
y_pred_test = np.concatenate(y_pred_test)

cm_test = confusion_matrix_manual(y_test, y_pred_test, labels=np.unique(y_test))
acc_test, prec_test, rec_test, f1_test = calc_metrics(cm_test)

print("\n=== Final Results on Test Dataset (Model A) ===")
print(f"Accuracy : {acc_test:.4f}")
print(f"Precision: {prec_test:.4f}")
print(f"Recall   : {rec_test:.4f}")
print(f"F1‑score : {f1_test:.4f}")
print("\nConfusion Matrix:\n", cm_test)
print("Classes:", encoder.classes_)

cv_path = "/kaggle/working/cv_results_effb0_trainA.pkl"
with open(cv_path, "rb") as f:
    cv_results_effb0 = pickle.load(f)

print("\nCV result keys:", cv_results_effb0.keys())

cv_acc_mean = cv_results_effb0["accuracy"]
cv_acc_std  = cv_results_effb0["std"][0]

print("\n=== CV vs. Test Comparison (Data A) ===")
print(f"CV Accuracy : {cv_acc_mean:.4f} ± {cv_acc_std:.4f}")
print(f"Test Accuracy: {acc_test:.4f}")
print(f"Difference   : {acc_test - cv_acc_mean:.4f}")


Loading and preparing the Test Data (Data A):
Test dataset Loaded: (3000, 224, 224, 3) (3000,)
Re‑created encoder. Classes: ['Boot' 'Sandal' 'Shoe']
Test tensors Ready: (3000, 3, 224, 224)

Evaluating final EfficientNet‑B0 (Data A) on test dataset:

=== Final Results on Test Dataset (Model A) ===
Accuracy : 0.7250
Precision: 0.7288
Recall   : 0.7250
F1‑score : 0.7269

Confusion Matrix:
 [[689 174 137]
 [109 684 207]
 [ 74 124 802]]
Classes: ['Boot' 'Sandal' 'Shoe']

CV result keys: dict_keys(['accuracy', 'precision', 'recall', 'f1', 'std'])

=== CV vs. Test Comparison (Data A) ===
CV Accuracy : 0.9359 ± 0.0216
Test Accuracy: 0.7250
Difference   : -0.2109


In [22]:
def bootstrap_metrics(y_true, y_pred, n_bootstrap=1000, confidence=0.95):
    n_samples = len(y_true)
    results = {"acc": [], "prec": [], "rec": [], "f1": []}
    labels = np.unique(y_true)

    for _ in range(n_bootstrap):
        idx = np.random.choice(n_samples, n_samples, replace=True)
        y_t, y_p = y_true[idx], y_pred[idx]
        cm = confusion_matrix_manual(y_t, y_p, labels=labels)
        acc, prec, rec, f1 = calc_metrics(cm)
        results["acc"].append(acc)
        results["prec"].append(prec)
        results["rec"].append(rec)
        results["f1"].append(f1)

    alpha = (1 - confidence) / 2
    stats = {}
    for k, v in results.items():
        v = np.array(v)
        stats[k] = (
            np.mean(v),
            np.percentile(v, 100 * alpha),
            np.percentile(v, 100 * (1 - alpha)),
        )
    return stats

In [33]:
print("\n=== Bootstrapping All Metrics (Model Train_A) ===")
boot_stats_A = bootstrap_metrics(y_test, y_pred_test)

for metric, (mean, lower, upper) in boot_stats_A.items():
    print(f"{metric.upper():>5}: {mean:.4f}  [{lower:.4f}, {upper:.4f}]")


=== Bootstrapping All Metrics (Model Train_A) ===
  ACC: 0.7250  [0.7080, 0.7410]
 PREC: 0.7289  [0.7122, 0.7452]
  REC: 0.7250  [0.7084, 0.7410]
   F1: 0.7269  [0.7104, 0.7431]


In [34]:
print("\n=== Comparison: Model A vs Model B (95 % CI) ===")
for metric in boot_stats_A.keys():
    meanA, lowA, upA = boot_stats_A[metric]
    meanB, lowB, upB = boot_stats_B[metric]
    print(f"{metric.upper():>5}:")
    print(f"  Model A: {meanA:.4f}  [{lowA:.4f}, {upA:.4f}]")
    print(f"  Model B: {meanB:.4f}  [{lowB:.4f}, {upB:.4f}]")
    print()


=== Comparison: Model A vs Model B (95 % CI) ===
  ACC:
  Model A: 0.7250  [0.7080, 0.7410]
  Model B: 0.9432  [0.9343, 0.9510]

 PREC:
  Model A: 0.7289  [0.7122, 0.7452]
  Model B: 0.9445  [0.9359, 0.9522]

  REC:
  Model A: 0.7250  [0.7084, 0.7410]
  Model B: 0.9432  [0.9346, 0.9511]

   F1:
  Model A: 0.7269  [0.7104, 0.7431]
  Model B: 0.9438  [0.9351, 0.9516]

