In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/images-shoes/train_B.npz
/kaggle/input/images-shoes/test.npz
/kaggle/input/images-shoes/train_A.npz


In [2]:
import os, random, numpy as np
import torch
from torchvision import models
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import gc
import pickle
from pathlib import Path

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [None]:
#creates stratified k-fold splits, that preserve class distribution
#INPUTS: 
#k - number of folds
#seed - random seed
#y - class labels for stratification
#RETURNS: 
#list of k arrays, containing indices for each fold

def make_folds(k=10, seed=42, y=None):
    np.random.seed(seed)
    
    unique_classes = np.unique(y)
    folds = [[] for _ in range(k)]
    
    #split samples across k folds for each class, stratification
    for cls in unique_classes:
        cls_indices = np.where(y == cls)[0]
        np.random.shuffle(cls_indices)
        cls_splits = np.array_split(cls_indices, k)
        
        #add class samples to each fold
        for fold_idx, split in enumerate(cls_splits):
            folds[fold_idx].extend(split)
    
    #shuffle within each fold and convert to numpy arrays
    for i in range(k):
        np.random.shuffle(folds[i])
        folds[i] = np.array(folds[i])
    
    return folds

In [10]:
def confusion_matrix_manual(y_true, y_pred, labels):
    n = len(labels)
    label_to_idx = {lab: i for i, lab in enumerate(labels)}
    cm = np.zeros((n, n), dtype=int)
    for yt, yp in zip(y_true, y_pred):
        i = label_to_idx[yt]
        j = label_to_idx[yp]
        cm[i, j] += 1
    return cm

In [11]:
def calc_metrics(cm):
    TP = np.diag(cm)
    FP = cm.sum(0) - TP
    FN = cm.sum(1) - TP
    precision = np.mean(TP / (TP + FP + 1e-9))
    recall    = np.mean(TP / (TP + FN + 1e-9))
    f1 = 2 * precision * recall / (precision + recall + 1e-9)
    acc = TP.sum() / cm.sum()
    return acc, precision, recall, f1

In [None]:
#training one fold, and returning model+predictions on validation set
def train_one_fold(X_train, y_train, X_val, y_val, model_builder,
                   lr=1e-3, epochs=5, batch=64, device="cpu"):
    
    train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
    val_ds   = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)
    val_dl   = DataLoader(val_ds, batch_size=batch, shuffle=False)

    model = model_builder().to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(model.parameters(), lr=lr)

    for ep in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_dl)
        print(f"  Epoch {ep+1}/{epochs}, Train loss={avg_loss:.4f}")

    #validation predictions
    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in val_dl:
            xb = xb.to(device)
            probs = torch.softmax(model(xb), 1)
            preds.append(torch.argmax(probs, 1).cpu().numpy())
    preds = np.concatenate(preds)
    
    #memory cleanup
    del model, loss_fn, opt, train_dl, val_dl
    if device == "cuda":
        torch.cuda.empty_cache()
    gc.collect()
    return None, preds

In [None]:
def evaluate_model_nested_cv(
    X, y, model_builder,
    candidate_lr=[1e-3, 3e-4, 1e-4],
    k_outer=10, k_inner=3, epochs=5,
    device="cpu"
):
    folds = make_folds(len(X), k_outer, seed=42, y=y)
    metrics_all = []

    for i in range(k_outer):
        print(f"\n=== Outer Fold {i+1}/{k_outer} ===")

        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(k_outer) if j != i])
        X_train, y_train = X[train_idx], y[train_idx]
        X_test,  y_test  = X[test_idx],  y[test_idx]

        #inner loop does hyperparameter tuning
        inner_folds = make_folds(len(X_train), k_inner, seed=42, y=y_train)
        mean_accs = []

        for lr in candidate_lr:
            inner_scores = []
            for j in range(k_inner):
                val_idx = inner_folds[j]
                tr_idx  = np.concatenate([inner_folds[m] for m in range(k_inner) if m != j])

                _, y_pred_val = train_one_fold(
                    X_train[tr_idx], y_train[tr_idx],
                    X_train[val_idx], y_train[val_idx],
                    model_builder=model_builder,
                    lr=lr, epochs=2, device=device
                )

                cm = confusion_matrix_manual(y_train[val_idx], y_pred_val, labels=np.unique(y))
                acc, prec, rec, f1 = calc_metrics(cm)
                inner_scores.append(acc)

            mean_accs.append(np.mean(inner_scores))

        best_lr = candidate_lr[int(np.argmax(mean_accs))]
        print(f"Best LR = {best_lr:.0e}")

        #outer test fold, calculates validation performance
        _, y_pred = train_one_fold(
            X_train, y_train, X_test, y_test,
            model_builder=model_builder,
            lr=best_lr, epochs=epochs, device=device
        )

        cm = confusion_matrix_manual(y_test, y_pred, labels=np.unique(y))
        acc, prec, rec, f1 = calc_metrics(cm)
        metrics_all.append([acc, prec, rec, f1])

        print(f"Fold {i+1}: Acc={acc:.3f}, P={prec:.3f}, R={rec:.3f}, F1={f1:.3f}")
        
        #memory cleanup
        if device == "cuda":
            torch.cuda.empty_cache()
        gc.collect()

    #summary statistics
    metrics_all = np.array(metrics_all)
    mean, std = metrics_all.mean(0), metrics_all.std(0)

    print("\n=== Nested CV Results ===")
    print(f"Accuracy : {mean[0]:.3f} ± {std[0]:.3f}")
    print(f"Precision: {mean[1]:.3f} ± {std[1]:.3f}")
    print(f"Recall   : {mean[2]:.3f} ± {std[2]:.3f}")
    print(f"F1-score : {mean[3]:.3f} ± {std[3]:.3f}")

    return mean, std

In [None]:
class DeepCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        #feature extractor
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),  #32 filters to start off with, padding = 1 to maintain filter dimensions coming out of each conv layer
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),                 #downsampling

            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.3),
        )

        self.flatten_dim = None
        self.classifier = None
        self.n_classes = n_classes

    def _get_flatten_dim(self, x):
        with torch.no_grad():
            f = self.features(x)
            return f.view(f.size(0), -1).shape[1]

    def forward(self, x):
        if self.classifier is None:
            flat_dim = self._get_flatten_dim(x)
            self.classifier = nn.Sequential(
                nn.Flatten(),
                nn.Linear(flat_dim, 128), nn.ReLU(), 
                nn.Dropout(0.4),  #dropout between linear layers, for regularisation
                nn.Linear(128, self.n_classes)
            ).to(x.device)
        out = self.features(x)
        out = self.classifier(out)
        return out

Training on train_B.npz (dirty images)

In [None]:
data_dir = "/kaggle/input/images-shoes"
data = np.load(os.path.join(data_dir, "train_B.npz"))  # or train_A.npz
X, y = data["X"], data["y"]
print("Loaded:", X.shape, y.shape)

#encode string labels to int values
encoder = LabelEncoder()
y = encoder.fit_transform(y)            #e.g. Boot=0, Sandal=1, Shoe=2

#save encoder
with open('/kaggle/working/encoder_trainB.pkl', 'wb') as f:
    pickle.dump(encoder, f)

print("Label mapping:", dict(zip(encoder.classes_,
                                 range(len(encoder.classes_)))))

#normalise images
X = X.astype("float32") / 255.0

#reshape for pytorch
X = np.transpose(X, (0,3,1,2))
y = y.astype("int64")
num_classes = len(np.unique(y))
print("Final tensors:", X.shape, "Classes:", num_classes)

Loaded: (12000, 224, 224, 3) (12000,)
Label mapping: {'Boot': 0, 'Sandal': 1, 'Shoe': 2}
Final tensors: (12000, 3, 224, 224) Classes: 3


In [None]:
#cross validation, train B (distorted images)
print("\n### Evaluating DeepCNN ###")
deep_mean_B, deep_std_B = evaluate_model_nested_cv(
    X, y,
    model_builder=lambda: DeepCNN(num_classes),
    candidate_lr=[3e-3, 1e-3, 3e-4],
    k_outer=10,
    k_inner=3,
    epochs=3,
    device=device
)

#saving results
import pickle

cv_results_B = {
    'mean': deep_mean_B,
    'std': deep_std_B,
    'accuracy': deep_mean_B[0],
    'precision': deep_mean_B[1],
    'recall': deep_mean_B[2],
    'f1': deep_mean_B[3]
}

with open('/kaggle/working/cv_results_trainB.pkl', 'wb') as f:
    pickle.dump(cv_results_B, f)

print("CV results saved!")


### Evaluating DeepCNN ###

=== Outer Fold 1/10 ===
  Epoch 1/2, Train loss=0.9186
  Epoch 2/2, Train loss=0.7659
  Epoch 1/2, Train loss=0.9090
  Epoch 2/2, Train loss=0.7549
  Epoch 1/2, Train loss=0.9340
  Epoch 2/2, Train loss=0.7718
  Epoch 1/2, Train loss=0.9266
  Epoch 2/2, Train loss=0.7782
  Epoch 1/2, Train loss=0.9270
  Epoch 2/2, Train loss=0.7841
  Epoch 1/2, Train loss=0.9262
  Epoch 2/2, Train loss=0.7785
  Epoch 1/2, Train loss=0.9811
  Epoch 2/2, Train loss=0.8617
  Epoch 1/2, Train loss=0.9634
  Epoch 2/2, Train loss=0.8492
  Epoch 1/2, Train loss=0.9903
  Epoch 2/2, Train loss=0.8747
Best LR = 1e-03
  Epoch 1/3, Train loss=0.8735
  Epoch 2/3, Train loss=0.7241
  Epoch 3/3, Train loss=0.6613
Fold 1: Acc=0.816, P=0.819, R=0.816, F1=0.817

=== Outer Fold 2/10 ===
  Epoch 1/2, Train loss=0.9078
  Epoch 2/2, Train loss=0.7662
  Epoch 1/2, Train loss=0.9118
  Epoch 2/2, Train loss=0.7604
  Epoch 1/2, Train loss=0.9299
  Epoch 2/2, Train loss=0.7572
  Epoch 1/2, Train loss

Training Final Model

In [None]:
#training final model, using the whole training set
#lr - set to the value determined via inner CV (see output above)
def train_final_model(X, y, model_builder, lr=1e-3, epochs=5, batch=64, device="cpu"):   
    train_ds = TensorDataset(torch.tensor(X), torch.tensor(y))
    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)
    
    model = model_builder().to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = optim.Adam(model.parameters(), lr=lr)
    
    for ep in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = loss_fn(out, yb)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_dl)
        print(f"  Epoch {ep+1}/{epochs}, Train loss={avg_loss:.4f}")
    
    return model

In [None]:
#training final model, using hyperparameter values chosen via nested CV
print("\n=== Training Final Model on Full Dataset (Dirty Images) ===")

final_model = train_final_model(
    X, y,
    model_builder=lambda: DeepCNN(num_classes),
    lr=1e-3,
    epochs=5,
    batch=64,
    device=device
)

#save final model
torch.save(final_model.state_dict(), '/kaggle/working/deepcnn_final_B.pth')
print("Final model saved!")


=== Training Final Model on Full Dataset (Dirty Images) ===
  Epoch 1/5, Train loss=0.8796
  Epoch 2/5, Train loss=0.7210
  Epoch 3/5, Train loss=0.6518
  Epoch 4/5, Train loss=0.6117
  Epoch 5/5, Train loss=0.5837
Final model saved!


Testing Model

In [None]:
#loading/preparing test set
print("\n=== Loading Test Data ===")
test_data = np.load(os.path.join(data_dir, "test.npz"))
X_test, y_test = test_data["X"], test_data["y"]
print("Loaded test data:", X_test.shape, y_test.shape)

#loading encoder
with open('/kaggle/working/encoder_trainB.pkl', 'rb') as f:
    encoder = pickle.load(f)

#encode labels
y_test = encoder.transform(y_test)

#normalising and reshaping test images
X_test = X_test.astype("float32") / 255.0

X_test = np.transpose(X_test, (0,3,1,2))
y_test = y_test.astype("int64")

print("Test data prepared:", X_test.shape)

#making predictions
print("\n=== Evaluating Final Model on Test Set ===")

test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

final_model.eval()
y_pred_test = []

with torch.no_grad():
    for xb, _ in test_dl:
        xb = xb.to(device)
        probs = torch.softmax(final_model(xb), 1)
        y_pred_test.append(torch.argmax(probs, 1).cpu().numpy())

y_pred_test = np.concatenate(y_pred_test)

#calculating metrics
cm_test = confusion_matrix_manual(y_test, y_pred_test, labels=np.unique(y_test))
acc_test, prec_test, rec_test, f1_test = calc_metrics(cm_test)

print(f"\n=== Test Set Results ===")
print(f"Accuracy : {acc_test:.3f}")
print(f"Precision: {prec_test:.3f}")
print(f"Recall   : {rec_test:.3f}")
print(f"F1-score : {f1_test:.3f}")

print(f"\nConfusion Matrix:")
print(cm_test)
print(f"Classes: {encoder.classes_}")

#comparison with cv results
#opening saved cv results
with open('/kaggle/working/cv_results_trainB.pkl', 'rb') as f:
    cv_results_B = pickle.load(f)

#extracting relevant variables
deep_mean_B = cv_results_B['mean']
deep_std_B = cv_results_B['std']

print(f"\n=== Performance Comparison ===")
print(f"CV Performance:   {deep_mean_B[0]:.3f} ± {deep_std_B[0]:.3f}") 
print(f"Test Performance: {acc_test:.3f}")
print(f"Difference:       {acc_test - deep_mean_B[0]:.3f}") 


=== Loading Test Data ===
Loaded test data: (3000, 224, 224, 3) (3000,)
Test data prepared: (3000, 3, 224, 224)

=== Evaluating Final Model on Test Set ===

=== Test Set Results ===
Accuracy : 0.843
Precision: 0.842
Recall   : 0.843
F1-score : 0.842

Confusion Matrix:
[[946  30  24]
 [108 757 135]
 [ 46 128 826]]
Classes: ['Boot' 'Sandal' 'Shoe']

=== Performance Comparison ===
CV Performance:   0.799 ± 0.020
Test Performance: 0.843
Difference:       0.044


### Repeating the above, but this time using train_A (which is the image set with only clean images)

In [None]:
data_dir = "/kaggle/input/images-shoes"
data = np.load(os.path.join(data_dir, "train_A.npz"))  #changed to train_A.npz
X, y = data["X"], data["y"]
print("Loaded:", X.shape, y.shape)

#encode string labels to int values
encoder = LabelEncoder()
y = encoder.fit_transform(y)            #e.g. Boot=0, Sandal=1, Shoe=2

#save encoder
with open('/kaggle/working/encoder_trainA.pkl', 'wb') as f:
    pickle.dump(encoder, f)
    
print("Label mapping:", dict(zip(encoder.classes_,
                                 range(len(encoder.classes_)))))

#normalise images
X = X.astype("float32") / 255.0

#reshape for pytorch
X = np.transpose(X, (0,3,1,2))
y = y.astype("int64")
num_classes = len(np.unique(y))
print("Final tensors:", X.shape, "Classes:", num_classes)

Loaded: (12000, 224, 224, 3) (12000,)
Label mapping: {'Boot': 0, 'Sandal': 1, 'Shoe': 2}
Final tensors: (12000, 3, 224, 224) Classes: 3


In [None]:
#cross validation, train A (clean images)
print("\n### Evaluating DeepCNN ###")
deep_mean_A, deep_std_A = evaluate_model_nested_cv(
    X, y,
    model_builder=lambda: DeepCNN(num_classes),
    candidate_lr=[3e-3, 1e-3, 3e-4],
    k_outer=10,
    k_inner=3,
    epochs=3,
    device=device
)

#saving results
import pickle

cv_results_A = {
    'mean': deep_mean_A,
    'std': deep_std_A,
    'accuracy': deep_mean_A[0],
    'precision': deep_mean_A[1],
    'recall': deep_mean_A[2],
    'f1': deep_mean_A[3]
}

with open('/kaggle/working/cv_results_trainA.pkl', 'wb') as f:
    pickle.dump(cv_results_A, f)

print("CV results saved!")


### Evaluating DeepCNN ###

=== Outer Fold 1/10 ===
  Epoch 1/2, Train loss=0.7664
  Epoch 2/2, Train loss=0.5608
  Epoch 1/2, Train loss=0.7626
  Epoch 2/2, Train loss=0.5600
  Epoch 1/2, Train loss=0.7324
  Epoch 2/2, Train loss=0.5504
  Epoch 1/2, Train loss=0.7799
  Epoch 2/2, Train loss=0.6029
  Epoch 1/2, Train loss=0.7233
  Epoch 2/2, Train loss=0.5521
  Epoch 1/2, Train loss=0.7769
  Epoch 2/2, Train loss=0.5930
  Epoch 1/2, Train loss=0.8542
  Epoch 2/2, Train loss=0.6773
  Epoch 1/2, Train loss=0.8415
  Epoch 2/2, Train loss=0.6645
  Epoch 1/2, Train loss=0.8234
  Epoch 2/2, Train loss=0.6538
Best LR = 1e-03
  Epoch 1/3, Train loss=0.7051
  Epoch 2/3, Train loss=0.5408
  Epoch 3/3, Train loss=0.4639
Fold 1: Acc=0.900, P=0.900, R=0.900, F1=0.900

=== Outer Fold 2/10 ===
  Epoch 1/2, Train loss=0.7760
  Epoch 2/2, Train loss=0.5782
  Epoch 1/2, Train loss=0.7593
  Epoch 2/2, Train loss=0.5623
  Epoch 1/2, Train loss=0.7432
  Epoch 2/2, Train loss=0.5540
  Epoch 1/2, Train loss

In [None]:
#training final model, using the whole training set
#lr - set to the value determined via inner CV (see output above)
print("\n=== Training Final Model on Full Dataset (Clean Images) ===")

final_model_clean = train_final_model(
    X, y,
    model_builder=lambda: DeepCNN(num_classes),
    lr=1e-3,
    epochs=5,
    batch=64,
    device=device
)

torch.save(final_model_clean.state_dict(), '/kaggle/working/deepcnn_final_clean.pth')
print("Final model saved!")


=== Training Final Model on Full Dataset (Clean Images) ===
  Epoch 1/5, Train loss=0.6687
  Epoch 2/5, Train loss=0.4961
  Epoch 3/5, Train loss=0.4271
  Epoch 4/5, Train loss=0.3816
  Epoch 5/5, Train loss=0.3456
Final model saved!


In [None]:
#loading/preparing test images
print("\n=== Loading Test Data (for Train_A Model) ===")
test_data = np.load(os.path.join(data_dir, "test.npz"))
X_test, y_test = test_data["X"], test_data["y"]
print("Loaded test data:", X_test.shape, y_test.shape)

#loading encoder
with open('/kaggle/working/encoder_trainA.pkl', 'rb') as f:
    encoder = pickle.load(f)
    
#encode labels
y_test = encoder.transform(y_test)  # Make sure this is the encoder fitted on train_A

#normalising and reshaping test set images
X_test = X_test.astype("float32") / 255.0

X_test = np.transpose(X_test, (0,3,1,2))
y_test = y_test.astype("int64")

print("Test data prepared:", X_test.shape)

#making predictions
print("\n=== Evaluating Train_A Model on Test Set ===")

test_ds = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

final_model_clean.eval()  
y_pred_test = []

with torch.no_grad():
    for xb, _ in test_dl:
        xb = xb.to(device)
        probs = torch.softmax(final_model_clean(xb), 1)  
        y_pred_test.append(torch.argmax(probs, 1).cpu().numpy())

y_pred_test = np.concatenate(y_pred_test)

#calculating metrics
cm_test = confusion_matrix_manual(y_test, y_pred_test, labels=np.unique(y_test))
acc_test, prec_test, rec_test, f1_test = calc_metrics(cm_test)

print(f"\n=== Test Set Results (Train_A Model - Clean Training) ===")
print(f"Accuracy : {acc_test:.3f}")
print(f"Precision: {prec_test:.3f}")
print(f"Recall   : {rec_test:.3f}")
print(f"F1-score : {f1_test:.3f}")

print(f"\nConfusion Matrix:")
print(cm_test)
print(f"Classes: {encoder.classes_}")

#comparison with cv results
#opening saved cv results
with open('/kaggle/working/cv_results_trainA.pkl', 'rb') as f:
    cv_results_A = pickle.load(f)

#extracting relevant variables
deep_mean_A = cv_results_A['mean']
deep_std_A = cv_results_A['std']

print(f"\n=== Performance Comparison (Train_A) ===")
print(f"CV Performance:   {deep_mean_A[0]:.3f} ± {deep_std_A[0]:.3f}")  # ← Use train_A CV results
print(f"Test Performance: {acc_test:.3f}")
print(f"Difference:       {acc_test - deep_mean_A[0]:.3f}")


=== Loading Test Data (for Train_A Model) ===
Loaded test data: (3000, 224, 224, 3) (3000,)
Test data prepared: (3000, 3, 224, 224)

=== Evaluating Train_A Model on Test Set ===

=== Test Set Results (Train_A Model - Clean Training) ===
Accuracy : 0.812
Precision: 0.816
Recall   : 0.812
F1-score : 0.814

Confusion Matrix:
[[844  92  64]
 [ 70 830 100]
 [ 52 185 763]]
Classes: ['Boot' 'Sandal' 'Shoe']

=== Performance Comparison (Train_A) ===
CV Performance:   0.890 ± 0.016
Test Performance: 0.812
Difference:       -0.078


Bootstrapping, to get error bars for the test accuracy

In [None]:
#bootstrapping to get error bars for test metrics
#INPUTS:
#y_true - true labels of test set
#y_pred - predicted labels of test set
#n_bootstrap - number of bootstrap iterations to perform
#confidence - to calculate confidence interval, to determine statistical significance
def bootstrap_metrics(y_true, y_pred, n_bootstrap=1000, confidence=0.95):
    n_samples = len(y_true)
    results = {"acc": [], "prec": [], "rec": [], "f1": []}
    labels = np.unique(y_true)

    for _ in range(n_bootstrap):
        idx = np.random.choice(n_samples, n_samples, replace=True)
        y_t, y_p = y_true[idx], y_pred[idx]
        cm = confusion_matrix_manual(y_t, y_p, labels=labels)
        acc, prec, rec, f1 = calc_metrics(cm)
        results["acc"].append(acc)
        results["prec"].append(prec)
        results["rec"].append(rec)
        results["f1"].append(f1)

    alpha = (1 - confidence) / 2
    stats = {}
    for k, v in results.items():
        v = np.array(v)
        stats[k] = (
            np.mean(v),
            np.percentile(v, 100 * alpha),
            np.percentile(v, 100 * (1 - alpha)),
        )
    return stats

In [None]:
#loading relevant paths 
results_dir = "/Users/fungs4/Desktop/Skye/University of Melbourne/Year 1 2025/COMP90051 Statistical Machine Learning/COMP90051 Assignments/COMP90051 Assignment 2/COMP90051_A2/A2_DeepCNN_Results"
data_dir = "/Users/fungs4/Desktop/Skye/University of Melbourne/Year 1 2025/COMP90051 Statistical Machine Learning/COMP90051 Assignments/COMP90051 Assignment 2/augmented_data"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#loading test data
test_data = np.load(os.path.join(data_dir, "test.npz"))
X_test_raw, y_test_raw = test_data["X"], test_data["y"]

#loading train A model (clean images), and making predictions
print("\n=== Loading Train_A Model (Clean Images) ===")

#load encoder
with open(os.path.join(results_dir, 'encoder_trainA.pkl'), 'rb') as f:
    encoder_A = pickle.load(f)

#transform labels
y_test = encoder_A.transform(y_test_raw)

#preprocessing images
X_test = X_test_raw.astype("float32") / 255.0
X_test = np.transpose(X_test, (0, 3, 1, 2))
y_test = y_test.astype("int64")

#loading train A
num_classes = len(encoder_A.classes_)
final_model_A = DeepCNN(num_classes).to(device)

#dummy forward pass to initialize
with torch.no_grad():
    dummy_input = torch.randn(1, 3, 224, 224).to(device)
    _ = final_model_A(dummy_input)

#load weights
final_model_A.load_state_dict(torch.load(os.path.join(results_dir, 'deepcnn_final_clean.pth'), map_location=device))

#predictions for Train_A
test_ds_A = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
test_dl_A = DataLoader(test_ds_A, batch_size=64, shuffle=False)

final_model_A.eval()
y_pred_test = []

with torch.no_grad():
    for xb, _ in test_dl_A:
        xb = xb.to(device)
        probs = torch.softmax(final_model_A(xb), 1)
        y_pred_test.append(torch.argmax(probs, 1).cpu().numpy())

y_pred_test = np.concatenate(y_pred_test)

#bootstrap for train A
print("\n=== Bootstrap Analysis for Train_A Model ===")

stats_A = bootstrap_metrics(y_test, y_pred_test, n_bootstrap=1000)

#extract metrics
mean_acc_A, lower_acc_A, upper_acc_A = stats_A['acc']
mean_prec_A, lower_prec_A, upper_prec_A = stats_A['prec']
mean_rec_A, lower_rec_A, upper_rec_A = stats_A['rec']
mean_f1_A, lower_f1_A, upper_f1_A = stats_A['f1']

print(f"Test Accuracy:  {mean_acc_A:.4f} ({lower_acc_A:.4f}, {upper_acc_A:.4f})")
print(f"Test Precision: {mean_prec_A:.4f} ({lower_prec_A:.4f}, {upper_prec_A:.4f})")
print(f"Test Recall:    {mean_rec_A:.4f} ({lower_rec_A:.4f}, {upper_rec_A:.4f})")
print(f"Test F1:        {mean_f1_A:.4f} ({lower_f1_A:.4f}, {upper_f1_A:.4f})")

#bootstrap for train B
print("\n=== Bootstrap Analysis for Train_B Model ===")

#load encoder 
with open(os.path.join(results_dir, 'encoder_trainB.pkl'), 'rb') as f:
    encoder_B = pickle.load(f)

#transform labels 
y_test_B = encoder_B.transform(y_test_raw)

#preprocess images
X_test_B = X_test_raw.astype("float32") / 255.0
X_test_B = np.transpose(X_test_B, (0, 3, 1, 2))
y_test_B = y_test_B.astype("int64")

#load train B model
num_classes_B = len(encoder_B.classes_)
final_model_B = DeepCNN(num_classes_B).to(device)

#dummy forward pass to initialize
with torch.no_grad():
    dummy_input = torch.randn(1, 3, 224, 224).to(device)
    _ = final_model_B(dummy_input)

#load weights
final_model_B.load_state_dict(torch.load(os.path.join(results_dir, 'deepcnn_final_B.pth'), map_location=device))

#get  predictions
test_ds_B = TensorDataset(torch.tensor(X_test_B), torch.tensor(y_test_B))
test_dl_B = DataLoader(test_ds_B, batch_size=64, shuffle=False)

final_model_B.eval()
y_pred_test_B = []

with torch.no_grad():
    for xb, _ in test_dl_B:
        xb = xb.to(device)
        probs = torch.softmax(final_model_B(xb), 1)
        y_pred_test_B.append(torch.argmax(probs, 1).cpu().numpy())

y_pred_test_B = np.concatenate(y_pred_test_B)

#bootstrap for Train_B
stats_B = bootstrap_metrics(y_test_B, y_pred_test_B, n_bootstrap=1000)

# Extract metrics
mean_acc_B, lower_acc_B, upper_acc_B = stats_B['acc']
mean_prec_B, lower_prec_B, upper_prec_B = stats_B['prec']
mean_rec_B, lower_rec_B, upper_rec_B = stats_B['rec']
mean_f1_B, lower_f1_B, upper_f1_B = stats_B['f1']

print(f"Test Accuracy:  {mean_acc_B:.4f} ({lower_acc_B:.4f}, {upper_acc_B:.4f})")
print(f"Test Precision: {mean_prec_B:.4f} ({lower_prec_B:.4f}, {upper_prec_B:.4f})")
print(f"Test Recall:    {mean_rec_B:.4f} ({lower_rec_B:.4f}, {upper_rec_B:.4f})")
print(f"Test F1:        {mean_f1_B:.4f} ({lower_f1_B:.4f}, {upper_f1_B:.4f})")


#bootstrap test accuracy error bars comparison (is the test accuracy difference significant?)
print("\n=== Final Comparison ===")
print(f"Train on Clean (A): {mean_acc_A:.4f} ({lower_acc_A:.4f}, {upper_acc_A:.4f})")
print(f"Train on Dirty (B): {mean_acc_B:.4f} ({lower_acc_B:.4f}, {upper_acc_B:.4f})")
print(f"\nDifference: {mean_acc_B - mean_acc_A:.4f}")


=== Loading Train_A Model (Clean Images) ===


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  final_model_A.load_state_dict(torch.load(os.path.join(results_dir, 'deepcnn_final_clean.pth'), map_location=device))



=== Bootstrap Analysis for Train_A Model ===
Test Accuracy:  0.8124 (0.7987, 0.8263)
Test Precision: 0.8157 (0.8019, 0.8291)
Test Recall:    0.8124 (0.7981, 0.8259)
Test F1:        0.8140 (0.8003, 0.8273)

=== Bootstrap Analysis for Train_B Model ===


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  final_model_B.load_state_dict(torch.load(os.path.join(results_dir, 'deepcnn_final_B.pth'), map_location=device))


Test Accuracy:  0.8469 (0.8333, 0.8597)
Test Precision: 0.8484 (0.8348, 0.8613)
Test Recall:    0.8469 (0.8335, 0.8596)
Test F1:        0.8477 (0.8340, 0.8601)

=== Final Comparison ===
Train on Clean (A): 0.8124 (0.7987, 0.8263)
Train on Dirty (B): 0.8469 (0.8333, 0.8597)

Difference: 0.0344
