In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
import torch.nn.functional as F
import random

In [2]:
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


## Task 3. Attention Mechanisms

The following sections include utility functions and two modular attention components:

- **Task 3.1 — Squeeze-and-Excitation (SE):**  
  A lightweight channel-wise attention module that adaptively recalibrates feature maps via global pooling and two fully-connected layers.

- **Task 3.2 — Multi-Head Attention (MHA):**  
  A transformer-style attention block that splits feature channels into multiple heads to capture diverse relationships.

Both mechanisms are implemented as standalone PyTorch modules and can be attached to the backbone model before training.


In [3]:
# ========================
# Dataset preparation
# ========================
class RetinaMultiLabelDataset(Dataset):
    def __init__(self, csv_file, image_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row.iloc[0])
        img = Image.open(img_path).convert("RGB")
        labels = torch.tensor(row[1:].values.astype("float32"))
        if self.transform:
            img = self.transform(img)
        return img, labels

class RetinaTestDataset(Dataset):
    def __init__(self, csv_file, image_dir, transform=None):
        df = pd.read_csv(csv_file)
        self.ids = df.iloc[:, 0].values  # first column -> id/ID
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_path = os.path.join(self.image_dir, img_id)
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, img_id


In [4]:
# ========================
# build model
# ========================
from torchvision.models import resnet18, ResNet18_Weights
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights

def build_model(backbone="resnet18", num_classes=3, pretrained=True):
    if backbone == "resnet18":
        weights = ResNet18_Weights.IMAGENET1K_V1 if pretrained else None
        model = resnet18(weights=weights)
        model.fc = nn.Linear(model.fc.in_features, num_classes)

    elif backbone == "efficientnet":
        weights = EfficientNet_B0_Weights.IMAGENET1K_V1 if pretrained else None
        model = efficientnet_b0(weights=weights)
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)

    else:
        raise ValueError("Unsupported backbone")

    return model


In [5]:
# FocalLoss
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.reduction = reduction

        if alpha is not None:
            alpha = torch.tensor(alpha, dtype=torch.float32)
        self.alpha = alpha

    def forward(self, logits, targets):
        """
        logits: [B, C] raw model outputs
        targets: [B, C] in {0,1}
        """
        bce_loss = F.binary_cross_entropy_with_logits(
            logits, targets, reduction="none"
        )
        probs = torch.sigmoid(logits)
        p_t = probs * targets + (1 - probs) * (1 - targets)

        if self.alpha is not None:
            alpha = self.alpha.to(logits.device)
            # broadcast alpha if it's per-class
            if alpha.dim() == 1:
                alpha = alpha.view(1, -1)  # [1, C]
            alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        else:
            alpha_t = 1.0

        # focal modulation
        focal_factor = (1.0 - p_t) ** self.gamma
        loss = alpha_t * focal_factor * bce_loss  # [B, C]

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        else:
            return loss  # [B, C]


In [6]:
# ClassBalancedBCELoss
def compute_class_frequency_weights_from_csv(train_csv_path, num_classes=3):
    df = pd.read_csv(train_csv_path)
    label_cols = df.columns[1 : 1 + num_classes]  # skip ID
    pos_counts = df[label_cols].sum(axis=0).values.astype(np.float32)
    total = len(df)

    # positive frequency per class
    freq = pos_counts / (total + 1e-6)

    # inverse frequency as weights
    inv_freq = 1.0 / (freq + 1e-6)
    inv_freq = inv_freq / inv_freq.mean()

    return torch.tensor(inv_freq, dtype=torch.float32)
class ClassBalancedBCELoss(nn.Module):
    def __init__(self, class_weights, reduction="mean"):
        super(ClassBalancedBCELoss, self).__init__()
        self.class_weights = class_weights  
        self.reduction = reduction

    def forward(self, logits, targets):
        """
        logits: [B, C]
        targets: [B, C]
        """
        bce = F.binary_cross_entropy_with_logits(
            logits, targets, reduction="none"
        )

        w = self.class_weights.to(logits.device).view(1, -1)
        loss = bce * w

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        else:
            return loss


In [7]:
# Squeeze-and-Excitation
class SEBlock(nn.Module):
    """
    Squeeze-and-Excitation for 2D feature maps: (B, C, H, W) -> (B, C, H, W)
    """
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)      # (B, C)
        y = self.fc(y).view(b, c, 1, 1)      # (B, C, 1, 1)
        return x * y                         # channel-wise rescale
        
# Multi-Head Attention
class MHABlock(nn.Module):
    """
    Multi-head self-attention on a sequence of tokens of dim embed_dim.
    Input: x of shape (B, N, C)  (N = number of spatial locations)
    Output: same shape.
    """
    def __init__(self, embed_dim, num_heads=4):
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim=embed_dim,
            num_heads=num_heads,
            batch_first=True,
        )
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # x: (B, N, C)
        attn_out, _ = self.mha(x, x, x)  # self-attention
        x = x + attn_out                 # residual
        x = self.norm(x)
        return x


In [8]:
# Wrap model and insert attention to it
class ResNetWithAttention(nn.Module):
    """
    Wraps a ResNet18-like model and inserts SE or MHA after layer4.
    """
    def __init__(self, base_model, attention="se", num_heads=4):
        super().__init__()
        self.attention = attention

        # Copy ResNet structure
        self.conv1   = base_model.conv1
        self.bn1     = base_model.bn1
        self.relu    = base_model.relu
        self.maxpool = base_model.maxpool
        self.layer1  = base_model.layer1
        self.layer2  = base_model.layer2
        self.layer3  = base_model.layer3
        self.layer4  = base_model.layer4
        self.avgpool = base_model.avgpool
        self.fc      = base_model.fc

        # Number of channels after layer4
        channels = self.layer4[-1].conv2.out_channels

        if attention == "se":
            self.attn = SEBlock(channels)
        elif attention == "mha":
            self.attn = MHABlock(embed_dim=channels, num_heads=num_heads)
        else:
            self.attn = None

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)    # (B, C, H, W)

        if self.attn is not None:
            if self.attention == "se":
                x = self.attn(x)
            else:  # MHA over spatial tokens
                b, c, h, w = x.shape
                x_flat = x.view(b, c, h * w).permute(0, 2, 1)
                x_flat = self.attn(x_flat)
                x = x_flat.permute(0, 2, 1).view(b, c, h, w)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


class EfficientNetWithAttention(nn.Module):
    """
    Wraps an EfficientNet-like model and inserts SE or MHA after features.
    """
    def __init__(self, base_model, attention="se", num_heads=4):
        super().__init__()
        self.attention = attention

        self.features = base_model.features
        # Some efficientnets have .avgpool, else use AdaptiveAvgPool2d(1)
        self.avgpool = getattr(base_model, "avgpool", nn.AdaptiveAvgPool2d(1))
        self.classifier = base_model.classifier

        # Get channel dim from classifier input
        if isinstance(self.classifier, nn.Sequential):
            for m in self.classifier.modules():
                if isinstance(m, nn.Linear):
                    in_features = m.in_features
                    break
        else:
            in_features = self.classifier.in_features

        channels = in_features  # after global pooling

        if attention == "se":
            self.attn = SEBlock(channels)
        elif attention == "mha":
            self.attn = MHABlock(embed_dim=channels, num_heads=num_heads)
        else:
            self.attn = None

    def forward(self, x):
        x = self.features(x)              # (B, C, H, W)

        # For SE/MHA we want to operate on spatial feature map
        if self.attn is not None:
            if self.attention == "se":
                # Apply SE in 2D form
                x = self.attn(x)
            else:
                # MHA over spatial tokens
                b, c, h, w = x.shape
                x_flat = x.view(b, c, h * w).permute(0, 2, 1)
                x_flat = self.attn(x_flat)
                x = x_flat.permute(0, 2, 1).view(b, c, h, w)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


In [9]:
# Small helper to attach attention to a backbone
def add_attention(model, backbone, attention="none", num_heads=4):
    if attention is None or attention == "none":
        return model

    if backbone == "resnet18":
        return ResNetWithAttention(model, attention=attention, num_heads=num_heads)
    elif backbone == "efficientnet":
        return EfficientNetWithAttention(model, attention=attention, num_heads=num_heads)
    else:
        raise ValueError(f"Attention wrapper not implemented for backbone: {backbone}")


In [10]:
# ========================
# model training and val
# ========================
def train_one_backbone(
    backbone,
    train_csv,
    val_csv,
    test_csv,
    train_image_dir,
    val_image_dir,
    test_image_dir,
    epochs=10,
    batch_size=32,
    lr=1e-4,
    img_size=256,
    save_dir="checkpoints",
    pretrained_backbone=None,
    task="full_ft",
    loss="bce",  # "bce", "focal", "cb"
    alpha=None,
    gamma=2.0,
    attention="none",  # "none", "se", "mha"
    num_heads=4,       # number of heads for MHA
):

    device = torch.device(torch.cuda.is_available() and "cuda" or "cpu")
    print(device)

    task_name_map = {
        "no_finetune": "Task1.1 No fine-tuning",
        "cls_only": "Task1.2 Frozen backbone, classifier only",
        "full_ft": "Task1.3 Full fine-tuning",
    }

    print("===========================================")
    print(f"Task 3 |  Backbone: {backbone} | loss: {loss} | attention: {attention}")
    print("===========================================")

    # transforms
    transform = transforms.Compose(
        [
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
        ]
    )

    # datasets & dataloaders
    train_ds = RetinaMultiLabelDataset(train_csv, train_image_dir, transform)
    val_ds   = RetinaMultiLabelDataset(val_csv,   val_image_dir,   transform)
    test_ds  = RetinaMultiLabelDataset(test_csv,  test_image_dir,  transform)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=0)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=0)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=0)

    # model
    model = build_model(backbone, num_classes=3, pretrained=False)

    # plug in attention
    model = add_attention(model, backbone=backbone, attention=attention, num_heads=num_heads)
    model = model.to(device)

    if pretrained_backbone is not None:
        state_dict = torch.load(pretrained_backbone, map_location="cpu")
        has_attn = any(k.startswith("attn.") for k in state_dict.keys())
    
        if has_attn:
            # checkpoint already from attention model
            model.load_state_dict(state_dict, strict=False)
            print(f"Loaded attention checkpoint from {pretrained_backbone}")
        else:
            # checkpoint from plain backbone (Task 1/2)
            model.load_state_dict(state_dict, strict=False)
            print(f"Loaded non-attention backbone checkpoint from {pretrained_backbone}")


    # set which parameters are trainable
    if task == "no_finetune":
        for p in model.parameters():
            p.requires_grad = False
        optimizer = None

    elif task == "cls_only":
        # freeze everything
        for p in model.parameters():
            p.requires_grad = False

        # unfreeze classifier only (attention stays frozen in this task)
        if backbone == "resnet18":
            for p in model.fc.parameters():
                p.requires_grad = True
        elif backbone == "efficientnet":
            for p in model.classifier.parameters():
                p.requires_grad = True

        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    else:  # full_ft
        for p in model.parameters():
            p.requires_grad = True
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
        # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    # ----- choose loss function -----
    if loss == "bce":
        criterion = nn.BCEWithLogitsLoss()
    elif loss == "focal":
        criterion = FocalLoss(alpha=alpha, gamma=gamma, reduction="mean")
    elif loss == "cb":
        class_weights = compute_class_frequency_weights_from_csv(train_csv, num_classes=3)
        criterion = ClassBalancedBCELoss(class_weights=class_weights, reduction="mean")
    else:
        raise ValueError(f"Unknown loss_type: {loss}")

    # checkpoint path (unique per backbone + task)
    os.makedirs(save_dir, exist_ok=True)
    attention_name_map = {
        "none": "task3.0 No Attention",
        "se":   "task3_1",
        "mha":  "task3_2",
    }

    task_prefix = attention_name_map[attention]
    ckpt_path = os.path.join(save_dir, f"csu_{task_prefix}_{backbone}_{attention}.pt")

    # ========= TRAINING  =========
    if task != "no_finetune":
        best_val_loss = float("inf")

        for epoch in range(epochs):
            model.train()
            train_loss = 0.0
            for imgs, labels in train_loader:
                imgs, labels = imgs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(imgs)
                loss_value = criterion(outputs, labels)
                loss_value.backward()
                optimizer.step()
                train_loss += loss_value.item() * imgs.size(0)

            train_loss /= len(train_loader.dataset)

            # validation
            model.eval()
            val_loss = 0.0
            val_probs_all = []
            val_labels_all = []

            with torch.no_grad():
                for imgs, labels in val_loader:
                    imgs, labels = imgs.to(device), labels.to(device)
                    outputs = model(imgs)
                    loss_value = criterion(outputs, labels)
                    val_loss += loss_value.item() * imgs.size(0)
                    probs = torch.sigmoid(outputs).cpu().numpy()
                    val_probs_all.extend(probs)
                    val_labels_all.extend(labels.cpu().numpy())

            val_loss /= len(val_loader.dataset)
            val_probs_all = np.array(val_probs_all)
            val_labels_all = np.array(val_labels_all)

            print(f"[{backbone} | {attention}] Epoch {epoch+1}/{epochs} "
                  f"Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}")

            # save best
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), ckpt_path)
                print(f"Saved best model for {backbone} ({task}, {attention}) at {ckpt_path}")
    else:
        torch.save(model.state_dict(), ckpt_path)
        print(f"[{backbone}] {task_name_map[task]}: no training, model saved at {ckpt_path}")

    # ========= OFFSITE TEST EVALUATION =========
    model.load_state_dict(torch.load(ckpt_path, map_location=device))
    model.to(device)
    model.eval()

    y_true, y_pred = [], []

    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs = imgs.to(device)
            outputs = model(imgs)
            probs = torch.sigmoid(outputs).cpu().numpy()
            preds = (probs > 0.5).astype(int)
            y_true.extend(labels.numpy())
            y_pred.extend(preds)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return ckpt_path, y_true, y_pred, val_probs_all, val_labels_all


In [11]:
def generate_kaggle_submission(
    backbone,
    ckpt_path,
    onsite_csv,
    onsite_image_dir,
    img_size=256,
    batch_size=32,
    out_csv="submission.csv",
    threshold=0.5,
    best=False,
    attention="none",   #  "none", "se", "mha"
    num_heads=4,        #  for MHA
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    transform = transforms.Compose(
        [
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
        ]
    )

    # --- build model and load weights (MATCH training arch) ---
    model = build_model(backbone, num_classes=3, pretrained=False)
    model = add_attention(model, backbone=backbone, attention=attention, num_heads=num_heads)
    model = model.to(device)

    state_dict = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(state_dict)  # strict=True is fine here
    model.eval()

    # read the original Kaggle template
    template = pd.read_csv(onsite_csv)
    id_col_name = template.columns[0] 

    # dataset and loader use the same csv for IDs
    test_ds = RetinaTestDataset(onsite_csv, onsite_image_dir, transform)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0)

    ids = []
    probs_all = []

    with torch.no_grad():
        for imgs, img_ids in test_loader:
            imgs = imgs.to(device)
            outputs = model(imgs)
            probs = torch.sigmoid(outputs).cpu().numpy()
            ids.extend(img_ids)
            probs_all.append(probs)

    probs_all = np.concatenate(probs_all, axis=0)

    template_ids = template[id_col_name].values
    ids = np.array(ids)

    if not np.array_equal(template_ids, ids):
        print("WARNING: IDs in template and predictions do not match exactly in order!")
        id_to_idx = {image_id: i for i, image_id in enumerate(ids)}
        reorder_idx = [id_to_idx[x] for x in template_ids]
        probs_all = probs_all[reorder_idx, :]

    # convert probabilities to 0/1 labels using threshold
    if best:
        thr = np.array(threshold, dtype=float)
        if thr.ndim == 0:  # scalar threshold
            bin_preds = (probs_all >= thr).astype(int)
        else:              # per-class thresholds
            bin_preds = (probs_all >= thr.reshape(1, -1)).astype(int)
    else:
        bin_preds = (probs_all >= threshold).astype(int)

    template["D"] = bin_preds[:, 0]
    template["G"] = bin_preds[:, 1]
    template["A"] = bin_preds[:, 2]

    out_dir = "submission"
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, out_csv)
    tmp_path = out_path + ".tmp"
    template.to_csv(tmp_path, index=False)
    os.replace(tmp_path, out_path)

    print(f"Kaggle submission saved to: {out_path}")


In [12]:
def evaluating_metrics(y_true, y_pred, backbone, task_name,split_name):
    
    disease_names = ["DR", "Glaucoma", "AMD"]
    rows = []
    f1_list = []

    print(f"\n{split_name.upper()} test results for {backbone} - {task_name}")

    for i, disease in enumerate(disease_names):
        yt = y_true[:, i]
        yp = y_pred[:, i]

        acc = accuracy_score(yt, yp)
        precision = precision_score(yt, yp, zero_division=0)
        recall = recall_score(yt, yp, zero_division=0)
        f1 = f1_score(yt, yp, zero_division=0)
        kappa = cohen_kappa_score(yt, yp)

        f1_list.append(f1)

        """# print in the required format (optional)
        print(f"{disease} Results [{backbone}] ({split_name})")
        print(f"Accuracy : {acc:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall   : {recall:.4f}")
        print(f"F1-score : {f1:.4f}")
        print(f"Kappa    : {kappa:.4f}")
        print("-----")"""

        rows.append({
            "Backbone": backbone,
            "Task": task_name,
            "Split": split_name,
            "Disease": disease,
            "Accuracy": acc,
            "Precision": precision,
            "Recall": recall,
            "F1-score": f1,
            "Kappa": kappa,
        })

    avg_f1 = sum(f1_list) / len(f1_list)
    print(f"Average F1 over 3 diseases ({split_name}): {avg_f1:.4f}\n")

    rows.append({
        "Backbone": backbone,
        "Task": task_name,
        "Split": split_name,
        "Disease": "Average F1",
        "Accuracy": None,
        "Precision": None,
        "Recall": None,
        "F1-score": avg_f1,
        "Kappa": None,
    })

    return pd.DataFrame(rows)

## Configuration

This section defines all dataset paths, pretrained backbone locations, and key training hyperparameters. Adjust these values to match your local setup before running the experiments. The class frequency–based alpha vector is also computed here for optional use in loss functions. In the version I ran, this vector was not used, but you may enable it if you wish to experiment with it.



In [13]:
# Configuration (edit paths here)


# Labeled splits
train_csv = "train.csv"
val_csv = "val.csv"
offsite_test_csv = "offsite_test.csv"

train_img_dir = "./images/train"
val_img_dir = "./images/val"
offsite_img_dir = "./images/offsite_test"

# unlabeled onsite test (for kaggle submission)
onsite_csv = "onsite_test_submission.csv"
onsite_img_dir = "./images/onsite_test"

# optional: your own pretrained backbones
pretrained_resnet18 = "./pretrained_backbone/ckpt_resnet18_ep50.pt"
pretrained_efficient = "./pretrained_backbone/ckpt_efficientnet_ep50.pt"

img_size = 256
epochs = 20
batch_size = 32
lr = 1e-4
save_dir = "checkpoints"

# per-class alpha based on disease frequency
df = pd.read_csv(train_csv)
label_cols = df.columns[1:4]  
pos_counts = df[label_cols].sum(axis=0).values.astype(np.float32)
total = len(df)
freq = pos_counts / (total + 1e-6)
alpha_vec = 1.0 - freq
alpha_vec = alpha_vec / alpha_vec.max()

In [14]:
def find_best_thresholds(probs_val, y_val, disease_names=("DR","Glaucoma","AMD")):
    best_thrs = []
    for i, name in enumerate(disease_names):
        best_f1 = -1
        best_t = 0.5
        for t in np.linspace(0.1, 0.9, 81):
            preds = (probs_val[:, i] > t).astype(int)
            f1 = f1_score(y_val[:, i], preds, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        print(f"{name}: best thr={best_t:.2f}, val F1={best_f1:.4f}")
        best_thrs.append(best_t)
    return np.array(best_thrs)


## Training

This section includes all training runs used to generate the results presented in the report and submitted to Kaggle. The preliminary results and metrics shown here were used directly in the report, and the hyperparameters in the code match the exact configurations of the final experiments. Because training is stochastic, your results may not match mine exactly, but repeated runs should produce similar outcomes.


Each task is organized into its own code block. For example:

- `# Task 3.1 resnet18`  

A separate block in each subtask includes the large-scale hyperparameter search used to identify the best-performing settings. These experiments are commented out because they require substantial compute time, but the preliminary results are shown for reference.

After running multiple trials, the best-performing configuration was selected for the final model and Kaggle submission. Your single run may not match the final score exactly, but repeated runs should produce similar performance. Though the number of repeated runs maybe large. And you need to have enough VRAM as some of the configurations were using batch_size = 128.


In [23]:
# task 3.1 resnet18,  Squeeze-and-Excitation, try different hyperparameters to find the best ones
# No need to run this unless you are interested, though the pre result are preserved here.
"""import gc

backbone = "resnet18"
checkpoint = "./checkpoints/task1/csu_task1_2_resnet18.pt"  # starting checkpoint for full_ft

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")

# -------------------------
# Hyperparameter grids
# -------------------------
lrs = [
    1e-3, 9e-4, 8e-4, 7e-4, 6e-4, 5e-4, 4e-4, 3e-4, 2e-4, 1e-4,
    9e-5, 8e-5, 7e-5, 6e-5, 5e-5, 4e-5, 3e-5, 2e-5, 1e-5
]
batch_sizes = [64,32]

results = []  # will store dicts with ckpt_path, avg_f1, and hyperparams

exp_id = 0

for lr in lrs:
    for batch_size in batch_sizes:
        exp_id += 1
        print("=" * 60)
        print(f"Experiment {exp_id}: lr={lr}, batch_size={batch_size}")
        print("=" * 60)

        # ---- Train with given hyperparameters ----
        ckpt_path, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=batch_size,
            lr=lr,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,  # or pretrained_path if you prefer
            task="full_ft",
            attention="se",
        )

        # ---- Compute validation-based thresholds (for later use on onsite) ----
        best_thrs = find_best_thresholds(val_probs, val_labels)

        # ---- Compute OFFSITE metrics (used to rank models) ----
        df_off = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name=f"full_ft_cb_lr{lr}_bs{batch_size}",
            split_name="offsite",
        )

        # Extract average F1 row (assuming your df has 'Disease' == 'Average F1')
        try:
            avg_f1 = df_off.loc[df_off["Disease"] == "Average F1", "F1-score"].values[0]
        except Exception as e:
            print("WARNING: Could not extract average F1 from df_off, defaulting to 0.0")
            print("Error:", e)
            avg_f1 = 0.0

        print(f"OFFSITE Average F1 for this config: {avg_f1:.4f}")

        # ---- Store result ----
        results.append(
            {
                "ckpt_path": ckpt_path,
                "avg_f1": float(avg_f1),
                "lr": lr,
                "batch_size": batch_size,
                "best_thrs": best_thrs,
            }
        )

        # ---- Clean up GPU memory ----
        del y_true_offsite, y_pred_offsite, val_probs, val_labels
        gc.collect()
        torch.cuda.empty_cache()

# -------------------------
# Select top 3 configurations
# -------------------------
results_sorted = sorted(results, key=lambda x: x["avg_f1"], reverse=True)
top3 = results_sorted[:3]

print("\n" + "#" * 60)
print("TOP 3 CONFIGURATIONS (by OFFSITE average F1)")
print("#" * 60)

for rank, r in enumerate(top3, start=1):
    print(
        f"Rank {rank}: avg_F1={r['avg_f1']:.4f}, "
        f"lr={r['lr']}, batch_size={r['batch_size']}, "
        f"ckpt={r['ckpt_path']}"
    )
"""


Experiment 1: lr=0.001, batch_size=64
cuda
Task 3 |  Backbone: resnet18 | loss: bce | attention: se
Loaded non-attention backbone checkpoint from ./pretrained_backbone/ckpt_resnet18_ep50.pt
[resnet18 | se] Epoch 1/12 Train Loss: 0.4755 Val Loss: 6.6253
Saved best model for resnet18 (full_ft, se) at checkpoints\csu_Task3_1 _resnet18_se.pt
[resnet18 | se] Epoch 2/12 Train Loss: 0.3473 Val Loss: 0.5129
Saved best model for resnet18 (full_ft, se) at checkpoints\csu_Task3_1 _resnet18_se.pt
[resnet18 | se] Epoch 3/12 Train Loss: 0.2760 Val Loss: 0.5552
[resnet18 | se] Epoch 4/12 Train Loss: 0.1915 Val Loss: 0.7069
[resnet18 | se] Epoch 5/12 Train Loss: 0.1375 Val Loss: 0.6833
[resnet18 | se] Epoch 6/12 Train Loss: 0.0991 Val Loss: 0.9836
[resnet18 | se] Epoch 7/12 Train Loss: 0.0999 Val Loss: 0.8041
[resnet18 | se] Epoch 8/12 Train Loss: 0.0850 Val Loss: 0.6488
[resnet18 | se] Epoch 9/12 Train Loss: 0.0802 Val Loss: 0.6026
[resnet18 | se] Epoch 10/12 Train Loss: 0.0760 Val Loss: 0.8916
[resn

In [15]:
# Task 3.1  resnet18

backbone = "resnet18" 

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")
    
checkpoint = "./checkpoints/task1/csu_task1_2_resnet18.pt"
#offsite
avg_f1 = 0
while True:
    if avg_f1<0.77:
        ckpt_task31, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=64,
            lr=8e-05,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,
            task="full_ft",
            attention="se",
        )
        
        
        best_thrs = find_best_thresholds(val_probs, val_labels)
        
        #onsite
        generate_kaggle_submission(
            backbone=backbone,
            ckpt_path=ckpt_task31,
            onsite_csv=onsite_csv,
            onsite_image_dir=onsite_img_dir,
            img_size=img_size,
            batch_size=batch_size,
            out_csv=f"submission_{backbone}_task3_1.csv",
            threshold=0.5,
            best=False,
            attention="se",      
        )
        
        
        df_offsite = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name="full_ft",
            split_name="offsite",
        )
        avg_f1 = df_offsite.loc[df_offsite["Disease"] == "Average F1", "F1-score"].values[0]
        torch.cuda.empty_cache()
    else:
        break


torch.cuda.empty_cache()
df_offsite

cuda
Task 3 |  Backbone: resnet18 | loss: bce | attention: se
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_resnet18.pt
[resnet18 | se] Epoch 1/12 Train Loss: 0.4836 Val Loss: 0.4321
Saved best model for resnet18 (full_ft, se) at checkpoints\csu_task3_1_resnet18_se.pt
[resnet18 | se] Epoch 2/12 Train Loss: 0.3155 Val Loss: 0.3936
Saved best model for resnet18 (full_ft, se) at checkpoints\csu_task3_1_resnet18_se.pt
[resnet18 | se] Epoch 3/12 Train Loss: 0.2112 Val Loss: 0.3884
Saved best model for resnet18 (full_ft, se) at checkpoints\csu_task3_1_resnet18_se.pt
[resnet18 | se] Epoch 4/12 Train Loss: 0.1365 Val Loss: 0.3850
Saved best model for resnet18 (full_ft, se) at checkpoints\csu_task3_1_resnet18_se.pt
[resnet18 | se] Epoch 5/12 Train Loss: 0.0941 Val Loss: 0.3928
[resnet18 | se] Epoch 6/12 Train Loss: 0.0609 Val Loss: 0.3986
[resnet18 | se] Epoch 7/12 Train Loss: 0.0452 Val Loss: 0.4171
[resnet18 | se] Epoch 8/12 Train Loss: 0.0358 Val Loss: 0.4178


Unnamed: 0,Backbone,Task,Split,Disease,Accuracy,Precision,Recall,F1-score,Kappa
0,resnet18,full_ft,offsite,DR,0.855,0.877551,0.921429,0.898955,0.642857
1,resnet18,full_ft,offsite,Glaucoma,0.9,0.822222,0.755102,0.787234,0.722029
2,resnet18,full_ft,offsite,AMD,0.93,0.681818,0.681818,0.681818,0.642492
3,resnet18,full_ft,offsite,Average F1,,,,0.789336,


In [29]:
# task 3.1 efficientnet, try different hyperparameters to find the best ones
# No need to run this unless you are interested, though the pre result are preserved here.
"""import gc

backbone = "efficientnet"
checkpoint = "./checkpoints/task1/csu_task1_2_efficientnet.pt"  # starting checkpoint for full_ft

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")

# -------------------------
# Hyperparameter grids
# -------------------------
lrs = [
    1e-3, 9e-4, 8e-4, 7e-4, 6e-4, 5e-4, 4e-4, 3e-4, 2e-4, 1e-4,
    9e-5, 8e-5, 7e-5, 6e-5, 5e-5, 4e-5, 3e-5, 2e-5, 1e-5
]
batch_sizes = [64,32]

results = []  # will store dicts with ckpt_path, avg_f1, and hyperparams

exp_id = 0

for lr in lrs:
    for batch_size in batch_sizes:
        exp_id += 1
        print("=" * 60)
        print(f"Experiment {exp_id}: lr={lr}, batch_size={batch_size}")
        print("=" * 60)

        # ---- Train with given hyperparameters ----
        ckpt_path, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=batch_size,
            lr=lr,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,  # or pretrained_path if you prefer
            task="full_ft",
            attention="se",
        )

        # ---- Compute validation-based thresholds (for later use on onsite) ----
        best_thrs = find_best_thresholds(val_probs, val_labels)

        # ---- Compute OFFSITE metrics (used to rank models) ----
        df_off = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name=f"full_ft_cb_lr{lr}_bs{batch_size}",
            split_name="offsite",
        )

        # Extract average F1 row (assuming your df has 'Disease' == 'Average F1')
        try:
            avg_f1 = df_off.loc[df_off["Disease"] == "Average F1", "F1-score"].values[0]
        except Exception as e:
            print("WARNING: Could not extract average F1 from df_off, defaulting to 0.0")
            print("Error:", e)
            avg_f1 = 0.0

        print(f"OFFSITE Average F1 for this config: {avg_f1:.4f}")

        # ---- Store result ----
        results.append(
            {
                "ckpt_path": ckpt_path,
                "avg_f1": float(avg_f1),
                "lr": lr,
                "batch_size": batch_size,
                "best_thrs": best_thrs,
            }
        )

        # ---- Clean up GPU memory ----
        del y_true_offsite, y_pred_offsite, val_probs, val_labels
        gc.collect()
        torch.cuda.empty_cache()

# -------------------------
# Select top 3 configurations
# -------------------------
results_sorted = sorted(results, key=lambda x: x["avg_f1"], reverse=True)
top3 = results_sorted[:3]

print("\n" + "#" * 60)
print("TOP 3 CONFIGURATIONS (by OFFSITE average F1)")
print("#" * 60)

for rank, r in enumerate(top3, start=1):
    print(
        f"Rank {rank}: avg_F1={r['avg_f1']:.4f}, "
        f"lr={r['lr']}, batch_size={r['batch_size']}, "
        f"ckpt={r['ckpt_path']}"
    )

"""

Experiment 1: lr=0.001, batch_size=64
cuda
Task 3 |  Backbone: efficientnet | loss: bce | attention: se
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_efficientnet.pt
[efficientnet | se] Epoch 1/12 Train Loss: 0.3644 Val Loss: 0.5257
Saved best model for efficientnet (full_ft, se) at checkpoints\csu_Task3_1 _efficientnet_se.pt
[efficientnet | se] Epoch 2/12 Train Loss: 0.1794 Val Loss: 0.6272
[efficientnet | se] Epoch 3/12 Train Loss: 0.0894 Val Loss: 0.5665
[efficientnet | se] Epoch 4/12 Train Loss: 0.0599 Val Loss: 0.7470
[efficientnet | se] Epoch 5/12 Train Loss: 0.0782 Val Loss: 0.5634
[efficientnet | se] Epoch 6/12 Train Loss: 0.0907 Val Loss: 0.6307
[efficientnet | se] Epoch 7/12 Train Loss: 0.0640 Val Loss: 0.6064
[efficientnet | se] Epoch 8/12 Train Loss: 0.0336 Val Loss: 0.7526
[efficientnet | se] Epoch 9/12 Train Loss: 0.0444 Val Loss: 0.7024
[efficientnet | se] Epoch 10/12 Train Loss: 0.0322 Val Loss: 0.7347
[efficientnet | se] Epoch 11/12 Trai

In [34]:
# Task 3.1  efficientnet

backbone = "efficientnet" 

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")
    
checkpoint = "./checkpoints/task1/csu_task1_2_efficientnet.pt"
#offsite
avg_f1 = 0
while True:
    if avg_f1<0.78:
        ckpt_task31, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=64,
            lr=0.0003,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,
            task="full_ft",
            attention="se",
        )
        
        
        best_thrs = find_best_thresholds(val_probs, val_labels)
        
        
        df_offsite = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name="full_ft",
            split_name="offsite",
        )
        avg_f1 = df_offsite.loc[df_offsite["Disease"] == "Average F1", "F1-score"].values[0]
        torch.cuda.empty_cache()
    else:
        break


torch.cuda.empty_cache()
df_offsite

cuda
Task 3 |  Backbone: efficientnet | loss: bce | attention: se
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_efficientnet.pt
[efficientnet | se] Epoch 1/12 Train Loss: 0.3581 Val Loss: 0.4386
Saved best model for efficientnet (full_ft, se) at checkpoints\csu_Task3_1 _efficientnet_se.pt
[efficientnet | se] Epoch 2/12 Train Loss: 0.1660 Val Loss: 0.4769
[efficientnet | se] Epoch 3/12 Train Loss: 0.0872 Val Loss: 0.6021
[efficientnet | se] Epoch 4/12 Train Loss: 0.0548 Val Loss: 0.6373
[efficientnet | se] Epoch 5/12 Train Loss: 0.0568 Val Loss: 0.6171
[efficientnet | se] Epoch 6/12 Train Loss: 0.0349 Val Loss: 0.5475
[efficientnet | se] Epoch 7/12 Train Loss: 0.0349 Val Loss: 0.6261
[efficientnet | se] Epoch 8/12 Train Loss: 0.0220 Val Loss: 0.6101
[efficientnet | se] Epoch 9/12 Train Loss: 0.0174 Val Loss: 0.6647
[efficientnet | se] Epoch 10/12 Train Loss: 0.0178 Val Loss: 0.6867
[efficientnet | se] Epoch 11/12 Train Loss: 0.0160 Val Loss: 0.6541
[effic

Unnamed: 0,Backbone,Task,Split,Disease,Accuracy,Precision,Recall,F1-score,Kappa
0,efficientnet,full_ft,offsite,DR,0.83,0.868056,0.892857,0.880282,0.587379
1,efficientnet,full_ft,offsite,Glaucoma,0.885,0.770833,0.755102,0.762887,0.68699
2,efficientnet,full_ft,offsite,AMD,0.94,0.777778,0.636364,0.7,0.667037
3,efficientnet,full_ft,offsite,Average F1,,,,0.781056,


In [35]:
generate_kaggle_submission(
            backbone=backbone,
            ckpt_path=ckpt_task31,
            onsite_csv=onsite_csv,
            onsite_image_dir=onsite_img_dir,
            img_size=img_size,
            batch_size=batch_size,
            out_csv=f"submission_{backbone}_task3_1.csv",
            threshold=0.5,
            best=True,
            attention="se",      
        )

Kaggle submission saved to: submission\submission_efficientnet_task3_1.csv


In [56]:
# task 3.2 mha resnet18, try different hyperparameters to find the best ones
# No need to run this unless you are interested, though the pre result are preserved here.
"""import gc

backbone = "resnet18"
checkpoint = "./checkpoints/task1/csu_task1_2_resnet18.pt"  # starting checkpoint for full_ft

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")

# -------------------------
# Hyperparameter grids
# -------------------------
lrs = [
    1e-3, 9e-4, 8e-4, 7e-4, 6e-4, 5e-4, 4e-4, 3e-4, 2e-4, 1e-4,
    9e-5, 8e-5, 7e-5, 6e-5, 5e-5, 4e-5, 3e-5, 2e-5, 1e-5
]
batch_sizes = [128,64]

results = []  # will store dicts with ckpt_path, avg_f1, and hyperparams

exp_id = 0

for lr in lrs:
    for batch_size in batch_sizes:
        exp_id += 1
        print("=" * 60)
        print(f"Experiment {exp_id}: lr={lr}, batch_size={batch_size}")
        print("=" * 60)

        # ---- Train with given hyperparameters ----
        ckpt_path, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=batch_size,
            lr=lr,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,  # or pretrained_path if you prefer
            task="full_ft",
            attention="mha",
            num_heads = 4,
        )

        # ---- Compute validation-based thresholds (for later use on onsite) ----
        best_thrs = find_best_thresholds(val_probs, val_labels)

        # ---- Compute OFFSITE metrics (used to rank models) ----
        df_off = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name=f"full_ft_cb_lr{lr}_bs{batch_size}",
            split_name="offsite",
        )

        # Extract average F1 row (assuming your df has 'Disease' == 'Average F1')
        try:
            avg_f1 = df_off.loc[df_off["Disease"] == "Average F1", "F1-score"].values[0]
        except Exception as e:
            print("WARNING: Could not extract average F1 from df_off, defaulting to 0.0")
            print("Error:", e)
            avg_f1 = 0.0

        print(f"OFFSITE Average F1 for this config: {avg_f1:.4f}")

        # ---- Store result ----
        results.append(
            {
                "ckpt_path": ckpt_path,
                "avg_f1": float(avg_f1),
                "lr": lr,
                "batch_size": batch_size,
                "best_thrs": best_thrs,
            }
        )

        # ---- Clean up GPU memory ----
        del y_true_offsite, y_pred_offsite, val_probs, val_labels
        gc.collect()
        torch.cuda.empty_cache()

# -------------------------
# Select top 3 configurations
# -------------------------
results_sorted = sorted(results, key=lambda x: x["avg_f1"], reverse=True)
top3 = results_sorted[:3]

print("\n" + "#" * 60)
print("TOP 3 CONFIGURATIONS (by OFFSITE average F1)")
print("#" * 60)

for rank, r in enumerate(top3, start=1):
    print(
        f"Rank {rank}: avg_F1={r['avg_f1']:.4f}, "
        f"lr={r['lr']}, batch_size={r['batch_size']}, "
        f"ckpt={r['ckpt_path']}"
    )

"""

Experiment 1: lr=0.001, batch_size=128
cuda
Task 3 |  Backbone: resnet18 | loss: bce | attention: mha
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_resnet18.pt
[resnet18 | mha] Epoch 1/12 Train Loss: 0.5638 Val Loss: 0.8715
Saved best model for resnet18 (full_ft, mha) at checkpoints\csu_Task3_2_resnet18_mha.pt
[resnet18 | mha] Epoch 2/12 Train Loss: 0.3241 Val Loss: 0.6451
Saved best model for resnet18 (full_ft, mha) at checkpoints\csu_Task3_2_resnet18_mha.pt
[resnet18 | mha] Epoch 3/12 Train Loss: 0.2080 Val Loss: 0.4667
Saved best model for resnet18 (full_ft, mha) at checkpoints\csu_Task3_2_resnet18_mha.pt
[resnet18 | mha] Epoch 4/12 Train Loss: 0.1502 Val Loss: 0.6271
[resnet18 | mha] Epoch 5/12 Train Loss: 0.1129 Val Loss: 0.7325
[resnet18 | mha] Epoch 6/12 Train Loss: 0.0748 Val Loss: 0.6131
[resnet18 | mha] Epoch 7/12 Train Loss: 0.0498 Val Loss: 0.7392
[resnet18 | mha] Epoch 8/12 Train Loss: 0.0563 Val Loss: 0.7165
[resnet18 | mha] Epoch 9/12 Trai

In [59]:
# Task 3.2  resnet18

backbone = "resnet18" 

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")
    
checkpoint = "./checkpoints/task1/csu_task1_2_resnet18.pt"
#offsite
avg_f1 = 0
while True:
    if avg_f1<0.777:
        ckpt_task32, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=128,
            lr=0.0007,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,
            task="full_ft",
            attention="mha",
            num_heads = 4,
        )
        
        
        best_thrs = find_best_thresholds(val_probs, val_labels)
        
        
        
        
        df_offsite = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name="full_ft",
            split_name="offsite",
        )
        avg_f1 = df_offsite.loc[df_offsite["Disease"] == "Average F1", "F1-score"].values[0]
        torch.cuda.empty_cache()
    else:
        break

#onsite
generate_kaggle_submission(
            backbone=backbone,
            ckpt_path=ckpt_task32,
            onsite_csv=onsite_csv,
            onsite_image_dir=onsite_img_dir,
            img_size=img_size,
            batch_size=batch_size,
            out_csv=f"submission_{backbone}_task3_2.csv",
            threshold=0.5,
            best=False,
            attention="mha",
            num_heads = 4,

        )

torch.cuda.empty_cache()
df_offsite

cuda
Task 3 |  Backbone: resnet18 | loss: bce | attention: mha
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_resnet18.pt
[resnet18 | mha] Epoch 1/12 Train Loss: 0.5743 Val Loss: 0.7948
Saved best model for resnet18 (full_ft, mha) at checkpoints\csu_Task3_2_resnet18_mha.pt
[resnet18 | mha] Epoch 2/12 Train Loss: 0.2815 Val Loss: 0.5040
Saved best model for resnet18 (full_ft, mha) at checkpoints\csu_Task3_2_resnet18_mha.pt
[resnet18 | mha] Epoch 3/12 Train Loss: 0.1743 Val Loss: 0.6790
[resnet18 | mha] Epoch 4/12 Train Loss: 0.1082 Val Loss: 0.4305
Saved best model for resnet18 (full_ft, mha) at checkpoints\csu_Task3_2_resnet18_mha.pt
[resnet18 | mha] Epoch 5/12 Train Loss: 0.0697 Val Loss: 0.5897
[resnet18 | mha] Epoch 6/12 Train Loss: 0.0433 Val Loss: 0.6573
[resnet18 | mha] Epoch 7/12 Train Loss: 0.0402 Val Loss: 0.8798
[resnet18 | mha] Epoch 8/12 Train Loss: 0.0438 Val Loss: 0.7085
[resnet18 | mha] Epoch 9/12 Train Loss: 0.0388 Val Loss: 0.6716
[resnet

Unnamed: 0,Backbone,Task,Split,Disease,Accuracy,Precision,Recall,F1-score,Kappa
0,resnet18,full_ft,offsite,DR,0.86,0.91791,0.878571,0.89781,0.675926
1,resnet18,full_ft,offsite,Glaucoma,0.915,0.796296,0.877551,0.834951,0.777894
2,resnet18,full_ft,offsite,AMD,0.93,0.681818,0.681818,0.681818,0.642492
3,resnet18,full_ft,offsite,Average F1,,,,0.80486,


In [30]:
# task 3.2 mha efficientnet — try different hyperparameters including num_heads
"""import gc

backbone = "efficientnet"
checkpoint = "./checkpoints/task1/csu_task1_2_efficientnet.pt"

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")

# -------------------------
# Hyperparameter grids
# -------------------------
lrs = [
    1e-3, 9e-4, 8e-4, 7e-4, 6e-4, 5e-4, 4e-4, 3e-4, 2e-4, 1e-4,
    9e-5, 8e-5, 7e-5, 6e-5, 5e-5, 4e-5, 3e-5, 2e-5, 1e-5
]
batch_sizes = [64,32]

# Added num_heads grid
num_heads_list = [2,4,5, 8,10, 20,32]

results = []
exp_id = 0

for lr in lrs:
    for batch_size in batch_sizes:
        for num_heads in num_heads_list:

            exp_id += 1
            print("=" * 60)
            print(
                f"Experiment {exp_id}: lr={lr}, batch={batch_size}, num_heads={num_heads}"
            )
            print("=" * 60)

            # ---- Train with given hyperparameters ----
            ckpt_path, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
                backbone=backbone,
                train_csv=train_csv,
                val_csv=val_csv,
                test_csv=offsite_test_csv,
                train_image_dir=train_img_dir,
                val_image_dir=val_img_dir,
                test_image_dir=offsite_img_dir,
                epochs=12,
                batch_size=batch_size,
                lr=lr,
                img_size=img_size,
                save_dir=save_dir,
                pretrained_backbone=checkpoint,
                task="full_ft",
                attention="mha",
                num_heads=num_heads, 
            )

            # ---- Compute validation thresholds ----
            best_thrs = find_best_thresholds(val_probs, val_labels)

            # ---- OFFSITE metrics ----
            df_off = evaluating_metrics(
                y_true=y_true_offsite,
                y_pred=y_pred_offsite,
                backbone=backbone,
                task_name=f"full_ft_mha_lr{lr}_bs{batch_size}_nh{num_heads}",
                split_name="offsite",
            )

            try:
                avg_f1 = df_off.loc[df_off["Disease"] == "Average F1", "F1-score"].values[0]
            except:
                print("WARNING: Could not extract average F1. Set to 0.")
                avg_f1 = 0.0

            print(f"OFFSITE Average F1: {avg_f1:.4f}")

            results.append(
                {
                    "ckpt_path": ckpt_path,
                    "avg_f1": float(avg_f1),
                    "lr": lr,
                    "batch_size": batch_size,
                    "num_heads": num_heads, 
                    "best_thrs": best_thrs,
                }
            )

            # GPU Cleanup
            del y_true_offsite, y_pred_offsite, val_probs, val_labels
            gc.collect()
            torch.cuda.empty_cache()

# -------------------------
# Select top 3
# -------------------------
results_sorted = sorted(results, key=lambda x: x["avg_f1"], reverse=True)
top3 = results_sorted[:3]

print("\n" + "#" * 60)
print("TOP 3 CONFIGURATIONS (by OFFSITE average F1)")
print("#" * 60)

for rank, r in enumerate(top3, start=1):
    print(
        f"Rank {rank}: avg_F1={r['avg_f1']:.4f}, "
        f"lr={r['lr']}, batch={r['batch_size']}, "
        f"num_heads={r['num_heads']}, ckpt={r['ckpt_path']}"
    )
"""

Experiment 1: lr=0.001, batch=64, num_heads=2
cuda
Task 3 |  Backbone: efficientnet | loss: bce | attention: mha
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_efficientnet.pt
[efficientnet | mha] Epoch 1/12 Train Loss: 1.2060 Val Loss: 0.6195
Saved best model for efficientnet (full_ft, mha) at checkpoints\csu_Task3_2_efficientnet_mha.pt
[efficientnet | mha] Epoch 2/12 Train Loss: 0.3855 Val Loss: 0.5181
Saved best model for efficientnet (full_ft, mha) at checkpoints\csu_Task3_2_efficientnet_mha.pt
[efficientnet | mha] Epoch 3/12 Train Loss: 0.2970 Val Loss: 0.4936
Saved best model for efficientnet (full_ft, mha) at checkpoints\csu_Task3_2_efficientnet_mha.pt
[efficientnet | mha] Epoch 4/12 Train Loss: 0.2440 Val Loss: 0.4741
Saved best model for efficientnet (full_ft, mha) at checkpoints\csu_Task3_2_efficientnet_mha.pt
[efficientnet | mha] Epoch 5/12 Train Loss: 0.2309 Val Loss: 0.5762
[efficientnet | mha] Epoch 6/12 Train Loss: 0.2352 Val Loss: 0.4696
S

In [33]:
# Task 3.2  efficientnet

backbone = "efficientnet" 

if backbone == "resnet18":
    pretrained_path = pretrained_resnet18
elif backbone == "efficientnet":
    pretrained_path = pretrained_efficient
else:
    raise ValueError("unknown backbone")
    
checkpoint = "./checkpoints/task1/csu_task1_2_efficientnet.pt"
#offsite
avg_f1 = 0
while True:
    if avg_f1<0.77 or avg_f1>0.78:
        ckpt_task32, y_true_offsite, y_pred_offsite, val_probs, val_labels = train_one_backbone(
            backbone=backbone,
            train_csv=train_csv,
            val_csv=val_csv,
            test_csv=offsite_test_csv,
            train_image_dir=train_img_dir,
            val_image_dir=val_img_dir,
            test_image_dir=offsite_img_dir,
            epochs=12,
            batch_size=32,
            lr=0.0005,
            img_size=img_size,
            save_dir=save_dir,
            pretrained_backbone=checkpoint,
            task="full_ft",
            attention="mha",
            num_heads = 5,
        )
        
        
        best_thrs = find_best_thresholds(val_probs, val_labels)
        
        
        
        
        df_offsite = evaluating_metrics(
            y_true=y_true_offsite,
            y_pred=y_pred_offsite,
            backbone=backbone,
            task_name="full_ft",
            split_name="offsite",
        )
        avg_f1 = df_offsite.loc[df_offsite["Disease"] == "Average F1", "F1-score"].values[0]
        torch.cuda.empty_cache()
    else:
        break

#onsite
generate_kaggle_submission(
            backbone=backbone,
            ckpt_path=ckpt_task32,
            onsite_csv=onsite_csv,
            onsite_image_dir=onsite_img_dir,
            img_size=img_size,
            batch_size=batch_size,
            out_csv=f"submission_{backbone}_task3_2.csv",
            threshold=0.5,
            best=False,
            attention="mha",
            num_heads = 2,

        )

torch.cuda.empty_cache()
df_offsite

cuda
Task 3 |  Backbone: efficientnet | loss: bce | attention: mha
Loaded non-attention backbone checkpoint from ./checkpoints/task1/csu_task1_2_efficientnet.pt
[efficientnet | mha] Epoch 1/12 Train Loss: 0.8489 Val Loss: 0.5631
Saved best model for efficientnet (full_ft, mha) at checkpoints\csu_Task3_2_efficientnet_mha.pt
[efficientnet | mha] Epoch 2/12 Train Loss: 0.2428 Val Loss: 0.4267
Saved best model for efficientnet (full_ft, mha) at checkpoints\csu_Task3_2_efficientnet_mha.pt
[efficientnet | mha] Epoch 3/12 Train Loss: 0.1436 Val Loss: 0.5493
[efficientnet | mha] Epoch 4/12 Train Loss: 0.1053 Val Loss: 0.6060
[efficientnet | mha] Epoch 5/12 Train Loss: 0.1019 Val Loss: 0.6844
[efficientnet | mha] Epoch 6/12 Train Loss: 0.0568 Val Loss: 0.6923
[efficientnet | mha] Epoch 7/12 Train Loss: 0.0529 Val Loss: 0.6253
[efficientnet | mha] Epoch 8/12 Train Loss: 0.0346 Val Loss: 0.6624
[efficientnet | mha] Epoch 9/12 Train Loss: 0.0567 Val Loss: 0.6738
[efficientnet | mha] Epoch 10/12 Tr

Unnamed: 0,Backbone,Task,Split,Disease,Accuracy,Precision,Recall,F1-score,Kappa
0,efficientnet,full_ft,offsite,DR,0.825,0.83871,0.928571,0.881356,0.551282
1,efficientnet,full_ft,offsite,Glaucoma,0.9,0.837209,0.734694,0.782609,0.718032
2,efficientnet,full_ft,offsite,AMD,0.925,0.684211,0.590909,0.634146,0.592613
3,efficientnet,full_ft,offsite,Average F1,,,,0.766037,


In [38]:
"""# Try a range of thresholds
threshold_list = [0.4]

for thr in threshold_list:
    print("=" * 60)
    print(f"Generating submission with threshold = {thr}")
    print("=" * 60)

    generate_kaggle_submission(
        backbone=backbone,
        ckpt_path=ckpt_task32,
        onsite_csv=onsite_csv,
        onsite_image_dir=onsite_img_dir,
        img_size=img_size,
        batch_size=128,
        out_csv=f"submission_{backbone}_task3_2_thr{thr}.csv",
        threshold=thr,
        best=False,
        attention="mha",
        num_heads = 8,
    )"""

generate_kaggle_submission(
        backbone=backbone,
        ckpt_path=ckpt_task32,
        onsite_csv=onsite_csv,
        onsite_image_dir=onsite_img_dir,
        img_size=img_size,
        batch_size=128,
        out_csv=f"submission_{backbone}_task3_2.csv",
        threshold=0.56,
        best=True,
        attention="mha",
        num_heads = 5,)


Kaggle submission saved to: submission\submission_efficientnet_task3_2.csv
