In [None]:
!pip install lightning-utilities
!pip install torchmetrics --no-deps

Collecting lightning-utilities
  Downloading lightning_utilities-0.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading lightning_utilities-0.15.0-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities
Successfully installed lightning-utilities-0.15.0
Collecting torchmetrics
  Downloading torchmetrics-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchmetrics-1.8.0-py3-none-any.whl (981 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.9/981.9 kB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchmetrics
Successfully installed torchmetrics-1.8.0


In [None]:
pip install pandas pillow matplotlib seaborn imagehash tqdm

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imagehash
Successfully installed imagehash-4.3.2


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!unzip -q -P unlocked308BR /content/gdrive/MyDrive/DeepLearning/DFUC2021_trainset_210427.zip -d dfu_train

In [None]:
!unzip -q -P sigmoid608KL /content/gdrive/MyDrive/DeepLearning/DFUC2021_testing_release.zip -d dfu_test

In [None]:
# --- Full Self-Supervised Pipeline for DFUC2021 ---

# --- SETUP: IMPORTS & CONFIGURATION ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from transformers import ViTModel, ViTImageProcessor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from PIL import Image
import pandas as pd
import numpy as np
import os

# --- Configuration ---
# Part A: Self-Supervised Config
MIM_EPOCHS = 25  # Number of epochs for Masked Image Modeling pre-training. Can be increased for better results.
MIM_BATCH_SIZE = 32
MODEL_CHECKPOINT = "google/vit-base-patch16-224-in21k"
ADAPTED_BACKBONE_PATH = "vit_mae_adapted_backbone.pth"

# Part B & C: Supervised Config
PROBE_EPOCHS = 20 # Number of epochs for linear probing on each fold.
PROBE_BATCH_SIZE = 32
N_SPLITS = 4 # Using 4 folds for cross-validation
CLASS_NAMES = ['none', 'infection', 'ischaemia', 'both']
NUM_CLASSES = len(CLASS_NAMES)

# General Config
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
DATA_DIR = "/content/dfu_train/DFUC2021_train/"
TRAIN_IMG_DIR = os.path.join(DATA_DIR, "images")
CSV_FILE = os.path.join(DATA_DIR, "train.csv")
TEST_IMG_DIR = '/content/dfu_test/DFUC2021_test'
OUTPUT_CSV_PATH = 'submission_vit_mae_ensemble.csv'

# Set seeds for reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"Using device: {DEVICE}")


Using device: cuda


In [None]:
# --- PART A: SELF-SUPERVISED PRE-TRAINING (MAE-STYLE) ---
# This updated version saves the model with the lowest reconstruction loss.

print("\n" + "="*60)
print("🚀 PART A: STARTING SELF-SUPERVISED PRE-TRAINING (MAE)")
print("="*60)

# 1. Define MAE Model, Dataset, and Collator
class ViTForMAE(nn.Module):
    """Masked Autoencoder with ViT backbone"""
    def __init__(self, vit_model):
        super().__init__()
        self.vit = vit_model
        self.config = vit_model.config

        # Decoder
        self.decoder_hidden_size = 512 # Can be smaller than encoder
        self.decoder = nn.Sequential(
            nn.Linear(self.config.hidden_size, self.decoder_hidden_size),
            nn.GELU(),
            nn.Linear(self.decoder_hidden_size, self.config.patch_size**2 * 3),
        )
        self.mask_token = nn.Parameter(torch.zeros(1, 1, self.config.hidden_size))

    def forward(self, pixel_values, bool_masked_pos):
        # Get patch embeddings
        embeddings = self.vit.embeddings(pixel_values)

        # Exclude CLS token from masking
        embeddings_without_cls = embeddings[:, 1:, :]
        bool_masked_pos_without_cls = bool_masked_pos[:, 1:]

        batch_size, seq_len, dim = embeddings_without_cls.shape
        mask = ~bool_masked_pos_without_cls.unsqueeze(-1).expand_as(embeddings_without_cls)
        visible_embeddings = embeddings_without_cls[mask].reshape(batch_size, -1, dim)

        # Pass visible patches through encoder
        encoder_outputs = self.vit.encoder(visible_embeddings)
        encoded_visible = encoder_outputs.last_hidden_state

        # Add CLS token back to the decoder input
        decoder_input = self.mask_token.expand(batch_size, seq_len + 1, -1).clone()
        decoder_input[:, 1:][~bool_masked_pos_without_cls] = encoded_visible.flatten(0, 1)

        # Pass full sequence through decoder
        decoded_patches = self.decoder(decoder_input)

        # Predict only the masked patches
        predicted_masked_patches = decoded_patches[:, 1:][bool_masked_pos_without_cls]

        return predicted_masked_patches

class MAEDataCollator:
    def __init__(self, processor, vit_config, mask_patch_rate=0.75):
        self.processor = processor
        self.mask_patch_rate = mask_patch_rate
        self.patch_size = vit_config.patch_size
        self.num_patches = (vit_config.image_size // self.patch_size) ** 2

    def __call__(self, examples):
        batch = torch.stack(examples, dim=0)
        num_mask = int(self.num_patches * self.mask_patch_rate)

        masked_indices = torch.rand(batch.shape[0], self.num_patches).argsort(dim=-1)[:, :num_mask]
        bool_masked_pos = torch.zeros((batch.shape[0], self.num_patches + 1), dtype=torch.bool)
        for i in range(batch.shape[0]):
            bool_masked_pos[i, masked_indices[i] + 1] = True

        patches = batch.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        patches = patches.permute(0, 2, 3, 1, 4, 5).reshape(batch.shape[0], self.num_patches, -1)
        labels = patches[bool_masked_pos[:, 1:]]

        return {"pixel_values": batch, "bool_masked_pos": bool_masked_pos, "labels": labels}

class ImageDataset(Dataset):
    def __init__(self, df, img_dir, processor):
        self.df = df
        self.img_dir = img_dir
        self.processor = processor
        self.image_files = df['image'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")
        return self.processor(image, return_tensors="pt")['pixel_values'].squeeze(0)

# 2. Run Self-Supervised Training
processor = ViTImageProcessor.from_pretrained(MODEL_CHECKPOINT)
vit_model = ViTModel.from_pretrained(MODEL_CHECKPOINT)
mae_model = ViTForMAE(vit_model).to(DEVICE)
optimizer_mae = optim.AdamW(mae_model.parameters(), lr=1.5e-4, weight_decay=0.05)
loss_fn_mae = nn.MSELoss()

full_df = pd.read_csv(CSV_FILE)
mae_dataset = ImageDataset(full_df, TRAIN_IMG_DIR, processor)
mae_collator = MAEDataCollator(processor, vit_model.config)
mae_loader = DataLoader(mae_dataset, batch_size=MIM_BATCH_SIZE, shuffle=True, collate_fn=mae_collator)

# --- CHANGE IS HERE: Track best loss ---
min_loss = float('inf')

for epoch in range(MIM_EPOCHS):
    mae_model.train()
    total_loss = 0
    progress_bar = tqdm(mae_loader, desc=f"MAE Epoch {epoch+1}/{MIM_EPOCHS}")
    for batch in progress_bar:
        pixel_values = batch['pixel_values'].to(DEVICE)
        bool_masked_pos = batch['bool_masked_pos'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        optimizer_mae.zero_grad()
        predictions = mae_model(pixel_values, bool_masked_pos)
        loss = loss_fn_mae(predictions, labels)
        loss.backward()
        optimizer_mae.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"})

    avg_loss = total_loss / len(mae_loader)
    print(f"MAE Epoch {epoch+1} - Average Reconstruction Loss: {avg_loss:.4f}")

    # --- CHANGE IS HERE: Save model only if loss has improved ---
    if avg_loss < min_loss:
        min_loss = avg_loss
        # Save the adapted backbone
        torch.save(mae_model.vit.state_dict(), ADAPTED_BACKBONE_PATH)
        print(f"🎉 New best backbone saved to {ADAPTED_BACKBONE_PATH} with loss: {min_loss:.4f}")

print(f"\n✅ Self-supervised pre-training complete.")
print(f"The best adapted backbone is saved at: {ADAPTED_BACKBONE_PATH}")



🚀 PART A: STARTING SELF-SUPERVISED PRE-TRAINING (MAE)


MAE Epoch 1/25: 100%|██████████| 311/311 [01:05<00:00,  4.74it/s, Loss=0.0842]


MAE Epoch 1 - Average Reconstruction Loss: 0.1064
🎉 New best backbone saved to vit_mae_adapted_backbone.pth with loss: 0.1064


MAE Epoch 2/25: 100%|██████████| 311/311 [01:05<00:00,  4.76it/s, Loss=0.1064]


MAE Epoch 2 - Average Reconstruction Loss: 0.0985
🎉 New best backbone saved to vit_mae_adapted_backbone.pth with loss: 0.0985


MAE Epoch 3/25: 100%|██████████| 311/311 [01:05<00:00,  4.76it/s, Loss=0.1016]


MAE Epoch 3 - Average Reconstruction Loss: 0.0985


MAE Epoch 4/25: 100%|██████████| 311/311 [01:05<00:00,  4.76it/s, Loss=0.1314]


MAE Epoch 4 - Average Reconstruction Loss: 0.0987


MAE Epoch 5/25: 100%|██████████| 311/311 [01:05<00:00,  4.76it/s, Loss=0.1044]


MAE Epoch 5 - Average Reconstruction Loss: 0.0986


MAE Epoch 6/25: 100%|██████████| 311/311 [01:05<00:00,  4.71it/s, Loss=0.1068]


MAE Epoch 6 - Average Reconstruction Loss: 0.0986


MAE Epoch 7/25: 100%|██████████| 311/311 [01:06<00:00,  4.68it/s, Loss=0.1083]


MAE Epoch 7 - Average Reconstruction Loss: 0.0984
🎉 New best backbone saved to vit_mae_adapted_backbone.pth with loss: 0.0984


MAE Epoch 8/25: 100%|██████████| 311/311 [01:06<00:00,  4.69it/s, Loss=0.1020]


MAE Epoch 8 - Average Reconstruction Loss: 0.0985


MAE Epoch 9/25: 100%|██████████| 311/311 [01:06<00:00,  4.69it/s, Loss=0.0820]


MAE Epoch 9 - Average Reconstruction Loss: 0.0986


MAE Epoch 10/25: 100%|██████████| 311/311 [01:06<00:00,  4.69it/s, Loss=0.0754]


MAE Epoch 10 - Average Reconstruction Loss: 0.0985


MAE Epoch 11/25: 100%|██████████| 311/311 [01:06<00:00,  4.70it/s, Loss=0.1259]


MAE Epoch 11 - Average Reconstruction Loss: 0.0986


MAE Epoch 12/25: 100%|██████████| 311/311 [01:06<00:00,  4.68it/s, Loss=0.1017]


MAE Epoch 12 - Average Reconstruction Loss: 0.0985


MAE Epoch 13/25: 100%|██████████| 311/311 [01:06<00:00,  4.70it/s, Loss=0.1031]


MAE Epoch 13 - Average Reconstruction Loss: 0.0986


MAE Epoch 14/25:  18%|█▊        | 56/311 [00:12<00:55,  4.62it/s, Loss=0.1137]


KeyboardInterrupt: 

In [None]:
# --- PART B: K-FOLD LINEAR PROBING ---
print("\n" + "="*60)
print("🧠 PART B: STARTING K-FOLD SUPERVISED LINEAR PROBING")
print("="*60)

# 1. Define Classification Model and Dataset for Probing
class ClassificationModel(nn.Module):
    def __init__(self, backbone_path, num_classes):
        super().__init__()
        self.backbone = ViTModel.from_pretrained(MODEL_CHECKPOINT, add_pooling_layer=False)
        # Load the state dictionary and remove unexpected keys
        state_dict = torch.load(backbone_path)
        # Remove keys related to the pooling layer if they exist
        state_dict.pop('pooler.dense.weight', None)
        state_dict.pop('pooler.dense.bias', None)
        self.backbone.load_state_dict(state_dict)

        for param in self.backbone.parameters():
            param.requires_grad = False # Freeze backbone
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_classes)

    def forward(self, pixel_values):
        outputs = self.backbone(pixel_values)
        # Use the CLS token for classification
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

class LabeledDataset(Dataset):
    def __init__(self, df, img_dir, processor):
        self.df = df
        self.img_dir = img_dir
        self.processor = processor
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['image'])
        image = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt")['pixel_values'].squeeze(0)
        label = row['label']
        return pixel_values, torch.tensor(label, dtype=torch.long)

# 2. Run K-Fold Loop
labeled_df = full_df[full_df['none'].notna()].copy()
labeled_df['label'] = np.argmax(labeled_df[CLASS_NAMES].values, axis=1)
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(skf.split(labeled_df, labeled_df['label'])):
    print(f"\n--- FOLD {fold + 1}/{N_SPLITS} ---")
    train_df = labeled_df.iloc[train_idx]
    val_df = labeled_df.iloc[val_idx]

    train_dataset = LabeledDataset(train_df, TRAIN_IMG_DIR, processor)
    val_dataset = LabeledDataset(val_df, TRAIN_IMG_DIR, processor)
    train_loader = DataLoader(train_dataset, batch_size=PROBE_BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=PROBE_BATCH_SIZE, shuffle=False)

    model = ClassificationModel(ADAPTED_BACKBONE_PATH, NUM_CLASSES).to(DEVICE)
    optimizer = optim.AdamW(model.classifier.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    best_val_accuracy = 0
    for epoch in range(PROBE_EPOCHS):
        model.train()
        for pixel_values, labels in tqdm(train_loader, desc=f"Probing Epoch {epoch+1}"):
            pixel_values, labels = pixel_values.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(pixel_values)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for pixel_values, labels in val_loader:
                pixel_values, labels = pixel_values.to(DEVICE), labels.to(DEVICE)
                outputs = model(pixel_values)
                _, predicted = torch.max(outputs.data, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(all_labels, all_preds)
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), f"vit_classifier_fold_{fold+1}.pth")

    print(f"Fold {fold+1} Best Val Accuracy: {best_val_accuracy:.4f}")


🧠 PART B: STARTING K-FOLD SUPERVISED LINEAR PROBING

--- FOLD 1/4 ---


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Probing Epoch 1: 100%|██████████| 140/140 [00:25<00:00,  5.57it/s]
Probing Epoch 2: 100%|██████████| 140/140 [00:24<00:00,  5.72it/s]
Probing Epoch 3: 100%|██████████| 140/140 [00:24<00:00,  5.78it/s]
Probing Epoch 4: 100%|██████████| 140/140 [00:24<00:00,  5.67it/s]
Probing Epoch 5: 100%|██████████| 140/140 [00:24<00:00,  5.72it/s]
Probing Epoch 6: 100%|██████████| 140/140 [00:24<00:0

Fold 1 Best Val Accuracy: 0.7582

--- FOLD 2/4 ---


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Probing Epoch 1: 100%|██████████| 140/140 [00:24<00:00,  5.78it/s]
Probing Epoch 2: 100%|██████████| 140/140 [00:24<00:00,  5.80it/s]
Probing Epoch 3: 100%|██████████| 140/140 [00:24<00:00,  5.81it/s]
Probing Epoch 4: 100%|██████████| 140/140 [00:24<00:00,  5.80it/s]
Probing Epoch 5: 100%|██████████| 140/140 [00:23<00:00,  5.91it/s]
Probing Epoch 6: 100%|██████████| 140/140 [00:24<00:0

Fold 2 Best Val Accuracy: 0.7408

--- FOLD 3/4 ---


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Probing Epoch 1: 100%|██████████| 140/140 [00:24<00:00,  5.72it/s]
Probing Epoch 2: 100%|██████████| 140/140 [00:24<00:00,  5.70it/s]
Probing Epoch 3: 100%|██████████| 140/140 [00:24<00:00,  5.72it/s]
Probing Epoch 4: 100%|██████████| 140/140 [00:24<00:00,  5.71it/s]
Probing Epoch 5: 100%|██████████| 140/140 [00:24<00:00,  5.71it/s]
Probing Epoch 6: 100%|██████████| 140/140 [00:24<00:0

Fold 3 Best Val Accuracy: 0.7495

--- FOLD 4/4 ---


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Probing Epoch 1: 100%|██████████| 140/140 [00:24<00:00,  5.77it/s]
Probing Epoch 2: 100%|██████████| 140/140 [00:24<00:00,  5.69it/s]
Probing Epoch 3: 100%|██████████| 140/140 [00:24<00:00,  5.70it/s]
Probing Epoch 4: 100%|██████████| 140/140 [00:24<00:00,  5.71it/s]
Probing Epoch 5: 100%|██████████| 140/140 [00:24<00:00,  5.68it/s]
Probing Epoch 6: 100%|██████████| 140/140 [00:24<00:0

Fold 4 Best Val Accuracy: 0.7480


In [None]:


# --- PART C: ENSEMBLE PREDICTION ---
print("\n" + "="*60)
print("🏆 PART C: STARTING ENSEMBLE PREDICTION ON TEST SET")
print("="*60)

# 1. Prepare Test Data
test_image_files = sorted([f for f in os.listdir(TEST_IMG_DIR) if f.endswith(('.jpg', '.jpeg', '.png'))])
class TestDataset(Dataset):
    def __init__(self, fnames, img_dir, processor):
        self.fnames = fnames
        self.img_dir = img_dir
        self.processor = processor
    def __len__(self): return len(self.fnames)
    def __getitem__(self, idx):
        fname = self.fnames[idx]
        img_path = os.path.join(self.img_dir, fname)
        image = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt")['pixel_values'].squeeze(0)
        return pixel_values, fname

test_dataset = TestDataset(test_image_files, TEST_IMG_DIR, processor)
test_loader = DataLoader(test_dataset, batch_size=PROBE_BATCH_SIZE, shuffle=False)

# 2. Generate and Average Predictions
all_fold_probs = []
for fold in range(N_SPLITS):
    model = ClassificationModel(ADAPTED_BACKBONE_PATH, NUM_CLASSES).to(DEVICE)
    model.load_state_dict(torch.load(f"vit_classifier_fold_{fold+1}.pth"))
    model.eval()

    fold_probs = []
    with torch.no_grad():
        for pixel_values, _ in tqdm(test_loader, desc=f"Predicting with Fold {fold+1}"):
            pixel_values = pixel_values.to(DEVICE)
            outputs = model(pixel_values)
            probs = torch.softmax(outputs, dim=1)
            fold_probs.append(probs.cpu().numpy())
    all_fold_probs.append(np.concatenate(fold_probs, axis=0))

avg_probs = np.mean(all_fold_probs, axis=0)

# 3. Save Submission File
submission_df = pd.DataFrame()
submission_df['image'] = test_image_files
for i, class_name in enumerate(CLASS_NAMES):
    submission_df[class_name] = avg_probs[:, i]

submission_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"\n✅ Submission file saved to {OUTPUT_CSV_PATH}")
print("\nTop 5 rows of submission file:")
print(submission_df.head())


🏆 PART C: STARTING ENSEMBLE PREDICTION ON TEST SET


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Predicting with Fold 1: 100%|██████████| 180/180 [00:39<00:00,  4.58it/s]
Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializin


✅ Submission file saved to submission_vit_mae_ensemble.csv

Top 5 rows of submission file:
        image      none  infection  ischaemia      both
0  501000.jpg  0.117010   0.745947   0.008810  0.128234
1  501001.jpg  0.007359   0.020343   0.638700  0.333597
2  501002.jpg  0.335097   0.658787   0.002314  0.003802
3  501003.jpg  0.383388   0.609733   0.002775  0.004104
4  501004.jpg  0.058074   0.190404   0.597071  0.154451





In [None]:
# --- PART C: ENSEMBLE PREDICTION ---
print("\n" + "="*60)
print("🏆 PART C: STARTING ENSEMBLE PREDICTION ON TEST SET")
print("="*60)

# 1. Prepare Test Data
test_image_files = sorted([f for f in os.listdir(TEST_IMG_DIR) if f.endswith(('.jpg', '.jpeg', '.png'))])
class TestDataset(Dataset):
    def __init__(self, fnames, img_dir, processor):
        self.fnames = fnames
        self.img_dir = img_dir
        self.processor = processor
    def __len__(self): return len(self.fnames)
    def __getitem__(self, idx):
        fname = self.fnames[idx]
        img_path = os.path.join(self.img_dir, fname)
        image = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt")['pixel_values'].squeeze(0)
        return pixel_values, fname

test_dataset = TestDataset(test_image_files, TEST_IMG_DIR, processor)
test_loader = DataLoader(test_dataset, batch_size=PROBE_BATCH_SIZE, shuffle=False)

# 2. Generate and Average Predictions
all_fold_probs = []
for fold in range(N_SPLITS):
    model = ClassificationModel(ADAPTED_BACKBONE_PATH, NUM_CLASSES).to(DEVICE)
    model.load_state_dict(torch.load(f"vit_classifier_fold_{fold+1}.pth"))
    model.eval()

    fold_probs = []
    with torch.no_grad():
        for pixel_values, _ in tqdm(test_loader, desc=f"Predicting with Fold {fold+1}"):
            pixel_values = pixel_values.to(DEVICE)
            outputs = model(pixel_values)
            probs = torch.softmax(outputs, dim=1)
            fold_probs.append(probs.cpu().numpy())
    all_fold_probs.append(np.concatenate(fold_probs, axis=0))

avg_probs = np.mean(all_fold_probs, axis=0)

# 3. Save Submission File
submission_df = pd.DataFrame()
submission_df['image'] = test_image_files
for i, class_name in enumerate(CLASS_NAMES):
    submission_df[class_name] = avg_probs[:, i]

submission_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"\n✅ Submission file saved to {OUTPUT_CSV_PATH}")
print("\nTop 5 rows of submission file:")
print(submission_df.head())


🏆 PART C: STARTING ENSEMBLE PREDICTION ON TEST SET


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Predicting with Fold 1: 100%|██████████| 180/180 [00:40<00:00,  4.48it/s]
Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializin


✅ Submission file saved to submission_vit_mae_ensemble.csv

Top 5 rows of submission file:
        image      none  infection  ischaemia      both
0  501000.jpg  0.117010   0.745947   0.008810  0.128234
1  501001.jpg  0.007359   0.020343   0.638700  0.333597
2  501002.jpg  0.335097   0.658787   0.002314  0.003802
3  501003.jpg  0.383388   0.609733   0.002775  0.004104
4  501004.jpg  0.058074   0.190404   0.597071  0.154451



