In [1]:
import os
import os.path as osp
import zipfile
import csv
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import time
import random
from collections import Counter
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, WeightedRandomSampler, Sampler 
from torchvision import transforms as T 
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingWarmRestarts
from torch.amp import autocast, GradScaler


In [2]:
train_dir = "/kaggle/input/unipd-deep-learning-2025-challenge-1/train_dataset"
test_dir  = "/kaggle/input/unipd-deep-learning-2025-challenge-1/test_dataset"


mean = [0.5131654143333435, 0.46465885639190674, 0.4044109880924225]
std = [0.2625938653945923, 0.25588247179985046, 0.27475273609161377]

In [3]:
class ImageDataset(Dataset):
    def __init__(self, root: str, test: bool=False, transform=None):
        super().__init__()
        self.root = root
        self.test = test
        self.transform = transform
        if osp.exists(osp.join(root,'images')):
             self.img_path = osp.join(root,'images')
        elif osp.exists(osp.join(root, osp.basename(root))): 
             self.img_path = osp.join(root, osp.basename(root))
        else:
             self.img_path = root

        self.ids, self.targets = [], []

        if not test:
            label_file = osp.join(root,'labels.csv')
            if not osp.exists(label_file) and osp.exists(osp.join(osp.dirname(root), 'labels.csv')):
                 label_file = osp.join(osp.dirname(root), 'labels.csv')

            with open(label_file) as f:
                reader = csv.DictReader(f)
                for r in reader:
                    self.ids.append(r['id'].zfill(5))
                    self.targets.append(int(r['label']))
        else:
            image_files = [fn for fn in sorted(os.listdir(self.img_path)) if fn.lower().endswith('.jpeg')]
            print(f"Found {len(image_files)} test images in {self.img_path}")
            for fn in image_files:
                 self.ids.append(osp.splitext(fn)[0])


    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img = Image.open(osp.join(self.img_path,f"{img_id}.jpeg")).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return (img, img_id) if self.test else (img, self.targets[idx])

class BalancedSampler(Sampler):
    def __init__(self, dataset, target_count):
        self.dataset = dataset
        self.target_count = target_count

        if hasattr(dataset, 'indices') and hasattr(dataset.dataset, 'targets'):
            original_targets = np.array(dataset.dataset.targets)
            self.labels = original_targets[dataset.indices].tolist()
        else: 
            self.labels = [dataset[i][1] for i in range(len(dataset))]


        self.class_indices = {}
        for subset_idx, label in enumerate(self.labels):
            self.class_indices.setdefault(label, []).append(subset_idx) 

        self.balanced_indices = []
        for label, indices in self.class_indices.items():
            current_count = len(indices)

            if current_count == 0: continue 

            if current_count < target_count:
                oversample_factor = target_count // current_count
                remainder = target_count % current_count
                self.balanced_indices.extend(indices * oversample_factor)
                self.balanced_indices.extend(random.sample(indices, k=remainder))
            else:
                self.balanced_indices.extend(random.sample(indices, k=target_count))

        random.shuffle(self.balanced_indices)
        print(f"BalancedSampler: Created {len(self.balanced_indices)} indices targeting {target_count} per class.")


    def __iter__(self):
        return iter(self.balanced_indices)

    def __len__(self):
        return len(self.balanced_indices)

class TransformedSubset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.subset[index] 
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.subset)


In [4]:
INPUT_SIZE = 96

train_transform = T.Compose([
    T.Resize((INPUT_SIZE + 30, INPUT_SIZE + 30)),
    T.RandomResizedCrop(INPUT_SIZE, scale=(0.5, 1.0)),  
    T.RandomHorizontalFlip(p=0.6),
    T.RandomVerticalFlip(p=0.2),      
    T.RandomRotation(30),
    T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    T.ToTensor(),
    T.Normalize(mean=mean, std=std)
])


val_transform = T.Compose([
    T.Resize((INPUT_SIZE, INPUT_SIZE)), 
    T.ToTensor(),
    T.Normalize(mean=mean, std=std)])

base_dataset = ImageDataset(root=train_dir, transform=None)

NUM_CLASSES = len(set(base_dataset.targets))
print(f"Number of classes detected: {NUM_CLASSES}")


Number of classes detected: 20


In [5]:
train_size = int(0.80 * len(base_dataset)) 
val_size = len(base_dataset) - train_size
train_subset, val_subset = random_split(base_dataset, [train_size, val_size])

train_dataset_transformed = TransformedSubset(train_subset, transform=train_transform)
val_dataset_transformed = TransformedSubset(val_subset, transform=val_transform)
label_counts = Counter(base_dataset.targets)
print("Class distribution in full dataset:", label_counts)

target_count_sampler = 1800 
print(f"Using target_count={target_count_sampler} for BalancedSampler.")
train_sampler = BalancedSampler(train_subset, target_count=target_count_sampler)

BATCH_SIZE = 128

train_loader = DataLoader(
    train_dataset_transformed,
    batch_size=BATCH_SIZE,
    sampler=train_sampler, 
    num_workers=2,
    pin_memory=True, 
    drop_last=True 
)

val_loader = DataLoader(
    val_dataset_transformed,
    batch_size=BATCH_SIZE * 2, 
    shuffle=False, 
    num_workers=2,
    pin_memory=True
)

test_ds = ImageDataset(test_dir, test=True, transform=val_transform) 
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2)

Class distribution in full dataset: Counter({11: 1300, 10: 1300, 9: 1300, 3: 1300, 8: 1300, 1: 1300, 0: 1300, 4: 1300, 17: 1300, 14: 1300, 12: 1300, 6: 1300, 2: 1300, 18: 1300, 19: 760, 13: 756, 5: 755, 16: 751, 7: 658, 15: 550})
Using target_count=1800 for BalancedSampler.
BalancedSampler: Created 36000 indices targeting 1800 per class.
Found 4000 test images in /kaggle/input/unipd-deep-learning-2025-challenge-1/test_dataset/images


In [6]:
class ImagerClassifier(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.2):  
        super().__init__()
        self.features = nn.Sequential(

            nn.Conv2d(3, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.GELU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.GELU(),
            nn.MaxPool2d(2),
            

            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.GELU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.GELU(),
            nn.MaxPool2d(2),

            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.GELU(),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.GELU(),
            nn.MaxPool2d(2),
            

            nn.AdaptiveAvgPool2d(1)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


In [7]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device,
                patience=10, best_model_path='best_model.pth', use_amp=True):

    model.to(device)
    scaler = GradScaler(enabled=use_amp) 

    best_val_accuracy = 0.0
    epochs_no_improve = 0
    best_model_weights = copy.deepcopy(model.state_dict()) 
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    total_steps = len(train_loader) * num_epochs 

    start_time = time.time()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            with autocast(device_type=device.type, enabled=use_amp):
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            if isinstance(scheduler, OneCycleLR):
                 scheduler.step() 

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

        epoch_train_loss = running_loss / total_samples
        epoch_train_acc = correct_predictions / total_samples
        history['train_loss'].append(epoch_train_loss)
        history['train_acc'].append(epoch_train_acc)

        #Validation
        model.eval()
        running_val_loss = 0.0
        correct_val_predictions = 0
        total_val_samples = 0

        with torch.no_grad(): 
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                with autocast(device_type=device.type, enabled=use_amp):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                running_val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                total_val_samples += labels.size(0)
                correct_val_predictions += (predicted == labels).sum().item()

        epoch_val_loss = running_val_loss / total_val_samples
        epoch_val_acc = correct_val_predictions / total_val_samples
        history['val_loss'].append(epoch_val_loss)
        history['val_acc'].append(epoch_val_acc)

        print(f"Epoch [{epoch+1}/{num_epochs}] - "
              f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.4f} - "
              f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_acc:.4f} - "
              f"LR: {optimizer.param_groups[0]['lr']:.6f}") 

        #Early Stopping
        if epoch_val_acc > best_val_accuracy:
            print(f"Validation accuracy improved ({best_val_accuracy:.4f} --> {epoch_val_acc:.4f}). Saving model...")
            best_val_accuracy = epoch_val_acc
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(best_model_weights, best_model_path)
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(f"Validation accuracy did not improve for {epochs_no_improve} epoch(s).")
            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                break

    end_time = time.time()
    print(f"Training finished in {(end_time - start_time)/60:.2f} minutes.")
    print(f"Best validation accuracy: {best_val_accuracy:.4f}")

    model.load_state_dict(torch.load(best_model_path)) 
    return model, history 


In [8]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
labels = base_dataset.targets
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

Using device: cuda


In [9]:
LEARNING_RATE = 2e-4  
WEIGHT_DECAY = 1e-4    
EPOCHS = 100   
PATIENCE = 20   
BEST_MODEL_PATH = 'large_cnn_best_model.pth'
USE_AMP = torch.cuda.is_available() 

model = ImagerClassifier(num_classes=NUM_CLASSES, dropout_rate=0.3).to(DEVICE)
criterion = nn.CrossEntropyLoss(weight=class_weights,label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(),lr=LEARNING_RATE,weight_decay=WEIGHT_DECAY,betas=(0.9, 0.999))
scheduler = OneCycleLR(optimizer,max_lr=LEARNING_RATE*4,epochs=EPOCHS,steps_per_epoch=len(train_loader),pct_start=0.4,anneal_strategy='linear')
best_model, history = train_model(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    scheduler, 
    num_epochs=EPOCHS,
    device=DEVICE,
    patience=PATIENCE,
    best_model_path=BEST_MODEL_PATH,
    use_amp=USE_AMP)


Epoch [1/100] - Train Loss: 2.5826, Train Acc: 0.1906 - Val Loss: 2.4650, Val Acc: 0.2218 - LR: 0.000051
Validation accuracy improved (0.0000 --> 0.2218). Saving model...
Epoch [2/100] - Train Loss: 2.2868, Train Acc: 0.2859 - Val Loss: 2.2815, Val Acc: 0.2922 - LR: 0.000070
Validation accuracy improved (0.2218 --> 0.2922). Saving model...
Epoch [3/100] - Train Loss: 2.1231, Train Acc: 0.3390 - Val Loss: 2.2892, Val Acc: 0.3304 - LR: 0.000090
Validation accuracy improved (0.2922 --> 0.3304). Saving model...
Epoch [4/100] - Train Loss: 2.0308, Train Acc: 0.3728 - Val Loss: 2.1176, Val Acc: 0.3819 - LR: 0.000109
Validation accuracy improved (0.3304 --> 0.3819). Saving model...
Epoch [5/100] - Train Loss: 1.9672, Train Acc: 0.3976 - Val Loss: 2.1080, Val Acc: 0.3816 - LR: 0.000128
Validation accuracy did not improve for 1 epoch(s).
Epoch [6/100] - Train Loss: 1.8980, Train Acc: 0.4317 - Val Loss: 2.2780, Val Acc: 0.3437 - LR: 0.000147
Validation accuracy did not improve for 2 epoch(s).
Ep

  model.load_state_dict(torch.load(best_model_path))


In [10]:
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\nValidation Set Evaluation (Best Model):")
    print(classification_report(all_labels, all_preds, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))
    print(f"Overall Accuracy: {accuracy_score(all_labels, all_preds):.4f}")

evaluate_model(best_model, val_loader, DEVICE)


def predict_test(model, test_loader, device):
    model.eval()
    predictions = []
    image_ids = []
    with torch.no_grad():
        for inputs, ids in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
            image_ids.extend(ids) 

    return image_ids, predictions



Validation Set Evaluation (Best Model):
              precision    recall  f1-score   support

           0     0.5042    0.4651    0.4839       258
           1     0.8233    0.7100    0.7625       269
           2     0.8756    0.8678    0.8717       227
           3     0.5639    0.5155    0.5386       291
           4     0.8584    0.7968    0.8264       251
           5     0.8276    0.8108    0.8191       148
           6     0.8760    0.8309    0.8528       272
           7     0.6294    0.6667    0.6475       135
           8     0.7199    0.7808    0.7491       260
           9     0.8268    0.8268    0.8268       254
          10     0.7449    0.7572    0.7510       243
          11     0.7022    0.6529    0.6767       242
          12     0.5447    0.5426    0.5437       258
          13     0.6716    0.8405    0.7466       163
          14     0.7992    0.7962    0.7977       265
          15     0.5282    0.7143    0.6073       105
          16     0.8571    0.8790    0.8

In [11]:
test_ids, test_preds = predict_test(best_model, test_loader, DEVICE)

submission_df = pd.DataFrame({'id': test_ids, 'label': test_preds})
submission_df.to_csv('submission.csv', index=False)
print(f"Number of predictions: {len(submission_df)}")
print(submission_df.head())

Number of predictions: 4000
      id  label
0  22430     16
1  22431      8
2  22432     10
3  22433     18
4  22434      8
