## CONFIG & HELPERS


In [None]:
# Install dependencies
%pip install --quiet torch torchvision torchaudio tensorboard scikit-learn matplotlib pandas

import os, math, random, numpy as np, pandas as pd
import torch, torchvision, torch.nn as nn, torch.nn.functional as F
from pathlib import Path
from typing import Dict, Tuple

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
set_seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

def get_output_shape(model: nn.Module, image_dim: Tuple[int,int,int,int]):
    model.eval()
    with torch.no_grad():
        x = torch.randn(*image_dim, device=device)
        out = model(x)
        return tuple(out.shape)

def xyxy_to_valid(b: torch.Tensor) -> torch.Tensor:
    b = b.float().view(-1,4)
    x1 = torch.minimum(b[:,0], b[:,2])
    y1 = torch.minimum(b[:,1], b[:,3])
    x2 = torch.maximum(b[:,0], b[:,2])
    y2 = torch.maximum(b[:,1], b[:,3])
    return torch.stack([x1,y1,x2,y2], dim=1)

def clamp_01(b: torch.Tensor) -> torch.Tensor:
    return b.clamp(0.0, 1.0)


## DATA PREPARATION & EXPLORATION


In [None]:
# Load dataset
data_path = Path('fa-ii-2025-ii-object-localization/train.csv')
df = pd.read_csv(data_path)
print('Samples:', len(df))
display(df.head())
print(df['class'].value_counts())


## DATA NORMALIZATION & TRANSFORMATIONS


In [None]:
from sklearn.model_selection import train_test_split
from PIL import Image
import numpy as np
from tqdm.auto import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader

# train/validation split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['class'], random_state=42)
print(f'Train: {len(train_df)} | Val: {len(val_df)}')

# compute normalization stats on training subset only
img_dir = Path('fa-ii-2025-ii-object-localization/images')

def compute_stats(sub_df):
    sums = np.zeros(3); sqs = np.zeros(3); n_pix = 0
    for fname in tqdm(sub_df['filename'], desc='compute stats'):
        img = np.array(Image.open(img_dir/fname).convert('RGB'), dtype=np.float32) / 255.0
        sums += img.reshape(-1,3).sum(0)
        sqs  += (img.reshape(-1,3)**2).sum(0)
        n_pix += img.shape[0]*img.shape[1]
    mean = sums / n_pix
    std = np.sqrt(sqs / n_pix - mean**2)
    return mean, std

mean, std = compute_stats(train_df)
print('mean:', mean, 'std:', std)

# augmentations
h = w = 256
train_transforms = A.Compose([
    A.Resize(h, w),
    A.HorizontalFlip(p=0.5),
    A.ColorJitter(0.2,0.2,0.2,0.1,p=0.5),
    A.Normalize(mean=mean, std=std),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_id']))

eval_transforms = A.Compose([
    A.Resize(h, w),
    A.Normalize(mean=mean, std=std),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_id']))

class MaskDataset(Dataset):
    def __init__(self, df, root_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.root_dir = Path(root_dir)
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = np.array(Image.open(self.root_dir / row['filename']).convert('RGB'))
        bbox = [row['xmin'], row['ymin'], row['xmax'], row['ymax']]
        class_id = 1 if row['class']=='mask' else 0
        if self.transform:
            sample = self.transform(image=image, bboxes=[bbox], class_id=[class_id])
            image = sample['image']
            bbox = sample['bboxes'][0]
            class_id = sample['class_id'][0]
        return {
            'image': image,
            'bbox': torch.tensor(bbox, dtype=torch.float32),
            'class_id': torch.tensor(class_id, dtype=torch.long)
        }

batch_size = 16
train_root_dir = img_dir
train_data = DataLoader(MaskDataset(train_df, train_root_dir, transform=train_transforms),
                        batch_size=batch_size, shuffle=True, num_workers=0,
                        pin_memory=(device=='cuda'))
val_data = DataLoader(MaskDataset(val_df, train_root_dir, transform=eval_transforms),
                      batch_size=batch_size, shuffle=False, num_workers=0,
                      pin_memory=(device=='cuda'))
