In [1]:
# Install required packages and verify GPU
import sys, subprocess, importlib, os, glob, pandas as pd

def ensure(pkg, import_name=None, extra=None):
    imp = import_name or pkg
    try:
        importlib.import_module(imp)
        print(f"OK: {pkg}")
    except Exception:
        cmd = [sys.executable, '-m', 'pip', 'install', pkg] + (extra or [])
        print('Installing', pkg, '...')
        subprocess.check_call(cmd)
        importlib.import_module(imp)
        print(f"Installed: {pkg}")

ensure('torch')
ensure('torchvision')
ensure('timm')
ensure('albumentations')
ensure('opencv-python', import_name='cv2')
ensure('scikit-learn', import_name='sklearn')

import torch
print('GPU Available:', torch.cuda.is_available())
print('GPU Count:', torch.cuda.device_count())
if torch.cuda.is_available():
    print('GPU Name:', torch.cuda.get_device_name(0))
    props = torch.cuda.get_device_properties(0)
    print('GPU Memory (GB):', round(props.total_memory/1024**3, 2))

# Quick dataset sanity checks
train_imgs = glob.glob('train/*.jpg')
test_imgs = glob.glob('test/*.jpg')
print('Train images:', len(train_imgs))
print('Test images:', len(test_imgs))

labels = pd.read_csv('labels.csv')
print('Labels shape:', labels.shape)
print('Unique breeds:', labels['breed'].nunique())
print(labels.head())

ss = pd.read_csv('sample_submission.csv')
print('Sample submission shape:', ss.shape)
print(ss.head())

OK: torch


OK: torchvision


  from .autonotebook import tqdm as notebook_tqdm


OK: timm


OK: albumentations
OK: opencv-python


OK: scikit-learn
GPU Available: True
GPU Count: 1
GPU Name: Tesla V100-SXM2-16GB
GPU Memory (GB): 15.77
Train images: 9199
Test images: 1023
Labels shape: (9199, 2)
Unique breeds: 120
                                 id                        breed
0  8406d837b2d7fac1c3cd621abb4c4f9e  west_highland_white_terrier
1  e270622b5ffec8294d7e7628c4ff6c1e             brittany_spaniel
2  41295c36303043fc587e791b14ef2272                       basset
3  b63b0200ddbb97df81972b26574959ab                        boxer
4  2c64e362c9aa29450082291264dcba29        flat-coated_retriever
Sample submission shape: (1023, 121)
                                 id  affenpinscher  afghan_hound  \
0  9f68d045a396679a778eb54c5ed29038       0.008333      0.008333   
1  f375e6363bc21dcd3cb65637c7855e9c       0.008333      0.008333   
2  010e87fdf252645a827e37470e65e842       0.008333      0.008333   
3  ad2dfa0202d8ea3766fea1e743cd5166       0.008333      0.008333   
4  a579a1802c57cfbc31b79781f6f37a39       0.00833

In [9]:
# Dog Breed Identification - Improved training with ConvNeXt + Mixup + AMP + StratifiedKFold
import os, gc, random, time, math, json, shutil
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
from timm.utils import ModelEmaV2

torch.backends.cudnn.benchmark = True

# ========== Config ========== 
class CFG:
    seed = 42
    img_size = 448
    n_folds = 5
    epochs = 3  # smoke test here; full 5-fold cell will set higher
    train_bs = 8
    valid_bs = 16
    num_workers = 8
    model_name = 'convnext_base.fb_in22k_ft_in1k'
    lr = 1e-4
    weight_decay = 1e-4
    label_smoothing = 0.0  # disable when using mixup
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tta = True
    tta_hflip = True
    out_dir = 'outputs'
    mixup_alpha = 0.2
    cutmix_alpha = 0.0
    mixup_prob = 0.3
    mixup_switch_prob = 0.0
    warmup_epochs = 3
    ema_decay = 0.9998
    grad_clip = 1.0
    early_stop_patience = 6

os.makedirs(CFG.out_dir, exist_ok=True)

# ========== Utils ========== 
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)

def read_data():
    labels = pd.read_csv('labels.csv')
    ss = pd.read_csv('sample_submission.csv')
    # label order must match sample_submission columns (excluding id)
    class_names = [c for c in ss.columns if c != 'id']
    class2idx = {c:i for i,c in enumerate(class_names)}
    idx2class = {i:c for c,i in class2idx.items()}
    labels['filepath'] = labels['id'].apply(lambda x: f'train/{x}.jpg')
    labels['target'] = labels['breed'].map(class2idx)
    assert labels['target'].notnull().all(), 'Found breed not in sample_submission columns order'
    # Folds
    skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    labels['fold'] = -1
    for f, (_, val_idx) in enumerate(skf.split(labels, labels['breed'])):
        labels.loc[val_idx, 'fold'] = f
    return labels, ss, class_names, class2idx, idx2class

train_tfms = A.Compose([
    A.RandomResizedCrop(size=(CFG.img_size, CFG.img_size), scale=(0.85, 1.0), ratio=(0.75, 1.333), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.Affine(scale=(0.95, 1.05), translate_percent=(0.0, 0.02), rotate=(-10, 10), shear=(-2, 2), fit_output=False, border_mode=cv2.BORDER_REFLECT_101, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05, p=0.5),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

valid_tfms = A.Compose([
    A.Resize(height=CFG.img_size, width=CFG.img_size),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

class DogDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df.reset_index(drop=True)
        self.transforms = transforms
        self.is_test = 'target' not in self.df.columns
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = cv2.imread(row['filepath'])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transforms:
            img = self.transforms(image=img)['image']
        if self.is_test:
            return img, row['id']
        else:
            return img, int(row['target'])

def build_model(num_classes):
    model = timm.create_model(CFG.model_name, pretrained=True, num_classes=num_classes)
    return model

def _maybe_channels_last(x):
    return x.to(memory_format=torch.channels_last)

def train_one_fold(fold, labels, num_classes):
    print(f'\n===== Fold {fold} =====')
    trn_df = labels[labels.fold != fold][['filepath','target']].copy()
    val_df = labels[labels.fold == fold][['filepath','target']].copy()

    train_ds = DogDataset(trn_df, transforms=train_tfms)
    valid_ds = DogDataset(val_df, transforms=valid_tfms)

    train_loader = DataLoader(train_ds, batch_size=CFG.train_bs, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

    model = build_model(num_classes).to(CFG.device)
    ema = ModelEmaV2(model, decay=CFG.ema_decay) if CFG.device=='cuda' else None

    # Mixup config + losses
    mixup_base = CFG.mixup_prob if (CFG.mixup_alpha > 0 or CFG.cutmix_alpha > 0) else 0.0
    mixup_fn = Mixup(
        mixup_alpha=CFG.mixup_alpha, cutmix_alpha=CFG.cutmix_alpha, prob=mixup_base,
        switch_prob=CFG.mixup_switch_prob, mode='batch', num_classes=num_classes
    ) if mixup_base > 0 else None

    criterion_hard = nn.CrossEntropyLoss(label_smoothing=CFG.label_smoothing).to(CFG.device)
    criterion_soft = SoftTargetCrossEntropy().to(CFG.device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    # Warmup + Cosine
    warmup_epochs = min(CFG.warmup_epochs, max(1, CFG.epochs//5))
    cosine_epochs = max(1, CFG.epochs - warmup_epochs)
    cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cosine_epochs)
    warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_epochs)
    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_epochs])
    scaler = torch.amp.GradScaler('cuda', enabled=(CFG.device=='cuda'))

    best_val = 1e9
    best_path = os.path.join(CFG.out_dir, f'fold{fold}_best.pth')
    no_improve = 0

    for epoch in range(1, CFG.epochs+1):
        model.train()
        running_loss = 0.0
        n = 0
        t0 = time.time()

        # Mixup prob decays linearly to 0 by 90% of total epochs
        if mixup_fn is not None:
            frac = (epoch - 1) / max(1, CFG.epochs * 0.9)
            decay = max(0.0, 1.0 - min(1.0, frac))
            mixup_fn.mixup_prob = mixup_base * decay

        for imgs, targets in train_loader:
            imgs = imgs.to(CFG.device, non_blocking=True)
            imgs = _maybe_channels_last(imgs)
            targets = targets.to(CFG.device, non_blocking=True)
            if mixup_fn is not None and mixup_fn.mixup_prob > 0:
                imgs, targets = mixup_fn(imgs, targets)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
                logits = model(imgs)
                if targets.dtype.is_floating_point:
                    loss = criterion_soft(logits, targets)
                else:
                    loss = criterion_hard(logits, targets)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            if CFG.grad_clip is not None and CFG.grad_clip > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
            scaler.step(optimizer)
            scaler.update()
            if ema is not None:
                ema.update(model)
            running_loss += loss.item() * imgs.size(0)
            n += imgs.size(0)
        train_loss = running_loss / max(1,n)

        # validation (strict CE, no smoothing)
        model.eval()
        val_loss = 0.0
        m = 0
        val_criterion = nn.CrossEntropyLoss().to(CFG.device)
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, targets in valid_loader:
                imgs = imgs.to(CFG.device, non_blocking=True)
                imgs = _maybe_channels_last(imgs)
                targets = targets.to(CFG.device, non_blocking=True)
                logits = ema.module(imgs) if ema is not None else model(imgs)
                loss = val_criterion(logits, targets)
                val_loss += loss.item() * imgs.size(0)
                m += imgs.size(0)
        val_loss /= max(1,m)
        scheduler.step()
        dt = time.time()-t0
        print(f'Epoch {epoch}/{CFG.epochs} - train {train_loss:.4f} - val {val_loss:.4f} - time {dt:.1f}s - mixup_p {mixup_fn.mixup_prob if mixup_fn else 0:.2f}')
        if val_loss < best_val:
            best_val = val_loss
            to_save = ema.module.state_dict() if ema is not None else model.state_dict()
            torch.save({'model': to_save}, best_path)
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= CFG.early_stop_patience:
                print('Early stopping triggered')
                break

    # Load best and produce OOF preds
    ckpt = torch.load(best_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    oof_logits = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, targets in valid_loader:
            imgs = imgs.to(CFG.device, non_blocking=True)
            imgs = _maybe_channels_last(imgs)
            logits = model(imgs)
            oof_logits.append(logits.detach().cpu().float())
    oof_logits = torch.cat(oof_logits, dim=0).numpy()

    # Cleanup
    del model, optimizer, scaler
    if 'ema' in locals() and ema is not None:
        del ema
    torch.cuda.empty_cache()
    gc.collect()

    return best_path, oof_logits, val_df.index.values

def predict_test_single_model(ckpt_path, num_classes):
    ss = pd.read_csv('sample_submission.csv')
    test_df = ss[['id']].copy()
    test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
    test_ds = DogDataset(test_df[['id','filepath']], transforms=valid_tfms)
    test_loader = DataLoader(test_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = build_model(num_classes).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    all_logits = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in test_loader:
            imgs = imgs.to(CFG.device, non_blocking=True)
            imgs = _maybe_channels_last(imgs)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                imgs_flipped = torch.flip(imgs, dims=[3])
                logits_f = model(imgs_flipped)
                logits = (logits + logits_f) / 2.0
            all_logits.append(logits.detach().cpu().float())
    all_logits = torch.cat(all_logits, dim=0)
    probs = torch.softmax(all_logits, dim=1).numpy()
    # Cleanup
    del model
    torch.cuda.empty_cache()
    gc.collect()
    return probs

def save_submission(test_probs, class_names):
    ss = pd.read_csv('sample_submission.csv')
    probs = np.clip(test_probs, 1e-7, 1-1e-7)
    probs = probs / probs.sum(axis=1, keepdims=True)
    sub = pd.DataFrame(probs, columns=class_names)
    sub.insert(0, 'id', ss['id'])
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv with shape', sub.shape)

# ========== Run quick sanity: Train fold 0 only (can be skipped for full CV cell) ==========
labels, ss, class_names, class2idx, idx2class = read_data()
num_classes = len(class_names)

best_path, oof_logits, oof_index = train_one_fold(0, labels, num_classes)
test_probs = predict_test_single_model(best_path, num_classes)
save_submission(test_probs, class_names)

exp_log = {
    'model': CFG.model_name,
    'img_size': CFG.img_size,
    'epochs': CFG.epochs,
    'folds_trained': [0],
    'tta_hflip': CFG.tta_hflip,
    'mixup': {'mixup_alpha': CFG.mixup_alpha, 'cutmix_alpha': CFG.cutmix_alpha, 'prob': CFG.mixup_prob},
}
with open(os.path.join(CFG.out_dir, 'exp_log.json'), 'w') as f:
    json.dump(exp_log, f, indent=2)
print('Sanity fold complete. Ready to run full CV in next cell.')


===== Fold 0 =====




Epoch 1/3 - train 3.4707 - val 4.7265 - time 135.2s - mixup_p 0.30


Epoch 2/3 - train 1.7037 - val 3.8991 - time 135.4s - mixup_p 0.10


Epoch 3/3 - train 0.7016 - val 2.8320 - time 135.2s - mixup_p 0.00


Saved submission.csv with shape (1023, 121)
Sanity fold complete. Ready to run full CV in next cell.


In [10]:
# 5-Fold Training + Ensembling Submission (updated recipe)
from sklearn.metrics import log_loss
import numpy as np, json, os, torch

# Set full-run epochs per expert recipe
CFG.epochs = 30

# Ensure fresh data and correct functions from cell 1 are used
labels, ss, class_names, class2idx, idx2class = read_data()
num_classes = len(class_names)

all_oof_logits = np.zeros((len(labels), num_classes), dtype=np.float32)
all_val_indices = []
ckpts = []

for fold in range(CFG.n_folds):
    best_path, oof_logits, val_idx = train_one_fold(fold, labels, num_classes)
    all_oof_logits[val_idx] = oof_logits
    all_val_indices.extend(val_idx.tolist())
    ckpts.append(best_path)

# OOF logloss
oof_probs = torch.softmax(torch.tensor(all_oof_logits), dim=1).numpy()
oof_loss = log_loss(labels.loc[all_val_indices, 'target'].values, oof_probs[all_val_indices])
print('OOF logloss:', oof_loss)

# Test-time ensemble (average probs over folds)
test_probs_sum = None
for p in ckpts:
    probs = predict_test_single_model(p, num_classes)
    if test_probs_sum is None:
        test_probs_sum = probs
    else:
        test_probs_sum += probs
test_probs_avg = test_probs_sum / len(ckpts)
save_submission(test_probs_avg, class_names)

# Save metrics/log
metrics = {
    'oof_logloss': float(oof_loss),
    'folds': CFG.n_folds,
    'epochs': CFG.epochs,
    'model': CFG.model_name,
    'img_size': CFG.img_size,
    'mixup': {'alpha': CFG.mixup_alpha, 'prob': CFG.mixup_prob},
    'weight_decay': CFG.weight_decay
}
with open(os.path.join(CFG.out_dir, 'metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=2)
print('5-fold training complete. Submission saved.')


===== Fold 0 =====


Epoch 1/30 - train 3.4651 - val 4.7522 - time 135.8s - mixup_p 0.30


Epoch 2/30 - train 1.5517 - val 4.1412 - time 135.3s - mixup_p 0.28




Epoch 3/30 - train 1.3573 - val 3.2413 - time 135.5s - mixup_p 0.26


Epoch 4/30 - train 1.3011 - val 2.2952 - time 135.0s - mixup_p 0.24


Epoch 5/30 - train 1.1946 - val 1.5658 - time 135.8s - mixup_p 0.22


Epoch 6/30 - train 1.0963 - val 1.0945 - time 135.8s - mixup_p 0.20


Epoch 7/30 - train 1.0408 - val 0.8195 - time 136.2s - mixup_p 0.18


Epoch 8/30 - train 0.9937 - val 0.6620 - time 135.5s - mixup_p 0.16


Epoch 9/30 - train 1.0021 - val 0.5714 - time 135.2s - mixup_p 0.14


Epoch 10/30 - train 0.9916 - val 0.5234 - time 135.4s - mixup_p 0.12


Epoch 11/30 - train 0.9778 - val 0.4975 - time 135.9s - mixup_p 0.10


Epoch 12/30 - train 0.9799 - val 0.4884 - time 135.7s - mixup_p 0.08


Epoch 13/30 - train 0.9744 - val 0.4883 - time 136.1s - mixup_p 0.06


Epoch 14/30 - train 0.9779 - val 0.4960 - time 135.6s - mixup_p 0.04


Epoch 15/30 - train 0.9481 - val 0.5058 - time 136.3s - mixup_p 0.02


Epoch 16/30 - train 0.4462 - val 0.5017 - time 135.4s - mixup_p 0.00


Epoch 17/30 - train 0.4435 - val 0.5061 - time 136.3s - mixup_p 0.00


Epoch 18/30 - train 0.4443 - val 0.5155 - time 135.2s - mixup_p 0.00


Epoch 19/30 - train 0.4435 - val 0.5262 - time 134.9s - mixup_p 0.00
Early stopping triggered



===== Fold 1 =====


Epoch 1/30 - train 3.3460 - val 4.7194 - time 135.4s - mixup_p 0.30


Epoch 2/30 - train 1.5600 - val 4.1189 - time 135.5s - mixup_p 0.28




Epoch 3/30 - train 1.3502 - val 3.2270 - time 135.9s - mixup_p 0.26


Epoch 4/30 - train 1.3047 - val 2.2879 - time 135.8s - mixup_p 0.24


Epoch 5/30 - train 1.1751 - val 1.5592 - time 134.9s - mixup_p 0.22


Epoch 6/30 - train 1.1182 - val 1.1021 - time 135.7s - mixup_p 0.20


Epoch 7/30 - train 1.0862 - val 0.8378 - time 136.3s - mixup_p 0.18


Epoch 8/30 - train 1.0301 - val 0.6946 - time 136.4s - mixup_p 0.16


Epoch 9/30 - train 1.0077 - val 0.6162 - time 135.7s - mixup_p 0.14


Epoch 10/30 - train 0.9660 - val 0.5777 - time 135.6s - mixup_p 0.12


Epoch 11/30 - train 0.9705 - val 0.5627 - time 136.2s - mixup_p 0.10


Epoch 12/30 - train 0.9875 - val 0.5590 - time 135.3s - mixup_p 0.08


Epoch 13/30 - train 0.9892 - val 0.5654 - time 135.3s - mixup_p 0.06


Epoch 14/30 - train 0.9565 - val 0.5750 - time 135.5s - mixup_p 0.04


Epoch 15/30 - train 0.9755 - val 0.5875 - time 135.8s - mixup_p 0.02


Epoch 16/30 - train 0.4442 - val 0.5899 - time 136.1s - mixup_p 0.00


Epoch 17/30 - train 0.4396 - val 0.5965 - time 135.7s - mixup_p 0.00


Epoch 18/30 - train 0.4457 - val 0.6084 - time 136.2s - mixup_p 0.00
Early stopping triggered



===== Fold 2 =====


Epoch 1/30 - train 3.3875 - val 4.7171 - time 135.5s - mixup_p 0.30


Epoch 2/30 - train 1.5122 - val 4.1185 - time 134.9s - mixup_p 0.28




Epoch 3/30 - train 1.3649 - val 3.2430 - time 135.6s - mixup_p 0.26


Epoch 4/30 - train 1.2991 - val 2.3142 - time 135.7s - mixup_p 0.24


Epoch 5/30 - train 1.1799 - val 1.5884 - time 135.3s - mixup_p 0.22


Epoch 6/30 - train 1.1107 - val 1.1288 - time 135.5s - mixup_p 0.20


Epoch 7/30 - train 1.0600 - val 0.8669 - time 135.8s - mixup_p 0.18


Epoch 8/30 - train 1.0048 - val 0.7204 - time 136.0s - mixup_p 0.16


Epoch 9/30 - train 0.9927 - val 0.6395 - time 135.4s - mixup_p 0.14


Epoch 10/30 - train 0.9811 - val 0.5906 - time 135.1s - mixup_p 0.12


Epoch 11/30 - train 0.9779 - val 0.5678 - time 135.5s - mixup_p 0.10


Epoch 12/30 - train 0.9827 - val 0.5582 - time 135.5s - mixup_p 0.08


Epoch 13/30 - train 0.9793 - val 0.5617 - time 136.2s - mixup_p 0.06


Epoch 14/30 - train 0.9415 - val 0.5695 - time 135.1s - mixup_p 0.04


Epoch 15/30 - train 0.9766 - val 0.5791 - time 135.6s - mixup_p 0.02


Epoch 16/30 - train 0.4435 - val 0.5796 - time 135.9s - mixup_p 0.00


Epoch 17/30 - train 0.4443 - val 0.5879 - time 134.8s - mixup_p 0.00


Epoch 18/30 - train 0.4421 - val 0.5966 - time 135.1s - mixup_p 0.00
Early stopping triggered



===== Fold 3 =====


Epoch 1/30 - train 3.3530 - val 4.6588 - time 135.5s - mixup_p 0.30


Epoch 2/30 - train 1.5114 - val 4.0433 - time 135.4s - mixup_p 0.28




Epoch 3/30 - train 1.3247 - val 3.1624 - time 135.5s - mixup_p 0.26


Epoch 4/30 - train 1.2659 - val 2.2496 - time 135.6s - mixup_p 0.24


Epoch 5/30 - train 1.1844 - val 1.5613 - time 135.4s - mixup_p 0.22


Epoch 6/30 - train 1.1230 - val 1.1349 - time 135.2s - mixup_p 0.20


Epoch 7/30 - train 1.0547 - val 0.8870 - time 136.3s - mixup_p 0.18


Epoch 8/30 - train 1.0157 - val 0.7458 - time 135.5s - mixup_p 0.16


Epoch 9/30 - train 0.9723 - val 0.6675 - time 135.3s - mixup_p 0.14


Epoch 10/30 - train 0.9732 - val 0.6296 - time 135.3s - mixup_p 0.12


Epoch 11/30 - train 0.9814 - val 0.6166 - time 135.9s - mixup_p 0.10


Epoch 12/30 - train 0.9722 - val 0.6154 - time 135.5s - mixup_p 0.08


Epoch 13/30 - train 0.9701 - val 0.6189 - time 136.1s - mixup_p 0.06


Epoch 14/30 - train 0.9755 - val 0.6298 - time 136.7s - mixup_p 0.04


Epoch 15/30 - train 0.9708 - val 0.6407 - time 135.9s - mixup_p 0.02


Epoch 16/30 - train 0.4454 - val 0.6438 - time 135.4s - mixup_p 0.00


Epoch 17/30 - train 0.4396 - val 0.6538 - time 135.5s - mixup_p 0.00


Epoch 18/30 - train 0.4404 - val 0.6657 - time 135.8s - mixup_p 0.00
Early stopping triggered



===== Fold 4 =====


Epoch 1/30 - train 3.3765 - val 4.7112 - time 135.3s - mixup_p 0.30


Epoch 2/30 - train 1.5081 - val 4.1131 - time 135.4s - mixup_p 0.28




Epoch 3/30 - train 1.3240 - val 3.2337 - time 135.0s - mixup_p 0.26


Epoch 4/30 - train 1.2874 - val 2.3085 - time 135.4s - mixup_p 0.24


Epoch 5/30 - train 1.1555 - val 1.5929 - time 135.5s - mixup_p 0.22


Epoch 6/30 - train 1.1034 - val 1.1419 - time 135.7s - mixup_p 0.20


Epoch 7/30 - train 1.0957 - val 0.8742 - time 135.7s - mixup_p 0.18


Epoch 8/30 - train 1.0369 - val 0.7225 - time 135.4s - mixup_p 0.16


Epoch 9/30 - train 0.9811 - val 0.6386 - time 135.2s - mixup_p 0.14


Epoch 10/30 - train 0.9817 - val 0.5940 - time 135.5s - mixup_p 0.12


Epoch 11/30 - train 1.0115 - val 0.5757 - time 135.2s - mixup_p 0.10


Epoch 12/30 - train 0.9999 - val 0.5714 - time 135.4s - mixup_p 0.08


Epoch 13/30 - train 0.9733 - val 0.5721 - time 135.8s - mixup_p 0.06


Epoch 14/30 - train 0.9838 - val 0.5797 - time 135.4s - mixup_p 0.04


Epoch 15/30 - train 0.9347 - val 0.5895 - time 135.4s - mixup_p 0.02


Epoch 16/30 - train 0.4395 - val 0.5912 - time 135.0s - mixup_p 0.00


Epoch 17/30 - train 0.4420 - val 0.6019 - time 134.8s - mixup_p 0.00


Epoch 18/30 - train 0.4407 - val 0.6141 - time 134.5s - mixup_p 0.00
Early stopping triggered


OOF logloss: 0.5584536029048021


Saved submission.csv with shape (1023, 121)
5-fold training complete. Submission saved.


In [29]:
# 5-Fold Training with tf_efficientnet_b4_ns @512 (alternative strong baseline)
from sklearn.metrics import log_loss
import numpy as np, json, os, torch

# Reconfigure CFG for EfficientNet-B4 strong baseline (fixed hyperparams per action plan)
CFG.model_name = 'tf_efficientnet_b4_ns'
CFG.img_size = 512
CFG.train_bs = 8
CFG.valid_bs = 16
CFG.epochs = 35
CFG.lr = 1e-4
CFG.weight_decay = 1e-4
CFG.mixup_alpha = 0.2
CFG.cutmix_alpha = 0.0
CFG.mixup_prob = 0.3
CFG.mixup_switch_prob = 0.0
CFG.ema_decay = 0.9998
# Per strategy A: reduce early stopping patience to avoid overfitting
CFG.early_stop_patience = 3

# Update transforms to new resolution (add border_mode to avoid black borders)
train_tfms = A.Compose([
    A.RandomResizedCrop(size=(CFG.img_size, CFG.img_size), scale=(0.85, 1.0), ratio=(0.75, 1.333), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.Affine(scale=(0.95, 1.05), translate_percent=(0.0, 0.02), rotate=(-10, 10), shear=(-2, 2), fit_output=False, border_mode=cv2.BORDER_REFLECT_101, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05, p=0.5),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])
valid_tfms = A.Compose([
    A.Resize(height=CFG.img_size, width=CFG.img_size),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

# Fresh data
labels, ss, class_names, class2idx, idx2class = read_data()
num_classes = len(class_names)

all_oof_logits = np.zeros((len(labels), num_classes), dtype=np.float32)
all_val_indices = []
ckpts = []

for fold in range(CFG.n_folds):
    best_path, oof_logits, val_idx = train_one_fold(fold, labels, num_classes)
    all_oof_logits[val_idx] = oof_logits
    all_val_indices.extend(val_idx.tolist())
    ckpts.append(best_path)

# OOF logloss
oof_probs = torch.softmax(torch.tensor(all_oof_logits), dim=1).numpy()
oof_loss = log_loss(labels.loc[all_val_indices, 'target'].values, oof_probs[all_val_indices])
print('OOF logloss (EffNet-B4 512):', oof_loss)

# Ensemble test predictions
test_probs_sum = None
for p in ckpts:
    probs = predict_test_single_model(p, num_classes)
    test_probs_sum = probs if test_probs_sum is None else (test_probs_sum + probs)
test_probs_avg = test_probs_sum / len(ckpts)
save_submission(test_probs_avg, class_names)

metrics = {
    'oof_logloss': float(oof_loss),
    'folds': CFG.n_folds,
    'epochs': CFG.epochs,
    'model': CFG.model_name,
    'img_size': CFG.img_size,
    'mixup': {'alpha': CFG.mixup_alpha, 'prob': CFG.mixup_prob},
    'weight_decay': CFG.weight_decay,
    'early_stop_patience': CFG.early_stop_patience
}
with open(os.path.join(CFG.out_dir, 'metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=2)
print('5-fold EfficientNet-B4 run complete. Submission saved.')


===== Fold 0 =====


  model = create_fn(


Epoch 1/35 - train 4.5266 - val 4.8111 - time 137.0s - mixup_p 0.30


Epoch 2/35 - train 2.5996 - val 4.5192 - time 136.6s - mixup_p 0.28




Epoch 3/35 - train 1.7003 - val 3.9949 - time 137.1s - mixup_p 0.27


Epoch 4/35 - train 1.4892 - val 3.3604 - time 139.3s - mixup_p 0.25


Epoch 5/35 - train 1.3685 - val 2.6937 - time 137.6s - mixup_p 0.23


Epoch 6/35 - train 1.2731 - val 2.0587 - time 137.6s - mixup_p 0.21


Epoch 7/35 - train 1.2448 - val 1.5281 - time 137.7s - mixup_p 0.20


Epoch 8/35 - train 1.1635 - val 1.1313 - time 138.4s - mixup_p 0.18


Epoch 9/35 - train 1.1404 - val 0.8606 - time 139.0s - mixup_p 0.16


Epoch 10/35 - train 1.1307 - val 0.6925 - time 136.0s - mixup_p 0.15


Epoch 11/35 - train 1.0912 - val 0.5899 - time 137.7s - mixup_p 0.13


Epoch 12/35 - train 1.0806 - val 0.5323 - time 137.5s - mixup_p 0.11


Epoch 13/35 - train 1.0772 - val 0.4983 - time 141.2s - mixup_p 0.09


Epoch 14/35 - train 1.0504 - val 0.4819 - time 139.7s - mixup_p 0.08


Epoch 15/35 - train 1.0282 - val 0.4773 - time 139.5s - mixup_p 0.06


Epoch 16/35 - train 1.0328 - val 0.4809 - time 138.6s - mixup_p 0.04


Epoch 17/35 - train 1.0521 - val 0.4879 - time 138.5s - mixup_p 0.03


Epoch 18/35 - train 1.0120 - val 0.4978 - time 138.9s - mixup_p 0.01
Early stopping triggered



===== Fold 1 =====


  model = create_fn(


Epoch 1/35 - train 4.4862 - val 4.7684 - time 137.2s - mixup_p 0.30


Epoch 2/35 - train 2.5350 - val 4.4629 - time 138.3s - mixup_p 0.28




Epoch 3/35 - train 1.6514 - val 3.9290 - time 138.9s - mixup_p 0.27


Epoch 4/35 - train 1.4598 - val 3.2928 - time 137.8s - mixup_p 0.25


Epoch 5/35 - train 1.3812 - val 2.6233 - time 139.8s - mixup_p 0.23


Epoch 6/35 - train 1.2890 - val 2.0059 - time 141.1s - mixup_p 0.21


Epoch 7/35 - train 1.2158 - val 1.4946 - time 138.4s - mixup_p 0.20


Epoch 8/35 - train 1.1584 - val 1.1075 - time 137.8s - mixup_p 0.18


Epoch 9/35 - train 1.1411 - val 0.8508 - time 139.9s - mixup_p 0.16


Epoch 10/35 - train 1.0954 - val 0.6934 - time 139.0s - mixup_p 0.15


Epoch 11/35 - train 1.0873 - val 0.6047 - time 136.8s - mixup_p 0.13


Epoch 12/35 - train 1.0806 - val 0.5577 - time 138.1s - mixup_p 0.11


Epoch 13/35 - train 1.0907 - val 0.5345 - time 138.4s - mixup_p 0.09


Epoch 14/35 - train 1.0691 - val 0.5232 - time 138.8s - mixup_p 0.08


Epoch 15/35 - train 1.0415 - val 0.5236 - time 139.4s - mixup_p 0.06


Epoch 16/35 - train 1.0610 - val 0.5315 - time 142.2s - mixup_p 0.04


Epoch 17/35 - train 1.0268 - val 0.5391 - time 143.5s - mixup_p 0.03
Early stopping triggered



===== Fold 2 =====


  model = create_fn(


Epoch 1/35 - train 4.5121 - val 4.7809 - time 139.6s - mixup_p 0.30


Epoch 2/35 - train 2.6038 - val 4.4883 - time 139.9s - mixup_p 0.28




Epoch 3/35 - train 1.7103 - val 3.9646 - time 139.4s - mixup_p 0.27


Epoch 4/35 - train 1.5400 - val 3.3294 - time 141.5s - mixup_p 0.25


Epoch 5/35 - train 1.3626 - val 2.6650 - time 143.0s - mixup_p 0.23


Epoch 6/35 - train 1.2481 - val 2.0436 - time 139.7s - mixup_p 0.21


Epoch 7/35 - train 1.1837 - val 1.5348 - time 143.7s - mixup_p 0.20


Epoch 8/35 - train 1.1321 - val 1.1519 - time 141.4s - mixup_p 0.18


Epoch 9/35 - train 1.1656 - val 0.8897 - time 142.2s - mixup_p 0.16


Epoch 10/35 - train 1.1253 - val 0.7246 - time 141.2s - mixup_p 0.15


Epoch 11/35 - train 1.1465 - val 0.6306 - time 140.6s - mixup_p 0.13


Epoch 12/35 - train 1.1224 - val 0.5767 - time 140.7s - mixup_p 0.11


Epoch 13/35 - train 1.0559 - val 0.5480 - time 140.6s - mixup_p 0.09


Epoch 14/35 - train 1.0734 - val 0.5401 - time 141.0s - mixup_p 0.08


Epoch 15/35 - train 1.0638 - val 0.5408 - time 139.5s - mixup_p 0.06


Epoch 16/35 - train 1.0508 - val 0.5457 - time 141.9s - mixup_p 0.04


Epoch 17/35 - train 1.0185 - val 0.5555 - time 141.9s - mixup_p 0.03
Early stopping triggered



===== Fold 3 =====


  model = create_fn(


Epoch 1/35 - train 4.5048 - val 4.7946 - time 142.0s - mixup_p 0.30


Epoch 2/35 - train 2.5736 - val 4.4963 - time 142.1s - mixup_p 0.28




Epoch 3/35 - train 1.6541 - val 3.9674 - time 140.3s - mixup_p 0.27


Epoch 4/35 - train 1.4610 - val 3.3373 - time 141.7s - mixup_p 0.25


Epoch 5/35 - train 1.3735 - val 2.6852 - time 139.9s - mixup_p 0.23


Epoch 6/35 - train 1.2770 - val 2.0762 - time 144.0s - mixup_p 0.21


Epoch 7/35 - train 1.2078 - val 1.5851 - time 140.8s - mixup_p 0.20


Epoch 8/35 - train 1.1376 - val 1.2155 - time 141.8s - mixup_p 0.18


Epoch 9/35 - train 1.1418 - val 0.9556 - time 140.4s - mixup_p 0.16


Epoch 10/35 - train 1.1163 - val 0.7844 - time 143.2s - mixup_p 0.15


Epoch 11/35 - train 1.0841 - val 0.6804 - time 142.6s - mixup_p 0.13


Epoch 12/35 - train 1.0314 - val 0.6228 - time 142.7s - mixup_p 0.11


Epoch 13/35 - train 1.0411 - val 0.5934 - time 140.0s - mixup_p 0.09


Epoch 14/35 - train 1.0297 - val 0.5803 - time 140.7s - mixup_p 0.08


Epoch 15/35 - train 1.0459 - val 0.5793 - time 139.3s - mixup_p 0.06


Epoch 16/35 - train 1.0354 - val 0.5845 - time 141.4s - mixup_p 0.04


Epoch 17/35 - train 1.0694 - val 0.5933 - time 140.9s - mixup_p 0.03


Epoch 18/35 - train 1.0343 - val 0.6009 - time 143.0s - mixup_p 0.01
Early stopping triggered



===== Fold 4 =====


  model = create_fn(


Epoch 1/35 - train 4.4672 - val 4.7599 - time 140.7s - mixup_p 0.30


Epoch 2/35 - train 2.5122 - val 4.4578 - time 142.3s - mixup_p 0.28




Epoch 3/35 - train 1.6578 - val 3.9375 - time 142.1s - mixup_p 0.27


Epoch 4/35 - train 1.4761 - val 3.3194 - time 141.4s - mixup_p 0.25


Epoch 5/35 - train 1.3312 - val 2.6777 - time 142.5s - mixup_p 0.23


Epoch 6/35 - train 1.2337 - val 2.0812 - time 140.9s - mixup_p 0.21


Epoch 7/35 - train 1.1938 - val 1.5871 - time 143.5s - mixup_p 0.20


Epoch 8/35 - train 1.1364 - val 1.2084 - time 141.4s - mixup_p 0.18


Epoch 9/35 - train 1.1374 - val 0.9421 - time 143.3s - mixup_p 0.16


Epoch 10/35 - train 1.0867 - val 0.7720 - time 141.9s - mixup_p 0.15


Epoch 11/35 - train 1.0973 - val 0.6692 - time 143.0s - mixup_p 0.13


Epoch 12/35 - train 1.0557 - val 0.6085 - time 143.1s - mixup_p 0.11


Epoch 13/35 - train 1.0471 - val 0.5800 - time 145.3s - mixup_p 0.09


Epoch 14/35 - train 1.0423 - val 0.5708 - time 142.9s - mixup_p 0.08


Epoch 15/35 - train 1.0259 - val 0.5741 - time 143.4s - mixup_p 0.06


Epoch 16/35 - train 1.0196 - val 0.5814 - time 141.0s - mixup_p 0.04


Epoch 17/35 - train 1.0320 - val 0.5961 - time 141.8s - mixup_p 0.03
Early stopping triggered


OOF logloss (EffNet-B4 512): 0.5381454578467637


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved submission.csv with shape (1023, 121)
5-fold EfficientNet-B4 run complete. Submission saved.


In [13]:
# Temperature scaling using OOF logits, then re-infer test with scaled logits
import torch, numpy as np, pandas as pd, os, gc
from sklearn.metrics import log_loss

assert 'all_oof_logits' in globals() and 'all_val_indices' in globals(), 'Run 5-fold cell to populate OOF logits first.'

# Fit temperature T to minimize OOF logloss
y_true = labels.loc[all_val_indices, 'target'].values
oof_logits_tensor = torch.tensor(all_oof_logits[all_val_indices], dtype=torch.float32, device=CFG.device)
T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
optimizer = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')

def _nll():
    optimizer.zero_grad()
    scaled = oof_logits_tensor / torch.clamp(T, min=1e-3)
    log_probs = torch.log_softmax(scaled, dim=1)
    nll = -log_probs[torch.arange(len(y_true), device=CFG.device), torch.tensor(y_true, device=CFG.device)].mean()
    nll.backward()
    return nll

optimizer.step(_nll)
T_opt = float(T.detach().clamp(min=1e-3).cpu().item())
oof_probs_scaled = torch.softmax(oof_logits_tensor / T_opt, dim=1).cpu().numpy()
oof_loss_scaled = log_loss(y_true, oof_probs_scaled)
print(f'Fitted temperature: {T_opt:.4f} | OOF logloss (scaled): {oof_loss_scaled:.6f}')

# Re-run test inference to collect logits per fold and apply temperature scaling
ss = pd.read_csv('sample_submission.csv')
test_df = ss[['id']].copy()
test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
test_ds = DogDataset(test_df[['id','filepath']], transforms=valid_tfms)
test_loader = DataLoader(test_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

def predict_test_logits(ckpt_path, num_classes):
    model = build_model(num_classes).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    all_logits = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in test_loader:
            imgs = imgs.to(CFG.device, non_blocking=True)
            imgs = imgs.to(memory_format=torch.channels_last)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                imgs_flipped = torch.flip(imgs, dims=[3])
                logits_f = model(imgs_flipped)
                logits = (logits + logits_f) / 2.0
            all_logits.append(logits.detach().cpu().float())
    logits_cat = torch.cat(all_logits, dim=0)
    del model
    torch.cuda.empty_cache(); gc.collect()
    return logits_cat

num_classes = len(class_names)
test_logits_sum = None
for p in ckpts:
    logits = predict_test_logits(p, num_classes)
    test_logits_sum = logits if test_logits_sum is None else (test_logits_sum + logits)
test_logits_avg = test_logits_sum / len(ckpts)
test_probs_scaled = torch.softmax(test_logits_avg / T_opt, dim=1).numpy()

# Save scaled submission
probs = np.clip(test_probs_scaled, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=class_names)
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved temperature-scaled submission.csv with shape', sub.shape)

Fitted temperature: 0.7941 | OOF logloss (scaled): 0.472534


  model = create_fn(


Saved temperature-scaled submission.csv with shape (1023, 121)


In [14]:
# Multi-scale TTA with temperature scaling (no retraining)
import torch, numpy as np, pandas as pd, gc
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2

assert 'ckpts' in globals() and len(ckpts) == CFG.n_folds, 'Run 5-fold training to populate ckpts'
assert 'T_opt' in globals(), 'Run temperature scaling cell to compute T_opt'

ss = pd.read_csv('sample_submission.csv')
test_df = ss[['id']].copy()
test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')

def make_tta_tfms(size):
    return A.Compose([
        A.Resize(height=size, width=size),
        A.CenterCrop(height=CFG.img_size, width=CFG.img_size, p=1.0) if size > CFG.img_size else A.NoOp(),
        A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ToTensorV2(),
    ])

def predict_test_logits_with_tfms(ckpt_path, tfms):
    ds = DogDataset(test_df[['id','filepath']], transforms=tfms)
    dl = DataLoader(ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = build_model(len(class_names)).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    outs = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in dl:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
            outs.append(logits.detach().cpu().float())
    logits_cat = torch.cat(outs, dim=0)
    del model; torch.cuda.empty_cache(); gc.collect()
    return logits_cat

# Define multi-scale sizes around training size
scales = [int(CFG.img_size*0.9), CFG.img_size, int(CFG.img_size*1.15)]
scales = sorted(list(set([max(224, s) for s in scales])))
print('TTA scales:', scales, '| base size:', CFG.img_size)

fold_logits_sum = None
for p in ckpts:
    scale_logits_sum = None
    for s in scales:
        tfms = make_tta_tfms(s)
        logits = predict_test_logits_with_tfms(p, tfms)
        scale_logits_sum = logits if scale_logits_sum is None else (scale_logits_sum + logits)
    logits_avg_scales = scale_logits_sum / len(scales)
    fold_logits_sum = logits_avg_scales if fold_logits_sum is None else (fold_logits_sum + logits_avg_scales)

test_logits_ens = fold_logits_sum / len(ckpts)
test_probs_scaled = torch.softmax(test_logits_ens / T_opt, dim=1).numpy()

# Save submission
probs = np.clip(test_probs_scaled, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=class_names)
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved multi-scale TTA + temp-scaled submission.csv with shape', sub.shape)

TTA scales: [460, 512, 588] | base size: 512


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved multi-scale TTA + temp-scaled submission.csv with shape (1023, 121)


In [15]:
# Per-fold temperature scaling and ensemble (no retraining)
import torch, numpy as np, pandas as pd, gc
from sklearn.metrics import log_loss

assert 'labels' in globals() and 'ckpts' in globals() and 'all_oof_logits' in globals(), 'Run 5-fold training cell first.'

per_fold_T = []
for f in range(CFG.n_folds):
    val_idx_f = labels.index[labels.fold == f].values
    y_true_f = labels.loc[val_idx_f, 'target'].values
    oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device=CFG.device)
    T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
    opt = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
    def _closure():
        opt.zero_grad()
        scaled = oof_logits_f / torch.clamp(T, min=1e-3)
        log_probs = torch.log_softmax(scaled, dim=1)
        idx = torch.arange(len(y_true_f), device=CFG.device)
        y_t = torch.tensor(y_true_f, device=CFG.device)
        nll = -log_probs[idx, y_t].mean()
        nll.backward()
        return nll
    opt.step(_closure)
    T_val = float(T.detach().clamp(min=1e-3).cpu().item())
    per_fold_T.append(T_val)
    with torch.no_grad():
        probs_f = torch.softmax(oof_logits_f / T_val, dim=1).cpu().numpy()
    print(f'Fold {f}: T={T_val:.4f}, OOF logloss (scaled)={log_loss(y_true_f, probs_f):.6f}')

# Build test logits per fold, apply per-fold temperature, then average probabilities
def predict_test_logits_once(ckpt_path, num_classes):
    ss = pd.read_csv('sample_submission.csv')
    test_df = ss[['id']].copy()
    test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
    test_ds = DogDataset(test_df[['id','filepath']], transforms=valid_tfms)
    test_loader = DataLoader(test_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = build_model(num_classes).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    outs = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in test_loader:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
            outs.append(logits.detach().cpu().float())
    logits_cat = torch.cat(outs, dim=0)
    del model; torch.cuda.empty_cache(); gc.collect()
    return logits_cat

num_classes = len(class_names)
probs_sum = None
for f, (p, Tf) in enumerate(zip(ckpts, per_fold_T)):
    logits = predict_test_logits_once(p, num_classes)
    probs = torch.softmax(logits / Tf, dim=1).numpy()
    probs_sum = probs if probs_sum is None else (probs_sum + probs)
probs_avg = probs_sum / len(ckpts)

# Save submission
ss = pd.read_csv('sample_submission.csv')
probs = np.clip(probs_avg, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=class_names)
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved per-fold temp-scaled submission.csv with shape', sub.shape)

Fold 0: T=0.7861, OOF logloss (scaled)=0.414481
Fold 1: T=0.8036, OOF logloss (scaled)=0.455294
Fold 2: T=0.7986, OOF logloss (scaled)=0.475274


Fold 3: T=0.7979, OOF logloss (scaled)=0.517592
Fold 4: T=0.7845, OOF logloss (scaled)=0.499581


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved per-fold temp-scaled submission.csv with shape (1023, 121)


In [30]:
# Multi-scale TTA + per-fold temperature scaling (average logits per scale, then temp & softmax)
import torch, numpy as np, pandas as pd, gc
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss

assert 'ckpts' in globals() and len(ckpts) == CFG.n_folds, 'Run 5-fold training to populate ckpts'
assert 'labels' in globals() and 'all_oof_logits' in globals(), 'Need OOF logits and labels for per-fold T'

# Ensure per_fold_T is available; if not, fit it
if 'per_fold_T' not in globals():
    per_fold_T = []
    for f in range(CFG.n_folds):
        val_idx_f = labels.index[labels.fold == f].values
        y_true_f = labels.loc[val_idx_f, 'target'].values
        oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device=CFG.device)
        T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
        opt = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
        def _closure():
            opt.zero_grad()
            scaled = oof_logits_f / torch.clamp(T, min=1e-3)
            log_probs = torch.log_softmax(scaled, dim=1)
            idx = torch.arange(len(y_true_f), device=CFG.device)
            y_t = torch.tensor(y_true_f, device=CFG.device)
            nll = -log_probs[idx, y_t].mean()
            nll.backward()
            return nll
        opt.step(_closure)
        T_val = float(T.detach().clamp(min=1e-3).cpu().item())
        per_fold_T.append(T_val)

ss = pd.read_csv('sample_submission.csv')
test_df = ss[['id']].copy()
test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')

def make_tta_tfms(size):
    return A.Compose([
        A.Resize(height=size, width=size),
        A.CenterCrop(height=CFG.img_size, width=CFG.img_size, p=1.0) if size > CFG.img_size else A.NoOp(),
        A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ToTensorV2(),
    ])

def predict_test_logits_with_tfms(ckpt_path, tfms):
    ds = DogDataset(test_df[['id','filepath']], transforms=tfms)
    dl = DataLoader(ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = build_model(len(class_names)).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    outs = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in dl:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
            outs.append(logits.detach().cpu().float())
    logits_cat = torch.cat(outs, dim=0)
    del model; torch.cuda.empty_cache(); gc.collect()
    return logits_cat

# Define scales and run per-fold multi-scale logits averaging, then temp scale and softmax
scales = [int(CFG.img_size*0.9), CFG.img_size, int(CFG.img_size*1.15)]
scales = sorted(list(set([max(224, s) for s in scales])))
print('Per-fold TTA scales:', scales, '| base size:', CFG.img_size)

probs_sum = None
for f, (p, Tf) in enumerate(zip(ckpts, per_fold_T)):
    scale_logits_sum = None
    for s in scales:
        tfms = make_tta_tfms(s)
        logits = predict_test_logits_with_tfms(p, tfms)
        scale_logits_sum = logits if scale_logits_sum is None else (scale_logits_sum + logits)
    logits_avg_scales = scale_logits_sum / len(scales)
    probs_f = torch.softmax(logits_avg_scales / Tf, dim=1).numpy()
    probs_sum = probs_f if probs_sum is None else (probs_sum + probs_f)

probs_avg = probs_sum / len(ckpts)
probs = np.clip(probs_avg, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=class_names)
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved per-fold TTA+temp-scaled submission.csv with shape', sub.shape)

Per-fold TTA scales: [460, 512, 588] | base size: 512


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved per-fold TTA+temp-scaled submission.csv with shape (1023, 121)


In [32]:
# Strategy 3: Feature Extraction + Fast Classifier (XGBoost GPU) using xgb.train
import os, gc, numpy as np, pandas as pd, torch, timm, cv2, sys, importlib, subprocess
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

def ensure(pkg, import_name=None):
    imp = import_name or pkg
    try:
        importlib.import_module(imp)
    except Exception:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        importlib.import_module(imp)

ensure('xgboost')
import xgboost as xgb

class SimpleImageDataset(Dataset):
    def __init__(self, df, size):
        self.df = df.reset_index(drop=True)
        self.transforms = A.Compose([
            A.Resize(height=size, width=size),
            A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            ToTensorV2(),
        ])
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = cv2.imread(row['filepath'])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.transforms(image=img)['image']
        return img, row['id'] if 'id' in row else idx

def build_backbone_feat_extractor(model_name, size):
    model = timm.create_model(model_name, pretrained=True, num_classes=0, global_pool='avg')
    model.eval().to(CFG.device)
    return model

@torch.no_grad()
def extract_features(df, size, model_name, batch_size=32):
    ds = SimpleImageDataset(df[['id','filepath']], size=size)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = build_backbone_feat_extractor(model_name, size)
    feats = []
    with torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ in dl:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            f = model(imgs)
            feats.append(f.detach().cpu().float())
    feats = torch.cat(feats, dim=0).numpy()
    del model; torch.cuda.empty_cache(); gc.collect()
    return feats

# Config for feature pipeline
FE = {
    'model_name': 'convnext_base.fb_in22k_ft_in1k',
    'img_size': 448,
    'batch_size': 32
}

# Prepare dataframes
labels_df = labels.copy() if 'labels' in globals() else pd.read_csv('labels.csv')
if 'filepath' not in labels_df.columns:
    labels_df['filepath'] = labels_df['id'].apply(lambda x: f'train/{x}.jpg')
if 'fold' not in labels_df.columns:
    skf_tmp = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    labels_df['fold'] = -1
    for f, (_, vi) in enumerate(skf_tmp.split(labels_df, labels_df['breed'])):
        labels_df.loc[vi, 'fold'] = f

ss = pd.read_csv('sample_submission.csv')
test_df = ss[['id']].copy()
test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
class_names_fe = [c for c in ss.columns if c != 'id']
class2idx_fe = {c:i for i,c in enumerate(class_names_fe)}
labels_df['target'] = labels_df['breed'].map(class2idx_fe)

# Extract or load cached features
train_feats_path = os.path.join(CFG.out_dir, f'train_feats_{FE["model_name"]}_{FE["img_size"]}.npy')
test_feats_path  = os.path.join(CFG.out_dir, f'test_feats_{FE["model_name"]}_{FE["img_size"]}.npy')
os.makedirs(CFG.out_dir, exist_ok=True)

if os.path.exists(train_feats_path) and os.path.exists(test_feats_path):
    X_train = np.load(train_feats_path)
    X_test = np.load(test_feats_path)
else:
    X_train = extract_features(labels_df[['id','filepath']], FE['img_size'], FE['model_name'], batch_size=FE['batch_size'])
    X_test = extract_features(test_df[['id','filepath']], FE['img_size'], FE['model_name'], batch_size=FE['batch_size'])
    np.save(train_feats_path, X_train)
    np.save(test_feats_path, X_test)
y_train = labels_df['target'].values.astype(int)

# Ensure correct dtypes
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# CV with xgboost.train (GPU) on features
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros((len(labels_df), len(class_names_fe)), dtype=np.float32)
test_pred_sum = np.zeros((len(test_df), len(class_names_fe)), dtype=np.float32)
oof_idx_all = []

num_classes = len(class_names_fe)
xgb_params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'max_depth': 6,
    'eta': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 8.0,
    'alpha': 0.0,
    'min_child_weight': 1.0,
    'eval_metric': 'mlogloss',
    'verbosity': 0
}

num_boost_round = 4000
early_stopping_rounds = 200

for f, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_va = X_train[trn_idx], X_train[val_idx]
    y_tr, y_va = y_train[trn_idx], y_train[val_idx]
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dvalid = xgb.DMatrix(X_va, label=y_va)
    dtest = xgb.DMatrix(X_test)
    bst = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dvalid, 'valid')],
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=False
    )
    # Predictions (use best_iteration if available)
    try:
        pred_val = bst.predict(dvalid, iteration_range=(0, bst.best_iteration + 1))
        pred_test = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
    except Exception:
        ntree = getattr(bst, 'best_ntree_limit', 0) or bst.num_boosted_rounds()
        pred_val = bst.predict(dvalid, ntree_limit=ntree)
        pred_test = bst.predict(dtest, ntree_limit=ntree)
    oof[val_idx] = pred_val.astype(np.float32)
    test_pred_sum += pred_test.astype(np.float32)
    oof_idx_all.extend(val_idx.tolist())
    fold_loss = log_loss(y_va, oof[val_idx])
    print(f'FE-XGB(train) Fold {f} logloss: {fold_loss:.6f}')

oof_loss = log_loss(y_train[oof_idx_all], oof[oof_idx_all])
print('FE-XGB(train) OOF logloss:', oof_loss)

# Build submission
test_probs = test_pred_sum / skf.n_splits
probs = np.clip(test_probs, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=class_names_fe)
sub.insert(0, 'id', test_df['id'])
sub.to_csv('submission.csv', index=False)
print('Saved FE-XGB(train) submission.csv with shape', sub.shape)

In [23]:
# Strategy 3b: Feature Extraction already cached -> Fast SGDClassifier (multinomial logistic) on ConvNeXt features
import os, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

# Paths
out_dir = 'outputs'
train_feats_path = os.path.join(out_dir, 'train_feats_convnext_base.fb_in22k_ft_in1k_448.npy')
test_feats_path = os.path.join(out_dir, 'test_feats_convnext_base.fb_in22k_ft_in1k_448.npy')

# Load cached features
X_train = np.load(train_feats_path).astype(np.float32)
X_test = np.load(test_feats_path).astype(np.float32)

# Prepare labels
labels_df = labels.copy() if 'labels' in globals() else pd.read_csv('labels.csv')
ss = pd.read_csv('sample_submission.csv')
class_names_fe = [c for c in ss.columns if c != 'id']
class2idx_fe = {c:i for i,c in enumerate(class_names_fe)}
labels_df['target'] = labels_df['breed'].map(class2idx_fe).astype(int)
y_train = labels_df['target'].values

# 5-fold CV with fast SGDClassifier (multinomial logistic regression approximation)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros((len(labels_df), len(class_names_fe)), dtype=np.float32)
test_pred_sum = np.zeros((len(ss), len(class_names_fe)), dtype=np.float32)
oof_idx_all = []

for f, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_va = X_train[trn_idx], X_train[val_idx]
    y_tr, y_va = y_train[trn_idx], y_train[val_idx]
    clf = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('sgd', SGDClassifier(loss='log_loss', penalty='l2', alpha=1e-4,
                              learning_rate='optimal', max_iter=2000, tol=1e-4,
                              early_stopping=True, n_iter_no_change=5,
                              random_state=42, n_jobs=-1))
    ])
    clf.fit(X_tr, y_tr)
    oof_fold = clf.predict_proba(X_va).astype(np.float32)
    test_fold = clf.predict_proba(X_test).astype(np.float32)
    # Stabilize probabilities: replace NaNs/Infs and renormalize
    oof_fold = np.nan_to_num(oof_fold, nan=1e-9, posinf=1e9, neginf=0.0)
    oof_fold = oof_fold / np.clip(oof_fold.sum(axis=1, keepdims=True), 1e-12, None)
    test_fold = np.nan_to_num(test_fold, nan=1e-9, posinf=1e9, neginf=0.0)
    test_fold = test_fold / np.clip(test_fold.sum(axis=1, keepdims=True), 1e-12, None)
    oof[val_idx] = oof_fold
    test_pred_sum += test_fold
    oof_idx_all.extend(val_idx.tolist())
    fold_loss = log_loss(y_va, oof[val_idx])
    print(f'SGD-Logit Fold {f} logloss: {fold_loss:.6f}')

oof_loss = log_loss(y_train[oof_idx_all], oof[oof_idx_all])
print('SGD-Logit OOF logloss:', oof_loss)

# Submission
test_probs = test_pred_sum / skf.n_splits
probs = np.clip(test_probs, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=class_names_fe)
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved SGD-Logit FE submission.csv with shape', sub.shape)

  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


SGD-Logit Fold 0 logloss: 1.341019


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


SGD-Logit Fold 1 logloss: 1.471068


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


SGD-Logit Fold 2 logloss: 1.432442


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


SGD-Logit Fold 3 logloss: 1.610691


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


SGD-Logit Fold 4 logloss: 1.347397
SGD-Logit OOF logloss: 1.4405337187877858


Saved SGD-Logit FE submission.csv with shape (1023, 121)


In [26]:
# Strategy 1: Downsize + Two-Stage Training with aggressive ES (EffNet-B2 @384)
import os, gc, time, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss

# Reconfigure CFG for smaller model to reduce overfitting and train faster
CFG.model_name = 'tf_efficientnet_b2_ns'
CFG.img_size = 384
CFG.train_bs = 16
CFG.valid_bs = 32
CFG.lr = 1e-4
CFG.weight_decay = 1e-4
CFG.mixup_alpha = 0.2
CFG.cutmix_alpha = 0.0
CFG.mixup_prob = 0.3
CFG.mixup_switch_prob = 0.0
CFG.ema_decay = 0.9998
CFG.early_stop_patience = 3  # aggressive ES as advised

# Update transforms for new resolution
train_tfms = A.Compose([
    A.RandomResizedCrop(size=(CFG.img_size, CFG.img_size), scale=(0.85, 1.0), ratio=(0.75, 1.333), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.Affine(scale=(0.95, 1.05), translate_percent=(0.0, 0.02), rotate=(-10, 10), shear=(-2, 2), fit_output=False, border_mode=cv2.BORDER_REFLECT_101, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05, p=0.5),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])
valid_tfms = A.Compose([
    A.Resize(height=CFG.img_size, width=CFG.img_size),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

def train_two_stage_fold(fold, labels, num_classes, stage1_epochs=6, stage2_epochs=6):
    # Data
    trn_df = labels[labels.fold != fold][['filepath','target']].copy()
    val_df = labels[labels.fold == fold][['filepath','target']].copy()
    train_ds = DogDataset(trn_df, transforms=train_tfms)
    valid_ds = DogDataset(val_df, transforms=valid_tfms)
    train_loader = DataLoader(train_ds, batch_size=CFG.train_bs, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

    # Model
    model = build_model(num_classes).to(CFG.device)
    ema = ModelEmaV2(model, decay=CFG.ema_decay) if CFG.device=='cuda' else None

    # Mixup + losses
    mixup_base = CFG.mixup_prob if (CFG.mixup_alpha > 0 or CFG.cutmix_alpha > 0) else 0.0
    mixup_fn = Mixup(
        mixup_alpha=CFG.mixup_alpha, cutmix_alpha=CFG.cutmix_alpha, prob=mixup_base,
        switch_prob=CFG.mixup_switch_prob, mode='batch', num_classes=num_classes
    ) if mixup_base > 0 else None
    criterion_hard = nn.CrossEntropyLoss(label_smoothing=0.0).to(CFG.device)  # no LS
    criterion_soft = SoftTargetCrossEntropy().to(CFG.device)

    def run_training(epochs, lr, use_mixup):
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=CFG.weight_decay)
        warmup_epochs = max(1, min(3, epochs//5))
        cosine_epochs = max(1, epochs - warmup_epochs)
        cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cosine_epochs)
        warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_epochs)
        scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_epochs])
        scaler = torch.amp.GradScaler('cuda', enabled=(CFG.device=='cuda'))

        best_val = 1e9
        no_improve = 0
        for epoch in range(1, epochs+1):
            model.train()
            # decay mixup with floor if enabled
            if use_mixup and mixup_fn is not None:
                frac = (epoch - 1) / max(1, epochs * 0.9)
                decay = max(0.05, 1.0 - min(1.0, frac))  # keep small floor
                mixup_fn.mixup_prob = mixup_base * decay
            else:
                if mixup_fn is not None:
                    mixup_fn.mixup_prob = 0.0

            train_loss, n = 0.0, 0
            for imgs, targets in train_loader:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                targets = targets.to(CFG.device, non_blocking=True)
                if use_mixup and mixup_fn is not None and mixup_fn.mixup_prob > 0:
                    imgs, targets = mixup_fn(imgs, targets)
                optimizer.zero_grad(set_to_none=True)
                with torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
                    logits = model(imgs)
                    loss = criterion_soft(logits, targets) if targets.dtype.is_floating_point else criterion_hard(logits, targets)
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                if CFG.grad_clip:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
                scaler.step(optimizer); scaler.update()
                if ema is not None: ema.update(model)
                train_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            train_loss /= max(1,n)

            # validation
            model.eval()
            val_loss, m = 0.0, 0
            with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
                for imgs, targets in valid_loader:
                    imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                    targets = targets.to(CFG.device, non_blocking=True)
                    logits = ema.module(imgs) if ema is not None else model(imgs)
                    loss = criterion_hard(logits, targets)
                    val_loss += loss.item() * imgs.size(0); m += imgs.size(0)
            val_loss /= max(1,m)
            scheduler.step()
            print(f'Epoch {epoch}/{epochs} - train {train_loss:.4f} - val {val_loss:.4f} - mixup_p {mixup_fn.mixup_prob if (use_mixup and mixup_fn) else 0:.2f}')
            # ES
            improved = val_loss < getattr(run_training, 'best_val', 1e9) - 1e-5
            if improved:
                run_training.best_val = val_loss
                run_training.best_state = (ema.module.state_dict() if ema is not None else model.state_dict())
                no_improve = 0
            else:
                no_improve += 1
                if no_improve >= CFG.early_stop_patience:
                    print('Early stopping')
                    break

        # load best
        if hasattr(run_training, 'best_state'):
            model.load_state_dict(run_training.best_state, strict=False)

    # Stage 1: train with mixup
    print(f'Fold {fold} - Stage 1 training with mixup, epochs={stage1_epochs}')
    run_training(stage1_epochs, lr=CFG.lr, use_mixup=True)

    # Stage 2: fine-tune without mixup at lower LR
    print(f'Fold {fold} - Stage 2 fine-tune no-mixup, epochs={stage2_epochs}')
    run_training(stage2_epochs, lr=1e-5, use_mixup=False)

    # Save best checkpoint
    best_path = os.path.join(CFG.out_dir, f'fold{fold}_b2_two_stage_best.pth')
    torch.save({'model': (ema.module.state_dict() if ema is not None else model.state_dict())}, best_path)

    # OOF logits
    model.eval()
    valid_loader = DataLoader(valid_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    oof_logits = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, targets in valid_loader:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            oof_logits.append(logits.detach().cpu().float())
    oof_logits = torch.cat(oof_logits, dim=0).numpy()

    # cleanup
    del model
    if ema is not None: del ema
    torch.cuda.empty_cache(); gc.collect()
    return best_path, oof_logits, val_df.index.values

# Fresh data and run a short CV (first fold only to gauge quickly)
labels, ss, class_names, class2idx, idx2class = read_data()
num_classes = len(class_names)
folds_to_run = [0]  # run one fold quickly
all_oof_logits_b2 = np.zeros((len(labels), num_classes), dtype=np.float32)
all_val_idx_b2 = []
ckpts_b2 = []

for f in folds_to_run:
    p, oof_l, val_idx = train_two_stage_fold(f, labels, num_classes, stage1_epochs=6, stage2_epochs=6)
    all_oof_logits_b2[val_idx] = oof_l
    all_val_idx_b2.extend(val_idx.tolist())
    ckpts_b2.append(p)

if len(all_val_idx_b2) > 0:
    oof_probs_b2 = torch.softmax(torch.tensor(all_oof_logits_b2[all_val_idx_b2]), dim=1).numpy()
    oof_loss_b2 = log_loss(labels.loc[all_val_idx_b2, 'target'].values, oof_probs_b2)
    print('EffNet-B2 two-stage partial OOF logloss:', oof_loss_b2)

# Quick test-time average over available folds
if len(ckpts_b2) > 0:
    test_df = ss[['id']].copy()
    test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
    test_ds = DogDataset(test_df[['id','filepath']], transforms=valid_tfms)
    test_loader = DataLoader(test_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    def infer_ckpt(pth):
        m = build_model(num_classes).to(CFG.device)
        ck = torch.load(pth, map_location=CFG.device)
        m.load_state_dict(ck['model'], strict=False); m.eval()
        outs = []
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, _ids in test_loader:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                logits = m(imgs)
                if CFG.tta and CFG.tta_hflip:
                    logits = (logits + m(torch.flip(imgs, dims=[3]))) / 2.0
                outs.append(logits.detach().cpu().float())
        logits = torch.cat(outs, dim=0); del m; torch.cuda.empty_cache(); gc.collect(); return logits
    sum_probs = None
    for p in ckpts_b2:
        lg = infer_ckpt(p)
        pr = torch.softmax(lg, dim=1).numpy()
        sum_probs = pr if sum_probs is None else (sum_probs + pr)
    avg_probs = sum_probs / len(ckpts_b2)
    probs = np.clip(avg_probs, 1e-9, 1-1e-9); probs = probs / probs.sum(axis=1, keepdims=True)
    sub = pd.DataFrame(probs, columns=class_names)
    sub.insert(0, 'id', ss['id'])
    sub.to_csv('submission.csv', index=False)
    print('Saved B2 two-stage partial-fold submission.csv with shape', sub.shape)

  model = create_fn(


Fold 0 - Stage 1 training with mixup, epochs=6


Epoch 1/6 - train 4.7333 - val 4.8298 - mixup_p 0.30




Epoch 2/6 - train 2.8073 - val 4.6771 - mixup_p 0.24


Epoch 3/6 - train 1.6856 - val 4.4105 - mixup_p 0.19


Epoch 4/6 - train 1.4607 - val 4.1067 - mixup_p 0.13


Epoch 5/6 - train 1.3399 - val 3.7889 - mixup_p 0.08


Epoch 6/6 - train 1.2845 - val 3.4694 - mixup_p 0.02
Fold 0 - Stage 2 fine-tune no-mixup, epochs=6


Epoch 1/6 - train 3.4556 - val 3.4570 - mixup_p 0.00


Epoch 2/6 - train 2.7004 - val 3.3874 - mixup_p 0.00


Epoch 3/6 - train 1.8098 - val 3.2533 - mixup_p 0.00


Epoch 4/6 - train 1.3923 - val 3.0861 - mixup_p 0.00


Epoch 5/6 - train 1.1919 - val 2.9039 - mixup_p 0.00


Epoch 6/6 - train 1.1190 - val 2.7196 - mixup_p 0.00


EffNet-B2 two-stage partial OOF logloss: 2.7196442919473403


  model = create_fn(


Saved B2 two-stage partial-fold submission.csv with shape (1023, 121)


In [31]:
# Per-fold weighted ensemble with multi-scale TTA and per-fold temperature (force B4 checkpoints/model)
import torch, numpy as np, pandas as pd, gc, timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss

assert 'ckpts' in globals() and len(ckpts) == 5, 'Need ckpts from 5-fold training (B4)'
assert 'all_oof_logits' in globals() and 'labels' in globals(), 'Need OOF logits & labels'

# Use the architecture and size that produced ckpts (EffNet-B4 @512)
BASE_MODEL_NAME = 'tf_efficientnet_b4_ns'
BASE_SIZE = 512

# Ensure per-fold temperatures exist; if not, fit them
if 'per_fold_T' not in globals():
    per_fold_T = []
    for f in range(5):
        val_idx_f = labels.index[labels.fold == f].values
        y_true_f = labels.loc[val_idx_f, 'target'].values
        oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device=CFG.device)
        T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
        opt = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
        def _closure():
            opt.zero_grad()
            scaled = oof_logits_f / torch.clamp(T, min=1e-3)
            log_probs = torch.log_softmax(scaled, dim=1)
            idx = torch.arange(len(y_true_f), device=CFG.device)
            y_t = torch.tensor(y_true_f, device=CFG.device)
            nll = -log_probs[idx, y_t].mean()
            nll.backward()
            return nll
        opt.step(_closure)
        per_fold_T.append(float(T.detach().clamp(min=1e-3).cpu().item()))

# Compute per-fold OOF logloss after temperature to derive weights
per_fold_loss = []
for f, Tf in enumerate(per_fold_T):
    val_idx_f = labels.index[labels.fold == f].values
    y_true_f = labels.loc[val_idx_f, 'target'].values
    oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device='cpu')
    probs_f = torch.softmax(oof_logits_f / Tf, dim=1).numpy()
    loss_f = log_loss(y_true_f, probs_f)
    per_fold_loss.append(loss_f)
per_fold_loss = np.array(per_fold_loss, dtype=np.float64)
w = np.exp(-per_fold_loss); w = w / w.sum()
print('Per-fold losses:', per_fold_loss, 'weights:', w)

# Multi-scale TTA helpers
ss = pd.read_csv('sample_submission.csv')
test_df = ss[['id']].copy()
test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
num_classes = len([c for c in ss.columns if c != 'id'])

def make_tta_tfms(size):
    return A.Compose([
        A.Resize(height=size, width=size),
        A.CenterCrop(height=BASE_SIZE, width=BASE_SIZE, p=1.0) if size > BASE_SIZE else A.NoOp(),
        A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ToTensorV2(),
    ])

def predict_test_logits_with_tfms(ckpt_path, tfms):
    ds = DogDataset(test_df[['id','filepath']], transforms=tfms)
    dl = DataLoader(ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = timm.create_model(BASE_MODEL_NAME, pretrained=False, num_classes=num_classes).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    outs = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in dl:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
            outs.append(logits.detach().cpu().float())
    logits_cat = torch.cat(outs, dim=0)
    del model; torch.cuda.empty_cache(); gc.collect()
    return logits_cat

scales = [int(BASE_SIZE*0.9), BASE_SIZE, int(BASE_SIZE*1.15)]
scales = sorted(list(set([max(224, s) for s in scales])))
print('Weighted TTA scales:', scales, '| base size:', BASE_SIZE)

# Weighted average over folds: avg logits across scales per fold -> temp -> softmax -> weighted sum
probs_weighted = None
for f, (p, Tf) in enumerate(zip(ckpts, per_fold_T)):
    scale_logits_sum = None
    for s in scales:
        tfms = make_tta_tfms(s)
        logits = predict_test_logits_with_tfms(p, tfms)
        scale_logits_sum = logits if scale_logits_sum is None else (scale_logits_sum + logits)
    logits_avg_scales = scale_logits_sum / len(scales)
    probs_f = torch.softmax(logits_avg_scales / Tf, dim=1).numpy()
    probs_weighted = probs_f * w[f] if probs_weighted is None else (probs_weighted + probs_f * w[f])

probs = np.clip(probs_weighted, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=[c for c in ss.columns if c != 'id'])
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved weighted per-fold TTA+temp submission.csv with shape', sub.shape)

Per-fold losses: [0.39534907 0.44786684 0.4686756  0.523403   0.49069849] weights: [0.21427059 0.20330797 0.19912109 0.18851654 0.19478381]
Weighted TTA scales: [460, 512, 588] | base size: 512


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved weighted per-fold TTA+temp submission.csv with shape (1023, 121)


In [33]:
# Extended multi-scale TTA + per-fold temperature + per-fold weighting (denser scales)
import torch, numpy as np, pandas as pd, gc, timm, albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss

assert 'ckpts' in globals() and len(ckpts) == 5, 'Need ckpts from 5-fold training (B4)'
assert 'all_oof_logits' in globals() and 'labels' in globals(), 'Need OOF logits & labels'

BASE_MODEL_NAME = 'tf_efficientnet_b4_ns'
BASE_SIZE = 512

# Ensure per_fold_T exists; compute if missing
if 'per_fold_T' not in globals():
    per_fold_T = []
    for f in range(5):
        val_idx_f = labels.index[labels.fold == f].values
        y_true_f = labels.loc[val_idx_f, 'target'].values
        oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device=CFG.device)
        T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
        opt = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
        def _closure():
            opt.zero_grad()
            scaled = oof_logits_f / torch.clamp(T, min=1e-3)
            log_probs = torch.log_softmax(scaled, dim=1)
            idx = torch.arange(len(y_true_f), device=CFG.device)
            y_t = torch.tensor(y_true_f, device=CFG.device)
            nll = -log_probs[idx, y_t].mean()
            nll.backward()
            return nll
        opt.step(_closure)
        per_fold_T.append(float(T.detach().clamp(min=1e-3).cpu().item()))

# Per-fold weights from temp-scaled OOF
per_fold_loss = []
for f, Tf in enumerate(per_fold_T):
    val_idx_f = labels.index[labels.fold == f].values
    y_true_f = labels.loc[val_idx_f, 'target'].values
    oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device='cpu')
    probs_f = torch.softmax(oof_logits_f / Tf, dim=1).numpy()
    per_fold_loss.append(log_loss(y_true_f, probs_f))
per_fold_loss = np.array(per_fold_loss, dtype=np.float64)
w = np.exp(-per_fold_loss); w = w / w.sum()
print('Per-fold losses:', per_fold_loss, 'weights:', w)

ss = pd.read_csv('sample_submission.csv')
test_df = ss[['id']].copy()
test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
num_classes = len([c for c in ss.columns if c != 'id'])

def make_tta_tfms(size):
    return A.Compose([
        A.Resize(height=size, width=size),
        A.CenterCrop(height=BASE_SIZE, width=BASE_SIZE, p=1.0) if size > BASE_SIZE else A.NoOp(),
        A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ToTensorV2(),
    ])

def predict_test_logits_with_tfms(ckpt_path, tfms, batch_size):
    ds = DogDataset(test_df[['id','filepath']], transforms=tfms)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
    model = timm.create_model(BASE_MODEL_NAME, pretrained=False, num_classes=num_classes).to(CFG.device)
    ckpt = torch.load(ckpt_path, map_location=CFG.device)
    model.load_state_dict(ckpt['model'], strict=False)
    model.eval()
    outs = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, _ids in dl:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            if CFG.tta and CFG.tta_hflip:
                logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
            outs.append(logits.detach().cpu().float())
    logits_cat = torch.cat(outs, dim=0)
    del model; torch.cuda.empty_cache(); gc.collect()
    return logits_cat

# Denser scale set; adjust batch for memory at larger scales
scales = sorted(list(set([int(BASE_SIZE*x) for x in [0.85, 0.9, 0.95, 1.0, 1.08, 1.15]])))
print('Extended TTA scales:', scales, '| base size:', BASE_SIZE)

def batch_for_size(size):
    # heuristic: reduce batch when > BASE_SIZE
    if size <= BASE_SIZE: return CFG.valid_bs
    if size <= int(BASE_SIZE*1.08): return max(8, CFG.valid_bs//2)
    return max(4, CFG.valid_bs//4)

probs_weighted = None
for f, (p, Tf) in enumerate(zip(ckpts, per_fold_T)):
    scale_logits_sum = None
    for s in scales:
        tfms = make_tta_tfms(s)
        bs = batch_for_size(s)
        logits = predict_test_logits_with_tfms(p, tfms, bs)
        scale_logits_sum = logits if scale_logits_sum is None else (scale_logits_sum + logits)
    logits_avg_scales = scale_logits_sum / len(scales)
    probs_f = torch.softmax(logits_avg_scales / Tf, dim=1).numpy()
    probs_weighted = probs_f * w[f] if probs_weighted is None else (probs_weighted + probs_f * w[f])

probs = np.clip(probs_weighted, 1e-9, 1-1e-9)
probs = probs / probs.sum(axis=1, keepdims=True)
sub = pd.DataFrame(probs, columns=[c for c in ss.columns if c != 'id'])
sub.insert(0, 'id', ss['id'])
sub.to_csv('submission.csv', index=False)
print('Saved extended-TTA weighted per-fold temp submission.csv with shape', sub.shape)

Per-fold losses: [0.39534907 0.44786684 0.4686756  0.523403   0.49069849] weights: [0.21427059 0.20330797 0.19912109 0.18851654 0.19478381]
Extended TTA scales: [435, 460, 486, 512, 552, 588] | base size: 512


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved extended-TTA weighted per-fold temp submission.csv with shape (1023, 121)


In [34]:
# Strategy A (fast pivot): Corrected retraining with stronger regularization (folds 0-1) and fresh submission
import os, gc, time, math, json
import numpy as np, pandas as pd, torch, torch.nn as nn, albumentations as A, timm, cv2
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss

# Configure for B4 @512 with stronger regularization
CFG.model_name = 'tf_efficientnet_b4_ns'
CFG.img_size = 512
CFG.train_bs = 8
CFG.valid_bs = 16
CFG.epochs = 25
CFG.lr = 1e-4
CFG.weight_decay = 1e-4
CFG.early_stop_patience = 2  # tighter ES to catch peak

# Augmentations: stronger + CoarseDropout; keep normalization
train_tfms = A.Compose([
    A.RandomResizedCrop(size=(CFG.img_size, CFG.img_size), scale=(0.7, 1.0), ratio=(0.75, 1.333), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.Affine(scale=(0.95, 1.05), translate_percent=(0.0, 0.03), rotate=(-12, 12), shear=(-3, 3), fit_output=False, border_mode=cv2.BORDER_REFLECT_101, p=0.6),
    A.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.05, p=0.5),
    A.CoarseDropout(max_holes=1, max_height=int(0.25*CFG.img_size), max_width=int(0.25*CFG.img_size), min_holes=1, min_height=int(0.1*CFG.img_size), min_width=int(0.1*CFG.img_size), fill_value=0, p=0.5),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])
valid_tfms = A.Compose([
    A.Resize(height=CFG.img_size, width=CFG.img_size),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

def build_model_b4(num_classes):
    m = timm.create_model(CFG.model_name, pretrained=True, num_classes=num_classes)
    return m

def train_one_fold_v2(fold, labels, num_classes):
    print(f'\n=== Fast Retrain Fold {fold} ===')
    trn_df = labels[labels.fold != fold][['filepath','target']].copy()
    val_df = labels[labels.fold == fold][['filepath','target']].copy()
    train_ds = DogDataset(trn_df, transforms=train_tfms)
    valid_ds = DogDataset(val_df, transforms=valid_tfms)
    train_loader = DataLoader(train_ds, batch_size=CFG.train_bs, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

    model = build_model_b4(num_classes).to(CFG.device)
    ema = ModelEmaV2(model, decay=CFG.ema_decay) if CFG.device=='cuda' else None

    # Mixup with floor (do not decay to 0)
    mixup_base = CFG.mixup_prob if (CFG.mixup_alpha > 0 or CFG.cutmix_alpha > 0) else 0.0
    mixup_fn = Mixup(
        mixup_alpha=CFG.mixup_alpha, cutmix_alpha=CFG.cutmix_alpha, prob=mixup_base,
        switch_prob=CFG.mixup_switch_prob, mode='batch', num_classes=num_classes
    ) if mixup_base > 0 else None

    criterion_hard = nn.CrossEntropyLoss().to(CFG.device)
    criterion_soft = SoftTargetCrossEntropy().to(CFG.device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    # 5-epoch warmup, cosine after; manual LR drop at epoch 8
    warmup_epochs = min(5, max(1, CFG.epochs//6))
    cosine_epochs = max(1, CFG.epochs - warmup_epochs)
    cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cosine_epochs)
    warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.2, total_iters=warmup_epochs)
    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_epochs])
    scaler = torch.amp.GradScaler('cuda', enabled=(CFG.device=='cuda'))

    best_val = 1e9; best_state = None; no_improve = 0
    for epoch in range(1, CFG.epochs+1):
        model.train(); t0=time.time()
        if mixup_fn is not None:
            frac = (epoch - 1) / max(1, CFG.epochs * 0.9)
            decay = max(0.0, 1.0 - min(1.0, frac))
            mixup_fn.mixup_prob = max(0.05, mixup_base * decay)  # floor at 0.05
        if epoch == 8:
            for g in optimizer.param_groups:
                g['lr'] = 5e-5

        run_loss = 0.0; n=0
        for imgs, targets in train_loader:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            targets = targets.to(CFG.device, non_blocking=True)
            if mixup_fn is not None and mixup_fn.mixup_prob > 0:
                imgs, targets = mixup_fn(imgs, targets)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
                logits = model(imgs)
                loss = criterion_soft(logits, targets) if targets.dtype.is_floating_point else criterion_hard(logits, targets)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            if CFG.grad_clip and CFG.grad_clip>0:
                nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
            scaler.step(optimizer); scaler.update()
            if ema is not None: ema.update(model)
            run_loss += loss.item()*imgs.size(0); n+=imgs.size(0)
        train_loss = run_loss/max(1,n)

        # validate
        model.eval(); val_loss=0.0; m=0; val_crit = nn.CrossEntropyLoss().to(CFG.device)
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, targets in valid_loader:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                targets = targets.to(CFG.device, non_blocking=True)
                logits = (ema.module(imgs) if ema is not None else model(imgs))
                l = val_crit(logits, targets)
                val_loss += l.item()*imgs.size(0); m+=imgs.size(0)
        val_loss/=max(1,m); scheduler.step()
        print(f'Epoch {epoch}/{CFG.epochs} - train {train_loss:.4f} - val {val_loss:.4f} - mixup_p {mixup_fn.mixup_prob if mixup_fn else 0:.2f}')
        if val_loss < best_val - 1e-6:
            best_val = val_loss; best_state = (ema.module.state_dict() if ema is not None else model.state_dict()); no_improve=0
        else:
            no_improve += 1
            if no_improve >= CFG.early_stop_patience:
                print('Early stopping'); break

    if best_state is not None: model.load_state_dict(best_state, strict=False)
    best_path = os.path.join(CFG.out_dir, f'fold{fold}_fastfix_best.pth')
    torch.save({'model': model.state_dict()}, best_path)

    # OOF logits
    model.eval(); oof_logits=[]
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, targets in valid_loader:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            oof_logits.append(logits.detach().cpu().float())
    oof_logits = torch.cat(oof_logits, dim=0).numpy()

    del model, optimizer, scaler
    if ema is not None: del ema
    torch.cuda.empty_cache(); gc.collect()
    return best_path, oof_logits, val_df.index.values

# Fresh data
labels, ss, class_names, class2idx, idx2class = read_data()
num_classes = len(class_names)

# Train 1-2 folds (time-bounded)
folds_to_run = [0,1]
ckpts_fast = []; all_oof_logits_fast = np.zeros((len(labels), num_classes), dtype=np.float32); all_val_idx_fast=[]
for f in folds_to_run:
    p, oof_l, val_idx = train_one_fold_v2(f, labels, num_classes)
    ckpts_fast.append(p); all_oof_logits_fast[val_idx] = oof_l; all_val_idx_fast.extend(val_idx.tolist())

if len(all_val_idx_fast)>0:
    oof_probs_fast = torch.softmax(torch.tensor(all_oof_logits_fast[all_val_idx_fast]), dim=1).numpy()
    print('Fast OOF logloss (partial folds):', log_loss(labels.loc[all_val_idx_fast, 'target'].values, oof_probs_fast))

# Per-fold temperature for new folds
per_fold_T_fast = {}
for f in folds_to_run:
    val_idx_f = labels.index[labels.fold == f].values
    y_true_f = labels.loc[val_idx_f, 'target'].values
    logits_f = torch.tensor(all_oof_logits_fast[val_idx_f], dtype=torch.float32, device=CFG.device)
    T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
    opt = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
    def _closure():
        opt.zero_grad()
        scaled = logits_f / torch.clamp(T, min=1e-3)
        idx = torch.arange(len(y_true_f), device=CFG.device)
        nll = -torch.log_softmax(scaled, dim=1)[idx, torch.tensor(y_true_f, device=CFG.device)].mean()
        nll.backward(); return nll
    opt.step(_closure)
    per_fold_T_fast[f] = float(T.detach().clamp(min=1e-3).cpu().item())

# Inference with 3-scale TTA and per-fold temp for the new folds only
def make_tta_tfms(size, base=CFG.img_size):
    return A.Compose([
        A.Resize(height=size, width=size),
        A.CenterCrop(height=base, width=base, p=1.0) if size > base else A.NoOp(),
        A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ToTensorV2(),
    ])

def predict_test_logits_ckpt(ckpt_path, size_list):
    ss_local = pd.read_csv('sample_submission.csv')
    test_df = ss_local[['id']].copy(); test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
    probs_sum = None
    for s in size_list:
        ds = DogDataset(test_df[['id','filepath']], transforms=make_tta_tfms(s))
        dl = DataLoader(ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
        model = timm.create_model(CFG.model_name, pretrained=False, num_classes=num_classes).to(CFG.device)
        ckpt = torch.load(ckpt_path, map_location=CFG.device)
        model.load_state_dict(ckpt['model'], strict=False); model.eval()
        outs = []
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, _ids in dl:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                logits = model(imgs)
                if CFG.tta and CFG.tta_hflip:
                    logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
                outs.append(logits.detach().cpu().float())
        logits_cat = torch.cat(outs, dim=0); del model; torch.cuda.empty_cache(); gc.collect()
        probs = torch.softmax(logits_cat, dim=1).numpy()
        probs_sum = probs if probs_sum is None else (probs_sum + probs)
    return probs_sum / len(size_list)

scales = sorted(list(set([int(CFG.img_size*0.9), CFG.img_size, int(CFG.img_size*1.15)])))
print('Fast-TTA scales:', scales)

ss_local = pd.read_csv('sample_submission.csv')
blend = None
for f, p in zip(folds_to_run, ckpts_fast):
    probs = predict_test_logits_ckpt(p, scales)
    T = per_fold_T_fast[f]
    logits = torch.log(torch.tensor(probs)).float()  # invert-softmax approx to scale by T
    probs_scaled = torch.softmax(logits / T, dim=1).numpy()
    blend = probs_scaled if blend is None else (blend + probs_scaled)
blend = blend / len(folds_to_run)
blend = np.clip(blend, 1e-9, 1-1e-9); blend = blend / blend.sum(axis=1, keepdims=True)
sub = pd.DataFrame(blend, columns=[c for c in ss_local.columns if c != 'id'])
sub.insert(0, 'id', ss_local['id']); sub.to_csv('submission.csv', index=False)
print('Saved fast-pivot submission.csv', sub.shape)

  A.CoarseDropout(max_holes=1, max_height=int(0.25*CFG.img_size), max_width=int(0.25*CFG.img_size), min_holes=1, min_height=int(0.1*CFG.img_size), min_width=int(0.1*CFG.img_size), fill_value=0, p=0.5),
  model = create_fn(



=== Fast Retrain Fold 0 ===


Epoch 1/25 - train 3.9793 - val 4.7247 - mixup_p 0.30


Epoch 2/25 - train 2.1537 - val 4.3571 - mixup_p 0.29


Epoch 3/25 - train 1.6104 - val 3.8160 - mixup_p 0.27


Epoch 4/25 - train 1.4752 - val 3.1868 - mixup_p 0.26




Epoch 5/25 - train 1.3752 - val 2.5444 - mixup_p 0.25


Epoch 6/25 - train 1.3186 - val 1.9430 - mixup_p 0.23


Epoch 7/25 - train 1.2353 - val 1.4477 - mixup_p 0.22


Epoch 8/25 - train 1.1467 - val 1.0826 - mixup_p 0.21


Epoch 9/25 - train 1.1324 - val 0.8422 - mixup_p 0.19


Epoch 10/25 - train 1.0603 - val 0.6936 - mixup_p 0.18


Epoch 11/25 - train 1.0784 - val 0.6022 - mixup_p 0.17


Epoch 12/25 - train 1.0544 - val 0.5469 - mixup_p 0.15


Epoch 13/25 - train 1.0605 - val 0.5157 - mixup_p 0.14


Epoch 14/25 - train 1.0386 - val 0.4984 - mixup_p 0.13


Epoch 15/25 - train 1.0219 - val 0.4924 - mixup_p 0.11


Epoch 16/25 - train 1.0521 - val 0.4926 - mixup_p 0.10


Epoch 17/25 - train 1.0539 - val 0.4951 - mixup_p 0.09
Early stopping



=== Fast Retrain Fold 1 ===


  model = create_fn(


Epoch 1/25 - train 4.0248 - val 4.7278 - mixup_p 0.30


Epoch 2/25 - train 2.1440 - val 4.3622 - mixup_p 0.29


Epoch 3/25 - train 1.6035 - val 3.8230 - mixup_p 0.27


Epoch 4/25 - train 1.4672 - val 3.1976 - mixup_p 0.26




Epoch 5/25 - train 1.4119 - val 2.5500 - mixup_p 0.25


Epoch 6/25 - train 1.3231 - val 1.9571 - mixup_p 0.23


Epoch 7/25 - train 1.1995 - val 1.4556 - mixup_p 0.22


Epoch 8/25 - train 1.1406 - val 1.0924 - mixup_p 0.21


Epoch 9/25 - train 1.1131 - val 0.8595 - mixup_p 0.19


Epoch 10/25 - train 1.0782 - val 0.7190 - mixup_p 0.18


Epoch 11/25 - train 1.0570 - val 0.6380 - mixup_p 0.17


Epoch 12/25 - train 1.0871 - val 0.5924 - mixup_p 0.15


Epoch 13/25 - train 1.0740 - val 0.5683 - mixup_p 0.14


Epoch 14/25 - train 1.0447 - val 0.5568 - mixup_p 0.13


Epoch 15/25 - train 1.0822 - val 0.5532 - mixup_p 0.11


Epoch 16/25 - train 1.0629 - val 0.5546 - mixup_p 0.10


Epoch 17/25 - train 1.0545 - val 0.5596 - mixup_p 0.09
Early stopping


Fast OOF logloss (partial folds): 0.5273450143491253


Fast-TTA scales: [460, 512, 588]


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved fast-pivot submission.csv (1023, 121)


In [37]:
# Final push: Fix regularization bug, retrain a single fold with custom Cutout, generate standalone and 50/50 blend submissions
import os, gc, time, math, json
import numpy as np, pandas as pd, torch, torch.nn as nn, albumentations as A, timm, cv2
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
from timm.utils import ModelEmaV2

# Config for single-fold retrain using B4 @512 with aggressive regularization
CFG.model_name = 'tf_efficientnet_b4_ns'
CFG.img_size = 512
CFG.train_bs = 8
CFG.valid_bs = 16
CFG.epochs = 25
CFG.lr = 1e-4
CFG.weight_decay = 1e-4
CFG.early_stop_patience = 3
CFG.mixup_alpha = 0.2
CFG.cutmix_alpha = 0.0
CFG.mixup_prob = 0.6  # stronger mixup prob
CFG.mixup_switch_prob = 0.0
CFG.ema_decay = 0.9998
CFG.num_workers = 0  # avoid multiprocessing issues with custom transforms

# Custom numpy Cutout to avoid albumentations API differences
def cutout_np(image, num_holes=8, max_h=64, max_w=64):
    h, w = image.shape[:2]
    img = image.copy()
    for _ in range(num_holes):
        ch = np.random.randint(max_h//2, max_h+1)
        cw = np.random.randint(max_w//2, max_w+1)
        cy = np.random.randint(0, h)
        cx = np.random.randint(0, w)
        y1 = max(0, cy - ch//2); y2 = min(h, cy + ch//2)
        x1 = max(0, cx - cw//2); x2 = min(w, cx + cw//2)
        img[y1:y2, x1:x2, :] = 0
    return img

class CutoutNP(A.ImageOnlyTransform):
    def __init__(self, num_holes=8, max_h=64, max_w=64, always_apply=False, p=0.5):
        super().__init__(always_apply=always_apply, p=p)
        self.num_holes = num_holes
        self.max_h = max_h
        self.max_w = max_w
    def apply(self, image, **params):
        return cutout_np(image, self.num_holes, self.max_h, self.max_w)

# Albumentations: widened RRC scale + custom Cutout
train_tfms = A.Compose([
    A.RandomResizedCrop(size=(CFG.img_size, CFG.img_size), scale=(0.6, 1.0), ratio=(0.75, 1.333), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.Affine(scale=(0.95, 1.05), translate_percent=(0.0, 0.03), rotate=(-12, 12), shear=(-3, 3), fit_output=False, border_mode=cv2.BORDER_REFLECT_101, p=0.6),
    A.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.05, p=0.5),
    CutoutNP(num_holes=8, max_h=64, max_w=64, p=0.5),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])
valid_tfms = A.Compose([
    A.Resize(height=CFG.img_size, width=CFG.img_size),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

def build_model_final(num_classes):
    return timm.create_model(CFG.model_name, pretrained=True, num_classes=num_classes)

def train_one_fold_final(fold, labels, num_classes):
    print(f'\n=== Final Fix Retrain Fold {fold} ===')
    trn_df = labels[labels.fold != fold][['filepath','target']].copy()
    val_df = labels[labels.fold == fold][['filepath','target']].copy()
    train_ds = DogDataset(trn_df, transforms=train_tfms)
    valid_ds = DogDataset(val_df, transforms=valid_tfms)
    train_loader = DataLoader(train_ds, batch_size=CFG.train_bs, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

    model = build_model_final(num_classes).to(CFG.device)
    ema = ModelEmaV2(model, decay=CFG.ema_decay) if CFG.device=='cuda' else None

    mixup_base = CFG.mixup_prob if (CFG.mixup_alpha > 0 or CFG.cutmix_alpha > 0) else 0.0
    mixup_fn = Mixup(
        mixup_alpha=CFG.mixup_alpha, cutmix_alpha=CFG.cutmix_alpha, prob=mixup_base,
        switch_prob=CFG.mixup_switch_prob, mode='batch', num_classes=num_classes
    ) if mixup_base > 0 else None

    criterion_hard = nn.CrossEntropyLoss(label_smoothing=0.1).to(CFG.device)  # add LS as regularizer
    criterion_soft = SoftTargetCrossEntropy().to(CFG.device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    warmup_epochs = min(5, max(1, CFG.epochs//6))
    cosine_epochs = max(1, CFG.epochs - warmup_epochs)
    cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cosine_epochs)
    warmup = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.2, total_iters=warmup_epochs)
    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup, cosine], milestones=[warmup_epochs])
    scaler = torch.amp.GradScaler('cuda', enabled=(CFG.device=='cuda'))

    best_val = 1e9; best_state = None; no_improve = 0
    for epoch in range(1, CFG.epochs+1):
        model.train()
        # decay mixup but keep a floor of 0.05
        if mixup_fn is not None:
            frac = (epoch - 1) / max(1, CFG.epochs * 0.9)
            decay = max(0.0, 1.0 - min(1.0, frac))
            mixup_fn.mixup_prob = max(0.05, mixup_base * decay)

        run_loss = 0.0; n = 0
        for imgs, targets in train_loader:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            targets = targets.to(CFG.device, non_blocking=True)
            if mixup_fn is not None and mixup_fn.mixup_prob > 0:
                imgs, targets = mixup_fn(imgs, targets)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
                logits = model(imgs)
                loss = criterion_soft(logits, targets) if targets.dtype.is_floating_point else criterion_hard(logits, targets)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            if CFG.grad_clip and CFG.grad_clip>0:
                nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
            scaler.step(optimizer); scaler.update()
            if ema is not None: ema.update(model)
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
        train_loss = run_loss / max(1,n)

        # validation with strict CE (no mixup, use EMA)
        model.eval(); val_loss=0.0; m=0; val_crit = nn.CrossEntropyLoss().to(CFG.device)
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, targets in valid_loader:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                targets = targets.to(CFG.device, non_blocking=True)
                logits = (ema.module(imgs) if ema is not None else model(imgs))
                l = val_crit(logits, targets)
                val_loss += l.item() * imgs.size(0); m += imgs.size(0)
        val_loss /= max(1,m); scheduler.step()
        print(f'Epoch {epoch}/{CFG.epochs} - train {train_loss:.4f} - val {val_loss:.4f} - mixup_p {mixup_fn.mixup_prob if mixup_fn else 0:.2f}')
        if val_loss < best_val - 1e-6:
            best_val = val_loss; best_state = (ema.module.state_dict() if ema is not None else model.state_dict()); no_improve = 0
        else:
            no_improve += 1
            if no_improve >= CFG.early_stop_patience:
                print('Early stopping'); break

    if best_state is not None:
        model.load_state_dict(best_state, strict=False)
    best_path = os.path.join(CFG.out_dir, f'fold{fold}_finalfix_best.pth')
    torch.save({'model': model.state_dict()}, best_path)

    # OOF logits for this fold
    model.eval(); oof_logits = []
    with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
        for imgs, targets in valid_loader:
            imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
            logits = model(imgs)
            oof_logits.append(logits.detach().cpu().float())
    oof_logits = torch.cat(oof_logits, dim=0).numpy()

    del model, optimizer, scaler
    if ema is not None: del ema
    torch.cuda.empty_cache(); gc.collect()
    return best_path, oof_logits, val_df.index.values, float(best_val)

# Prepare data
labels_final, ss_final, class_names_final, class2idx_final, idx2class_final = read_data()
num_classes_final = len(class_names_final)
folds_to_run = [0]

# Train a single fold
ckpts_final = []; all_oof_logits_final = np.zeros((len(labels_final), num_classes_final), dtype=np.float32); all_val_idx_final = [];
best_paths = []; best_vals = []
for f in folds_to_run:
    p, oof_l, val_idx, best_v = train_one_fold_final(f, labels_final, num_classes_final)
    ckpts_final.append(p); best_paths.append(p); best_vals.append(best_v)
    all_oof_logits_final[val_idx] = oof_l; all_val_idx_final.extend(val_idx.tolist())

if len(all_val_idx_final) > 0:
    oof_probs_final = torch.softmax(torch.tensor(all_oof_logits_final[all_val_idx_final]), dim=1).numpy()
    print('Final single-fold OOF logloss:', log_loss(labels_final.loc[all_val_idx_final, 'target'].values, oof_probs_final))

# Per-fold temperature for the new fold
val_idx_f0 = labels_final.index[labels_final.fold == folds_to_run[0]].values
y_true_f0 = labels_final.loc[val_idx_f0, 'target'].values
logits_f0 = torch.tensor(all_oof_logits_final[val_idx_f0], dtype=torch.float32, device=CFG.device)
T = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
opt = torch.optim.LBFGS([T], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
def _closure_T():
    opt.zero_grad()
    scaled = logits_f0 / torch.clamp(T, min=1e-3)
    idx = torch.arange(len(y_true_f0), device=CFG.device)
    nll = -torch.log_softmax(scaled, dim=1)[idx, torch.tensor(y_true_f0, device=CFG.device)].mean()
    nll.backward(); return nll
opt.step(_closure_T)
T_newfold = float(T.detach().clamp(min=1e-3).cpu().item())
print('New fold temperature:', T_newfold)

# Inference helpers
def make_tta_tfms_final(size, base=CFG.img_size):
    return A.Compose([
        A.Resize(height=size, width=size),
        A.CenterCrop(height=base, width=base, p=1.0) if size > base else A.NoOp(),
        A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ToTensorV2(),
    ])

def predict_test_probs_ckpt(ckpt_path, size_list, hflip=True):
    ss_loc = pd.read_csv('sample_submission.csv')
    test_df = ss_loc[['id']].copy(); test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
    probs_sum = None
    for s in size_list:
        ds = DogDataset(test_df[['id','filepath']], transforms=make_tta_tfms_final(s))
        dl = DataLoader(ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
        model = timm.create_model(CFG.model_name, pretrained=False, num_classes=num_classes_final).to(CFG.device)
        ckpt = torch.load(ckpt_path, map_location=CFG.device)
        model.load_state_dict(ckpt['model'], strict=False); model.eval()
        outs = []
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, _ids in dl:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                logits = model(imgs)
                if hflip:
                    logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
                outs.append(logits.detach().cpu().float())
        logits_cat = torch.cat(outs, dim=0)
        probs = torch.softmax(logits_cat, dim=1).numpy()
        probs_sum = probs if probs_sum is None else (probs_sum + probs)
        del model; torch.cuda.empty_cache(); gc.collect()
    return probs_sum / len(size_list)

# Generate Submission A: standalone new fold with temp scaling
scales_final = sorted(list(set([int(CFG.img_size*0.9), CFG.img_size, int(CFG.img_size*1.15)])))
print('TTA scales for new fold:', scales_final)
new_probs = predict_test_probs_ckpt(ckpts_final[0], scales_final, hflip=CFG.tta_hflip)
# temperature-scale the new probs: approximate by scaling logits
new_logits_approx = torch.log(torch.tensor(new_probs)).float()
new_probs_scaled = torch.softmax(new_logits_approx / T_newfold, dim=1).numpy()
subA = pd.DataFrame(np.clip(new_probs_scaled / new_probs_scaled.sum(axis=1, keepdims=True), 1e-9, 1-1e-9), columns=class_names_final)
subA.insert(0, 'id', pd.read_csv('sample_submission.csv')['id'])
subA.to_csv('submission_new_singlefold.csv', index=False)
print('Saved Submission A: submission_new_singlefold.csv', subA.shape)

# Generate Submission B: 50/50 blend with best existing 5-fold ensemble (recompute old best using ckpts & per-fold T/weights if available)
def compute_old_best_probs():
    assert 'ckpts' in globals() and len(ckpts) == CFG.n_folds, 'Need existing 5-fold ckpts from B4 run'
    # per-fold temperature
    if 'per_fold_T' not in globals():
        perT = []
        for f in range(CFG.n_folds):
            val_idx_f = labels.index[labels.fold == f].values
            y_true_f = labels.loc[val_idx_f, 'target'].values
            oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device=CFG.device)
            Tt = torch.tensor([1.0], dtype=torch.float32, device=CFG.device, requires_grad=True)
            optT = torch.optim.LBFGS([Tt], lr=0.1, max_iter=100, line_search_fn='strong_wolfe')
            def _cl():
                optT.zero_grad()
                sc = oof_logits_f / torch.clamp(Tt, min=1e-3)
                idx = torch.arange(len(y_true_f), device=CFG.device)
                nll = -torch.log_softmax(sc, dim=1)[idx, torch.tensor(y_true_f, device=CFG.device)].mean()
                nll.backward(); return nll
            optT.step(_cl)
            perT.append(float(Tt.detach().clamp(min=1e-3).cpu().item()))
    else:
        perT = per_fold_T
    # per-fold weights from temp-scaled OOF
    per_loss = []
    for f, Tf in enumerate(perT):
        val_idx_f = labels.index[labels.fold == f].values
        y_true_f = labels.loc[val_idx_f, 'target'].values
        oof_logits_f = torch.tensor(all_oof_logits[val_idx_f], dtype=torch.float32, device='cpu')
        probs_f = torch.softmax(oof_logits_f / Tf, dim=1).numpy()
        per_loss.append(log_loss(y_true_f, probs_f))
    per_loss = np.array(per_loss, dtype=np.float64)
    weights = np.exp(-per_loss); weights = weights / weights.sum()
    # inference per fold with 3-scale TTA
    ss_loc = pd.read_csv('sample_submission.csv')
    test_df = ss_loc[['id']].copy(); test_df['filepath'] = test_df['id'].apply(lambda x: f'test/{x}.jpg')
    def make_tta_tfms_old(size):
        return A.Compose([
            A.Resize(height=size, width=size),
            A.CenterCrop(height=CFG.img_size, width=CFG.img_size, p=1.0) if size > CFG.img_size else A.NoOp(),
            A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            ToTensorV2(),
        ])
    def predict_logits_once(ckpt_path, tfms):
        ds = DogDataset(test_df[['id','filepath']], transforms=tfms)
        dl = DataLoader(ds, batch_size=CFG.valid_bs, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)
        model = timm.create_model(CFG.model_name, pretrained=False, num_classes=num_classes_final).to(CFG.device)
        ckpt = torch.load(ckpt_path, map_location=CFG.device)
        model.load_state_dict(ckpt['model'], strict=False); model.eval()
        outs = []
        with torch.no_grad(), torch.amp.autocast('cuda', enabled=(CFG.device=='cuda')):
            for imgs, _ids in dl:
                imgs = imgs.to(CFG.device, non_blocking=True).to(memory_format=torch.channels_last)
                logits = model(imgs)
                if CFG.tta and CFG.tta_hflip:
                    logits = (logits + model(torch.flip(imgs, dims=[3]))) / 2.0
                outs.append(logits.detach().cpu().float())
        lg = torch.cat(outs, dim=0); del model; torch.cuda.empty_cache(); gc.collect(); return lg
    scales_old = sorted(list(set([int(CFG.img_size*0.9), CFG.img_size, int(CFG.img_size*1.15)])))
    probs_weighted = None
    for f, (p, Tf) in enumerate(zip(ckpts, perT)):
        scale_logits_sum = None
        for s in scales_old:
            tfms = make_tta_tfms_old(s)
            logits = predict_logits_once(p, tfms)
            scale_logits_sum = logits if scale_logits_sum is None else (scale_logits_sum + logits)
        logits_avg = scale_logits_sum / len(scales_old)
        probs_f = torch.softmax(logits_avg / Tf, dim=1).numpy()
        probs_weighted = probs_f * weights[f] if probs_weighted is None else (probs_weighted + probs_f * weights[f])
    return probs_weighted

old_probs = compute_old_best_probs()
blend_probs = 0.5 * new_probs_scaled + 0.5 * old_probs
blend_probs = np.clip(blend_probs, 1e-9, 1-1e-9)
blend_probs = blend_probs / blend_probs.sum(axis=1, keepdims=True)
subB = pd.DataFrame(blend_probs, columns=class_names_final)
subB.insert(0, 'id', pd.read_csv('sample_submission.csv')['id'])
subB.to_csv('submission_blend_new_old.csv', index=False)
print('Saved Submission B (blend): submission_blend_new_old.csv', subB.shape)

# Default to safer blend for final submission.csv
subB.to_csv('submission.csv', index=False)
print('submission.csv overwritten with blended submission.')

  super().__init__(always_apply=always_apply, p=p)
  model = create_fn(



=== Final Fix Retrain Fold 0 ===


Epoch 1/25 - train 4.1237 - val 4.7378 - mixup_p 0.60


Epoch 2/25 - train 2.3990 - val 4.3758 - mixup_p 0.57


Epoch 3/25 - train 1.8717 - val 3.8403 - mixup_p 0.55




Epoch 4/25 - train 1.7236 - val 3.2203 - mixup_p 0.52


Epoch 5/25 - train 1.6488 - val 2.5760 - mixup_p 0.49


Epoch 6/25 - train 1.5168 - val 1.9766 - mixup_p 0.47


Epoch 7/25 - train 1.4405 - val 1.4769 - mixup_p 0.44


Epoch 8/25 - train 1.4099 - val 1.1050 - mixup_p 0.41


Epoch 9/25 - train 1.3858 - val 0.8495 - mixup_p 0.39


Epoch 10/25 - train 1.3155 - val 0.6868 - mixup_p 0.36


Epoch 11/25 - train 1.3219 - val 0.5894 - mixup_p 0.33


Epoch 12/25 - train 1.2431 - val 0.5297 - mixup_p 0.31


Epoch 13/25 - train 1.2642 - val 0.5013 - mixup_p 0.28


Epoch 14/25 - train 1.2660 - val 0.4880 - mixup_p 0.25


Epoch 15/25 - train 1.2391 - val 0.4835 - mixup_p 0.23


Epoch 16/25 - train 1.2261 - val 0.4845 - mixup_p 0.20


Epoch 17/25 - train 1.1990 - val 0.4882 - mixup_p 0.17


Epoch 18/25 - train 1.1953 - val 0.4958 - mixup_p 0.15
Early stopping


Final single-fold OOF logloss: 0.49576659368712045
New fold temperature: 0.7695773243904114
TTA scales for new fold: [460, 512, 588]


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved Submission A: submission_new_singlefold.csv (1023, 121)


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


  model = create_fn(


Saved Submission B (blend): submission_blend_new_old.csv (1023, 121)
submission.csv overwritten with blended submission.


In [38]:
# Overwrite submission.csv with standalone new single-fold submission
import shutil, os
src = 'submission_new_singlefold.csv'
dst = 'submission.csv'
assert os.path.exists(src), f"Source not found: {src}"
shutil.copyfile(src, dst)
print('submission.csv replaced with', src)

submission.csv replaced with submission_new_singlefold.csv
