# Plan to Medal: Dogs vs. Cats Redux (log-loss)

Objectives:
- Achieve ≤ 0.061 log-loss (bronze) quickly; iterate to ≤ 0.050 (silver) with enhancements.

Approach:
1) Setup & Data
- Verify folders, counts, and sample_submission format.
- Parse labels from train filenames (cat./dog.).
- Stratified train/val split; prefer 5-fold CV for robust OOF and model selection.

2) Modeling (fast baseline → strong model)
- Baseline: Pretrained ResNet50/EfficientNet-B0 with transfer learning.
- Loss: BCEWithLogitsLoss with label smoothing.
- Optimizer: AdamW; Scheduler: cosine with warmup.
- Augmentations: flips, random resize/crop, color jitter; apply normalization for ImageNet.
- Hyperparams (baseline): img_size 224, bs ~64 (fit to GPU), epochs 5-8 for sanity.

3) Improve
- Progressive resizing (224→299/320), TTA (horizontal flip, multi-crops), EMA.
- Mixup/CutMix (light), stronger augmentations.
- 5-fold CV ensembling; average logits across folds.

4) Inference
- TTA on test; sigmoid to probabilities; ensure correct id sorting.
- Save submission.csv with columns: id,label.

5) Efficiency & Logging
- Print fold indices, epoch timings, and metrics.
- Early stop if overfitting or plateau; keep runs short and iterative.

6) Stretch
- Pseudo-labeling if time permits.

Next:
- Inspect data layout, counts, and GPU; then implement baseline dataloaders + quick single-fold baseline to validate pipeline before expanding to CV.

In [None]:
# Install required packages (CUDA 12.1 builds for PyTorch)
import sys, subprocess, pkgutil

def pip_install(pkgs, index_url=None):
    cmd = [sys.executable, '-m', 'pip', 'install', '-q'] + pkgs
    if index_url:
        cmd += ['--index-url', index_url]
    print('Running:', ' '.join(cmd));
    subprocess.check_call(cmd)

# Install PyTorch with CUDA 12.1 if not present
if pkgutil.find_loader('torch') is None:
    pip_install(['torch', 'torchvision', 'torchaudio'], index_url='https://download.pytorch.org/whl/cu121')
else:
    import torch
    print('Torch already installed:', torch.__version__)

# timm and albumentations for models/augs
for pkg in ['timm', 'albumentations', 'opencv-python-headless']:
    if pkgutil.find_loader(pkg) is None:
        pip_install([pkg])
    else:
        print(pkg, 'already installed')

print('Package installation complete.')

In [None]:
# Setup & quick data audit
import os, sys, random, re, time, math, json, gc
from pathlib import Path
import pandas as pd
from PIL import Image

# Ensure pip-target site-packages are importable (pip installed to /app/.pip-target)
PIP_TARGET = os.environ.get('PIP_TARGET', '/app/.pip-target')
if PIP_TARGET and PIP_TARGET not in sys.path:
    sys.path.insert(0, PIP_TARGET)

try:
    import torch
except Exception as e:
    torch = None
    print('Torch import failed:', e)

random.seed(42)
os.environ['PYTHONHASHSEED'] = '42'

DATA_DIR = Path('.')
TRAIN_DIR = DATA_DIR / 'train'
TEST_DIR = DATA_DIR / 'test'
SAMPLE_SUB = DATA_DIR / 'sample_submission.csv'

print('Paths exist:', TRAIN_DIR.exists(), TEST_DIR.exists(), SAMPLE_SUB.exists()); sys.stdout.flush()

# List files
train_files = sorted([p for p in TRAIN_DIR.glob('*.jpg')])
test_files = sorted([p for p in TEST_DIR.glob('*.jpg')], key=lambda p: int(p.stem))
print(f'Train files: {len(train_files):,} | Test files: {len(test_files):,}')

# Parse labels from filenames: cat.* -> 0, dog.* -> 1
def parse_label(p: Path):
    name = p.name
    if name.startswith('cat.'):
        return 0
    if name.startswith('dog.'):
        return 1
    raise ValueError(f'Unexpected train filename: {name}')

labels = [parse_label(p) for p in train_files]
num_cats = sum(1 for v in labels if v==0)
num_dogs = sum(1 for v in labels if v==1)
print(f'Class balance -> cats: {num_cats:,}, dogs: {num_dogs:,}')

# Inspect sample_submission
ss = pd.read_csv(SAMPLE_SUB)
print('Sample submission head:')
print(ss.head())
print('Sample columns:', list(ss.columns))
print('Sample len:', len(ss))

# GPU / Torch info
if torch is not None:
    print('Torch version:', torch.__version__)
    cuda_ok = torch.cuda.is_available()
    print('CUDA available:', cuda_ok)
    if cuda_ok:
        print('Device count:', torch.cuda.device_count())
        print('Device 0:', torch.cuda.get_device_name(0))
        torch.backends.cudnn.benchmark = True
else:
    print('Torch not available yet. Will install or fix path and retry later.')

# Quick corruption check on a small sample
def safe_open(img_path: Path):
    try:
        with Image.open(img_path) as im:
            im = im.convert('RGB')
            return True, im.size
    except Exception as e:
        return False, str(e)

sample_check = random.sample(train_files, k=min(20, len(train_files))) + random.sample(test_files, k=min(20, len(test_files)))
bad = []
sizes = []
for p in sample_check:
    ok, info = safe_open(p)
    if not ok:
        bad.append((p.name, info))
    else:
        sizes.append(info)
print(f'Checked {len(sample_check)} images. Bad: {len(bad)}')
if bad:
    print('Examples of bad files:', bad[:5])
if sizes:
    # show a few size samples
    print('Example image sizes (first 5):', sizes[:5])

# Verify test id parsing and sort numerically
test_ids = [int(p.stem) for p in test_files]
assert test_ids == sorted(test_ids), 'Test files are not sorted numerically as expected'
print('Test IDs numeric sort verified. First/last IDs:', test_ids[0], test_ids[-1])

print('Audit complete.')

In [4]:
# Single-fold baseline: EfficientNet-B0 @256 with EMA, LS, hflip TTA
import os, math, time, random, warnings
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T

# Ensure model weight caches are writable (avoid /app/.cache read-only)
CACHE_ROOT = Path('./.cache')
os.environ.setdefault('XDG_CACHE_HOME', str(CACHE_ROOT))
os.environ.setdefault('HF_HOME', str(CACHE_ROOT / 'huggingface'))
os.environ.setdefault('HF_HUB_CACHE', str(CACHE_ROOT / 'huggingface' / 'hub'))
os.environ.setdefault('TORCH_HOME', str(CACHE_ROOT / 'torch'))
os.environ.setdefault('TIMM_CACHE_DIR', str(CACHE_ROOT / 'timm'))
for p in [CACHE_ROOT, Path(os.environ['HF_HUB_CACHE']), Path(os.environ['TORCH_HOME']), Path(os.environ['TIMM_CACHE_DIR'])]:
    Path(p).mkdir(parents=True, exist_ok=True)

import timm
from timm.utils import ModelEmaV2

warnings.filterwarnings('ignore')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Reproducibility
random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(42)

IMG_SIZE = 256
BATCH_SIZE = 96
EPOCHS = 3  # sanity run to validate pipeline
LR_HEAD = 5e-4
LR_BACKBONE = 1e-4
WD = 0.03
LS_EPS = 0.0  # turn off for sanity
MIXUP_ALPHA = 0.2
MIXUP_P = 0.0  # off for sanity
EMA_DECAY = 0.99  # faster tracking
NUM_WORKERS = min(8, os.cpu_count() or 2)

rng = np.random.RandomState(42)

# Deterministic val/test transforms; train with RRC + HFlip
train_tfms = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])
eval_tfms = T.Compose([
    T.Resize(IMG_SIZE, interpolation=T.InterpolationMode.BILINEAR),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

class CatDogDataset(Dataset):
    def __init__(self, files, labels=None, transform=None):
        self.files = files
        self.labels = labels
        self.transform = transform
    def __len__(self):
        return len(self.files)
    def __getitem__(self, idx):
        p = self.files[idx]
        try:
            with Image.open(p) as im:
                im = im.convert('RGB')
        except Exception:
            im = Image.new('RGB', (IMG_SIZE, IMG_SIZE), (0,0,0))
        img = self.transform(im) if self.transform else T.ToTensor()(im)
        if self.labels is None:
            return img, -1.0
        return img, float(self.labels[idx])

def make_stratified_split(files, labels, val_frac=0.2, seed=42):
    idx = np.arange(len(files))
    y = np.array(labels)
    cats = idx[y==0]; dogs = idx[y==1]
    rng = np.random.RandomState(seed)
    rng.shuffle(cats); rng.shuffle(dogs)
    n_val_c = int(len(cats)*val_frac); n_val_d = int(len(dogs)*val_frac)
    val_idx = np.concatenate([cats[:n_val_c], dogs[:n_val_d]])
    trn_idx = np.concatenate([cats[n_val_c:], dogs[n_val_d:]])
    rng.shuffle(trn_idx); rng.shuffle(val_idx)
    return trn_idx, val_idx

def smooth_targets(y, eps=0.1):
    return y*(1.0 - eps) + 0.5*eps

def get_model():
    model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=1, drop_rate=0.2, cache_dir=str(CACHE_ROOT))
    return model

def mixup_batch(x, y, alpha):
    lam = np.random.beta(alpha, alpha) if alpha > 0 else 1.0
    bs = x.size(0)
    index = torch.randperm(bs, device=x.device)
    x_m = lam * x + (1 - lam) * x[index]
    y_m = lam * y + (1 - lam) * y[index]
    return x_m, y_m

# Prepare data
train_dir = Path('train')
test_dir = Path('test')
train_files = sorted([p for p in train_dir.glob('*.jpg')])
labels = [0 if p.name.startswith('cat.') else 1 for p in train_files]
trn_idx, val_idx = make_stratified_split(train_files, labels, val_frac=0.2, seed=42)
trn_files = [train_files[i] for i in trn_idx]
val_files = [train_files[i] for i in val_idx]
y_trn = [labels[i] for i in trn_idx]
y_val = [labels[i] for i in val_idx]
print(f'Train/Val sizes: {len(trn_files)} / {len(val_files)}');

ds_trn = CatDogDataset(trn_files, y_trn, transform=train_tfms)
ds_val = CatDogDataset(val_files, y_val, transform=eval_tfms)
dl_trn = DataLoader(ds_trn, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True, persistent_workers=(NUM_WORKERS>0))
dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=(NUM_WORKERS>0))

# Model, optimizer, EMA, OneCycleLR
model = get_model().to(device)

# Build param groups with wd/no-wd split using names + ids
named_params = list(model.named_parameters())
head_ids = {id(p) for p in model.get_classifier().parameters()}
backbone = [(n,p) for n,p in named_params if p.requires_grad and id(p) not in head_ids]
head = [(n,p) for n,p in named_params if p.requires_grad and id(p) in head_ids]
def no_wd(n,p):
    return (p.ndim == 1) or n.endswith('.bias')
pg = [
    {'params': [p for n,p in backbone if not no_wd(n,p)], 'lr': LR_BACKBONE, 'weight_decay': WD},
    {'params': [p for n,p in backbone if     no_wd(n,p)], 'lr': LR_BACKBONE, 'weight_decay': 0.0},
    {'params': [p for n,p in head     if not no_wd(n,p)], 'lr': LR_HEAD,     'weight_decay': WD},
    {'params': [p for n,p in head     if     no_wd(n,p)], 'lr': LR_HEAD,     'weight_decay': 0.0},
]
optimizer = torch.optim.AdamW(pg)
scaler = torch.cuda.amp.GradScaler(enabled=device.type=='cuda')
ema = ModelEmaV2(model, decay=EMA_DECAY)

# OneCycleLR per-batch
total_steps = EPOCHS * len(dl_trn)
max_lrs = [group['lr'] for group in optimizer.param_groups]
sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lrs, total_steps=total_steps, pct_start=1.0/max(1,EPOCHS), anneal_strategy='cos')

def bce_logits_loss(logits, targets, eps=0.0):
    targets = smooth_targets(targets, eps) if eps>0 else targets
    return nn.functional.binary_cross_entropy_with_logits(logits.view(-1), targets)

def sigmoid_numpy(x):
    return 1.0/(1.0+np.exp(-x))

def evaluate(model_eval, loader):
    model_eval.eval()
    total_loss = 0.0
    n = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            logits = model_eval(xb)
            loss = bce_logits_loss(logits, yb, eps=0.0)
            total_loss += loss.item()*xb.size(0)
            n += xb.size(0)
    return total_loss/max(1,n)

best_loss = float('inf')
best_state = None
start_time = time.time()

for epoch in range(EPOCHS):
    t0 = time.time()
    model.train()
    running = 0.0; seen = 0
    for i, (xb, yb) in enumerate(dl_trn):
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)
        use_mix = (epoch > 0) and (random.random() < MIXUP_P)
        if use_mix:
            xb, yb = mixup_batch(xb, yb.unsqueeze(1), MIXUP_ALPHA)
            yb = yb.view(-1)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=device.type=='cuda'):
            logits = model(xb)
            loss = bce_logits_loss(logits, yb, eps=LS_EPS)
        if epoch == 0 and i == 0:
            l = logits.detach()
            print('logits stats e1b1 -> mean/std/min/max:', float(l.mean()), float(l.std()), float(l.min()), float(l.max()))
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        sched.step()
        ema.update(model)
        running += loss.item()*xb.size(0)
        seen += xb.size(0)
        if (i+1)%50==0:
            print(f'Epoch {epoch+1}/{EPOCHS} | Step {i+1}/{len(dl_trn)} | Loss {(running/seen):.4f} | Elapsed {time.time()-t0:.1f}s', flush=True)
    # evaluate using the current model (not EMA) for sanity
    val_loss = evaluate(model, dl_val)
    print(f'Epoch {epoch+1} done in {time.time()-t0:.1f}s | Val log-loss: {val_loss:.5f}')
    if val_loss < best_loss:
        best_loss = val_loss
        best_state = { 'model': model.state_dict() }
print(f'Training finished in {(time.time()-start_time)/60:.1f} min. Best val log-loss: {best_loss:.5f}')

# Inference on test with hflip TTA; average logits then sigmoid
test_files = sorted([p for p in test_dir.glob('*.jpg')], key=lambda p: int(p.stem))
ds_test = CatDogDataset(test_files, labels=None, transform=eval_tfms)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=(NUM_WORKERS>0))
model = get_model().to(device)
model.load_state_dict(best_state['model'], strict=True)
model.eval()
probs = []
with torch.no_grad():
    for xb, _ in dl_test:
        xb = xb.to(device, non_blocking=True)
        logits1 = model(xb)
        xb_flip = torch.flip(xb, dims=[3])
        logits2 = model(xb_flip)
        logits = 0.5*(logits1.view(-1) + logits2.view(-1))
        probs.append(sigmoid_numpy(logits.detach().cpu().numpy()))
probs = np.concatenate(probs)
probs = np.clip(probs, 1e-5, 1-1e-5)
sub = pd.DataFrame({'id': [int(p.stem) for p in test_files], 'label': probs})
sub = sub.sort_values('id')
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv. Head:')
print(sub.head())

Train/Val sizes: 18001 / 4499


logits stats e1b1 -> mean/std/min/max: 3.96484375 6.484375 -10.75 19.40625


Epoch 1/3 | Step 50/187 | Loss 2.4171 | Elapsed 15.4s


Epoch 1/3 | Step 100/187 | Loss 1.4484 | Elapsed 29.9s


Epoch 1/3 | Step 150/187 | Loss 1.0406 | Elapsed 44.5s


Epoch 1 done in 63.2s | Val log-loss: 0.10856


Epoch 2/3 | Step 50/187 | Loss 0.0745 | Elapsed 15.0s


Epoch 2/3 | Step 100/187 | Loss 0.0709 | Elapsed 29.4s


Epoch 2/3 | Step 150/187 | Loss 0.0705 | Elapsed 43.8s


Epoch 2 done in 61.9s | Val log-loss: 0.08079


Epoch 3/3 | Step 50/187 | Loss 0.0237 | Elapsed 15.4s


Epoch 3/3 | Step 100/187 | Loss 0.0219 | Elapsed 29.8s


Epoch 3/3 | Step 150/187 | Loss 0.0239 | Elapsed 44.3s


Epoch 3 done in 62.4s | Val log-loss: 0.07866
Training finished in 3.1 min. Best val log-loss: 0.07866


Saved submission.csv. Head:
   id     label
0   1  0.999990
1   2  0.000010
2   3  0.999990
3   4  0.003428
4   5  0.000013


In [3]:
# Debug: small overfit test on 200 images (no aug, no LS, no Mixup, no EMA)
import random, gc, torch, numpy as np
from pathlib import Path
import timm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
random.seed(123); np.random.seed(123); torch.manual_seed(123);
if torch.cuda.is_available(): torch.cuda.manual_seed_all(123)

IMG_SIZE = 256
BATCH_SIZE = 64
EPOCHS = 5
LR_HEAD = 1e-3
LR_BACKBONE = 1e-4
WD = 0.0

simple_tfms = T.Compose([
    T.Resize(IMG_SIZE, interpolation=T.InterpolationMode.BILINEAR),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

class SimpleDS(Dataset):
    def __init__(self, files, labels, tfm):
        self.files, self.labels, self.tfm = files, labels, tfm
    def __len__(self): return len(self.files)
    def __getitem__(self, i):
        p = self.files[i]
        try:
            with Image.open(p) as im:
                im = im.convert('RGB')
        except:
            im = Image.new('RGB', (IMG_SIZE, IMG_SIZE), (0,0,0))
        x = self.tfm(im)
        y = float(self.labels[i])
        return x, y

# Build small balanced subset of 200 (100 cats, 100 dogs)
all_files = sorted(list(Path('train').glob('*.jpg')))
labels_all = [0 if p.name.startswith('cat.') else 1 for p in all_files]
idx_c = [i for i,l in enumerate(labels_all) if l==0][:100]
idx_d = [i for i,l in enumerate(labels_all) if l==1][:100]
idx_small = idx_c + idx_d
random.shuffle(idx_small)
files_small = [all_files[i] for i in idx_small]
labels_small = [labels_all[i] for i in idx_small]

# Split 160 train / 40 val
trn_idx = list(range(160)); val_idx = list(range(160,200))
trn_files = [files_small[i] for i in trn_idx]
val_files = [files_small[i] for i in val_idx]
y_trn = [labels_small[i] for i in trn_idx]
y_val = [labels_small[i] for i in val_idx]
print('Debug subset sizes:', len(trn_files), len(val_files))

ds_trn = SimpleDS(trn_files, y_trn, simple_tfms)
ds_val = SimpleDS(val_files, y_val, simple_tfms)
dl_trn = DataLoader(ds_trn, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=False)
dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

def get_model():
    return timm.create_model('efficientnet_b0', pretrained=True, num_classes=1, drop_rate=0.0)

model = get_model().to(device)
# Param groups: head vs backbone (no wd split to keep simple here)
head_ids = {id(p) for p in model.get_classifier().parameters()}
backbone_params = [p for p in model.parameters() if p.requires_grad and id(p) not in head_ids]
head_params = [p for p in model.parameters() if p.requires_grad and id(p) in head_ids]
optimizer = torch.optim.AdamW([
    {'params': backbone_params, 'lr': LR_BACKBONE, 'weight_decay': WD},
    {'params': head_params,     'lr': LR_HEAD,     'weight_decay': WD},
])
scaler = torch.cuda.amp.GradScaler(enabled=device.type=='cuda')
loss_fn = nn.BCEWithLogitsLoss()

def eval_loss(m, dl):
    m.eval(); tot=0; n=0
    with torch.no_grad():
        for xb,yb in dl:
            xb=xb.to(device); yb=yb.to(device)
            logits = m(xb).view(-1)
            loss = loss_fn(logits, yb)
            tot += loss.item()*xb.size(0); n+=xb.size(0)
    return tot/max(1,n)

best_val=float('inf')
for ep in range(EPOCHS):
    model.train(); run=0; seen=0
    for xb,yb in dl_trn:
        xb=xb.to(device); yb=yb.to(device)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=device.type=='cuda'):
            logits = model(xb).view(-1)
            loss = loss_fn(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer); scaler.update()
        run += loss.item()*xb.size(0); seen += xb.size(0)
    trn_loss = run/seen
    val_loss = eval_loss(model, dl_val)
    print(f'[Overfit] Epoch {ep+1}/{EPOCHS} | train {trn_loss:.4f} | val {val_loss:.4f}')
    best_val=min(best_val,val_loss)
print('Overfit debug best val:', best_val)
gc.collect();
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Debug subset sizes: 160 40


[Overfit] Epoch 1/5 | train 1.3989 | val 0.9735


[Overfit] Epoch 2/5 | train 0.3547 | val 0.3880


[Overfit] Epoch 3/5 | train 0.0396 | val 0.2940


[Overfit] Epoch 4/5 | train 0.0056 | val 0.3071


[Overfit] Epoch 5/5 | train 0.0070 | val 0.3122
Overfit debug best val: 0.29400813585774976
