# Dogs vs. Cats Redux: Plan & Experiment Log

Objective: Achieve a medal (log-loss ≤ 0.061) using strong transfer learning with efficient training and robust validation.

Performance targets:
- Gold ≤ 0.0388
- Silver ≤ 0.0504
- Bronze ≤ 0.0613

High-level plan:
1. Sanity-check data and environment; create training dataframe from filenames.
2. Baseline model: pretrained ImageNet CNN (timm EfficientNet-B0/ResNet50d), 5-fold Stratified KFold, 224–320px, BCEWithLogitsLoss, label smoothing, MixUp/CutMix off initially, simple aug.
3. Optimize: image size 320, strong augs (HorizontalFlip, RandomResizedCrop), cosine schedule, EMA, AMP.
4. Ensembling: optionally 2 backbones (EffNet + ResNet) or TTA at inference.
5. Generate submission.csv; iterate to reduce log-loss.

Experiment Log:
- [T0] Setup, data inspection.
- [T1] Create dataset + 5-fold CV split.
- [T2] Train baseline model (EffNet-B0, 224, 3–5 epochs).
- [T3] Evaluate CV log-loss; submit if ≤ 0.06. If not, increase size/epochs/backbone.
- [T4] TTA / second model blend.

We will request expert reviews at major milestones (plan, after data prep/EDA, after baseline training, and if score underperforms).

In [None]:
# Setup: installs, environment check, data discovery, and CV split
import os, sys, subprocess, json, math, random, time, gc, re, glob, shutil
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

def pip_install(pkgs):
    print("Installing:", pkgs, flush=True)
    cmd = [sys.executable, '-m', 'pip', 'install', '--quiet'] + pkgs
    res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    if res.returncode != 0:
        print(res.stdout)
        raise RuntimeError("pip install failed")
    else:
        print(res.stdout[-1000:])

# Install required packages if missing
required = ['torch', 'torchvision', 'timm', 'albumentations', 'opencv-python']
to_install = []
import importlib
for pkg in required:
    try:
        importlib.import_module(pkg if pkg != 'opencv-python' else 'cv2')
    except Exception:
        to_install.append(pkg)
if to_install:
    pip_install(to_install)

import torch
import torchvision
print(f"Torch: {torch.__version__}, Torchvision: {torchvision.__version__}")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

# Paths
CWD = Path.cwd()
TRAIN_DIR = CWD / 'train'
TEST_DIR = CWD / 'test'
assert TRAIN_DIR.exists() and TEST_DIR.exists(), "Train/Test directories not found"

# List few files
train_files = sorted(glob.glob(str(TRAIN_DIR / '*.jpg')))[:5]
test_files = sorted(glob.glob(str(TEST_DIR / '*.jpg')))[:5]
print("Sample train files:", train_files)
print("Sample test files:", test_files)

# Build training dataframe
def parse_label_from_filename(fp):
    name = Path(fp).name
    # filenames like 'cat.123.jpg' or 'dog.456.jpg'
    if name.startswith('cat.'):
        return 0
    elif name.startswith('dog.'):
        return 1
    else:
        raise ValueError(f"Unknown label in filename: {name}")

train_paths = sorted(glob.glob(str(TRAIN_DIR / '*.jpg')))
test_paths = sorted(glob.glob(str(TEST_DIR / '*.jpg')), key=lambda p: int(Path(p).stem))
train_labels = [parse_label_from_filename(p) for p in train_paths]
df = pd.DataFrame({
    'filepath': train_paths,
    'label': train_labels
})
print("Train samples:", len(df), "Pos (dog):", df['label'].sum(), "Neg (cat):", (1-df['label']).sum())

# Create stratified KFold splits
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
df['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(df['filepath'], df['label'])):
    df.loc[val_idx, 'fold'] = fold

assert (df['fold'] >= 0).all()
df.to_csv('train_folds.csv', index=False)
pd.DataFrame({'filepath': test_paths}).to_csv('test_files.csv', index=False)
print(df['fold'].value_counts().sort_index())
print("Saved train_folds.csv and test_files.csv")

# Log
print("[T0] Data prepared and CV split created.", flush=True)

In [None]:
# Training pipeline: EfficientNet-B0 baseline, 3-fold, AMP, TTA(hflip), submission.csv (cv2-free, local cache)
import os, time, math, random, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Ensure writable cache directory for pretrained weights (avoid read-only /app/.cache)
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 5
FOLDS_TO_RUN = [0,1,2]  # fast baseline
LABEL_SMOOTH = 0.05
LR = 3e-4
WEIGHT_DECAY = 1e-2

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self):
        return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem  # id for test
        label = np.float32(self.labels[idx])
        return img, label

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))
    return model

@torch.no_grad()
def predict_loader(model, loader):
    model.eval()
    preds = []
    for imgs, _ in loader:
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits = model(imgs).squeeze(1)
            probs = torch.sigmoid(logits).float().cpu().numpy()
        preds.append(probs)
    return np.concatenate(preds)

def train_fold(fold):
    print(f"\n===== Fold {fold} =====", flush=True)
    trn_df = df[df.fold != fold].reset_index(drop=True)
    val_df = df[df.fold == fold].reset_index(drop=True)
    trn_ds = DogCatDataset(trn_df, train_aug)
    val_ds = DogCatDataset(val_df, valid_aug)
    trn_loader = DataLoader(trn_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    model = build_model().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    best_ll = 1e9
    best_path = f'model_fold{fold}.pt'
    start = time.time()
    for epoch in range(1, EPOCHS+1):
        model.train()
        running_loss = 0.0
        n_samples = 0
        t0 = time.time()
        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            targets = labels * (1.0 - LABEL_SMOOTH) + (1.0 - labels) * LABEL_SMOOTH
            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss_vec = criterion(logits, targets)
                loss = loss_vec.mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item() * imgs.size(0)
            n_samples += imgs.size(0)
            if (step+1) % 50 == 0:
                elapsed = time.time()-t0
                print(f"Fold {fold} Epoch {epoch} Step {step+1}/{len(trn_loader)} Loss {running_loss/n_samples:.4f} Elapsed {elapsed:.1f}s", flush=True)
        # Validation
        model.eval()
        val_probs = []
        val_targets = []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
                    logits = model(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy())
                val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs)
        val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: train_loss={running_loss/max(1,n_samples):.4f} val_logloss={ll:.5f} epoch_time={time.time()-t0:.1f}s total_elapsed={time.time()-start:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': model.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved best model to {best_path} (val_logloss={best_ll:.5f})", flush=True)
    # Load best
    ckpt = torch.load(best_path, map_location='cpu')
    model.load_state_dict(ckpt['state_dict'])
    model = model.to(device)
    # OOF predictions
    val_probs = predict_loader(model, val_loader)
    return best_ll, val_df.index.values, val_probs, model

# Run folds
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
models = {}
total_start = time.time()
for fold in FOLDS_TO_RUN:
    best_ll, val_idx, val_probs, model = train_fold(fold)
    oof[val_idx] = val_probs
    fold_scores[fold] = best_ll
    models[fold] = model  # keep in memory for fast test pred
    print(f"Fold {fold} best val_logloss: {best_ll:.5f}", flush=True)
print("Fold scores:", fold_scores, flush=True)
oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"OOF log-loss (partial folds): {oof_ll:.5f}")

# Test inference with TTA (original + hflip)
test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    # average across folds and TTA
    all_probs = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        # TTA: original
        probs_accum = None
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            for m in models_dict.values():
                m.eval()
                logits = m(imgs).squeeze(1)
                p = torch.sigmoid(logits)
                probs_accum = p if probs_accum is None else probs_accum + p
        # TTA: horizontal flip
        imgs_flipped = torch.flip(imgs, dims=[3])
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            for m in models_dict.values():
                logits = m(imgs_flipped).squeeze(1)
                p = torch.sigmoid(logits)
                probs_accum = probs_accum + p
        probs_avg = (probs_accum / (len(models_dict)*2)).float().cpu().numpy()
        all_probs.append(probs_avg)
    return np.concatenate(all_probs)

print("Predicting test...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({
    'id': test_ids,
    'label': np.clip(test_probs, 1e-6, 1-1e-6)
})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print("Saved submission.csv. Head:\n", sub.head())
print(f"Total elapsed: {(time.time()-total_start)/60:.1f} min", flush=True)

# Save OOF and fold metrics
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_partial.csv', index=False)
pd.Series(fold_scores).to_csv('fold_scores.csv')
print("Artifacts saved: submission.csv, oof_partial.csv, fold_scores.csv", flush=True)

gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Upgrade: train folds 3 & 4 with larger image size (320) and more epochs; then ensemble 5 folds
import os, time, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

SEED = 42
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Ensure cache dir
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)

# Settings for improved training
IMG_SIZE = 320
BATCH_SIZE = 32
EPOCHS = 7
FOLDS_TO_RUN = [3,4]
LABEL_SMOOTH = 0.05
LR = 3e-4
WEIGHT_DECAY = 1e-2
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self):
        return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    return timm.create_model('efficientnet_b0', pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

@torch.no_grad()
def predict_loader(model, loader):
    model.eval()
    preds = []
    for imgs, _ in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            logits = model(imgs).squeeze(1)
            probs = torch.sigmoid(logits).float().cpu().numpy()
        preds.append(probs)
    return np.concatenate(preds)

def train_fold(fold):
    print(f"\n===== (v2) Fold {fold} @ {IMG_SIZE}px =====", flush=True)
    trn_df = df[df.fold != fold].reset_index(drop=True)
    val_df = df[df.fold == fold].reset_index(drop=True)
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    model = build_model().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')
    best_ll, best_path = 1e9, f'model_fold{fold}.pt'
    start = time.time()
    for epoch in range(1, EPOCHS+1):
        model.train()
        run_loss, n = 0.0, 0
        t0 = time.time()
        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device); labels = labels.to(device)
            targets = labels * (1.0 - LABEL_SMOOTH) + (1.0 - labels) * LABEL_SMOOTH
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = criterion(logits, targets).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            if (step+1) % 50 == 0:
                print(f"Fold {fold} Epoch {epoch} Step {step+1}/{len(trn_loader)} Loss {run_loss/max(1,n):.4f}", flush=True)
        # validate
        model.eval(); val_probs=[]; val_targets=[]
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = model(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy()); val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs); val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: train_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} epoch_time={time.time()-t0:.1f}s total_elapsed={time.time()-start:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': model.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved best model to {best_path} (val_logloss={best_ll:.5f})", flush=True)
    # load best & oof
    ckpt = torch.load(best_path, map_location='cpu'); model.load_state_dict(ckpt['state_dict']); model = model.to(device)
    val_loader = DataLoader(DogCatDataset(df[df.fold == fold].reset_index(drop=True), valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    oof_probs = predict_loader(model, val_loader)
    return best_ll, (df[df.fold == fold].index.values), oof_probs

# Train remaining folds
oof_prev = pd.read_csv('oof_partial.csv') if Path('oof_partial.csv').exists() else None
oof = np.zeros(len(df), dtype=np.float32)
if oof_prev is not None:
    # load existing oof for folds 0-2
    oof = oof_prev['oof'].values.astype(np.float32)

fold_scores = {}
for fold in FOLDS_TO_RUN:
    best_ll, val_idx, val_probs = train_fold(fold)
    oof[val_idx] = val_probs
    fold_scores[fold] = best_ll
    print(f"(v2) Fold {fold} best val_logloss: {best_ll:.5f}", flush=True)

pd.Series(fold_scores).to_csv('fold_scores_v2_additional.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_merged.csv', index=False)
oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"Merged OOF log-loss (5 folds, mixed settings): {oof_ll:.5f}")

# Build models dict for inference from saved checkpoints for all 5 folds
@torch.no_grad()
def load_model_for_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_fold{fold}.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

models = {f: load_model_for_fold(f) for f in [0,1,2,3,4] if Path(f'model_fold{f}.pt').exists()}

# Test inference with TTA (original + hflip) averaging across all available folds
test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    all_probs = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            probs_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                probs_sum = p if probs_sum is None else probs_sum + p
        imgs_f = torch.flip(imgs, dims=[3])
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs_f).squeeze(1))
                probs_sum = probs_sum + p
        probs_avg = (probs_sum / (len(models_dict)*2)).float().cpu().numpy()
        all_probs.append(probs_avg)
    return np.concatenate(all_probs)

print("Predicting test with 5-fold ensemble...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print("Saved updated submission.csv. Head:\n", sub.head())
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Second backbone: ResNet50d 5-fold @224 for fast boost; blend with EfficientNet-B0
import os, time, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from PIL import Image
import torchvision.transforms as T
from sklearn.metrics import log_loss

SEED = 42
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# cache dir
LOCAL_CACHE = Path.cwd() / 'model_cache'; LOCAL_CACHE.mkdir(exist_ok=True, parents=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)

# settings
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 3
LR = 3e-4
WEIGHT_DECAY = 1e-2
LABEL_SMOOTH = 0.05
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None: img = self.aug(img)
        if self.labels is None: return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_resnet():
    return timm.create_model('resnet50d', pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

def train_fold_resnet(fold):
    print(f"\n[ResNet50d] Fold {fold}", flush=True)
    trn_df = df[df.fold != fold].reset_index(drop=True)
    val_df = df[df.fold == fold].reset_index(drop=True)
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    model = build_resnet().to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    crit = nn.BCEWithLogitsLoss(reduction='none')
    best_ll = 1e9; best_path = f'model_resnet_fold{fold}.pt'
    for epoch in range(1, EPOCHS+1):
        model.train(); run_loss=0.0; n=0; t0=time.time()
        for i,(imgs,labels) in enumerate(trn_loader):
            imgs=imgs.to(device); labels=labels.to(device)
            targets = labels*(1-LABEL_SMOOTH)+(1-labels)*LABEL_SMOOTH
            opt.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = crit(logits, targets).mean()
            scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
            run_loss += loss.item()*imgs.size(0); n += imgs.size(0)
            if (i+1)%50==0:
                print(f"Fold {fold} Ep{epoch} {i+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f}", flush=True)
        # val
        model.eval(); probs_all=[]; targs_all=[]
        with torch.no_grad():
            for imgs,labels in val_loader:
                imgs=imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    p = torch.sigmoid(model(imgs).squeeze(1))
                probs_all.append(p.float().cpu().numpy()); targs_all.append(labels.numpy())
        probs_all = np.concatenate(probs_all); targs_all = np.concatenate(targs_all)
        ll = log_loss(targs_all, np.clip(probs_all, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: train_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} time={time.time()-t0:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll; torch.save({'state_dict': model.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved {best_path} ({best_ll:.5f})", flush=True)
    return best_ll

# Train 5 folds
fold_scores = {}
for f in range(5):
    fold_scores[f] = train_fold_resnet(f)
print('ResNet fold scores:', fold_scores)

# Blend EfficientNet-B0 (existing 5 folds) + ResNet50d (new 5 folds) with HFlip TTA
@torch.no_grad()
def load_eff_fold(fold):
    import timm
    m = timm.create_model('efficientnet_b0', pretrained=False, num_classes=1, cache_dir=str(LOCAL_CACHE)).to(device)
    ckpt = torch.load(f'model_fold{fold}.pt', map_location='cpu'); m.load_state_dict(ckpt['state_dict']); m.eval(); return m

@torch.no_grad()
def load_resnet_fold(fold):
    m = build_resnet().to(device)
    ckpt = torch.load(f'model_resnet_fold{fold}.pt', map_location='cpu'); m.load_state_dict(ckpt['state_dict']); m.eval(); return m

eff_models = {f: load_eff_fold(f) for f in range(5) if Path(f'model_fold{f}.pt').exists()}
res_models = {f: load_resnet_fold(f) for f in range(5)}

test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

@torch.no_grad()
def predict_blend(eff, res, loader):
    all_probs = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            pe = None
            for m in eff.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                pe = p if pe is None else pe + p
            pr = None
            for m in res.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                pr = p if pr is None else pr + p
            # HFlip
            imgs_f = torch.flip(imgs, dims=[3])
            for m in eff.values():
                pe += torch.sigmoid(m(imgs_f).squeeze(1))
            for m in res.values():
                pr += torch.sigmoid(m(imgs_f).squeeze(1))
            pe = pe / (len(eff)*2); pr = pr / (len(res)*2)
            probs = (pe + pr) / 2.0
        all_probs.append(probs.float().cpu().numpy())
    return np.concatenate(all_probs)

print('Predicting test with blended ensemble...', flush=True)
test_probs = predict_blend(eff_models, res_models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved blended submission.csv. Head:\n', sub.head(), flush=True)
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Strong model: ConvNeXt-Tiny 5-fold @320 with EMA, Cosine LR; disable MixUp (previous run stuck at ~0.69)
import os, time, math, gc, random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from timm.utils import ModelEmaV2
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cache dir
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)

# Config (stabilized):
IMG_SIZE = 320
BATCH_SIZE = 32
EPOCHS = 6
LR = 3e-4
WEIGHT_DECAY = 0.01
LABEL_SMOOTH = 0.05
MIXUP_ALPHA = 0.0  # disabled to recover learning
EMA_DECAY = 0.9997
WARMUP_EPOCHS = 1

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    return timm.create_model('convnext_tiny', pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

def mixup_batch(x, y, alpha=MIXUP_ALPHA):
    if alpha is None or alpha <= 0:
        return x, y
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_m = lam * x + (1 - lam) * x[idx]
    y = y.view(-1, 1)
    y_m = lam * y + (1 - lam) * y[idx]
    return x_m, y_m.squeeze(1)

def train_one_fold(fold):
    print(f"\n[ConvNeXt-Tiny] Fold {fold} @ {IMG_SIZE}px", flush=True)
    trn_idx = df.index[df.fold != fold].values
    val_idx = df.index[df.fold == fold].values
    trn_df = df.loc[trn_idx]
    val_df = df.loc[val_idx]
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    model = build_model().to(device)
    ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS - WARMUP_EPOCHS)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    best_ll = 1e9
    best_path = f'model_convnext_fold{fold}_ema.pt'

    for epoch in range(1, EPOCHS+1):
        model.train()
        t0 = time.time(); run_loss = 0.0; n = 0
        # Warmup
        if epoch <= WARMUP_EPOCHS:
            for pg in optimizer.param_groups:
                pg['lr'] = LR * epoch / max(1, WARMUP_EPOCHS)
        else:
            scheduler.step()

        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            labels_sm = labels * (1 - LABEL_SMOOTH) + (1 - labels) * LABEL_SMOOTH
            imgs_mu, labels_mu = mixup_batch(imgs, labels_sm, alpha=MIXUP_ALPHA)

            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs_mu).squeeze(1)
                loss = criterion(logits, labels_mu).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            ema.update(model)
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            if (step+1) % 100 == 0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"Fold {fold} Ep{epoch} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f} lr {cur_lr:.2e}", flush=True)

        # Validation with EMA weights
        model.eval()
        val_probs = []; val_targets = []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = ema.module(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy())
                val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs); val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: tr_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} time={time.time()-t0:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': ema.module.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved EMA model -> {best_path} ({best_ll:.5f})", flush=True)

    # Load best EMA for OOF
    ckpt = torch.load(best_path, map_location='cpu')
    ema_model = build_model().to(device)
    ema_model.load_state_dict(ckpt['state_dict'])
    ema_model.eval()
    # OOF preds
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    preds = []
    with torch.no_grad():
        for imgs, _ in val_loader:
            imgs = imgs.to(device)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                p = torch.sigmoid(ema_model(imgs).squeeze(1))
            preds.append(p.float().cpu().numpy())
    preds = np.concatenate(preds)
    return val_idx, preds, best_ll

# Train all folds with consistent settings
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
for f in range(5):
    val_idx, preds, best_ll = train_one_fold(f)
    oof[val_idx] = preds
    fold_scores[f] = best_ll
    print(f"[ConvNeXt-Tiny] Fold {f} best val_logloss: {best_ll:.5f}", flush=True)

pd.Series(fold_scores).to_csv('fold_scores_convnext.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_convnext.csv', index=False)
oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"ConvNeXt-Tiny OOF log-loss (5 folds): {oof_ll:.5f}")

# Inference with EMA checkpoints + HFlip TTA
@torch.no_grad()
def load_convnext_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_convnext_fold{fold}_ema.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m
models = {f: load_convnext_fold(f) for f in range(5)}

test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    out = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum += torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
    return np.concatenate(out)

print("Predicting test with ConvNeXt-Tiny 5-fold EMA ensemble...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (ConvNeXt). Head:\n', sub.head(), flush=True)
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Clean, consistent 5-fold pipeline: EfficientNet-B3 (ra_in1k) @320 with EMA, cosine LR, LS=0.05, MixUp(0.1), AMP, fixed OOF
import os, time, math, gc, random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from timm.utils import ModelEmaV2
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cache for pretrained weights
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

# Config per expert recipe (Plan A: reliable RA weights)
MODEL_NAME = 'efficientnet_b3.ra_in1k'
IMG_SIZE = 320
BATCH_SIZE = 32
EPOCHS = 10  # per directive
LR = 3e-4
WEIGHT_DECAY = 1e-2
LABEL_SMOOTH = 0.05
MIXUP_ALPHA = 0.1  # will auto-disable if unstable
EMA_DECAY = 0.9997
WARMUP_EPOCHS = 1
NUM_WORKERS = 0  # stability safeguard

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

# Pre-download/initialize once to avoid hang inside training loop
print(f"Pre-initializing model '{MODEL_NAME}' (pretrained=True) to populate cache...", flush=True)
try:
    _tmp_m = build_model()
    del _tmp_m
    print("Model init OK.", flush=True)
    # lock to offline after successful fetch to prevent further network calls
    os.environ['HF_HUB_OFFLINE'] = '1'
except Exception as e:
    print(f"Model init failed: {e}", flush=True)

def mixup_batch(x, y, alpha):
    if alpha is None or alpha <= 0:
        return x, y
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_m = lam * x + (1 - lam) * x[idx]
    y = y.view(-1, 1)
    y_m = lam * y + (1 - lam) * y[idx]
    return x_m, y_m.squeeze(1)

def train_one_fold(fold):
    print(f"\n[EffB3-RA] Fold {fold} @ {IMG_SIZE}px", flush=True)
    trn_idx = df.index[df.fold != fold].values
    val_idx = df.index[df.fold == fold].values
    trn_df = df.loc[trn_idx]
    val_df = df.loc[val_idx]
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    model = build_model().to(device)
    ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS - WARMUP_EPOCHS)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    best_ll = 1e9
    best_path = f'model_b3_fold{fold}_ema.pt'
    mixup_enabled = True if (MIXUP_ALPHA is not None and MIXUP_ALPHA > 0) else False

    for epoch in range(1, EPOCHS+1):
        model.train()
        t0 = time.time(); run_loss = 0.0; n = 0
        # Warmup or cosine
        if epoch <= WARMUP_EPOCHS:
            for pg in optimizer.param_groups:
                pg['lr'] = LR * epoch / max(1, WARMUP_EPOCHS)
        else:
            scheduler.step()

        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            targets = labels * (1 - LABEL_SMOOTH) + (1 - labels) * LABEL_SMOOTH
            if mixup_enabled:
                imgs, targets = mixup_batch(imgs, targets, alpha=MIXUP_ALPHA)

            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = criterion(logits, targets).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            ema.update(model)
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            if (step+1) % 100 == 0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"Fold {fold} Ep{epoch} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f} lr {cur_lr:.2e}", flush=True)

        # Simple instability safeguard: if first-epoch loss > 0.69, disable MixUp from next epoch
        if epoch == 1 and mixup_enabled and (run_loss/max(1,n) > 0.69):
            mixup_enabled = False
            print("  Disabling MixUp due to unstable first-epoch loss.", flush=True)

        # Validation with EMA weights
        model.eval()
        val_probs = []; val_targets = []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = ema.module(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy())
                val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs); val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: tr_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} time={time.time()-t0:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': ema.module.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved EMA -> {best_path} ({best_ll:.5f})", flush=True)

    # Load best EMA and produce fold OOF
    ckpt = torch.load(best_path, map_location='cpu')
    ema_model = build_model().to(device)
    ema_model.load_state_dict(ckpt['state_dict'])
    ema_model.eval()
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    preds = []
    with torch.no_grad():
        for imgs, _ in val_loader:
            imgs = imgs.to(device)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                p = torch.sigmoid(ema_model(imgs).squeeze(1))
            preds.append(p.float().cpu().numpy())
    preds = np.concatenate(preds)
    return val_idx, preds, best_ll

# Train all 5 folds consistently
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
total_start = time.time()
for f in range(5):
    print(f"==== Start Fold {f} ====", flush=True)
    val_idx, preds, best_ll = train_one_fold(f)
    oof[val_idx] = preds  # fixed OOF indexing by original df index
    fold_scores[f] = best_ll
    print(f"[EffB3-RA] Fold {f} best val_logloss: {best_ll:.5f}", flush=True)

oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"EffB3-RA OOF log-loss (5 folds): {oof_ll:.5f}", flush=True)
pd.Series(fold_scores).to_csv('fold_scores_b3.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_b3.csv', index=False)

# Inference with 5 EMA checkpoints + HFlip TTA
@torch.no_grad()
def load_b3_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b3_fold{fold}_ema.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

models = {f: load_b3_fold(f) for f in range(5)}
test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    out = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum += torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
    return np.concatenate(out)

print("Predicting test with 5-fold EMA ensemble (EffB3-RA)...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (EffB3-RA). Head:\n', sub.head(), flush=True)
print(f"Total elapsed: {(time.time()-total_start)/60:.1f} min", flush=True)

gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Plan C: ResNet50d.ra2_in1k @384, 5-fold, EMA, Cosine LR, LS=0.05, MixUp=0.1, AMP, fixed OOF, HFlip TTA
import os, time, gc, random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from timm.utils import ModelEmaV2
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

print("[Plan C] Starting cell: ResNet50d-RA2 @384", flush=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True
os.environ['PYTHONUNBUFFERED'] = '1'

# Cache
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)

# Config
MODEL_NAME = 'resnet50d.ra2_in1k'
IMG_SIZE = 384
BATCH_SIZE = 24
EPOCHS = 10
LR = 3e-4
WEIGHT_DECAY = 1e-2
LABEL_SMOOTH = 0.05
MIXUP_ALPHA = 0.1
EMA_DECAY = 0.9997
WARMUP_EPOCHS = 1
NUM_WORKERS = 0  # set 0 to avoid potential multiprocessing hang

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

# Pre-init to ensure weights are cached and avoid silent hangs
print(f"Pre-initializing '{MODEL_NAME}' ...", flush=True)
try:
    _m = build_model()
    del _m
    print("Model pre-init OK.", flush=True)
except Exception as e:
    print(f"Model pre-init failed: {e}", flush=True)

def mixup_batch(x, y, alpha):
    if alpha is None or alpha <= 0:
        return x, y
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_m = lam * x + (1 - lam) * x[idx]
    y = y.view(-1, 1)
    y_m = lam * y + (1 - lam) * y[idx]
    return x_m, y_m.squeeze(1)

def train_one_fold(fold):
    print(f"\n[ResNet50d-RA2] Fold {fold} @ {IMG_SIZE}px", flush=True)
    trn_idx = df.index[df.fold != fold].values
    val_idx = df.index[df.fold == fold].values
    trn_df = df.loc[trn_idx]
    val_df = df.loc[val_idx]
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    model = build_model().to(device)
    ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS - WARMUP_EPOCHS)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    best_ll = 1e9
    best_path = f'model_resnet384_fold{fold}_ema.pt'
    mixup_enabled = True if (MIXUP_ALPHA is not None and MIXUP_ALPHA > 0) else False

    for epoch in range(1, EPOCHS+1):
        model.train()
        t0 = time.time(); run_loss = 0.0; n = 0
        # Warmup or cosine
        if epoch <= WARMUP_EPOCHS:
            for pg in optimizer.param_groups:
                pg['lr'] = LR * epoch / max(1, WARMUP_EPOCHS)
        else:
            scheduler.step()

        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            targets = labels * (1 - LABEL_SMOOTH) + (1 - labels) * LABEL_SMOOTH
            if mixup_enabled:
                imgs, targets = mixup_batch(imgs, targets, alpha=MIXUP_ALPHA)

            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = criterion(logits, targets).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            ema.update(model)
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            if (step+1) % 80 == 0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"Fold {fold} Ep{epoch} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f} lr {cur_lr:.2e}", flush=True)

        if epoch == 1 and mixup_enabled and (run_loss/max(1,n) > 0.69):
            mixup_enabled = False
            print("  Disabling MixUp due to unstable first-epoch loss.", flush=True)

        # Validation (EMA)
        model.eval()
        val_probs = []; val_targets = []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = ema.module(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy())
                val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs); val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: tr_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} time={time.time()-t0:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': ema.module.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved EMA -> {best_path} ({best_ll:.5f})", flush=True)

    # Best EMA OOF
    ckpt = torch.load(best_path, map_location='cpu')
    ema_model = build_model().to(device)
    ema_model.load_state_dict(ckpt['state_dict'])
    ema_model.eval()
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    preds = []
    with torch.no_grad():
        for imgs, _ in val_loader:
            imgs = imgs.to(device)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                p = torch.sigmoid(ema_model(imgs).squeeze(1))
            preds.append(p.float().cpu().numpy())
    preds = np.concatenate(preds)
    return val_idx, preds, best_ll

# Train 5 folds
print("==== Begin 5-fold training (ResNet50d-RA2 384px) ====", flush=True)
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
total_start = time.time()
for f in range(5):
    print(f"==== Start Fold {f} ====", flush=True)
    val_idx, preds, best_ll = train_one_fold(f)
    oof[val_idx] = preds
    fold_scores[f] = best_ll
    print(f"[ResNet50d-RA2] Fold {f} best val_logloss: {best_ll:.5f}", flush=True)

oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"ResNet50d-RA2 OOF log-loss (5 folds): {oof_ll:.5f}", flush=True)
pd.Series(fold_scores).to_csv('fold_scores_resnet384.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_resnet384.csv', index=False)

# Inference: 5 EMA checkpoints + HFlip TTA
@torch.no_grad()
def load_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_resnet384_fold{fold}_ema.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

models = {f: load_fold(f) for f in range(5)}
test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    out = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum += torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
    return np.concatenate(out)

print("Predicting test with 5-fold EMA ensemble (ResNet50d-RA2 384px)...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (ResNet50d-RA2 384). Head:\n', sub.head(), flush=True)
print(f"Total elapsed: {(time.time()-total_start)/60:.1f} min", flush=True)

gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Debug cell: verify kernel stdout, filesystem, timm import, and quick model instantiation (no download)
import os, glob
from pathlib import Path
import torch, timm
from PIL import Image
import pandas as pd

print("[DEBUG] Cell start", flush=True)
print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available(), flush=True)
print("timm:", timm.__version__, flush=True)
print("CWD:", Path.cwd(), flush=True)
print("Cache dirs:", [p.name for p in Path('model_cache').glob('*')], flush=True)
print("ResNet50d-RA2 cache exists:", Path('model_cache/models--timm--resnet50d.ra2_in1k').exists(), flush=True)
df_dbg = pd.read_csv('train_folds.csv')
print("train_folds rows:", len(df_dbg), "fold counts:", df_dbg['fold'].value_counts().to_dict(), flush=True)
sample_train = sorted(glob.glob('train/*.jpg'))[:1]
print("Sample train file:", sample_train, flush=True)
if sample_train:
    try:
        img = Image.open(sample_train[0]).convert('RGB')
        print("PIL open OK, size:", img.size, flush=True)
    except Exception as e:
        print("PIL open failed:", e, flush=True)
try:
    m = timm.create_model('resnet50d.ra2_in1k', pretrained=False, num_classes=1)
    _ = sum(p.numel() for p in m.parameters())
    print("timm model instantiate OK (pretrained=False). Params:", _, flush=True)
    del m
except Exception as e:
    print("timm model instantiate failed:", e, flush=True)
print("[DEBUG] Cell end", flush=True)

In [None]:
# Plan B: tf_efficientnet_b2_ns @320, 5-fold, EMA, Cosine LR, LS=0.05, MixUp=0.1, AMP, fixed OOF, HFlip TTA
import os, time, gc, random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from timm.utils import ModelEmaV2
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

print("[Plan B] Starting cell: tf_efficientnet_b2_ns @320", flush=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True
os.environ['PYTHONUNBUFFERED'] = '1'

# Cache
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

# Config
MODEL_NAME = 'tf_efficientnet_b2_ns'
IMG_SIZE = 320
BATCH_SIZE = 32
EPOCHS = 10
LR = 3e-4
WEIGHT_DECAY = 1e-2
LABEL_SMOOTH = 0.05
MIXUP_ALPHA = 0.1
EMA_DECAY = 0.9997
WARMUP_EPOCHS = 1
NUM_WORKERS = 0

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

# Pre-init to ensure weights are cached and avoid silent hangs
print(f"Pre-initializing '{MODEL_NAME}' ...", flush=True)
try:
    _m = build_model()
    del _m
    print("Model pre-init OK.", flush=True)
    os.environ['HF_HUB_OFFLINE'] = '1'
except Exception as e:
    print(f"Model pre-init failed: {e}", flush=True)

def mixup_batch(x, y, alpha):
    if alpha is None or alpha <= 0:
        return x, y
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_m = lam * x + (1 - lam) * x[idx]
    y = y.view(-1, 1)
    y_m = lam * y + (1 - lam) * y[idx]
    return x_m, y_m.squeeze(1)

def train_one_fold(fold):
    print(f"\n[EffB2-NS] Fold {fold} @ {IMG_SIZE}px", flush=True)
    trn_idx = df.index[df.fold != fold].values
    val_idx = df.index[df.fold == fold].values
    trn_df = df.loc[trn_idx]
    val_df = df.loc[val_idx]
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    model = build_model().to(device)
    ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS - WARMUP_EPOCHS)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    best_ll = 1e9
    best_path = f'model_b2_fold{fold}_ema.pt'
    mixup_enabled = True if (MIXUP_ALPHA is not None and MIXUP_ALPHA > 0) else False

    for epoch in range(1, EPOCHS+1):
        model.train()
        t0 = time.time(); run_loss = 0.0; n = 0
        if epoch <= WARMUP_EPOCHS:
            for pg in optimizer.param_groups:
                pg['lr'] = LR * epoch / max(1, WARMUP_EPOCHS)
        else:
            scheduler.step()

        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            targets = labels * (1 - LABEL_SMOOTH) + (1 - labels) * LABEL_SMOOTH
            if mixup_enabled:
                imgs, targets = mixup_batch(imgs, targets, alpha=MIXUP_ALPHA)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = criterion(logits, targets).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            ema.update(model)
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            if (step+1) % 100 == 0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"Fold {fold} Ep{epoch} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f} lr {cur_lr:.2e}", flush=True)

        if epoch == 1 and mixup_enabled and (run_loss/max(1,n) > 0.69):
            mixup_enabled = False
            print("  Disabling MixUp due to unstable first-epoch loss.", flush=True)

        # Validation with EMA weights
        model.eval()
        val_probs = []; val_targets = []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = ema.module(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy())
                val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs); val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: tr_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} time={time.time()-t0:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': ema.module.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved EMA -> {best_path} ({best_ll:.5f})", flush=True)

    # Best EMA OOF
    ckpt = torch.load(best_path, map_location='cpu')
    ema_model = build_model().to(device)
    ema_model.load_state_dict(ckpt['state_dict'])
    ema_model.eval()
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    preds = []
    with torch.no_grad():
        for imgs, _ in val_loader:
            imgs = imgs.to(device)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                p = torch.sigmoid(ema_model(imgs).squeeze(1))
            preds.append(p.float().cpu().numpy())
    preds = np.concatenate(preds)
    return val_idx, preds, best_ll

# Train 5 folds
print("==== Begin 5-fold training (EffB2-NS 320px) ====", flush=True)
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
total_start = time.time()
for f in range(5):
    print(f"==== Start Fold {f} ====", flush=True)
    val_idx, preds, best_ll = train_one_fold(f)
    oof[val_idx] = preds
    fold_scores[f] = best_ll
    print(f"[EffB2-NS] Fold {f} best val_logloss: {best_ll:.5f}", flush=True)

oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"EffB2-NS OOF log-loss (5 folds): {oof_ll:.5f}", flush=True)
pd.Series(fold_scores).to_csv('fold_scores_b2.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_b2.csv', index=False)

# Inference: 5 EMA checkpoints + HFlip TTA
@torch.no_grad()
def load_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b2_fold{fold}_ema.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

models = {f: load_fold(f) for f in range(5)}
test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    out = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum += torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
    return np.concatenate(out)

print("Predicting test with 5-fold EMA ensemble (EffB2-NS 320px)...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (EffB2-NS 320). Head:\n', sub.head(), flush=True)
print(f"Total elapsed: {(time.time()-total_start)/60:.1f} min", flush=True)

gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Plan D: EfficientNet-B0.ra_in1k @320, 5-fold, EMA, Cosine LR, LS=0.05, MixUp=0.1, AMP, fixed OOF, HFlip TTA (cached weights)
import os, time, gc, random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from timm.utils import ModelEmaV2
from sklearn.metrics import log_loss
from PIL import Image
import torchvision.transforms as T

print("[Plan D] Starting cell: efficientnet_b0.ra_in1k @320 (cached)", flush=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True
os.environ['PYTHONUNBUFFERED'] = '1'

# Cache
LOCAL_CACHE = Path.cwd() / 'model_cache'
LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'

# Config
MODEL_NAME = 'efficientnet_b0.ra_in1k'
IMG_SIZE = 320
BATCH_SIZE = 64
EPOCHS = 10
LR = 3e-4
WEIGHT_DECAY = 1e-2
LABEL_SMOOTH = 0.05
MIXUP_ALPHA = 0.1
EMA_DECAY = 0.9997
WARMUP_EPOCHS = 1
NUM_WORKERS = 0

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, aug):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.aug = aug
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        if self.aug is not None:
            img = self.aug(img)
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

train_aug = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.7, 1.0), ratio=(0.75, 1.3333)),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
valid_aug = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

def build_model():
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))

# Pre-init (cached)
print(f"Pre-initializing '{MODEL_NAME}' (cached)...", flush=True)
try:
    _m = build_model()
    del _m
    print("Model pre-init OK.", flush=True)
except Exception as e:
    print(f"Model pre-init failed: {e}", flush=True)

def mixup_batch(x, y, alpha):
    if alpha is None or alpha <= 0:
        return x, y
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(x.size(0), device=x.device)
    x_m = lam * x + (1 - lam) * x[idx]
    y = y.view(-1, 1)
    y_m = lam * y + (1 - lam) * y[idx]
    return x_m, y_m.squeeze(1)

def train_one_fold(fold):
    print(f"\n[EffB0-RA] Fold {fold} @ {IMG_SIZE}px", flush=True)
    trn_idx = df.index[df.fold != fold].values
    val_idx = df.index[df.fold == fold].values
    trn_df = df.loc[trn_idx]
    val_df = df.loc[val_idx]
    trn_loader = DataLoader(DogCatDataset(trn_df, train_aug), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    model = build_model().to(device)
    ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS - WARMUP_EPOCHS)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    best_ll = 1e9
    best_path = f'model_b0_fold{fold}_ema.pt'
    mixup_enabled = True if (MIXUP_ALPHA is not None and MIXUP_ALPHA > 0) else False

    for epoch in range(1, EPOCHS+1):
        model.train()
        t0 = time.time(); run_loss = 0.0; n = 0
        if epoch <= WARMUP_EPOCHS:
            for pg in optimizer.param_groups:
                pg['lr'] = LR * epoch / max(1, WARMUP_EPOCHS)
        else:
            scheduler.step()

        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            targets = labels * (1 - LABEL_SMOOTH) + (1 - labels) * LABEL_SMOOTH
            if mixup_enabled:
                imgs, targets = mixup_batch(imgs, targets, alpha=MIXUP_ALPHA)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = criterion(logits, targets).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            ema.update(model)
            run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
            if (step+1) % 100 == 0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"Fold {fold} Ep{epoch} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f} lr {cur_lr:.2e}", flush=True)

        if epoch == 1 and mixup_enabled and (run_loss/max(1,n) > 0.69):
            mixup_enabled = False
            print("  Disabling MixUp due to unstable first-epoch loss.", flush=True)

        model.eval()
        val_probs = []; val_targets = []
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs = imgs.to(device)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = ema.module(imgs).squeeze(1)
                    probs = torch.sigmoid(logits)
                val_probs.append(probs.float().cpu().numpy())
                val_targets.append(labels.numpy())
        val_probs = np.concatenate(val_probs); val_targets = np.concatenate(val_targets)
        ll = log_loss(val_targets, np.clip(val_probs, 1e-6, 1-1e-6))
        print(f"Epoch {epoch}: tr_loss={run_loss/max(1,n):.4f} val_logloss={ll:.5f} time={time.time()-t0:.1f}s", flush=True)
        if ll < best_ll:
            best_ll = ll
            torch.save({'state_dict': ema.module.state_dict(), 'val_logloss': best_ll}, best_path)
            print(f"  Saved EMA -> {best_path} ({best_ll:.5f})", flush=True)

    ckpt = torch.load(best_path, map_location='cpu')
    ema_model = build_model().to(device)
    ema_model.load_state_dict(ckpt['state_dict'])
    ema_model.eval()
    val_loader = DataLoader(DogCatDataset(val_df, valid_aug), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    preds = []
    with torch.no_grad():
        for imgs, _ in val_loader:
            imgs = imgs.to(device)
            with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                p = torch.sigmoid(ema_model(imgs).squeeze(1))
            preds.append(p.float().cpu().numpy())
    preds = np.concatenate(preds)
    return val_idx, preds, best_ll

print("==== Begin 5-fold training (EffB0-RA 320px) ====", flush=True)
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
total_start = time.time()
for f in range(5):
    print(f"==== Start Fold {f} ====", flush=True)
    val_idx, preds, best_ll = train_one_fold(f)
    oof[val_idx] = preds
    fold_scores[f] = best_ll
    print(f"[EffB0-RA] Fold {f} best val_logloss: {best_ll:.5f}", flush=True)

oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"EffB0-RA OOF log-loss (5 folds): {oof_ll:.5f}", flush=True)
pd.Series(fold_scores).to_csv('fold_scores_b0_ra.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_b0_ra.csv', index=False)

@torch.no_grad()
def load_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b0_fold{fold}_ema.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

models = {f: load_fold(f) for f in range(5)}
test_ds = DogCatDataset(test_df, valid_aug)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    out = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum += torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
    return np.concatenate(out)

print("Predicting test with 5-fold EMA ensemble (EffB0-RA 320px)...", flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (EffB0-RA 320). Head:\n', sub.head(), flush=True)
print(f"Total elapsed: {(time.time()-total_start)/60:.1f} min", flush=True)

gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
# Debug Plan D imports/model init (cached, offline) to pinpoint hang
import os, time
from pathlib import Path
print("[DEBUG D] start", flush=True)
LOCAL_CACHE = Path.cwd() / 'model_cache'
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'
t0 = time.time()
print("[DEBUG D] importing torch,timm...", flush=True)
import torch, timm
print("[DEBUG D] torch", torch.__version__, "cuda", torch.cuda.is_available(), flush=True)
print("[DEBUG D] timm", timm.__version__, flush=True)
print("[DEBUG D] building efficientnet_b0.ra_in1k pretrained=True from cache...", flush=True)
m = timm.create_model('efficientnet_b0.ra_in1k', pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE))
n_params = sum(p.numel() for p in m.parameters())
print(f"[DEBUG D] model ok, params={n_params}, elapsed={time.time()-t0:.2f}s", flush=True)
del m
print("[DEBUG D] end", flush=True)

In [None]:
# Debug 2: minimal timm usage without accessing __version__; test pretrained=False then True (cached)
import os, time
from pathlib import Path
print('[DEBUG D2] start', flush=True)
LOCAL_CACHE = Path.cwd() / 'model_cache'
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'
t0 = time.time()
import torch, timm
print('[DEBUG D2] imports ok; cuda:', torch.cuda.is_available(), flush=True)
print('[DEBUG D2] creating model pretrained=False...', flush=True)
m = timm.create_model('efficientnet_b0.ra_in1k', pretrained=False, num_classes=1, scriptable=False)
n_params = sum(p.numel() for p in m.parameters())
print(f'[DEBUG D2] model (pretrained=False) ok, params={n_params}', flush=True)
del m
print('[DEBUG D2] creating model pretrained=True (from cache)...', flush=True)
t1 = time.time()
m2 = timm.create_model('efficientnet_b0.ra_in1k', pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE), scriptable=False)
print(f'[DEBUG D2] pretrained=True model ok, elapsed={time.time()-t1:.2f}s total={time.time()-t0:.2f}s', flush=True)
del m2
print('[DEBUG D2] end', flush=True)

In [8]:
# Plan E (Fixes): EffNet-B0.ra_in1k with timm default transforms (bicubic), no MixUp, low LS, optional EMA vs raw eval, drop_rate=0.2
import os, time, gc, random, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss
from PIL import Image

print('[Plan E] Starting: EffB0-RA fixed preprocessing @384, MixUp OFF, LS low, EMA vs RAW', flush=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True
os.environ['PYTHONUNBUFFERED'] = '1'

# Cache / Offline
LOCAL_CACHE = Path.cwd() / 'model_cache'; LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'

# Config per expert guidance
MODEL_NAME = 'efficientnet_b0.ra_in1k'
IMG_SIZE = 384
BATCH_SIZE = 32
EPOCHS = 18
LR = 2e-4
WEIGHT_DECAY = 5e-3
LABEL_SMOOTH = 0.02  # 0.0..0.05 recommended
MIXUP_ALPHA = 0.0     # OFF
EMA_DECAY = 0.9997
WARMUP_EPOCHS = 1
NUM_WORKERS = 0
DROP_RATE = 0.2

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, transform):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.transform = transform
    def __len__(self):
        return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        img = self.transform(img) if self.transform is not None else img
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

def build_model():
    import timm
    # drop_rate for head regularization; num_classes=1 for BCE
    m = timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE), drop_rate=DROP_RATE)
    return m

def build_transforms():
    # Use timm default cfg (ensures bicubic, mean/std, crop_pct, etc.)
    import timm
    from timm.data import resolve_data_config, create_transform
    tmp = timm.create_model(MODEL_NAME, pretrained=False, num_classes=1)
    cfg = resolve_data_config({}, model=tmp)  # start from model defaults
    # Override input size to requested IMG_SIZE while keeping other defaults
    cfg['input_size'] = (3, IMG_SIZE, IMG_SIZE)
    tfm_train = create_transform(is_training=True, **cfg)
    tfm_valid = create_transform(is_training=False, **cfg)
    del tmp
    return tfm_train, tfm_valid

@torch.no_grad()
def evaluate_model(model, loader):
    model.eval()
    probs_all, targs_all = [], []
    for imgs, labels in loader:
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits = model(imgs).squeeze(1)
            probs = torch.sigmoid(logits)
        probs_all.append(probs.float().cpu().numpy())
        targs_all.append(labels.numpy())
    probs_all = np.concatenate(probs_all)
    targs_all = np.concatenate(targs_all)
    ll = log_loss(targs_all, np.clip(probs_all, 1e-6, 1-1e-6))
    return ll, probs_all

def train_one_fold(fold, tfm_train, tfm_valid):
    print(f"\n[Plan E] Fold {fold} @ {IMG_SIZE}px", flush=True)
    from timm.utils import ModelEmaV2  # lazy import to avoid early timm import
    trn_df = df[df.fold != fold].reset_index(drop=True)
    val_df = df[df.fold == fold].reset_index(drop=True)
    trn_loader = DataLoader(DogCatDataset(trn_df, tfm_train), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True)
    val_loader = DataLoader(DogCatDataset(val_df, tfm_valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    model = build_model().to(device)
    ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    # Cosine with warmup
    def lr_lambda(epoch):
        if epoch < WARMUP_EPOCHS:
            return float(epoch + 1) / float(max(1, WARMUP_EPOCHS))
        progress = (epoch - WARMUP_EPOCHS) / float(max(1, EPOCHS - WARMUP_EPOCHS))
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

    best_ll = 1e9
    best_variant = 'raw'
    best_path = f'model_b0e_fold{fold}.pt'

    for epoch in range(EPOCHS):
        model.train()
        run_loss, n = 0.0, 0
        t0 = time.time()
        for step, (imgs, labels) in enumerate(trn_loader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            # label smoothing
            targets = labels * (1.0 - LABEL_SMOOTH) + (1.0 - labels) * LABEL_SMOOTH
            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
                logits = model(imgs).squeeze(1)
                loss = criterion(logits, targets).mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            ema.update(model)
            run_loss += loss.item() * imgs.size(0)
            n += imgs.size(0)
            if (step + 1) % 100 == 0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"Fold {fold} Ep{epoch+1} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f} lr {cur_lr:.2e}", flush=True)
        scheduler.step()

        # Validate RAW
        ll_raw, _ = evaluate_model(model, val_loader)
        # Validate EMA
        ll_ema, _ = evaluate_model(ema.module, val_loader)
        cur_ll = min(ll_raw, ll_ema)
        variant = 'raw' if ll_raw <= ll_ema else 'ema'
        print(f"Epoch {epoch+1}: tr_loss={run_loss/max(1,n):.4f} val_raw={ll_raw:.5f} val_ema={ll_ema:.5f} -> best_this_epoch={cur_ll:.5f} ({variant}) time={time.time()-t0:.1f}s", flush=True)
        if cur_ll < best_ll:
            best_ll = cur_ll
            best_variant = variant
            if variant == 'raw':
                torch.save({'state_dict': model.state_dict(), 'val_logloss': best_ll, 'variant': 'raw'}, best_path)
            else:
                torch.save({'state_dict': ema.module.state_dict(), 'val_logloss': best_ll, 'variant': 'ema'}, best_path)
            print(f"  Saved best ({best_variant}) -> {best_path} ({best_ll:.5f})", flush=True)

    # Load best and compute OOF
    ckpt = torch.load(best_path, map_location='cpu')
    best_model = build_model().to(device)
    best_model.load_state_dict(ckpt['state_dict'])
    best_model.eval()
    val_loader = DataLoader(DogCatDataset(val_df, tfm_valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    _, oof_probs = evaluate_model(best_model, val_loader)
    return (df[df.fold == fold].index.values), oof_probs, best_ll

# Build transforms once (fixed to model defaults with IMG_SIZE override)
print('[Plan E] Building transforms with timm default_cfg (bicubic, mean/std, crop_pct)...', flush=True)
tfm_train, tfm_valid = build_transforms()
print('[Plan E] Transforms built. Starting training...', flush=True)

# Train 5 folds
print('==== Begin 5-fold training (EffB0-RA 384px, fixed) ====', flush=True)
oof = np.zeros(len(df), dtype=np.float32)
fold_scores = {}
total_start = time.time()
for f in range(5):
    print(f'==== Start Fold {f} ====')
    val_idx, preds, best_ll = train_one_fold(f, tfm_train, tfm_valid)
    oof[val_idx] = preds  # correct OOF indexing
    fold_scores[f] = best_ll
    print(f"[Plan E] Fold {f} best val_logloss: {best_ll:.5f}", flush=True)

oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-6, 1-1e-6))
print(f"Plan E OOF log-loss (5 folds): {oof_ll:.5f}", flush=True)
pd.Series(fold_scores).to_csv('fold_scores_b0_e.csv')
pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_b0_e.csv', index=False)

# Inference with best checkpoints + simple HFlip TTA
@torch.no_grad()
def load_best_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b0e_fold{fold}.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

models = {f: load_best_fold(f) for f in range(5)}
test_tfm = tfm_valid  # use validation (center-crop) transform
test_ds = DogCatDataset(test_df, test_tfm)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def predict_tta(models_dict, loader):
    out = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum += torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
    return np.concatenate(out)

print('Predicting test with 5-fold best (raw/ema) + HFlip TTA...', flush=True)
test_probs = predict_tta(models, test_loader)
test_ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': test_ids, 'label': np.clip(test_probs, 1e-6, 1-1e-6)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (Plan E). Head:\n', sub.head(), flush=True)
print(f'Total elapsed: {(time.time()-total_start)/60:.1f} min', flush=True)

gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [1]:
# Sanity check: verify kernel stdout responsiveness (no heavy imports)
import time
print('[SANITY] Kernel alive and printing. Timestamp:', time.time(), flush=True)

[SANITY] Kernel alive and printing. Timestamp: 1757331351.3687549


In [4]:
# Debug 3: verify timm.data transforms creation (bicubic cfg) works offline
import os, time
from pathlib import Path
print('[DEBUG TFM] start', flush=True)
LOCAL_CACHE = Path.cwd() / 'model_cache'
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'
import torch, timm
from timm.data import resolve_data_config, create_transform
MODEL_NAME = 'efficientnet_b0.ra_in1k'; IMG_SIZE = 384
print('[DEBUG TFM] imports ok; cuda:', torch.cuda.is_available(), flush=True)
print('[DEBUG TFM] creating tmp model (pretrained=False)...', flush=True)
tmp = timm.create_model(MODEL_NAME, pretrained=False, num_classes=1)
cfg = resolve_data_config({}, model=tmp)
print('[DEBUG TFM] cfg interp before:', cfg.get('interpolation'), 'input_size before:', cfg.get('input_size'), flush=True)
cfg['input_size'] = (3, IMG_SIZE, IMG_SIZE)
tfm_train = create_transform(is_training=True, **cfg)
tfm_valid = create_transform(is_training=False, **cfg)
del tmp
print('[DEBUG TFM] transforms created OK; interp:', cfg.get('interpolation'), 'input_size:', cfg.get('input_size'), flush=True)

[DEBUG TFM] start


[DEBUG TFM] imports ok; cuda: True


[DEBUG TFM] creating tmp model (pretrained=False)...


[DEBUG TFM] cfg interp before: bicubic input_size before: (3, 224, 224)


[DEBUG TFM] transforms created OK; interp: bicubic input_size: (3, 384, 384)


In [6]:
# Plan E-mini: run a quick Fold 0 sanity (5 epochs) with fixed preprocessing to verify val_logloss trajectory
import os, time, math, random, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss
from PIL import Image

print('[Plan E-mini] Start Fold 0 sanity run', flush=True)
SEED=42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

LOCAL_CACHE = Path.cwd() / 'model_cache'; LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'

MODEL_NAME = 'efficientnet_b0.ra_in1k'
IMG_SIZE = 384
BATCH_SIZE = 32
EPOCHS = 5
LR = 2e-4
WEIGHT_DECAY = 5e-3
LABEL_SMOOTH = 0.02
EMA_DECAY = 0.9997
NUM_WORKERS = 0
DROP_RATE = 0.2

df = pd.read_csv('train_folds.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, transform):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.transform = transform
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        img = self.transform(img) if self.transform is not None else img
        if self.labels is None: return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

print('[Plan E-mini] Building transforms (timm default cfg, bicubic)...', flush=True)
import timm
from timm.data import resolve_data_config, create_transform
tmp = timm.create_model(MODEL_NAME, pretrained=False, num_classes=1)
cfg = resolve_data_config({}, model=tmp)
cfg['input_size'] = (3, IMG_SIZE, IMG_SIZE)
tfm_train = create_transform(is_training=True, **cfg)
tfm_valid = create_transform(is_training=False, **cfg)
del tmp
print('[Plan E-mini] Transforms ready', flush=True)

def build_model():
    import timm
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE), drop_rate=DROP_RATE)

@torch.no_grad()
def evaluate_model(model, loader):
    model.eval()
    probs_all, targs_all = [], []
    for imgs, labels in loader:
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits = model(imgs).squeeze(1)
            probs = torch.sigmoid(logits)
        probs_all.append(probs.float().cpu().numpy())
        targs_all.append(labels.numpy())
    probs_all = np.concatenate(probs_all)
    targs_all = np.concatenate(targs_all)
    ll = log_loss(targs_all, np.clip(probs_all, 1e-6, 1-1e-6))
    return ll

from timm.utils import ModelEmaV2
fold = 0
trn_df = df[df.fold != fold].reset_index(drop=True)
val_df = df[df.fold == fold].reset_index(drop=True)
trn_loader = DataLoader(DogCatDataset(trn_df, tfm_train), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, drop_last=True)
val_loader = DataLoader(DogCatDataset(val_df, tfm_valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

model = build_model().to(device)
ema = ModelEmaV2(model, decay=EMA_DECAY, device=device if device.type=='cuda' else None)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))
criterion = nn.BCEWithLogitsLoss(reduction='none')

def lr_lambda(epoch):
    if epoch < 1: return float(epoch + 1)
    progress = (epoch - 1) / float(max(1, EPOCHS - 1))
    return 0.5 * (1.0 + math.cos(math.pi * progress))
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

best_ll = 1e9; best_variant='raw'
print(f"[Plan E-mini] Training Fold {fold} for {EPOCHS} epochs...", flush=True)
for epoch in range(EPOCHS):
    model.train()
    run_loss, n = 0.0, 0
    t0 = time.time()
    for step, (imgs, labels) in enumerate(trn_loader):
        imgs = imgs.to(device); labels = labels.to(device)
        targets = labels * (1.0 - LABEL_SMOOTH) + (1.0 - labels) * LABEL_SMOOTH
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits = model(imgs).squeeze(1)
            loss = criterion(logits, targets).mean()
        scaler.scale(loss).backward()
        scaler.step(optimizer); scaler.update()
        ema.update(model)
        run_loss += loss.item() * imgs.size(0); n += imgs.size(0)
        if (step+1) % 100 == 0:
            print(f"Ep{epoch+1} {step+1}/{len(trn_loader)} loss {run_loss/max(1,n):.4f}", flush=True)
    scheduler.step()
    # Validate RAW vs EMA
    ll_raw = evaluate_model(model, val_loader)
    ll_ema = evaluate_model(ema.module, val_loader)
    cur_ll = min(ll_raw, ll_ema)
    variant = 'raw' if ll_raw <= ll_ema else 'ema'
    print(f"Epoch {epoch+1}: tr_loss={run_loss/max(1,n):.4f} val_raw={ll_raw:.5f} val_ema={ll_ema:.5f} -> {cur_ll:.5f} ({variant}) time={time.time()-t0:.1f}s", flush=True)
    best_ll = min(best_ll, cur_ll)

print(f"[Plan E-mini] Fold {fold} best val_logloss (5 epochs): {best_ll:.5f}", flush=True)
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

[Plan E-mini] Start Fold 0 sanity run


[Plan E-mini] Building transforms (timm default cfg, bicubic)...


[Plan E-mini] Transforms ready


[Plan E-mini] Training Fold 0 for 5 epochs...


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Ep1 100/562 loss 0.8230


Ep1 200/562 loss 0.6540


Ep1 300/562 loss 0.5712


Ep1 400/562 loss 0.5216


Ep1 500/562 loss 0.4874


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Epoch 1: tr_loss=0.4723 val_raw=0.05012 val_ema=1.58407 -> 0.05012 (raw) time=458.8s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Ep2 100/562 loss 0.3138


Ep2 200/562 loss 0.3059


Ep2 300/562 loss 0.3011


Ep2 400/562 loss 0.2894


Ep2 500/562 loss 0.2858


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Epoch 2: tr_loss=0.2828 val_raw=0.03360 val_ema=0.54930 -> 0.03360 (raw) time=397.7s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Ep3 100/562 loss 0.2528


Ep3 200/562 loss 0.2500


Ep3 300/562 loss 0.2400


Ep3 400/562 loss 0.2373


Ep3 500/562 loss 0.2365


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Epoch 3: tr_loss=0.2339 val_raw=0.05277 val_ema=0.17513 -> 0.05277 (raw) time=397.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Ep4 100/562 loss 0.2044


Ep4 200/562 loss 0.2064


Ep4 300/562 loss 0.2050


Ep4 400/562 loss 0.2002


Ep4 500/562 loss 0.2009


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Epoch 4: tr_loss=0.2002 val_raw=0.03691 val_ema=0.07385 -> 0.03691 (raw) time=398.0s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Ep5 100/562 loss 0.1857


Ep5 200/562 loss 0.1819


Ep5 300/562 loss 0.1828


Ep5 400/562 loss 0.1836


Ep5 500/562 loss 0.1826


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


Epoch 5: tr_loss=0.1823 val_raw=0.03505 val_ema=0.04446 -> 0.03505 (raw) time=397.7s


[Plan E-mini] Fold 0 best val_logloss (5 epochs): 0.03360


In [10]:
# Temperature scaling for Plan E (EffB0-RA @384); optimize T on OOF and apply to test
import os, math, time, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss
from PIL import Image

print('[TempScale] Starting temperature scaling for Plan E...', flush=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

# Constants (must match Plan E)
MODEL_NAME = 'efficientnet_b0.ra_in1k'
IMG_SIZE = 384
BATCH_SIZE = 32
NUM_WORKERS = 0
DROP_RATE = 0.2
LOCAL_CACHE = Path.cwd() / 'model_cache'; LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'

def build_model():
    import timm
    m = timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE), drop_rate=DROP_RATE)
    return m

def build_transforms():
    import timm
    from timm.data import resolve_data_config, create_transform
    tmp = timm.create_model(MODEL_NAME, pretrained=False, num_classes=1)
    cfg = resolve_data_config({}, model=tmp)
    cfg['input_size'] = (3, IMG_SIZE, IMG_SIZE)
    tfm_valid = create_transform(is_training=False, **cfg)
    del tmp
    return tfm_valid

class ImageDataset(Dataset):
    def __init__(self, df, transform):
        self.filepaths = df['filepath'].values
        self.transform = transform
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        img = self.transform(img) if self.transform is not None else img
        return img, Path(fp).stem

# 1) Load OOF and optimize T
oof_path = Path('oof_b0_e.csv')
assert oof_path.exists(), 'oof_b0_e.csv not found; run Plan E training first.'
oof_df = pd.read_csv(oof_path)
# Safety checks
assert len(oof_df) > 0, 'Empty OOF file'
assert oof_df[['label','oof']].notna().all().all(), 'NaNs found in OOF file'
y_true = oof_df['label'].values.astype(np.float64)
p = np.clip(oof_df['oof'].values.astype(np.float64), 1e-7, 1-1e-7)
logits = np.log(p/(1.0-p))  # float64 by construction
base_ll = log_loss(y_true, p)
print(f"[TempScale] Base OOF logloss={base_ll:.6f} (rows={len(oof_df)})", flush=True)

def loss_for_T(T):
    if T <= 1e-4 or T > 100.0:
        return np.inf
    ps = 1.0 / (1.0 + np.exp(-logits / T))
    ps = np.clip(ps, 1e-7, 1-1e-7)
    return log_loss(y_true, ps)

# Coarse-to-fine search on T (expanded range 0.3..5.0)
grid1 = np.exp(np.linspace(np.log(0.3), np.log(5.0), 401))
vals1 = [loss_for_T(t) for t in grid1]
best_idx = int(np.argmin(vals1)); T_best = float(grid1[best_idx]); ll_best = float(vals1[best_idx])
lo, hi = T_best*0.8, T_best*1.2
grid2 = np.exp(np.linspace(np.log(max(0.1, lo)), np.log(min(10.0, hi)), 201))
vals2 = [loss_for_T(t) for t in grid2]
best2 = int(np.argmin(vals2)); T_best = float(grid2[best2]); ll_best = float(vals2[best2])

print(f"[TempScale] Calibrated (T={T_best:.6f}) OOF logloss={ll_best:.6f}", flush=True)
with open('temperature.txt', 'w') as f: f.write(f"{T_best}\n")

# 2) Re-run test inference with temperature scaling applied to logits before sigmoid
test_df = pd.read_csv('test_files.csv')
tfm_valid = build_transforms()
test_ds = ImageDataset(test_df, tfm_valid)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def load_best_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b0e_fold{fold}.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

# Ensure all checkpoints exist before loading
for f in range(5):
    ck = Path(f'model_b0e_fold{f}.pt')
    assert ck.exists(), f'Missing checkpoint: {ck}'

models = {f: load_best_fold(f) for f in range(5)}

@torch.no_grad()
def predict_tta_temp(models_dict, loader, T_temp):
    all_probs = []
    for imgs, ids in loader:
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            probs_sum = None
            for m in models_dict.values():
                logits_b = m(imgs).squeeze(1)
                p_b = torch.sigmoid(logits_b / T_temp)
                probs_sum = p_b if probs_sum is None else probs_sum + p_b
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                logits_f = m(imgs_f).squeeze(1)
                probs_sum = probs_sum + torch.sigmoid(logits_f / T_temp)
        probs_avg = (probs_sum / (len(models_dict)*2)).float().cpu().numpy()
        all_probs.append(probs_avg)
    return np.concatenate(all_probs)

print('[TempScale] Predicting test with temperature scaling...', flush=True)
t0 = time.time()
test_probs = predict_tta_temp(models, test_loader, T_best)
ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': ids, 'label': np.clip(test_probs, 1e-7, 1-1e-7)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print(f"[TempScale] Saved submission.csv with T={T_best:.6f}. Inference time: {time.time()-t0:.1f}s", flush=True)
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [12]:
# Plan E Finalization: Build OOF from saved checkpoints, temperature scale, and generate submission
import os, time, gc, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss
from PIL import Image

print('[Finalize] Starting OOF rebuild + TempScale + Submission for Plan E...', flush=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

# Constants (match Plan E)
MODEL_NAME = 'efficientnet_b0.ra_in1k'
IMG_SIZE = 384
BATCH_SIZE = 32
NUM_WORKERS = 0
DROP_RATE = 0.2
LOCAL_CACHE = Path.cwd() / 'model_cache'; LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'

df = pd.read_csv('train_folds.csv')
test_df = pd.read_csv('test_files.csv')

class DogCatDataset(Dataset):
    def __init__(self, df, transform):
        self.filepaths = df['filepath'].values
        self.labels = df['label'].values if 'label' in df.columns else None
        self.transform = transform
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        img = self.transform(img) if self.transform is not None else img
        if self.labels is None:
            return img, Path(fp).stem
        return img, np.float32(self.labels[idx])

def build_model():
    import timm
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE), drop_rate=DROP_RATE)

def build_tfm_valid():
    import timm
    from timm.data import resolve_data_config, create_transform
    tmp = timm.create_model(MODEL_NAME, pretrained=False, num_classes=1)
    cfg = resolve_data_config({}, model=tmp)
    cfg['input_size'] = (3, IMG_SIZE, IMG_SIZE)
    tfm_valid = create_transform(is_training=False, **cfg)
    del tmp
    return tfm_valid

@torch.no_grad()
def evaluate_model(model, loader):
    model.eval()
    probs_all, targs_all = [], []
    for i,(imgs, labels) in enumerate(loader):
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits = model(imgs).squeeze(1)
            probs = torch.sigmoid(logits)
        probs_all.append(probs.float().cpu().numpy())
        targs_all.append(labels.numpy())
        if (i+1) % 50 == 0:
            print(f'[Finalize] OOF eval batch {i+1}/{len(loader)}', flush=True)
    probs_all = np.concatenate(probs_all)
    targs_all = np.concatenate(targs_all)
    ll = log_loss(targs_all, np.clip(probs_all, 1e-7, 1-1e-7))
    return ll, probs_all

@torch.no_grad()
def load_best_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b0e_fold{fold}.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

# Ensure checkpoints exist
for f in range(5):
    assert Path(f'model_b0e_fold{f}.pt').exists(), f'Missing model_b0e_fold{f}.pt'
print('[Finalize] All 5 checkpoints present.', flush=True)

# 1) Rebuild OOF if missing
oof_path = Path('oof_b0_e.csv')
tfm_valid = build_tfm_valid()
if not oof_path.exists():
    print('[Finalize] oof_b0_e.csv not found. Rebuilding OOF from checkpoints...', flush=True)
    oof = np.zeros(len(df), dtype=np.float32)
    fold_scores = {}
    for fold in range(5):
        val_df = df[df.fold == fold].reset_index(drop=True)
        val_loader = DataLoader(DogCatDataset(val_df, tfm_valid), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
        model = load_best_fold(fold)
        ll, preds = evaluate_model(model, val_loader)
        oof[df.index[df.fold == fold].values] = preds
        fold_scores[fold] = ll
        print(f'[Finalize] Fold {fold} OOF val_logloss={ll:.6f}', flush=True)
        del model; gc.collect(); torch.cuda.empty_cache()
    oof_ll = log_loss(df['label'].values, np.clip(oof, 1e-7, 1-1e-7))
    pd.Series(fold_scores).to_csv('fold_scores_b0_e.csv')
    pd.DataFrame({'filepath': df['filepath'], 'label': df['label'], 'oof': oof}).to_csv('oof_b0_e.csv', index=False)
    print(f'[Finalize] Saved OOF (oof_b0_e.csv). OOF logloss={oof_ll:.6f}', flush=True)
else:
    print('[Finalize] Found existing oof_b0_e.csv; skipping OOF rebuild.', flush=True)

# 2) Temperature scaling on OOF
oof_df = pd.read_csv('oof_b0_e.csv')
assert len(oof_df) == len(df) and oof_df[['label','oof']].notna().all().all(), 'OOF integrity failed'
y_true = oof_df['label'].values.astype(np.float64)
p = np.clip(oof_df['oof'].values.astype(np.float64), 1e-7, 1-1e-7)
logits = np.log(p/(1.0-p))
base_ll = log_loss(y_true, p)
print(f'[Finalize] Base OOF logloss={base_ll:.6f}', flush=True)

def loss_for_T(T):
    if T <= 1e-4 or T > 100.0: return np.inf
    ps = 1.0 / (1.0 + np.exp(-logits / T))
    ps = np.clip(ps, 1e-7, 1-1e-7)
    return log_loss(y_true, ps)

grid1 = np.exp(np.linspace(np.log(0.3), np.log(5.0), 401))
vals1 = [loss_for_T(t) for t in grid1]
T_best = float(grid1[int(np.argmin(vals1))])
lo, hi = T_best*0.8, T_best*1.2
grid2 = np.exp(np.linspace(np.log(max(0.1, lo)), np.log(min(10.0, hi)), 201))
vals2 = [loss_for_T(t) for t in grid2]
T_best = float(grid2[int(np.argmin(vals2))])
ll_best = float(min(vals2))
print(f'[Finalize] Calibrated T={T_best:.6f} | OOF logloss={ll_best:.6f}', flush=True)
with open('temperature.txt', 'w') as f: f.write(f"{T_best}\n")

# 3) Test inference with TTA + temperature scaling
@torch.no_grad()
def predict_tta_temp(models_dict, loader, T_temp):
    all_probs = []
    for i,(imgs, ids) in enumerate(loader):
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            probs_sum = None
            for m in models_dict.values():
                logits_b = m(imgs).squeeze(1)
                p_b = torch.sigmoid(logits_b / T_temp)
                probs_sum = p_b if probs_sum is None else probs_sum + p_b
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                logits_f = m(imgs_f).squeeze(1)
                probs_sum = probs_sum + torch.sigmoid(logits_f / T_temp)
        probs_avg = (probs_sum / (len(models_dict)*2)).float().cpu().numpy()
        all_probs.append(probs_avg)
        if (i+1) % 50 == 0:
            print(f'[Finalize] Test infer batch {i+1}/{len(loader)}', flush=True)
    return np.concatenate(all_probs)

print('[Finalize] Loading models for inference...', flush=True)
models = {f: load_best_fold(f) for f in range(5)}
test_tfm = build_tfm_valid()
test_loader = DataLoader(DogCatDataset(test_df, test_tfm), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
print('[Finalize] Predicting test with TTA + temperature scaling...', flush=True)
t0 = time.time()
test_probs = predict_tta_temp(models, test_loader, T_best)
ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': ids, 'label': np.clip(test_probs, 1e-7, 1-1e-7)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print(f'[Finalize] Saved submission.csv (T={T_best:.6f}). Inference time: {time.time()-t0:.1f}s', flush=True)
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None
print('[Finalize] Done.', flush=True)

In [14]:
# Minimal final inference (Plan E checkpoints, unscaled) with HFlip TTA to produce submission.csv
import os, time, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

print('[QuickInfer] Starting minimal inference...', flush=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

MODEL_NAME = 'efficientnet_b0.ra_in1k'
IMG_SIZE = 384
BATCH_SIZE = 32
NUM_WORKERS = 0
DROP_RATE = 0.2
LOCAL_CACHE = Path.cwd() / 'model_cache'; LOCAL_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(LOCAL_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(LOCAL_CACHE)
os.environ['XDG_CACHE_HOME'] = str(LOCAL_CACHE)
os.environ['TORCH_HOME'] = str(LOCAL_CACHE)
os.environ['TIMM_HOME'] = str(LOCAL_CACHE)
os.environ['HF_HUB_OFFLINE'] = '1'

test_df = pd.read_csv('test_files.csv')

class TestDataset(Dataset):
    def __init__(self, df, transform):
        self.filepaths = df['filepath'].values
        self.transform = transform
    def __len__(self): return len(self.filepaths)
    def __getitem__(self, idx):
        fp = self.filepaths[idx]
        img = Image.open(fp).convert('RGB')
        img = self.transform(img) if self.transform is not None else img
        return img, Path(fp).stem

def build_model():
    import timm
    return timm.create_model(MODEL_NAME, pretrained=True, num_classes=1, cache_dir=str(LOCAL_CACHE), drop_rate=DROP_RATE)

def build_valid_tfm():
    import timm
    from timm.data import resolve_data_config, create_transform
    tmp = timm.create_model(MODEL_NAME, pretrained=False, num_classes=1)
    cfg = resolve_data_config({}, model=tmp)
    cfg['input_size'] = (3, IMG_SIZE, IMG_SIZE)
    tfm = create_transform(is_training=False, **cfg)
    del tmp
    return tfm

@torch.no_grad()
def load_best_fold(fold):
    m = build_model().to(device)
    ckpt = torch.load(f'model_b0e_fold{fold}.pt', map_location='cpu')
    m.load_state_dict(ckpt['state_dict']); m.eval()
    return m

# Checkpoints presence
for f in range(5):
    assert Path(f'model_b0e_fold{f}.pt').exists(), f'Missing model_b0e_fold{f}.pt'
print('[QuickInfer] All checkpoints present.', flush=True)

print('[QuickInfer] Loading models...', flush=True)
models = {f: load_best_fold(f) for f in range(5)}
tfm = build_valid_tfm()
loader = DataLoader(TestDataset(test_df, tfm), batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

@torch.no_grad()
def predict(models_dict, loader):
    out = []
    for i, (imgs, ids) in enumerate(loader):
        imgs = imgs.to(device)
        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            p_sum = None
            for m in models_dict.values():
                p = torch.sigmoid(m(imgs).squeeze(1))
                p_sum = p if p_sum is None else p_sum + p
            imgs_f = torch.flip(imgs, dims=[3])
            for m in models_dict.values():
                p_sum = p_sum + torch.sigmoid(m(imgs_f).squeeze(1))
        p_avg = (p_sum / (len(models_dict)*2)).float().cpu().numpy()
        out.append(p_avg)
        if (i+1) % 50 == 0:
            print(f'[QuickInfer] Batch {i+1}/{len(loader)}', flush=True)
    return np.concatenate(out)

print('[QuickInfer] Predicting test...', flush=True)
t0 = time.time()
probs = predict(models, loader)
ids = test_df['filepath'].apply(lambda p: int(Path(p).stem)).values
sub = pd.DataFrame({'id': ids, 'label': np.clip(probs, 1e-7, 1-1e-7)})
sub = sub.sort_values('id').reset_index(drop=True)
sub.to_csv('submission.csv', index=False)
print(f'[QuickInfer] Saved submission.csv. Inference time: {time.time()-t0:.1f}s', flush=True)
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

[QuickInfer] Starting minimal inference...


[QuickInfer] All checkpoints present.


[QuickInfer] Loading models...


[QuickInfer] Predicting test...


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[QuickInfer] Batch 50/79


[QuickInfer] Saved submission.csv. Inference time: 63.1s
