# Humpback Whale Identification - Plan

Objectives:
- Build a strong baseline fast and iterate to medal territory (MAP@5).

High-level plan:
1) Environment & sanity checks
   - Verify GPU availability (nvidia-smi).
   - Only install CUDA 12.1 torch stack when we are ready to train.

2) Data understanding
   - Inspect train.csv, sample_submission.csv, image folders (/train, /test).
   - Confirm label distribution and class cardinality.
   - Determine if there are any duplicates or near-duplicates.

3) Validation
   - Stratified KFold on target label to mirror test distribution.
   - Fix a single deterministic CV split and reuse.

4) Baseline modeling (fast)
   - Start with timm backbone (e.g., efficientnet_b0 or resnet18/34) at 224-256 px.
   - Loss: CE with label smoothing; metric: MAP@5.
   - Augmentations: light (RandomResizedCrop/CenterCrop, flips, brightness/contrast).
   - Mixed precision + cosine LR + warmup; early stopping by CV.

5) Improve
   - Scale up backbone (efficientnet_b3/b4, convnext_tiny), resolution 384+.
   - Use ArcFace/CosFace head (metric learning) or logits ensembling across seeds.
   - Hard mining / class-balanced sampling if long-tail severe.

6) Ensembling
   - Blend diverse models/seeds/resolutions using OOF-driven weights.

7) Submission
   - Generate top-5 per image; verify format against sample_submission.

Checkpoints for expert reviews:
- After this plan, after EDA, after first baseline CV, after tuning/ensembling.

Time discipline:
- Always print progress and elapsed time.
- Subsample smoke runs first (e.g., 2 folds, few epochs) before full training.

In [None]:
import os, sys, time, subprocess, json, textwrap
import pandas as pd
from pathlib import Path

def run(cmd):
    print(">>>", " ".join(cmd), flush=True)
    try:
        out = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8", errors="ignore")
    except subprocess.CalledProcessError as e:
        out = e.output.decode("utf-8", errors="ignore")
    print(out, flush=True)
    return out

t0 = time.time()
print("[Env] Checking GPU (nvidia-smi)...", flush=True)
run(["bash","-lc","nvidia-smi || true"])

print("[Env] Python:", sys.version)
print("[Env] CWD:", os.getcwd())

print("[FS] Listing key files:")
for p in ["train.csv", "sample_submission.csv", "submission.csv", "train", "test"]:
    pp = Path(p)
    if pp.is_file():
        print(f" - {p} file size={pp.stat().st_size:,}")
    elif pp.is_dir():
        cnt = sum(1 for _ in pp.iterdir())
        print(f" - {p} dir entries={cnt}")

print("[Data] Loading CSVs...")
train = pd.read_csv("train.csv")
sub = pd.read_csv("sample_submission.csv")
print("train.shape:", train.shape)
print("train.head():\n", train.head())
print("sample_submission.shape:", sub.shape)
print("sample_submission.head():\n", sub.head())

# Basic schema expectations
print("[Data] Columns:", train.columns.tolist())
img_col = None
label_col = None
for c in train.columns:
    if c.lower() in ("image","img","filename","file","file_name"):
        img_col = c
    if c.lower() in ("id","label","target","species"):
        label_col = c
if img_col is None:
    # Heuristic: first column is image
    img_col = train.columns[0]
if label_col is None and len(train.columns) > 1:
    label_col = train.columns[1]
print(f"[Data] Using columns -> image: {img_col}, label: {label_col}")

n_classes = train[label_col].nunique() if label_col in train.columns else None
print("[Data] n_images:", len(train), "n_classes:", n_classes)
print("[Data] Label distribution (top 10):\n", train[label_col].value_counts().head(10))

# Quick file existence check for a few samples
train_dir = Path("train")
missing = 0
for fn in train[img_col].head(5):
    if not (train_dir / fn).exists():
        missing += 1
print(f"[FS] Missing first-5 images present? missing={missing}")

print(f"[Done] Prep EDA in {time.time()-t0:.2f}s", flush=True)

In [None]:
# Setup: Install CUDA 12.1 PyTorch stack and deps
import os, sys, subprocess, shutil, time
from pathlib import Path

def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

# Uninstall any pre-existing torch stack to avoid conflicts
for pkg in ("torch","torchvision","torchaudio"):
    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", pkg], check=False)

# Clean stray site dirs that can shadow correct wheels (idempotent)
for d in (
    "/app/.pip-target/torch",
    "/app/.pip-target/torch-2.8.0.dist-info",
    "/app/.pip-target/torch-2.4.1.dist-info",
    "/app/.pip-target/torchvision",
    "/app/.pip-target/torchvision-0.23.0.dist-info",
    "/app/.pip-target/torchvision-0.19.1.dist-info",
    "/app/.pip-target/torchaudio",
    "/app/.pip-target/torchaudio-2.8.0.dist-info",
    "/app/.pip-target/torchaudio-2.4.1.dist-info",
    "/app/.pip-target/torchgen",
    "/app/.pip-target/functorch",
):
    if os.path.exists(d):
        print("Removing", d)
        shutil.rmtree(d, ignore_errors=True)

# Install EXACT cu121 torch stack
pip("install",
    "--index-url", "https://download.pytorch.org/whl/cu121",
    "--extra-index-url", "https://pypi.org/simple",
    "torch==2.4.1", "torchvision==0.19.1", "torchaudio==2.4.1")

# Freeze torch versions
Path("constraints.txt").write_text("\n".join([
    "torch==2.4.1",
    "torchvision==0.19.1",
    "torchaudio==2.4.1",
]))

# Install non-torch deps honoring constraints
pip("install", "-c", "constraints.txt",
    "timm==1.0.9",
    "albumentations==1.4.14",
    "opencv-python-headless==4.10.0.84",
    "scikit-learn==1.5.1",
    "pandas==2.2.2",
    "numpy==1.26.4",
    "faiss-cpu==1.8.0.post1",
    "torchmetrics==1.4.2",
    "accelerate==0.34.2",
    "rich==13.8.1",
    "matplotlib==3.9.2",
    "seaborn==0.13.2",
    "--upgrade-strategy", "only-if-needed")

# Sanity check GPU
import torch
print("torch:", torch.__version__, "built CUDA:", getattr(torch.version, "cuda", None))
print("CUDA available:", torch.cuda.is_available())
assert str(getattr(torch.version, "cuda", "")).startswith("12.1"), f"Wrong CUDA build: {torch.version.cuda}"
assert torch.cuda.is_available(), "CUDA not available"
print("GPU:", torch.cuda.get_device_name(0))
print("[Setup] Done.")

In [None]:
# Build 5-fold stratified splits with singleton handling
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
import time

t0 = time.time()
df = pd.read_csv('train.csv')
img_col, label_col = 'Image', 'Id'

# Identify singleton classes (excluding 'new_whale')
vc = df[label_col].value_counts()
is_singleton = df[label_col].map(vc) == 1
singleton_mask = is_singleton & (df[label_col] != 'new_whale')
n_singleton = int(singleton_mask.sum())
print(f"Singleton (train-only) classes (excl. new_whale): {n_singleton}")

# Rows eligible for stratified CV
eligible_mask = ~singleton_mask
df_elig = df.loc[eligible_mask].copy()

# Stratified by label to preserve distribution (including new_whale)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
folds = np.full(len(df), -1, dtype=int)  # default -1 for train-only
for fold, (tr_idx, va_idx) in enumerate(skf.split(df_elig[img_col], df_elig[label_col])):
    va_indices_global = df_elig.index.values[va_idx]
    folds[va_indices_global] = fold

df_folds = df.copy()
df_folds['fold'] = folds  # -1 means never used as validation (train-only)
df_folds['use_in_val'] = df_folds['fold'] >= 0

# Sanity logs
print(df_folds['fold'].value_counts(dropna=False).sort_index())
for f in range(n_splits):
    val_mask = df_folds['fold'] == f
    print(f"Fold {f}: val n={val_mask.sum()} (incl. new_whale {(df_folds.loc[val_mask, label_col]=='new_whale').sum()})")
print(f"Train-only rows (fold=-1): {(df_folds['fold'] == -1).sum()}")

# Save folds
out_path = Path('folds.csv')
df_folds.to_csv(out_path, index=False)
print(f"Saved folds to {out_path.resolve()} in {time.time()-t0:.2f}s")

In [None]:
# Utils: dataset, transforms, ArcFace head, model wrapper, sampler
import math, random, time, os
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import albumentations as A
from albumentations.pytorch import ToTensorV2
import timm

cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)

IM_DIR_TRAIN = Path('train')
IM_DIR_TEST = Path('test')

def build_transforms(size=384, train=True):
    if train:
        return A.Compose([
            A.RandomResizedCrop(size, size, scale=(0.8, 1.0), ratio=(0.75, 1.33), p=1.0),
            A.HorizontalFlip(p=0.5),
            A.ColorJitter(0.2,0.2,0.2,0.1,p=0.3),
            A.Rotate(limit=15, border_mode=cv2.BORDER_REFLECT_101, p=0.2),
            A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            ToTensorV2(),
        ])
    else:
        return A.Compose([
            A.LongestMaxSize(max_size=size),
            A.PadIfNeeded(min_height=size, min_width=size, border_mode=cv2.BORDER_CONSTANT, value=0),
            A.CenterCrop(size, size),
            A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            ToTensorV2(),
        ])

class ImageDS(Dataset):
    def __init__(self, df, img_col='Image', label_col='Id', img_dir=IM_DIR_TRAIN, tfm=None, label2idx=None):
        self.df = df.reset_index(drop=True)
        self.img_col = img_col
        self.label_col = label_col
        self.img_dir = Path(img_dir)
        self.tfm = tfm
        self.label2idx = label2idx
    def __len__(self):
        return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        img_path = self.img_dir / row[self.img_col]
        img = cv2.imread(str(img_path))
        if img is None:
            raise FileNotFoundError(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.tfm:
            img = self.tfm(image=img)['image']
        if self.label2idx is not None:
            id_ = row[self.label_col]
            tgt = self.label2idx.get(id_, -1)
            return img, tgt
        return img, row[self.img_col]

class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.5, easy_margin=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
    def forward(self, embeddings, labels):
        # embeddings: (B, in_features) L2-normalized
        # labels: (B,)
        cosine = torch.matmul(embeddings, self.weight.t())
        sine = torch.sqrt(torch.clamp(1.0 - cosine**2, min=1e-9))
        phi = cosine * self.cos_m - sine * self.sin_m
        if not self.easy_margin:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, labels.view(-1,1), 1.0)
        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        logits = logits * self.s
        return logits

class EmbeddingModel(nn.Module):
    def __init__(self, backbone_name='convnext_tiny', embed_dim=512, pretrained=True):
        super().__init__()
        self.backbone = timm.create_model(backbone_name, pretrained=pretrained, num_classes=0, global_pool='avg')
        feat_dim = self.backbone.num_features
        self.head = nn.Linear(feat_dim, embed_dim, bias=False)
        self.bn = nn.BatchNorm1d(embed_dim)
    def forward(self, x):
        f = self.backbone(x)
        e = self.head(f)
        e = self.bn(e)
        e = nn.functional.normalize(e, p=2, dim=1)
        return e

def make_balanced_sampler(labels, pow_k=0.5):
    # labels: numpy array of class indices (>=0) for rows used in training
    vc = pd.Series(labels).value_counts().to_dict()
    weights = np.array([1.0 / (vc[int(y)] ** pow_k) for y in labels], dtype=np.float32)
    return WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

def build_label_mapping(train_df, label_col='Id'):
    ids = sorted(x for x in train_df[label_col].unique().tolist() if x != 'new_whale')
    label2idx = {lbl:i for i,lbl in enumerate(ids)}
    idx2label = {i:lbl for lbl,i in label2idx.items()}
    return label2idx, idx2label

print('[Utils] Loaded utilities: transforms, dataset, ArcFace head, model wrapper, sampler, label mapping.')

In [None]:
# Fix albumentations/albucore compatibility
import sys, subprocess
def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

# Upgrade albucore to a version providing preserve_channel_dim
pip("install", "-c", "constraints.txt", "albucore==0.0.12", "--upgrade-strategy", "only-if-needed")
import albucore, albumentations
import importlib, inspect
import albucore.utils as acu
print("albumentations:", albumentations.__version__, "albucore:", albucore.__version__)
print("has preserve_channel_dim:", hasattr(acu, "preserve_channel_dim"))

In [None]:
# Bump albucore/albumentations to compatible versions
import sys, subprocess, importlib
def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

# Upgrade to versions that include preserve_channel_dim
pip("install", "-c", "constraints.txt", "albucore>=0.0.20", "albumentations>=1.4.20", "--upgrade")
import albucore, albumentations
from importlib import reload
import albucore.utils as acu
print("albumentations:", albumentations.__version__, "albucore:", albucore.__version__)
print("has preserve_channel_dim:", hasattr(acu, "preserve_channel_dim"))

In [3]:
# Override utils to avoid albumentations: use torchvision transforms + PIL
import math, random, time, os
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T
import timm

IM_DIR_TRAIN = Path('train')
IM_DIR_TEST = Path('test')

def build_transforms(size=384, train=True):
    if train:
        return T.Compose([
            T.RandomResizedCrop(size, scale=(0.8, 1.0), ratio=(0.75, 1.33)),
            T.RandomHorizontalFlip(p=0.5),
            T.ColorJitter(0.2,0.2,0.2,0.1),
            T.RandomRotation(degrees=15, interpolation=T.InterpolationMode.BILINEAR),
            T.ToTensor(),
            T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
            T.RandomErasing(p=0.15, scale=(0.02, 0.15), ratio=(0.3, 3.3), value='random'),
        ])
    else:
        return T.Compose([
            T.Resize(int(size*1.15), interpolation=T.InterpolationMode.BILINEAR),
            T.CenterCrop(size),
            T.ToTensor(),
            T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ])

class ImageDS(Dataset):
    def __init__(self, df, img_col='Image', label_col='Id', img_dir=IM_DIR_TRAIN, tfm=None, label2idx=None):
        self.df = df.reset_index(drop=True)
        self.img_col = img_col
        self.label_col = label_col
        self.img_dir = Path(img_dir)
        self.tfm = tfm
        self.label2idx = label2idx
    def __len__(self):
        return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        img_path = self.img_dir / row[self.img_col]
        with Image.open(img_path) as im:
            im = im.convert('RGB')
            img = self.tfm(im) if self.tfm else T.ToTensor()(im)
        if self.label2idx is not None:
            id_ = row[self.label_col]
            tgt = self.label2idx.get(id_, -1)
            return img, tgt
        return img, row[self.img_col]

class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.5, easy_margin=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
    def forward(self, embeddings, labels):
        # normalize class weights (critical for ArcFace stability)
        W = F.normalize(self.weight, p=2, dim=1)
        cosine = F.linear(embeddings, W)
        sine = torch.sqrt(torch.clamp(1.0 - cosine**2, min=1e-9))
        phi = cosine * self.cos_m - sine * self.sin_m
        if not self.easy_margin:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, labels.view(-1,1), 1.0)
        logits = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        logits = logits * self.s
        return logits

class EmbeddingModel(nn.Module):
    def __init__(self, backbone_name='convnext_tiny', embed_dim=512, pretrained=True, drop_path_rate=0.0):
        super().__init__()
        self.backbone = timm.create_model(backbone_name, pretrained=pretrained, num_classes=0, global_pool='avg', drop_path_rate=drop_path_rate)
        feat_dim = self.backbone.num_features
        self.head = nn.Linear(feat_dim, embed_dim, bias=False)
        self.bn = nn.BatchNorm1d(embed_dim)
    def forward(self, x):
        f = self.backbone(x)
        e = self.head(f)
        e = self.bn(e)
        e = nn.functional.normalize(e, p=2, dim=1)
        return e

def make_balanced_sampler(labels, pow_k=0.5):
    vc = pd.Series(labels).value_counts().to_dict()
    weights = np.array([1.0 / (vc[int(y)] ** pow_k) for y in labels], dtype=np.float32)
    return WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

def build_label_mapping(train_df, label_col='Id'):
    ids = sorted(x for x in train_df[label_col].unique().tolist() if x != 'new_whale')
    label2idx = {lbl:i for i,lbl in enumerate(ids)}
    idx2label = {i:lbl for lbl,i in label2idx.items()}
    return label2idx, idx2label

print('[Utils-TorchVision] Utilities ready: transforms (torchvision), dataset (PIL), ArcFace head (W-normalized), model, sampler, label mapping.')

[Utils-TorchVision] Utilities ready: transforms (torchvision), dataset (PIL), ArcFace head (W-normalized), model, sampler, label mapping.


In [None]:
# Training + Embedding + Retrieval pipeline (ArcFace, convnext_tiny, torchvision transforms) - 5-fold OOF, tau on OOF, fold-ensemble, hflip TTA
import os, time, math, json, gc
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, SequentialLR, LinearLR
import faiss

def set_seed(seed=42):
    import random, os, numpy as np, torch
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

def map5_score(y_true_ids, y_pred_ranked_ids):
    assert len(y_true_ids) == len(y_pred_ranked_ids)
    scores = []
    for t, preds in zip(y_true_ids, y_pred_ranked_ids):
        score = 0.0
        for i, p in enumerate(preds[:5]):
            if p == t:
                score = 1.0 / (i+1)
                break
        scores.append(score)
    return float(np.mean(scores))

def get_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_one_fold(fold, df_folds, img_size=384, epochs=12, batch_size=48, lr=3e-4, weight_decay=0.05, arc_s=30.0, arc_m=0.5, embed_dim=512):
    t0 = time.time()
    device = get_device()
    val_mask = df_folds['fold'] == fold
    train_mask = (df_folds['fold'] != fold)
    df_tr = df_folds.loc[train_mask].copy()
    df_va = df_folds.loc[val_mask].copy()
    label2idx, idx2label = build_label_mapping(df_tr, label_col='Id')
    n_classes = len(label2idx)
    tfm_tr = build_transforms(size=img_size, train=True)
    tfm_va = build_transforms(size=img_size, train=False)
    ds_tr = ImageDS(df_tr, img_col='Image', label_col='Id', img_dir=IM_DIR_TRAIN, tfm=tfm_tr, label2idx=label2idx)
    # Build sampler labels without loading images
    y_tr = ds_tr.df['Id'].map(label2idx).fillna(-1).to_numpy()
    train_indices = np.where(y_tr != -1)[0]
    ds_tr_sub = Subset(ds_tr, train_indices)
    y_sub = y_tr[train_indices]
    sampler = make_balanced_sampler(y_sub, pow_k=0.5)
    dl_tr = DataLoader(ds_tr_sub, batch_size=batch_size, sampler=sampler, num_workers=8, pin_memory=True, persistent_workers=True)
    model = EmbeddingModel(backbone_name='convnext_tiny', embed_dim=embed_dim, pretrained=True).to(device)
    arc = ArcMarginProduct(embed_dim, n_classes, s=arc_s, m=arc_m).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(list(model.parameters()) + list(arc.parameters()), lr=lr, weight_decay=weight_decay)
    # 1-epoch linear warmup then cosine
    main = CosineAnnealingLR(optimizer, T_max=max(1, epochs-1))
    warm = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=1)
    scheduler = SequentialLR(optimizer, [warm, main], milestones=[1])
    scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
    for ep in range(1, epochs+1):
        model.train(); arc.train()
        running = 0.0; n = 0; t_ep = time.time()
        for it, (imgs, targets) in enumerate(dl_tr):
            imgs = imgs.to(device, non_blocking=True); targets = targets.to(device)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
                emb = model(imgs)
                logits = arc(emb, targets)
                loss = criterion(logits, targets)
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
            running += loss.item() * targets.size(0); n += targets.size(0)
            if (it+1) % 50 == 0:
                print(f"[Fold {fold}] Epoch {ep} Iter {it+1} loss={running/max(n,1):.4f} elapsed={time.time()-t_ep:.1f}s", flush=True)
        scheduler.step()
        print(f"[Fold {fold}] Epoch {ep}/{epochs} tr_loss={running/max(n,1):.4f} lr={scheduler.get_last_lr()[0]:.6f}")
    os.makedirs('checkpoints', exist_ok=True)
    torch.save({'model': model.state_dict(), 'arc': arc.state_dict(), 'label2idx': label2idx}, f'checkpoints/fold{fold}.pt')
    print(f"[Fold {fold}] Saved checkpoint. Total fold time {time.time()-t0:.1f}s")
    return model, label2idx, idx2label, df_tr, df_va

def extract_embeddings(model, df, img_size=384, batch_size=64, img_dir=IM_DIR_TRAIN, tta_hflip=True):
    device = get_device()
    model.eval()
    tfm = build_transforms(size=img_size, train=False)
    ds = ImageDS(df, img_col='Image', label_col='Id', img_dir=img_dir, tfm=tfm, label2idx=None)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True)
    embs = []
    with torch.no_grad():
        for imgs, _ in dl:
            imgs = imgs.to(device, non_blocking=True)
            with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
                e1 = model(imgs)
                if tta_hflip:
                    e2 = model(torch.flip(imgs, dims=[3]))
                    e = (e1 + e2) / 2.0
                else:
                    e = e1
            embs.append(e.detach().cpu().numpy())
    embs = np.concatenate(embs, axis=0)
    faiss.normalize_L2(embs)
    return embs

def per_class_max_similarity(query_embs, gallery_embs, gallery_labels, topK=300):
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    preds = []
    for qi in range(query_embs.shape[0]):
        best = {}
        for j in range(K):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in best or s > best[cls]:
                best[cls] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(ranked)
    return preds

def tune_new_whale_threshold(val_ranked_lists, val_true_ids, grid=None):
    if grid is None:
        grid = np.linspace(0.2, 0.8, 61)
    best_tau, best_map5 = 0.5, -1.0
    for tau in grid:
        top5 = []
        for ranked in val_ranked_lists:
            if len(ranked) == 0 or ranked[0][1] < tau:
                cand = ['new_whale'] + [c for c,_ in ranked][:4]
            else:
                cand = [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq:
                    uniq.append(c)
                if len(uniq) == 5:
                    break
            while len(uniq) < 5:
                uniq.append('new_whale')
            top5.append(uniq)
        m = map5_score(val_true_ids, top5)
        if m > best_map5:
            best_map5, best_tau = m, tau
    return best_tau, best_map5

def combine_fold_scores(rank_lists_per_fold):
    # rank_lists_per_fold: list of length n_folds, each is list len N of list[(cls, score)]
    n_folds = len(rank_lists_per_fold)
    N = len(rank_lists_per_fold[0])
    combined = []
    for i in range(N):
        agg = defaultdict(list)
        for f in range(n_folds):
            for cls, s in rank_lists_per_fold[f][i]:
                agg[cls].append(s)
        # average per-class similarities across folds
        scored = [(cls, float(np.mean(v))) for cls, v in agg.items()]
        scored.sort(key=lambda x: x[1], reverse=True)
        combined.append(scored)
    return combined

def run_full_5fold_pipeline(epochs=12, img_size=384, batch_size=48):
    set_seed(42)
    df = pd.read_csv('train.csv')
    folds = pd.read_csv('folds.csv')
    df_folds = df.merge(folds[['Image','fold']], on='Image', how='left')
    device = get_device(); print('Device:', device)
    oof_ranked = []; oof_true = []
    test_df = pd.read_csv('sample_submission.csv')[['Image']].copy(); test_df['Id'] = 'new_whale'
    te_ranked_folds = []
    os.makedirs('embeddings', exist_ok=True)
    for f in range(5):
        print(f"=== Training fold {f} ===", flush=True)
        model, l2i, i2l, df_tr, df_va = train_one_fold(f, df_folds, img_size=img_size, epochs=epochs, batch_size=batch_size)
        # Build gallery (train part only), exclude new_whale
        gal_df = df_tr[df_tr.Id != 'new_whale'].copy()
        assert set(df_va['Image']).isdisjoint(set(gal_df['Image'])), 'Leakage: val images present in gallery!'
        tr_embs_gal = extract_embeddings(model, gal_df, img_size=img_size, batch_size=max(32, batch_size), img_dir=IM_DIR_TRAIN, tta_hflip=True)
        tr_labels_gal = gal_df['Id'].tolist()
        # Val embeddings and ranking
        val_embs = extract_embeddings(model, df_va, img_size=img_size, batch_size=max(32, batch_size), img_dir=IM_DIR_TRAIN, tta_hflip=True)
        val_ranked = per_class_max_similarity(val_embs, tr_embs_gal, tr_labels_gal, topK=300)
        oof_ranked.extend(val_ranked)
        oof_true.extend(df_va['Id'].tolist())
        # Test rankings for this fold
        te_embs = extract_embeddings(model, test_df, img_size=img_size, batch_size=max(32, batch_size), img_dir=IM_DIR_TEST, tta_hflip=True)
        te_ranked = per_class_max_similarity(te_embs, tr_embs_gal, tr_labels_gal, topK=300)
        te_ranked_folds.append(te_ranked)
        # Cache embeddings/dfs
        np.save(f'embeddings/f{f}_gal_embs.npy', tr_embs_gal)
        np.save(f'embeddings/f{f}_val_embs.npy', val_embs)
        np.save(f'embeddings/f{f}_te_embs.npy', te_embs)
        gal_df.to_csv(f'embeddings/f{f}_gal_df.csv', index=False)
        df_va.to_csv(f'embeddings/f{f}_val_df.csv', index=False)
        # free GPU
        del model; torch.cuda.empty_cache(); gc.collect()
    # Tune tau on OOF
    best_tau, best_map5 = tune_new_whale_threshold(oof_ranked, oof_true)
    print(f"OOF tuned new_whale tau={best_tau:.3f}, OOF MAP@5={best_map5:.4f}")
    # Combine fold test scores
    te_combined = combine_fold_scores(te_ranked_folds)
    # Build final predictions
    preds5 = []
    for ranked in te_combined:
        if len(ranked) == 0 or ranked[0][1] < best_tau:
            cand = ['new_whale'] + [c for c,_ in ranked][:4]
        else:
            cand = [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq:
                uniq.append(c)
            if len(uniq) == 5:
                break
        while len(uniq) < 5:
            uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv')

# Launch full 5-fold training + OOF tuning + fold-ensemble inference
run_full_5fold_pipeline(epochs=12, img_size=384, batch_size=48)

In [None]:
# Post-hoc: Recompute TEST rankings using FULL train gallery per fold; tune tau from cached OOF; write improved submission.csv
import os, time, gc
import numpy as np
import pandas as pd
import torch

def load_model_from_ckpt(ckpt_path, backbone_name='convnext_tiny', embed_dim=512):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = EmbeddingModel(backbone_name=backbone_name, embed_dim=embed_dim, pretrained=False).to(device)
    state = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(state['model'], strict=True)
    model.eval()
    return model

def extract_embeddings_df(model, df, img_size=384, batch_size=64, img_dir=IM_DIR_TRAIN, tta_hflip=True):
    return extract_embeddings(model, df, img_size=img_size, batch_size=batch_size, img_dir=img_dir, tta_hflip=tta_hflip)

def recompute_oof_and_tune_tau_from_cache():
    oof_ranked = []; oof_true = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        ranked = per_class_max_similarity(val_embs, gal_embs, gal_labels, topK=300)
        oof_ranked.extend(ranked)
        oof_true.extend(val_df['Id'].tolist())
    tau, map5 = tune_new_whale_threshold(oof_ranked, oof_true, grid=None)
    print(f"[Posthoc] OOF re-tuned tau={tau:.3f}, OOF MAP@5={map5:.4f}")
    return tau

def test_full_gallery_inference(epochs_img_size=384, batch_size=64):
    t0 = time.time()
    train_df = pd.read_csv('train.csv')
    full_gal_df = train_df[train_df.Id != 'new_whale'].copy()
    test_df = pd.read_csv('sample_submission.csv')[['Image']].copy(); test_df['Id'] = 'new_whale'
    te_ranked_folds = []
    for f in range(5):
        print(f"[Posthoc] Fold {f}: loading checkpoint and extracting FULL train gallery embs...", flush=True)
        model = load_model_from_ckpt(f'checkpoints/fold{f}.pt', backbone_name='convnext_tiny', embed_dim=512)
        gal_embs_full = extract_embeddings_df(model, full_gal_df, img_size=epochs_img_size, batch_size=max(32, batch_size), img_dir=IM_DIR_TRAIN, tta_hflip=True)
        print(f"[Posthoc] Fold {f}: gallery embs shape {gal_embs_full.shape}")
        te_embs = np.load(f'embeddings/f{f}_te_embs.npy')
        gal_labels_full = full_gal_df['Id'].tolist()
        te_ranked = per_class_max_similarity(te_embs, gal_embs_full, gal_labels_full, topK=300)
        te_ranked_folds.append(te_ranked)
        del model; torch.cuda.empty_cache(); gc.collect()
    print(f"[Posthoc] Combining per-fold per-class scores...", flush=True)
    te_combined = combine_fold_scores(te_ranked_folds)
    print(f"[Posthoc] Recomputing OOF tau from cache...", flush=True)
    tau = recompute_oof_and_tune_tau_from_cache()
    preds5 = []
    for ranked in te_combined:
        if len(ranked) == 0 or ranked[0][1] < tau:
            cand = ['new_whale'] + [c for c,_ in ranked][:4]
        else:
            cand = [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq:
                uniq.append(c)
            if len(uniq) == 5:
                break
        while len(uniq) < 5:
            uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Posthoc] Saved submission.csv using FULL-train gallery blend. Elapsed {time.time()-t0:.1f}s")

# Run post-hoc full-train gallery inference now
test_full_gallery_inference(epochs_img_size=384, batch_size=64)

In [None]:
# Post-hoc v2: Try per-class top-k mean similarity (k in {2,3}) using cached OOF to pick best, then recompute TEST with FULL-train gallery
import os, time, gc
import numpy as np
import pandas as pd
import torch
import faiss

def per_class_topk_mean_similarity(query_embs, gallery_embs, gallery_labels, topK=300, topn=2):
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    preds = []
    for qi in range(query_embs.shape[0]):
        buckets = {}
        for j in range(K):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in buckets:
                buckets[cls] = [s]
            else:
                buckets[cls].append(s)
        agg = []
        for cls, arr in buckets.items():
            arr.sort(reverse=True)
            m = float(np.mean(arr[:topn]))
            agg.append((cls, m))
        agg.sort(key=lambda x: x[1], reverse=True)
        preds.append(agg)
    return preds

def eval_oof_with_agg(topn=2, topK=300):
    oof_ranked = []; oof_true = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        ranked = per_class_topk_mean_similarity(val_embs, gal_embs, gal_labels, topK=topK, topn=topn)
        oof_ranked.extend(ranked)
        oof_true.extend(val_df['Id'].tolist())
    tau, map5 = tune_new_whale_threshold(oof_ranked, oof_true, grid=None)
    print(f"[Top{topn}-mean] OOF tau={tau:.3f}, MAP@5={map5:.4f}")
    return tau, map5

def run_topkmean_full_gallery_submission(img_size=384, batch_size=64, topn=2, topK=300):
    t0 = time.time()
    train_df = pd.read_csv('train.csv')
    full_gal_df = train_df[train_df.Id != 'new_whale'].copy()
    test_df = pd.read_csv('sample_submission.csv')[['Image']].copy(); test_df['Id'] = 'new_whale'
    # Select best aggregator on OOF
    tau2, map2 = eval_oof_with_agg(topn=2, topK=topK)
    tau3, map3 = eval_oof_with_agg(topn=3, topK=topK)
    if map3 > map2 + 0.005:
        use_topn, best_tau = 3, tau3
    else:
        use_topn, best_tau = 2, tau2
    print(f"[Select] Using top-{use_topn} mean; tau={best_tau:.3f}")
    te_ranked_folds = []
    for f in range(5):
        ckpt = f'checkpoints/fold{f}.pt'
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = EmbeddingModel(backbone_name='convnext_tiny', embed_dim=512, pretrained=False).to(device)
        state = torch.load(ckpt, map_location=device)
        model.load_state_dict(state['model'], strict=True)
        model.eval()
        full_emb_path = f'embeddings/f{f}_gal_full_embs.npy'
        if os.path.exists(full_emb_path):
            gal_embs_full = np.load(full_emb_path)
        else:
            gal_embs_full = extract_embeddings(model, full_gal_df, img_size=img_size, batch_size=max(32, batch_size), img_dir=IM_DIR_TRAIN, tta_hflip=True)
            np.save(full_emb_path, gal_embs_full)
        te_embs = np.load(f'embeddings/f{f}_te_embs.npy')
        gal_labels_full = full_gal_df['Id'].tolist()
        te_ranked = per_class_topk_mean_similarity(te_embs, gal_embs_full, gal_labels_full, topK=topK, topn=use_topn)
        te_ranked_folds.append(te_ranked)
        del model; torch.cuda.empty_cache(); gc.collect()
    te_combined = combine_fold_scores(te_ranked_folds)
    preds5 = []
    for ranked in te_combined:
        if len(ranked) == 0 or ranked[0][1] < best_tau:
            cand = ['new_whale'] + [c for c,_ in ranked][:4]
        else:
            cand = [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq:
                uniq.append(c)
            if len(uniq) == 5:
                break
        while len(uniq) < 5:
            uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Top{use_topn}-mean] Saved submission.csv (topK={topK}) in {time.time()-t0:.1f}s")

# Execute top-k mean submission build
run_topkmean_full_gallery_submission(img_size=384, batch_size=64, topn=2, topK=300)

In [6]:
# ConvNeXt-Small @512 with PK sampler + EMA; train 5 folds; ensemble with Tiny post-hoc using full-train gallery; re-tune tau on combined OOF
import os, time, gc, math
import numpy as np
import pandas as pd
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset, Sampler
from torch.optim import AdamW
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR
import faiss

os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF','expandable_segments:True')
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class EMA:
    def __init__(self, model, decay=0.9998):
        self.decay = decay
        self.shadow = {}  # store on CPU to save VRAM
        self.backup = {}
        self.register(model)
    def register(self, model):
        for name, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[name] = p.detach().cpu().clone()
    def update(self, model):
        for name, p in model.named_parameters():
            if p.requires_grad:
                w_cpu = p.detach().cpu()
                new_avg = (1.0 - self.decay) * w_cpu + self.decay * self.shadow[name]
                self.shadow[name] = new_avg.clone()
    def apply_shadow(self, model):
        self.backup = {}
        for name, p in model.named_parameters():
            if p.requires_grad:
                self.backup[name] = p.detach().clone()
                p.data.copy_(self.shadow[name].to(p.device, dtype=p.dtype))
    def restore(self, model):
        for name, p in model.named_parameters():
            if p.requires_grad and name in self.backup:
                p.data.copy_(self.backup[name].data)
        self.backup = {}

def build_pk_batches(labels, P=16, K=4, drop_last=True, rng=None):
    # labels: numpy array of class indices for ds_tr_sub (>=0)
    if rng is None:
        rng = np.random.default_rng(42)
    idx_by_cls = defaultdict(list)
    for i, y in enumerate(labels):
        idx_by_cls[int(y)].append(i)
    # include all classes; sample with replacement if class has <K samples
    classes = list(idx_by_cls.keys())
    if len(classes) == 0:
        raise RuntimeError('No classes available for PK batching')
    batches = []
    total = sum(len(idx_by_cls[c]) for c in classes)
    n_batches = max(1, total // (P*K))
    for _ in range(n_batches):
        chosen = rng.choice(classes, size=min(P, len(classes)), replace=False)
        batch = []
        for c in chosen:
            idxs = idx_by_cls[c]
            pick = rng.choice(idxs, size=K, replace=(len(idxs) < K))
            batch.extend(int(x) for x in pick)
        if len(batch) == P*K:
            batches.append(batch)
    if not drop_last:
        pass
    return batches

class PKBatchSampler(Sampler):
    def __init__(self, labels, P=16, K=4, drop_last=True, seed=42):
        self.labels = np.asarray(labels)
        self.P = P; self.K = K; self.drop_last = drop_last; self.seed = seed
        self.epoch = 0
        self.batches = build_pk_batches(self.labels, P=self.P, K=self.K, drop_last=self.drop_last, rng=np.random.default_rng(self.seed))
    def __iter__(self):
        rng = np.random.default_rng(self.seed + self.epoch)
        self.batches = build_pk_batches(self.labels, P=self.P, K=self.K, drop_last=self.drop_last, rng=rng)
        self.epoch += 1
        for b in self.batches:
            yield b
    def __len__(self):
        return len(self.batches)

def per_class_max_similarity(query_embs, gallery_embs, gallery_labels, topK=500):
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    preds = []
    for qi in range(query_embs.shape[0]):
        best = {}
        for j in range(K):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in best or s > best[cls]:
                best[cls] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(ranked)
    return preds

def merge_rank_lists_weighted(r_tiny, r_small, w=0.5):
    # weighted average: (1-w)*tiny + w*small
    d = defaultdict(float)
    for c, s in r_tiny:
        d[c] += (1.0 - w) * s
    for c, s in r_small:
        d[c] += w * s
    out = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return out

def merge_rank_lists_equal(r1, r2):
    d = defaultdict(list)
    for c, s in r1: d[c].append(s)
    for c, s in r2: d[c].append(s)
    out = [(c, float(np.mean(v))) for c, v in d.items()]
    out.sort(key=lambda x: x[1], reverse=True)
    return out

def tune_new_whale_threshold(val_ranked_lists, val_true_ids, grid=None):
    if grid is None:
        grid = np.linspace(0.2, 0.8, 61)
    best_tau, best_map5 = 0.5, -1.0
    def map5_score(y_true_ids, y_pred_ranked_ids):
        scores = []
        for t, preds in zip(y_true_ids, y_pred_ranked_ids):
            score = 0.0
            for i, p in enumerate(preds[:5]):
                if p == t:
                    score = 1.0 / (i+1); break
            scores.append(score)
        return float(np.mean(scores))
    for tau in grid:
        top5 = []
        for ranked in val_ranked_lists:
            if len(ranked) == 0 or ranked[0][1] < tau:
                cand = ['new_whale'] + [c for c,_ in ranked][:4]
            else:
                cand = [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq) == 5: break
            while len(uniq) < 5: uniq.append('new_whale')
            top5.append(uniq)
        m = map5_score(val_true_ids, top5)
        if m > best_map5: best_map5, best_tau = m, tau
    return best_tau, best_map5

def extract_embeddings(model, df, img_size=512, batch_size=64, img_dir=IM_DIR_TRAIN, tta_hflip=True):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    tfm = build_transforms(size=img_size, train=False)
    ds = ImageDS(df, img_col='Image', label_col='Id', img_dir=img_dir, tfm=tfm, label2idx=None)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True)
    embs = []
    with torch.no_grad():
        for imgs, _ in dl:
            imgs = imgs.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
                e1 = model(imgs)
                if tta_hflip:
                    e2 = model(torch.flip(imgs, dims=[3]))
                    e = (e1 + e2) / 2.0
                else:
                    e = e1
            embs.append(e.detach().cpu().numpy())
    embs = np.concatenate(embs, axis=0)
    faiss.normalize_L2(embs)
    return embs

def build_transforms_512(train=True):
    import torchvision.transforms as T
    size = 512
    if train:
        return T.Compose([
            T.RandomResizedCrop(size, scale=(0.88, 1.0), ratio=(0.75, 1.33)),
            T.RandomHorizontalFlip(0.5),
            T.ColorJitter(0.15,0.15,0.15,0.05),
            T.RandomGrayscale(p=0.05),
            T.ToTensor(),
            T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ])
    else:
        return T.Compose([
            T.Resize(int(size*1.15)),
            T.CenterCrop(size),
            T.ToTensor(),
            T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
        ])

def train_convnext_small_fold(fold, df_folds, epochs=25, P=16, K=4, lr=3e-4, wd=0.05, arc_s=45.0, arc_m=0.35, embed_dim=512):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    val_mask = df_folds['fold'] == fold
    train_mask = (df_folds['fold'] != fold)
    df_tr = df_folds.loc[train_mask].copy()
    df_va = df_folds.loc[val_mask].copy()
    label2idx, idx2label = build_label_mapping(df_tr, label_col='Id')
    n_classes = len(label2idx)
    tfm_tr = build_transforms_512(train=True)
    tfm_va = build_transforms_512(train=False)
    # dataset
    ds_tr = ImageDS(df_tr, img_col='Image', label_col='Id', img_dir=IM_DIR_TRAIN, tfm=tfm_tr, label2idx=label2idx)
    y_tr_full = ds_tr.df['Id'].map(label2idx).fillna(-1).to_numpy()
    train_indices = np.where(y_tr_full != -1)[0]
    ds_tr_sub = Subset(ds_tr, train_indices)
    y_sub = y_tr_full[train_indices]
    # PK batch sampler on subset
    pk_sampler = PKBatchSampler(y_sub, P=P, K=K, drop_last=True, seed=42+fold)
    dl_tr = DataLoader(ds_tr_sub, batch_sampler=pk_sampler, num_workers=8, pin_memory=True, persistent_workers=True)
    # model
    model = EmbeddingModel(backbone_name='convnext_small', embed_dim=embed_dim, pretrained=True, drop_path_rate=0.1).to(device)
    model.backbone = model.backbone.to(memory_format=torch.channels_last)
    # enable gradient checkpointing on backbone (fallback if not available)
    try:
        if hasattr(model.backbone, 'set_grad_checkpointing'):
            model.backbone.set_grad_checkpointing(True)
            print('[Small] Enabled gradient checkpointing on backbone')
        else:
            print('[Small] Grad checkpointing method not found on backbone; continuing without it')
    except Exception as e:
        print('[Small] Grad checkpointing not enabled:', e)
    arc = ArcMarginProduct(embed_dim, n_classes, s=arc_s, m=arc_m).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(list(model.parameters()) + list(arc.parameters()), lr=lr, weight_decay=wd)
    main = CosineAnnealingLR(optimizer, T_max=max(1, epochs-1), eta_min=1e-6)
    warm = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=1)
    scheduler = SequentialLR(optimizer, [warm, main], milestones=[1])
    scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
    ema = EMA(model, decay=0.9998)
    t0 = time.time()
    for ep in range(1, epochs+1):
        model.train(); arc.train()
        running = 0.0; n = 0; t_ep = time.time()
        for it, batch in enumerate(dl_tr):
            imgs, targets = batch
            imgs = imgs.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            targets = targets.to(device)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda', enabled=torch.cuda.is_available()):
                emb = model(imgs)
                logits = arc(emb, targets)
                loss = criterion(logits, targets)
            scaler.scale(loss).backward()
            # gradient clipping for stability
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(list(model.parameters()) + list(arc.parameters()), max_norm=1.0)
            scaler.step(optimizer); scaler.update()
            ema.update(model)
            running += loss.item() * targets.size(0); n += targets.size(0)
            if (it+1) % 50 == 0:
                print(f"[Small Fold {fold}] Ep {ep} It {it+1} loss={running/max(n,1):.4f} elaps={time.time()-t_ep:.1f}s", flush=True)
        scheduler.step()
        print(f"[Small Fold {fold}] Epoch {ep}/{epochs} tr_loss={running/max(n,1):.4f} lr={scheduler.get_last_lr()[0]:.6f}")
    # save ema weights
    ema.apply_shadow(model)
    os.makedirs('checkpoints_small', exist_ok=True)
    torch.save({'model': model.state_dict(), 'label2idx': label2idx}, f'checkpoints_small/fold{fold}.pt')
    ema.restore(model)  # keep training model state clean
    print(f"[Small Fold {fold}] Saved EMA checkpoint. Time {time.time()-t0:.1f}s")
    # Build OOF ranked using train-excl-val gallery (no leakage), exclude new_whale
    ema.apply_shadow(model)
    gal_df = df_tr[df_tr.Id != 'new_whale'].copy()
    tr_embs_gal = extract_embeddings(model, gal_df, img_size=512, batch_size=48, img_dir=IM_DIR_TRAIN, tta_hflip=True)
    tr_labels_gal = gal_df['Id'].tolist()
    val_embs = extract_embeddings(model, df_va, img_size=512, batch_size=48, img_dir=IM_DIR_TRAIN, tta_hflip=True)
    val_ranked = per_class_max_similarity(val_embs, tr_embs_gal, tr_labels_gal, topK=500)
    # cache
    os.makedirs('embeddings_small', exist_ok=True)
    np.save(f'embeddings_small/f{fold}_gal_embs.npy', tr_embs_gal)
    np.save(f'embeddings_small/f{fold}_val_embs.npy', val_embs)
    gal_df.to_csv(f'embeddings_small/f{fold}_gal_df.csv', index=False)
    df_va.to_csv(f'embeddings_small/f{fold}_val_df.csv', index=False)
    ema.restore(model)
    return model, val_ranked, df_va[['Image','Id']].copy()

def run_convnext_small_pipeline(epochs=25, P_first=8, P_others=10, K=4):
    set_seed = lambda s=42: (np.random.seed(s), torch.manual_seed(s), torch.cuda.manual_seed_all(s))
    set_seed(42)
    df = pd.read_csv('train.csv')
    folds = pd.read_csv('folds.csv')
    df_folds = df.merge(folds[['Image','fold']], on='Image', how='left')
    all_oof_ranked_small = []; all_oof_true_small = []; all_oof_img_small = []
    for f in range(5):
        P_this = P_first if f == 0 else P_others
        ckpt_path = f'checkpoints_small/fold{f}.pt'
        gal_emb_path = f'embeddings_small/f{f}_gal_embs.npy'
        val_emb_path = f'embeddings_small/f{f}_val_embs.npy'
        gal_df_path = f'embeddings_small/f{f}_gal_df.csv'
        val_df_path = f'embeddings_small/f{f}_val_df.csv'
        if os.path.exists(ckpt_path) and os.path.exists(gal_emb_path) and os.path.exists(val_emb_path) and os.path.exists(gal_df_path) and os.path.exists(val_df_path):
            print(f"=== Small: Skipping training fold {f} (cache found) ===", flush=True)
            gal_embs = np.load(gal_emb_path)
            val_embs = np.load(val_emb_path)
            gal_df = pd.read_csv(gal_df_path)
            val_df = pd.read_csv(val_df_path)
            gal_labels = gal_df['Id'].tolist()
            val_ranked = per_class_max_similarity(val_embs, gal_embs, gal_labels, topK=500)
            all_oof_ranked_small.extend(val_ranked); all_oof_true_small.extend(val_df['Id'].tolist()); all_oof_img_small.extend(val_df['Image'].tolist())
            continue
        elif os.path.exists(ckpt_path):
            print(f"=== Small: Fold {f} checkpoint found; running embedding extraction only ===", flush=True)
            val_mask = df_folds['fold'] == f
            train_mask = (df_folds['fold'] != f)
            df_tr = df_folds.loc[train_mask].copy()
            df_va = df_folds.loc[val_mask].copy()
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            m = EmbeddingModel(backbone_name='convnext_small', embed_dim=512, pretrained=False, drop_path_rate=0.1).to(device)
            state = torch.load(ckpt_path, map_location=device)
            m.load_state_dict(state['model'], strict=True); m.eval()
            gal_df = df_tr[df_tr.Id != 'new_whale'].copy()
            tr_embs_gal = extract_embeddings(m, gal_df, img_size=512, batch_size=48, img_dir=IM_DIR_TRAIN, tta_hflip=True)
            val_embs = extract_embeddings(m, df_va, img_size=512, batch_size=48, img_dir=IM_DIR_TRAIN, tta_hflip=True)
            val_ranked = per_class_max_similarity(val_embs, tr_embs_gal, gal_df['Id'].tolist(), topK=500)
            os.makedirs('embeddings_small', exist_ok=True)
            np.save(gal_emb_path, tr_embs_gal); np.save(val_emb_path, val_embs)
            gal_df.to_csv(gal_df_path, index=False); df_va.to_csv(val_df_path, index=False)
            all_oof_ranked_small.extend(val_ranked); all_oof_true_small.extend(df_va['Id'].tolist()); all_oof_img_small.extend(df_va['Image'].tolist())
            del m; torch.cuda.empty_cache(); gc.collect()
            continue
        else:
            print(f"=== Small: Training fold {f} (P={P_this},K={K}) ===", flush=True)
            model, va_ranked, va_df = train_convnext_small_fold(f, df_folds, epochs=epochs, P=P_this, K=K)
            all_oof_ranked_small.extend(va_ranked); all_oof_true_small.extend(va_df['Id'].tolist()); all_oof_img_small.extend(va_df['Image'].tolist())
            del model; torch.cuda.empty_cache(); gc.collect()
    tau_small, map_small = tune_new_whale_threshold(all_oof_ranked_small, all_oof_true_small, grid=None)
    print(f"[Small] OOF tau={tau_small:.3f}, MAP@5={map_small:.4f}")
    # Build ensemble with Tiny using full-train gallery per fold
    print('[Ensemble] Computing TEST rankings for Small (full-train gallery) and loading Tiny cached full gallery...')
    train_df = pd.read_csv('train.csv')
    full_gal_df = train_df[train_df.Id != 'new_whale'].copy()
    te_df = pd.read_csv('sample_submission.csv')[['Image']].copy(); te_df['Id'] = 'new_whale'
    te_ranked_small_folds = []; te_ranked_tiny_folds = []
    for f in range(5):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # Small model
        m_small = EmbeddingModel(backbone_name='convnext_small', embed_dim=512, pretrained=False, drop_path_rate=0.1).to(device)
        state = torch.load(f'checkpoints_small/fold{f}.pt', map_location=device)
        m_small.load_state_dict(state['model'], strict=True); m_small.eval()
        gal_small = extract_embeddings(m_small, full_gal_df, img_size=512, batch_size=48, img_dir=IM_DIR_TRAIN, tta_hflip=True)
        te_small_emb_path = f'embeddings_small/te_embs_small_f{f}.npy'
        if os.path.exists(te_small_emb_path):
            te_embs = np.load(te_small_emb_path)
        else:
            te_embs = extract_embeddings(m_small, te_df, img_size=512, batch_size=48, img_dir=IM_DIR_TEST, tta_hflip=True)
            os.makedirs('embeddings_small', exist_ok=True)
            np.save(te_small_emb_path, te_embs)
        labs_full = full_gal_df['Id'].tolist()
        te_ranked_small = per_class_max_similarity(te_embs, gal_small, labs_full, topK=500)
        te_ranked_small_folds.append(te_ranked_small)
        del m_small; torch.cuda.empty_cache(); gc.collect()
        # Tiny model
        if os.path.exists(f'embeddings/f{f}_gal_full_embs.npy'):
            gal_tiny = np.load(f'embeddings/f{f}_gal_full_embs.npy')
        else:
            m_tiny = EmbeddingModel(backbone_name='convnext_tiny', embed_dim=512, pretrained=False).to(device)
            st = torch.load(f'checkpoints/fold{f}.pt', map_location=device)
            m_tiny.load_state_dict(st['model'], strict=True); m_tiny.eval()
            gal_tiny = extract_embeddings(m_tiny, full_gal_df, img_size=384, batch_size=64, img_dir=IM_DIR_TRAIN, tta_hflip=True)
            np.save(f'embeddings/f{f}_gal_full_embs.npy', gal_tiny)
            del m_tiny; torch.cuda.empty_cache(); gc.collect()
        te_embs_tiny = np.load(f'embeddings/f{f}_te_embs.npy')
        te_ranked_tiny = per_class_max_similarity(te_embs_tiny, gal_tiny, labs_full, topK=500)
        te_ranked_tiny_folds.append(te_ranked_tiny)
    # Average across folds within each model
    def combine_fold_scores(rank_lists_per_fold):
        n_folds = len(rank_lists_per_fold); N = len(rank_lists_per_fold[0]); out = []
        for i in range(N):
            d = defaultdict(list)
            for f in range(n_folds):
                for c, s in rank_lists_per_fold[f][i]: d[c].append(s)
            arr = [(c, float(np.mean(v))) for c, v in d.items()]
            arr.sort(key=lambda x: x[1], reverse=True); out.append(arr)
        return out
    te_small_comb = combine_fold_scores(te_ranked_small_folds)
    te_tiny_comb = combine_fold_scores(te_ranked_tiny_folds)
    # Recompute combined OOF (tiny+small) for tau/weight tuning using cached per-model OOFs, aligned by Image
    oof_ranked_tiny = {}; oof_true = {}; order_imgs = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        ranked = per_class_max_similarity(val_embs, gal_embs, gal_labels, topK=500)
        for img, rnk, true_id in zip(val_df['Image'].tolist(), ranked, val_df['Id'].tolist()):
            oof_ranked_tiny[img] = rnk
            if img not in oof_true:
                oof_true[img] = true_id
                order_imgs.append(img)
    oof_ranked_small = {}
    for img, rnk in zip(all_oof_img_small, all_oof_ranked_small):
        oof_ranked_small[img] = rnk
    # Grid-search weight w and tau
    best_w, best_tau, best_map = 0.5, 0.5, -1.0
    w_grid = np.linspace(0.0, 1.0, 21)
    for w in w_grid:
        merged_ranked = []
        true_list = []
        for img in order_imgs:
            if (img not in oof_ranked_tiny) or (img not in oof_ranked_small):
                continue
            merged_ranked.append(merge_rank_lists_weighted(oof_ranked_tiny[img], oof_ranked_small[img], w=w))
            true_list.append(oof_true[img])
        tau, m = tune_new_whale_threshold(merged_ranked, true_list, grid=None)
        if m > best_map:
            best_map, best_tau, best_w = m, tau, float(w)
    print(f"[Ensemble] OOF weight w={best_w:.2f}, tau={best_tau:.3f}, MAP@5={best_map:.4f}")
    # Ensemble test per-class scores across models with best_w and apply best_tau
    preds5 = []
    for i in range(len(te_tiny_comb)):
        merged = merge_rank_lists_weighted(te_tiny_comb[i], te_small_comb[i], w=best_w)
        if len(merged) == 0 or merged[0][1] < best_tau:
            cand = ['new_whale'] + [c for c,_ in merged][:4]
        else:
            cand = [c for c,_ in merged][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq) == 5: break
        while len(uniq) < 5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print('[Ensemble] Saved submission.csv (Tiny@384 + Small@512, weighted per-class max, full-train gallery, tau tuned on merged OOF)')

# Kick off ConvNeXt-Small training + ensemble build
run_convnext_small_pipeline(epochs=25, P_first=8, P_others=10, K=4)

=== Small: Skipping training fold 0 (cache found) ===


=== Small: Fold 1 checkpoint found; running embedding extraction only ===


  state = torch.load(ckpt_path, map_location=device)


=== Small: Training fold 2 (P=10,K=4) ===


[Small] Enabled gradient checkpointing on backbone


  return fn(*args, **kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[Small Fold 2] Ep 1 It 50 loss=25.4914 elaps=1099.3s


In [13]:
# Partial-fold ensemble builder: Tiny (5 folds) + Small (available folds only) with OOF-aligned weighting and tau+margin; caching gal_full; topK=300
import os, gc, time
import numpy as np
import pandas as pd
import torch
from collections import defaultdict

def per_class_max_similarity(query_embs, gallery_embs, gallery_labels, topK=300):
    import faiss
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    preds = []
    for qi in range(query_embs.shape[0]):
        best = {}
        for j in range(K):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in best or s > best[cls]:
                best[cls] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(ranked)
    return preds

def tune_tau_delta(val_ranked_lists, val_true_ids, tau_grid_coarse=None, tau_window=0.05, tau_step_fine=0.005, delta_grid=None):
    import numpy as np
    if tau_grid_coarse is None:
        tau_grid_coarse = np.arange(0.2, 0.801, 0.02)
    if delta_grid is None:
        delta_grid = [0.0, 0.03, 0.05, 0.08, 0.10, 0.12, 0.15]
    def map5_score(y_true_ids, y_pred_ranked_ids):
        scores = []
        for t, preds in zip(y_true_ids, y_pred_ranked_ids):
            sc = 0.0
            for i, p in enumerate(preds[:5]):
                if p == t: sc = 1.0/(i+1); break
            scores.append(sc)
        return float(np.mean(scores))
    def build_preds(tau, delta):
        out = []
        for ranked in val_ranked_lists:
            if len(ranked) == 0:
                out.append(['new_whale']*5); continue
            top1 = ranked[0][1]
            top2 = ranked[1][1] if len(ranked) > 1 else -1.0
            cond_new = (top1 < tau) or ((top1 - top2) < delta)
            if cond_new:
                cand = ['new_whale'] + [c for c,_ in ranked][:4]
            else:
                cand = [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq) == 5: break
            while len(uniq) < 5: uniq.append('new_whale')
            out.append(uniq)
        return out
    best = (-1.0, 0.5, 0.0)
    for dlt in delta_grid:
        for tau in tau_grid_coarse:
            preds = build_preds(tau, dlt)
            m = map5_score(val_true_ids, preds)
            if m > best[0]: best = (m, float(tau), float(dlt))
    _, tau_c, dlt_c = best
    tau_fine = np.arange(max(0.0, tau_c - tau_window), min(1.0, tau_c + tau_window) + 1e-6, tau_step_fine)
    for tau in tau_fine:
        preds = build_preds(tau, dlt_c)
        m = map5_score(val_true_ids, preds)
        if m > best[0]: best = (m, float(tau), float(dlt_c))
    return best[1], best[0], best[2]  # tau, map, delta

def merge_rank_lists_weighted(r_tiny, r_small, w=0.5):
    d = defaultdict(float)
    for c, s in r_tiny:
        d[c] += (1.0 - w) * s
    for c, s in r_small:
        d[c] += w * s
    out = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return out

def combine_fold_scores(rank_lists_per_fold):
    n_folds = len(rank_lists_per_fold); N = len(rank_lists_per_fold[0]); out = []
    for i in range(N):
        d = defaultdict(list)
        for f in range(n_folds):
            for c, s in rank_lists_per_fold[f][i]: d[c].append(s)
        arr = [(c, float(np.mean(v))) for c, v in d.items()]
        arr.sort(key=lambda x: x[1], reverse=True); out.append(arr)
    return out

def build_partial_ensemble_and_submit():
    t0 = time.time()
    small_folds = []
    for f in range(5):
        if os.path.exists(f'checkpoints_small/fold{f}.pt') and \
           os.path.exists(f'embeddings_small/f{f}_gal_embs.npy') and \
           os.path.exists(f'embeddings_small/f{f}_val_embs.npy') and \
           os.path.exists(f'embeddings_small/f{f}_gal_df.csv') and \
           os.path.exists(f'embeddings_small/f{f}_val_df.csv'):
            small_folds.append(f)
    print(f"[Partial] Small folds available: {small_folds}")
    assert len(small_folds) > 0, 'No small folds available; run training or switch to Tiny-only submission.'

    # Tiny OOF dicts
    oof_ranked_tiny = {}; oof_true = {}; order_imgs = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        ranked = per_class_max_similarity(val_embs, gal_embs, gal_labels, topK=300)
        for img, rnk, true_id in zip(val_df['Image'].tolist(), ranked, val_df['Id'].tolist()):
            oof_ranked_tiny[img] = rnk
            if img not in oof_true:
                oof_true[img] = true_id
                order_imgs.append(img)

    # Small OOF dicts
    oof_ranked_small = {}
    for f in small_folds:
        gal_embs = np.load(f'embeddings_small/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings_small/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings_small/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings_small/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        ranked = per_class_max_similarity(val_embs, gal_embs, gal_labels, topK=300)
        for img, rnk in zip(val_df['Image'].tolist(), ranked):
            oof_ranked_small[img] = rnk

    inter_imgs = [img for img in order_imgs if img in oof_ranked_small]
    print(f"[Partial] OOF alignment images: {len(inter_imgs)}")
    assert len(inter_imgs) > 0, 'No overlapping OOF images between tiny and small folds'

    # Grid-search weight w and tau+delta
    best_w, best_tau, best_delta, best_map = 0.5, 0.5, 0.0, -1.0
    for w in np.linspace(0.0, 1.0, 21):
        merged_ranked = []; true_list = []
        for img in inter_imgs:
            merged_ranked.append(merge_rank_lists_weighted(oof_ranked_tiny[img], oof_ranked_small[img], w=float(w)))
            true_list.append(oof_true[img])
        tau, m, dlt = tune_tau_delta(merged_ranked, true_list, tau_grid_coarse=None)
        if m > best_map:
            best_map, best_tau, best_w, best_delta = m, tau, float(w), dlt
    print(f"[Partial Ensemble] OOF weight w={best_w:.2f}, tau={best_tau:.3f}, delta={best_delta:.3f}, MAP@5={best_map:.4f}")

    train_df = pd.read_csv('train.csv')
    full_gal_df = train_df[train_df.Id != 'new_whale'].copy()
    labs_full = full_gal_df['Id'].tolist()
    te_df = pd.read_csv('sample_submission.csv')[['Image']].copy(); te_df['Id'] = 'new_whale'

    # Tiny TEST ranks (use cached full gallery per fold, cached te_embs)
    te_ranked_tiny_folds = []
    for f in range(5):
        full_gal_path = f'embeddings/f{f}_gal_full_embs.npy'
        if os.path.exists(full_gal_path):
            gal_tiny = np.load(full_gal_path)
        else:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            m_tiny = EmbeddingModel(backbone_name='convnext_tiny', embed_dim=512, pretrained=False).to(device)
            st = torch.load(f'checkpoints/fold{f}.pt', map_location=device)
            m_tiny.load_state_dict(st['model'], strict=True); m_tiny.eval()
            gal_tiny = extract_embeddings(m_tiny, full_gal_df, img_size=384, batch_size=64, img_dir=IM_DIR_TRAIN, tta_hflip=True)
            os.makedirs('embeddings', exist_ok=True)
            np.save(full_gal_path, gal_tiny)
            del m_tiny; torch.cuda.empty_cache(); gc.collect()
        te_embs_tiny = np.load(f'embeddings/f{f}_te_embs.npy')
        te_ranked_tiny = per_class_max_similarity(te_embs_tiny, gal_tiny, labs_full, topK=300)
        te_ranked_tiny_folds.append(te_ranked_tiny)

    # Small TEST ranks (cache full gallery embs per available fold; accept te_embs from either embeddings_small or train/ fallback)
    os.makedirs('embeddings_small', exist_ok=True)
    te_ranked_small_folds = []
    for f in small_folds:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        m_small = EmbeddingModel(backbone_name='convnext_small', embed_dim=512, pretrained=False, drop_path_rate=0.1).to(device)
        st = torch.load(f'checkpoints_small/fold{f}.pt', map_location=device)
        m_small.load_state_dict(st['model'], strict=True); m_small.eval()
        gal_small_full_path = f'embeddings_small/f{f}_gal_full_embs.npy'
        if os.path.exists(gal_small_full_path):
            gal_small = np.load(gal_small_full_path)
        else:
            gal_small = extract_embeddings(m_small, full_gal_df, img_size=512, batch_size=48, img_dir=IM_DIR_TRAIN, tta_hflip=True)
            np.save(gal_small_full_path, gal_small)
        te_small_emb_path = f'embeddings_small/te_embs_small_f{f}.npy'
        te_embs = None
        if os.path.exists(te_small_emb_path):
            te_embs = np.load(te_small_emb_path)
        elif os.path.exists(f'train/te_embs_small_f{f}.npy'):  # fallback location observed in FS
            te_embs = np.load(f'train/te_embs_small_f{f}.npy')
        else:
            te_embs = extract_embeddings(m_small, te_df, img_size=512, batch_size=48, img_dir=IM_DIR_TEST, tta_hflip=True)
            np.save(te_small_emb_path, te_embs)
        te_ranked_small = per_class_max_similarity(te_embs, gal_small, labs_full, topK=300)
        te_ranked_small_folds.append(te_ranked_small)
        del m_small; torch.cuda.empty_cache(); gc.collect()

    te_tiny_comb = combine_fold_scores(te_ranked_tiny_folds)
    if len(te_ranked_small_folds) > 0:
        te_small_comb = combine_fold_scores(te_ranked_small_folds)
    else:
        te_small_comb = [list() for _ in range(len(te_tiny_comb))]

    preds5 = []
    for i in range(len(te_tiny_comb)):
        merged = merge_rank_lists_weighted(te_tiny_comb[i], te_small_comb[i], w=best_w)
        if len(merged) == 0:
            preds5.append('new_whale new_whale new_whale new_whale new_whale'); continue
        top1 = merged[0][1]; top2 = merged[1][1] if len(merged) > 1 else -1.0
        cond_new = (top1 < best_tau) or ((top1 - top2) < best_delta)
        cand = (['new_whale'] + [c for c,_ in merged][:4]) if cond_new else [c for c,_ in merged][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq) == 5: break
        while len(uniq) < 5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))

    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Partial] Saved submission.csv using Tiny(5f)+Small({len(small_folds)}f). Elapsed {time.time()-t0:.1f}s")

# Execute partial ensemble now
build_partial_ensemble_and_submit()

[Partial] Small folds available: [0, 1]


[Partial] OOF alignment images: 1662


[Partial Ensemble] OOF weight w=0.00, tau=0.440, delta=0.050, MAP@5=0.4441


  st = torch.load(f'checkpoints_small/fold{f}.pt', map_location=device)


[Partial] Saved submission.csv using Tiny(5f)+Small(2f). Elapsed 3386.9s


In [8]:
# Tiny-only improved submission with refined tau and margin rule (no model loading, use cached embeddings)
import os, time, gc
import numpy as np
import pandas as pd
import faiss

def map5_score(y_true_ids, y_pred_ranked_ids):
    scores = []
    for t, preds in zip(y_true_ids, y_pred_ranked_ids):
        score = 0.0
        for i, p in enumerate(preds[:5]):
            if p == t:
                score = 1.0 / (i+1); break
        scores.append(score)
    return float(np.mean(scores))

def per_class_max_similarity(query_embs, gallery_embs, gallery_labels, topK=300):
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    preds = []
    for qi in range(query_embs.shape[0]):
        best = {}
        for j in range(K):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in best or s > best[cls]:
                best[cls] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(ranked)
    return preds

def tune_tau_delta(rank_lists, true_ids, tau_grid_coarse=None, tau_window=0.05, tau_step_fine=0.005, delta_grid=None):
    if tau_grid_coarse is None:
        tau_grid_coarse = np.arange(0.2, 0.801, 0.02)
    if delta_grid is None:
        delta_grid = [0.0, 0.03, 0.05, 0.08, 0.10, 0.12, 0.15]
    best = (-1.0, 0.5, 0.0)
    def build_preds(tau, delta):
        out = []
        for ranked in rank_lists:
            if len(ranked) == 0:
                out.append(['new_whale']*5); continue
            top1 = ranked[0][1]
            top2 = ranked[1][1] if len(ranked) > 1 else -1.0
            cond_new = (top1 < tau) or ((top1 - top2) < delta)
            if cond_new:
                cand = ['new_whale'] + [c for c,_ in ranked][:4]
            else:
                cand = [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq) == 5: break
            while len(uniq) < 5: uniq.append('new_whale')
            out.append(uniq)
        return out
    # coarse
    for dlt in delta_grid:
        for tau in tau_grid_coarse:
            preds = build_preds(tau, dlt)
            m = map5_score(true_ids, preds)
            if m > best[0]: best = (m, float(tau), float(dlt))
    # fine around best tau
    _, tau_c, dlt_c = best
    tau_fine = np.arange(max(0.0, tau_c - tau_window), min(1.0, tau_c + tau_window) + 1e-6, tau_step_fine)
    for tau in tau_fine:
        preds = build_preds(tau, dlt_c)
        m = map5_score(true_ids, preds)
        if m > best[0]: best = (m, float(tau), float(dlt_c))
    return best  # (map5, tau, delta)

def build_tiny_only_submission(topK=300):
    t0 = time.time()
    # Recompute OOF ranked from cached tiny embeddings (train-excl-val gallery per fold)
    oof_ranked = []; oof_true = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        ranked = per_class_max_similarity(val_embs, gal_embs, gal_labels, topK=topK)
        oof_ranked.extend(ranked)
        oof_true.extend(val_df['Id'].tolist())
    best_map, best_tau, best_delta = tune_tau_delta(oof_ranked, oof_true)
    print(f"[Tiny-only] OOF tuned: tau={best_tau:.3f}, delta={best_delta:.3f}, MAP@5={best_map:.4f}")
    # Compute TEST rankings using FULL-train gallery per fold (cached full gallery embs + test embs)
    train_df = pd.read_csv('train.csv')
    full_gal_df = train_df[train_df.Id != 'new_whale'].copy()
    labs_full = full_gal_df['Id'].tolist()
    te_ranked_folds = []
    for f in range(5):
        gal_full_path = f'embeddings/f{f}_gal_full_embs.npy'
        assert os.path.exists(gal_full_path), f"Missing {gal_full_path}; run cell 11/12 first"
        gal_full = np.load(gal_full_path)
        te_embs = np.load(f'embeddings/f{f}_te_embs.npy')
        te_ranked = per_class_max_similarity(te_embs, gal_full, labs_full, topK=topK)
        te_ranked_folds.append(te_ranked)
    # Combine folds by mean
    N = len(te_ranked_folds[0])
    te_comb = []
    for i in range(N):
        d = {}
        for f in range(5):
            for c, s in te_ranked_folds[f][i]:
                d.setdefault(c, []).append(s)
        arr = [(c, float(np.mean(v))) for c, v in d.items()]
        arr.sort(key=lambda x: x[1], reverse=True)
        te_comb.append(arr)
    # Apply decision rule with best tau/delta
    preds5 = []
    for ranked in te_comb:
        if len(ranked) == 0:
            preds5.append('new_whale new_whale new_whale new_whale new_whale'); continue
        top1 = ranked[0][1]
        top2 = ranked[1][1] if len(ranked) > 1 else -1.0
        cond_new = (top1 < best_tau) or ((top1 - top2) < best_delta)
        if cond_new:
            cand = ['new_whale'] + [c for c,_ in ranked][:4]
        else:
            cand = [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq) == 5: break
        while len(uniq) < 5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Tiny-only] Saved submission.csv (topK={topK}). Elapsed {time.time()-t0:.1f}s")

# Run tiny-only improved submission build
build_tiny_only_submission(topK=300)

[Tiny-only] OOF tuned: tau=0.410, delta=0.080, MAP@5=0.4249


[Tiny-only] Saved submission.csv (topK=300). Elapsed 22.9s


In [9]:
# Tiny-only with Query Expansion (QE) and margin-based new_whale; uses cached tiny embeddings only
import os, time, gc
import numpy as np
import pandas as pd
import faiss

def map5_score(y_true_ids, y_pred_ranked_ids):
    scores = []
    for t, preds in zip(y_true_ids, y_pred_ranked_ids):
        score = 0.0
        for i, p in enumerate(preds[:5]):
            if p == t:
                score = 1.0 / (i+1); break
        scores.append(score)
    return float(np.mean(scores))

def faiss_search(query_embs, gallery_embs, topK):
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    return sims, idxs, index

def query_expansion(query_embs, gallery_embs, idxs, m=5, alpha=0.3):
    # expanded_q = normalize(q + alpha * mean(top-m gallery vecs))
    q_new = []
    for i in range(query_embs.shape[0]):
        top_idx = idxs[i, :min(m, idxs.shape[1])]
        neigh = gallery_embs[top_idx]
        mean_vec = neigh.mean(axis=0)
        v = query_embs[i] + alpha * mean_vec
        v = v / (np.linalg.norm(v) + 1e-9)
        q_new.append(v.astype('float32'))
    return np.vstack(q_new)

def per_class_rank_from_search(idxs, sims, gallery_labels):
    preds = []
    for qi in range(idxs.shape[0]):
        best = {}
        for j in range(idxs.shape[1]):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in best or s > best[cls]:
                best[cls] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(ranked)
    return preds

def tune_tau_delta(rank_lists, true_ids, tau_grid_coarse=None, tau_window=0.05, tau_step_fine=0.005, delta_grid=None):
    if tau_grid_coarse is None:
        tau_grid_coarse = np.arange(0.2, 0.801, 0.02)
    if delta_grid is None:
        delta_grid = [0.0, 0.03, 0.05, 0.08, 0.10, 0.12, 0.15]
    best = (-1.0, 0.5, 0.0)
    def build_preds(tau, delta):
        out = []
        for ranked in rank_lists:
            if len(ranked) == 0:
                out.append(['new_whale']*5); continue
            top1 = ranked[0][1]
            top2 = ranked[1][1] if len(ranked) > 1 else -1.0
            cond_new = (top1 < tau) or ((top1 - top2) < delta)
            if cond_new:
                cand = ['new_whale'] + [c for c,_ in ranked][:4]
            else:
                cand = [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq) == 5: break
            while len(uniq) < 5: uniq.append('new_whale')
            out.append(uniq)
        return out
    for dlt in delta_grid:
        for tau in tau_grid_coarse:
            preds = build_preds(tau, dlt)
            m = map5_score(true_ids, preds)
            if m > best[0]: best = (m, float(tau), float(dlt))
    _, tau_c, dlt_c = best
    tau_fine = np.arange(max(0.0, tau_c - tau_window), min(1.0, tau_c + tau_window) + 1e-6, tau_step_fine)
    for tau in tau_fine:
        preds = build_preds(tau, dlt_c)
        m = map5_score(true_ids, preds)
        if m > best[0]: best = (m, float(tau), float(dlt_c))
    return best  # (map5, tau, delta)

def tiny_qe_submission(topK_search=300, m_qe=5, alpha=0.3):
    t0 = time.time()
    # OOF with QE on tiny folds (train-excl-val gallery)
    oof_ranked = []; oof_true = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        sims1, idxs1, index = faiss_search(val_embs, gal_embs, topK_search)
        val_qe = query_expansion(val_embs, gal_embs, idxs1, m=m_qe, alpha=alpha)
        sims2, idxs2 = index.search(val_qe.astype('float32'), min(topK_search, gal_embs.shape[0]))
        ranked = per_class_rank_from_search(idxs2, sims2, gal_labels)
        oof_ranked.extend(ranked)
        oof_true.extend(val_df['Id'].tolist())
    best_map, best_tau, best_delta = tune_tau_delta(oof_ranked, oof_true)
    print(f"[Tiny-QE] OOF tuned: tau={best_tau:.3f}, delta={best_delta:.3f}, MAP@5={best_map:.4f}")
    # TEST with QE against FULL-train gallery per fold
    train_df = pd.read_csv('train.csv')
    full_gal_df = train_df[train_df.Id != 'new_whale'].copy()
    labs_full = full_gal_df['Id'].tolist()
    te_ranked_folds = []
    for f in range(5):
        gal_full_path = f'embeddings/f{f}_gal_full_embs.npy'
        assert os.path.exists(gal_full_path), f"Missing {gal_full_path}; run earlier cells first"
        gal_full = np.load(gal_full_path)
        te_embs = np.load(f'embeddings/f{f}_te_embs.npy')
        sims1, idxs1, index = faiss_search(te_embs, gal_full, topK_search)
        te_qe = query_expansion(te_embs, gal_full, idxs1, m=m_qe, alpha=alpha)
        sims2, idxs2 = index.search(te_qe.astype('float32'), min(topK_search, gal_full.shape[0]))
        ranked = per_class_rank_from_search(idxs2, sims2, labs_full)
        te_ranked_folds.append(ranked)
    # Combine folds by mean
    N = len(te_ranked_folds[0])
    te_comb = []
    for i in range(N):
        d = {}
        for f in range(5):
            for c, s in te_ranked_folds[f][i]:
                d.setdefault(c, []).append(s)
        arr = [(c, float(np.mean(v))) for c, v in d.items()]
        arr.sort(key=lambda x: x[1], reverse=True)
        te_comb.append(arr)
    # Apply decision rule with best tau/delta
    preds5 = []
    for ranked in te_comb:
        if len(ranked) == 0:
            preds5.append('new_whale new_whale new_whale new_whale new_whale'); continue
        top1 = ranked[0][1]
        top2 = ranked[1][1] if len(ranked) > 1 else -1.0
        cond_new = (top1 < best_tau) or ((top1 - top2) < best_delta)
        if cond_new:
            cand = ['new_whale'] + [c for c,_ in ranked][:4]
        else:
            cand = [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq) == 5: break
        while len(uniq) < 5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Tiny-QE] Saved submission.csv (topK={topK_search}, m={m_qe}, alpha={alpha}). Elapsed {time.time()-t0:.1f}s")

# Run Tiny-only QE with light params
tiny_qe_submission(topK_search=300, m_qe=5, alpha=0.3)

[Tiny-QE] OOF tuned: tau=0.495, delta=0.080, MAP@5=0.4255


[Tiny-QE] Saved submission.csv (topK=300, m=5, alpha=0.3). Elapsed 25.7s


In [11]:
# Tiny-only ensemble: baseline + QE + class-prior weighting; OOF-tune blend u, prior alpha, tau, delta; cached embeddings only
import os, time, gc
import numpy as np
import pandas as pd
import faiss

def faiss_search(query_embs, gallery_embs, topK):
    d = gallery_embs.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(gallery_embs.astype('float32'))
    K = min(topK, gallery_embs.shape[0])
    sims, idxs = index.search(query_embs.astype('float32'), K)
    return sims, idxs, index

def per_class_rank_from_search(idxs, sims, gallery_labels):
    preds = []
    for qi in range(idxs.shape[0]):
        best = {}
        for j in range(idxs.shape[1]):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); cls = gallery_labels[gi]
            if cls not in best or s > best[cls]:
                best[cls] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(ranked)
    return preds

def query_expansion(query_embs, gallery_embs, idxs, m=5, alpha=0.3):
    q_new = []
    for i in range(query_embs.shape[0]):
        top_idx = idxs[i, :min(m, idxs.shape[1])]
        neigh = gallery_embs[top_idx]
        mean_vec = neigh.mean(axis=0)
        v = query_embs[i] + alpha * mean_vec
        v = v / (np.linalg.norm(v) + 1e-9)
        q_new.append(v.astype('float32'))
    return np.vstack(q_new)

def map5_score(y_true_ids, y_pred_ranked_ids):
    scores = []
    for t, preds in zip(y_true_ids, y_pred_ranked_ids):
        score = 0.0
        for i, p in enumerate(preds[:5]):
            if p == t:
                score = 1.0 / (i+1); break
        scores.append(score)
    return float(np.mean(scores))

def tune_tau_delta(rank_lists, true_ids, tau_grid_coarse=None, tau_window=0.05, tau_step_fine=0.005, delta_grid=None):
    if tau_grid_coarse is None:
        tau_grid_coarse = np.arange(0.2, 0.801, 0.02)
    if delta_grid is None:
        delta_grid = [0.0, 0.03, 0.05, 0.08, 0.10, 0.12, 0.15]
    best = (-1.0, 0.5, 0.0)
    def build_preds(tau, delta):
        out = []
        for ranked in rank_lists:
            if len(ranked) == 0:
                out.append(['new_whale']*5); continue
            top1 = ranked[0][1]
            top2 = ranked[1][1] if len(ranked) > 1 else -1.0
            cond_new = (top1 < tau) or ((top1 - top2) < delta)
            if cond_new:
                cand = ['new_whale'] + [c for c,_ in ranked][:4]
            else:
                cand = [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq) == 5: break
            while len(uniq) < 5: uniq.append('new_whale')
            out.append(uniq)
        return out
    for dlt in delta_grid:
        for tau in tau_grid_coarse:
            preds = build_preds(tau, dlt)
            m = map5_score(true_ids, preds)
            if m > best[0]: best = (m, float(tau), float(dlt))
    _, tau_c, dlt_c = best
    tau_fine = np.arange(max(0.0, tau_c - tau_window), min(1.0, tau_c + tau_window) + 1e-6, tau_step_fine)
    for tau in tau_fine:
        preds = build_preds(tau, dlt_c)
        m = map5_score(true_ids, preds)
        if m > best[0]: best = (m, float(tau), float(dlt_c))
    return best  # (map5, tau, delta)

def apply_class_prior(ranked_list, freq_map, alpha=0.0):
    if alpha <= 0.0: return ranked_list
    adj = []
    for c, s in ranked_list:
        f = freq_map.get(c, 1.0)
        adj.append((c, float(s * (f ** alpha))))
    adj.sort(key=lambda x: x[1], reverse=True)
    return adj

def merge_two_rank_lists(r1, r2, u=0.5):
    d = {}
    for c, s in r1: d.setdefault(c, []).append((1.0 - u) * s)
    for c, s in r2: d.setdefault(c, []).append(u * s)
    out = [(c, float(np.sum(v))) for c, v in d.items()]
    out.sort(key=lambda x: x[1], reverse=True)
    return out

def tiny_ensemble_qe_prior(topK=300, m_qe=5, alpha_qe=0.3, u_grid=(0.0, 0.2, 0.5, 0.8, 1.0), alpha_prior_grid=(0.0, 0.1, 0.2, 0.3)):
    t0 = time.time()
    # Build class frequency prior from full train (exclude new_whale)
    tr = pd.read_csv('train.csv')
    freq = tr[tr.Id != 'new_whale']['Id'].value_counts().to_dict()
    # OOF: baseline + QE using train-excl-val gallery
    oof_base = []; oof_qe = []; oof_true = []
    for f in range(5):
        gal = np.load(f'embeddings/f{f}_gal_embs.npy')
        val = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        labs = gal_df['Id'].tolist()
        sims1, idxs1, index = faiss_search(val, gal, topK)
        base_rank = per_class_rank_from_search(idxs1, sims1, labs)
        val_qe = query_expansion(val, gal, idxs1, m=m_qe, alpha=alpha_qe)
        sims2, idxs2 = index.search(val_qe.astype('float32'), min(topK, gal.shape[0]))
        qe_rank = per_class_rank_from_search(idxs2, sims2, labs)
        oof_base.extend(base_rank); oof_qe.extend(qe_rank)
        oof_true.extend(val_df['Id'].tolist())
    # Grid over u and alpha_prior
    best = (-1.0, 0.5, 0.0, 0.0)  # (map, tau, delta, u, alpha_prior)
    best_u, best_ap = 0.0, 0.0
    for u in u_grid:
        for ap in alpha_prior_grid:
            merged = []
            for b, q in zip(oof_base, oof_qe):
                r = merge_two_rank_lists(b, q, u=u)
                r = apply_class_prior(r, freq, alpha=ap)
                merged.append(r)
            m, tau, dlt = tune_tau_delta(merged, oof_true)
            # tune_tau_delta returns (map, tau, delta)
            if m > best[0]:
                best = (m, tau, dlt, u, ap)
                best_u, best_ap = u, ap
    print(f"[Tiny-Ens] OOF best: u={best_u:.2f}, prior_alpha={best_ap:.2f}, tau={best[1]:.3f}, delta={best[2]:.3f}, MAP@5={best[0]:.4f}")
    # TEST on full-train gallery per fold
    full_gal_df = tr[tr.Id != 'new_whale'].copy()
    labs_full = full_gal_df['Id'].tolist()
    te_ranked_folds = []
    for f in range(5):
        gal_full = np.load(f'embeddings/f{f}_gal_full_embs.npy')
        te = np.load(f'embeddings/f{f}_te_embs.npy')
        sims1, idxs1, index = faiss_search(te, gal_full, topK)
        base_rank = per_class_rank_from_search(idxs1, sims1, labs_full)
        te_qe = query_expansion(te, gal_full, idxs1, m=m_qe, alpha=alpha_qe)
        sims2, idxs2 = index.search(te_qe.astype('float32'), min(topK, gal_full.shape[0]))
        qe_rank = per_class_rank_from_search(idxs2, sims2, labs_full)
        # merge + prior
        merged = []
        for b, q in zip(base_rank, qe_rank):
            r = merge_two_rank_lists(b, q, u=best_u)
            r = apply_class_prior(r, freq, alpha=best_ap)
            merged.append(r)
        te_ranked_folds.append(merged)
    # Combine folds by mean
    N = len(te_ranked_folds[0])
    te_comb = []
    for i in range(N):
        d = {}
        for f in range(5):
            for c, s in te_ranked_folds[f][i]:
                d.setdefault(c, []).append(s)
        arr = [(c, float(np.mean(v))) for c, v in d.items()]
        arr.sort(key=lambda x: x[1], reverse=True)
        te_comb.append(arr)
    # Apply decision rule
    _, tau_star, dlt_star, _, _ = best
    preds5 = []
    for ranked in te_comb:
        if len(ranked) == 0:
            preds5.append('new_whale new_whale new_whale new_whale new_whale'); continue
        top1 = ranked[0][1]
        top2 = ranked[1][1] if len(ranked) > 1 else -1.0
        cond_new = (top1 < tau_star) or ((top1 - top2) < dlt_star)
        if cond_new:
            cand = ['new_whale'] + [c for c,_ in ranked][:4]
        else:
            cand = [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq) == 5: break
        while len(uniq) < 5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Tiny-Ens] Saved submission.csv (topK={topK}). Elapsed {time.time()-t0:.1f}s")

# Run tiny-only ensemble with prior
tiny_ensemble_qe_prior(topK=300, m_qe=5, alpha_qe=0.3, u_grid=(0.0,0.2,0.5,0.8,1.0), alpha_prior_grid=(0.0,0.1,0.2))

[Tiny-Ens] OOF best: u=0.20, prior_alpha=0.10, tau=0.515, delta=0.080, MAP@5=0.4357


[Tiny-Ens] Saved submission.csv (topK=300). Elapsed 381.1s


In [12]:
# OpenCLIP ViT-L/14@336 retrieval + ensemble with Tiny; OOF-tune weights and tau/delta; fast frozen model
import os, time, gc, math, sys, subprocess
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as T
from PIL import Image
import faiss

def pip(*args):
    print('>', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Ensure open_clip without touching torch stack
pip('install', '-c', 'constraints.txt', 'open_clip_torch==2.26.1', '--upgrade-strategy', 'only-if-needed')
import open_clip

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('[CLIP] Device:', device)

def l2norm_np(x):
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-9
    return (x / n).astype('float32')

def build_img_loader(df, preprocess, img_dir, batch_size=128, num_workers=8):
    class ImgDS(torch.utils.data.Dataset):
        def __init__(self, df, img_dir, preprocess):
            self.df = df.reset_index(drop=True); self.img_dir = img_dir; self.pp = preprocess
        def __len__(self): return len(self.df)
        def __getitem__(self, i):
            p = os.path.join(self.img_dir, self.df.iloc[i]['Image'])
            with Image.open(p) as im:
                im = im.convert('RGB')
                return self.pp(im), self.df.iloc[i]['Image']
    ds = ImgDS(df, img_dir, preprocess)
    dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)
    return dl

def extract_openclip_embeddings(model, preprocess, df, img_dir, batch_size=128):
    dl = build_img_loader(df, preprocess, img_dir, batch_size=batch_size)
    embs = []; t0 = time.time(); n=0
    model.eval()
    autocast = torch.amp.autocast('cuda', enabled=torch.cuda.is_available())
    with torch.no_grad(), autocast:
        for i, (imgs, _) in enumerate(dl):
            imgs = imgs.to(device, non_blocking=True)
            feats = model.encode_image(imgs)
            feats = feats.float()
            embs.append(feats.detach().cpu().numpy())
            n += imgs.size(0)
            if (i+1) % 20 == 0:
                print(f'[CLIP] {n}/{len(df)} imgs, elapsed {time.time()-t0:.1f}s', flush=True)
    embs = np.concatenate(embs, axis=0)
    return l2norm_np(embs)

def per_class_max_from_search(idxs, sims, labels):
    out = []
    for qi in range(idxs.shape[0]):
        best = {}
        for j in range(idxs.shape[1]):
            gi = int(idxs[qi, j]); s = float(sims[qi, j]); c = labels[gi]
            if c not in best or s > best[c]: best[c] = s
        ranked = sorted(best.items(), key=lambda x: x[1], reverse=True)
        out.append(ranked)
    return out

def faiss_ip_search(Q, G, topK=300):
    d = G.shape[1]; index = faiss.IndexFlatIP(d); index.add(G.astype('float32'))
    K = min(topK, G.shape[0])
    sims, idxs = index.search(Q.astype('float32'), K)
    return sims, idxs

def map5_score(y_true_ids, y_pred_ranked_ids):
    scores = []
    for t, preds in zip(y_true_ids, y_pred_ranked_ids):
        sc = 0.0
        for i, p in enumerate(preds[:5]):
            if p == t: sc = 1.0/(i+1); break
        scores.append(sc)
    return float(np.mean(scores))

def tune_tau_delta(rank_lists, true_ids, tau_grid_coarse=None, tau_window=0.05, tau_step_fine=0.005, delta_grid=None):
    if tau_grid_coarse is None: tau_grid_coarse = np.arange(0.2, 0.801, 0.02)
    if delta_grid is None: delta_grid = [0.0, 0.03, 0.05, 0.08, 0.10, 0.12, 0.15]
    best = (-1.0, 0.5, 0.0)
    def preds_for(tau, delta):
        out = []
        for ranked in rank_lists:
            if len(ranked) == 0: out.append(['new_whale']*5); continue
            top1 = ranked[0][1]; top2 = ranked[1][1] if len(ranked)>1 else -1.0
            new_flag = (top1 < tau) or ((top1-top2) < delta)
            cand = (['new_whale'] + [c for c,_ in ranked][:4]) if new_flag else [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq)==5: break
            while len(uniq)<5: uniq.append('new_whale')
            out.append(uniq)
        return out
    for dlt in delta_grid:
        for tau in tau_grid_coarse:
            m = map5_score(true_ids, preds_for(tau, dlt))
            if m > best[0]: best = (m, float(tau), float(dlt))
    _, tau_c, dlt_c = best
    tau_fine = np.arange(max(0.0, tau_c-tau_window), min(1.0, tau_c+tau_window)+1e-6, tau_step_fine)
    for tau in tau_fine:
        m = map5_score(true_ids, preds_for(tau, dlt_c))
        if m > best[0]: best = (m, float(tau), float(dlt_c))
    return best  # (map, tau, delta)

def merge_rank_lists_weighted(r1, r2, w=0.5):
    d = {}
    for c,s in r1: d.setdefault(c, []).append((1.0-w)*s)
    for c,s in r2: d.setdefault(c, []).append(w*s)
    arr = [(c, float(np.sum(v))) for c,v in d.items()]
    arr.sort(key=lambda x: x[1], reverse=True)
    return arr

def combine_folds_mean(rank_lists_per_fold):
    n_f = len(rank_lists_per_fold); N = len(rank_lists_per_fold[0]); out = []
    for i in range(N):
        d = {}
        for f in range(n_f):
            for c,s in rank_lists_per_fold[f][i]: d.setdefault(c, []).append(s)
        arr = [(c, float(np.mean(v))) for c,v in d.items()]
        arr.sort(key=lambda x: x[1], reverse=True)
        out.append(arr)
    return out

def build_clip_and_ensemble_with_tiny(topK=300, batch_size=128):
    t_all = time.time()
    # Load OpenCLIP model+preprocess
    model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion2b_s32b_b82k')
    model = model.to(device)
    # Ensure output features are float32
    if hasattr(model, 'float'): model = model.float()
    # DataFrames
    tr = pd.read_csv('train.csv')
    te = pd.read_csv('sample_submission.csv')[['Image']].copy(); te['Id'] = 'new_whale'
    folds = pd.read_csv('folds.csv')
    tr = tr.merge(folds[['Image','fold']], on='Image', how='left')
    # Extract embeddings (cached to disk to re-use)
    os.makedirs('embeddings_clip', exist_ok=True)
    train_emb_path = 'embeddings_clip/train_clip.npy'
    test_emb_path = 'embeddings_clip/test_clip.npy'
    if os.path.exists(train_emb_path):
        E_tr = np.load(train_emb_path)
    else:
        print('[CLIP] Extracting train embeddings...')
        E_tr = extract_openclip_embeddings(model, preprocess, tr, 'train', batch_size=batch_size)
        np.save(train_emb_path, E_tr)
    if os.path.exists(test_emb_path):
        E_te = np.load(test_emb_path)
    else:
        print('[CLIP] Extracting test embeddings...')
        E_te = extract_openclip_embeddings(model, preprocess, te, 'test', batch_size=batch_size)
        np.save(test_emb_path, E_te)
    # Build OOF ranks for CLIP using train-excl-val gallery (exclude new_whale)
    oof_clip = []; oof_true = []; order_imgs = []
    for f in range(5):
        mask_val = tr['fold'] == f
        df_va = tr.loc[mask_val].reset_index(drop=True)
        df_tr = tr.loc[~mask_val].reset_index(drop=True)
        gal_df = df_tr[df_tr.Id != 'new_whale'].reset_index(drop=True)
        if len(gal_df)==0 or len(df_va)==0: continue
        G = E_tr[df_tr.index[df_tr['Id'] != 'new_whale']]; Q = E_tr[mask_val.values]
        labs = gal_df['Id'].tolist()
        sims, idxs = faiss_ip_search(Q, G, topK=topK)
        ranked = per_class_max_from_search(idxs, sims, labs)
        oof_clip.extend(ranked); oof_true.extend(df_va['Id'].tolist()); order_imgs.extend(df_va['Image'].tolist())
    # Build Test ranks for CLIP using FULL-train gallery (exclude new_whale)
    full_gal_df = tr[tr.Id!='new_whale'].reset_index(drop=True)
    G_full = E_tr[tr.index[tr['Id']!='new_whale']]; labs_full = full_gal_df['Id'].tolist()
    sims_te, idxs_te = faiss_ip_search(E_te, G_full, topK=topK)
    te_rank_clip = per_class_max_from_search(idxs_te, sims_te, labs_full)
    # Tiny OOF and Tiny Test ranks from cached embeddings (already implemented in prior cells)
    # OOF Tiny
    oof_tiny = []; oof_true_tiny = []; order_tiny = []
    for f in range(5):
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        labs = gal_df['Id'].tolist()
        ranked = per_class_max_from_search(faiss_ip_search(val_embs, gal_embs, topK=topK)[1],
                                          faiss_ip_search(val_embs, gal_embs, topK=topK)[0],
                                          labs)
        # Avoid double FAISS search; recompute once properly:
        sims, idxs = faiss_ip_search(val_embs, gal_embs, topK=topK)
        ranked = per_class_max_from_search(idxs, sims, labs)
        oof_tiny.extend(ranked); oof_true_tiny.extend(val_df['Id'].tolist()); order_tiny.extend(val_df['Image'].tolist())
    # Align OOF by Image intersection
    oof_tiny_map = {img:r for img,r in zip(order_tiny, oof_tiny)}
    oof_true_map = {img:t for img,t in zip(order_tiny, oof_true_tiny)}
    inter = [img for img in order_imgs if img in oof_tiny_map]
    print(f"[Ensemble] OOF alignment (CLIP ∩ Tiny): {len(inter)}")
    # Grid-search weight u for CLIP vs Tiny and tune tau/delta on merged ranks
    best = (-1.0, 0.5, 0.0, 0.5)  # (map, tau, delta, w_clip)
    for w in np.linspace(0.0, 1.0, 21):
        merged = []; truth = []
        for img in inter:
            r_clip = oof_clip[order_imgs.index(img)]  # order_imgs aligned to oof_clip
            r_tiny = oof_tiny_map[img]
            r = merge_rank_lists_weighted(r_tiny, r_clip, w=w)  # w applied to CLIP
            merged.append(r); truth.append(oof_true_map[img])
        m, tau, dlt = tune_tau_delta(merged, truth)
        if m > best[0]: best = (m, tau, dlt, float(w))
    print(f"[Ensemble] OOF best: w_clip={best[3]:.2f}, tau={best[1]:.3f}, delta={best[2]:.3f}, MAP@5={best[0]:.4f}")
    # Build Tiny Test combined (across 5 folds) using cached full gallery
    train_csv = pd.read_csv('train.csv'); full_gal_df2 = train_csv[train_csv.Id!='new_whale'].copy(); labs_full2 = full_gal_df2['Id'].tolist()
    te_rank_tiny_folds = []
    for f in range(5):
        gal_full = np.load(f'embeddings/f{f}_gal_full_embs.npy')
        te_embs = np.load(f'embeddings/f{f}_te_embs.npy')
        sims, idxs = faiss_ip_search(te_embs, gal_full, topK=topK)
        ranked = per_class_max_from_search(idxs, sims, labs_full2)
        te_rank_tiny_folds.append(ranked)
    te_rank_tiny = combine_folds_mean(te_rank_tiny_folds)
    # Merge Tiny and CLIP test ranks with best weight and apply tau/delta
    w_clip = best[3]; tau_star = best[1]; dlt_star = best[2]
    preds5 = []
    for i in range(len(te_rank_clip)):
        merged = merge_rank_lists_weighted(te_rank_tiny[i], te_rank_clip[i], w=w_clip)
        if len(merged)==0:
            preds5.append('new_whale new_whale new_whale new_whale new_whale'); continue
        top1 = merged[0][1]; top2 = merged[1][1] if len(merged)>1 else -1.0
        is_new = (top1 < tau_star) or ((top1-top2) < dlt_star)
        cand = (['new_whale'] + [c for c,_ in merged][:4]) if is_new else [c for c,_ in merged][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq)==5: break
        while len(uniq)<5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[Ensemble] Saved submission.csv (Tiny + OpenCLIP ViT-L/14). Total elapsed {time.time()-t_all:.1f}s")

# Execute CLIP extraction and Tiny+CLIP ensemble
build_clip_and_ensemble_with_tiny(topK=300, batch_size=128)

> install -c constraints.txt open_clip_torch==2.26.1 --upgrade-strategy only-if-needed


Collecting open_clip_torch==2.26.1
  Downloading open_clip_torch-2.26.1-py3-none-any.whl (1.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 55.9 MB/s eta 0:00:00
Collecting huggingface-hub
  Downloading huggingface_hub-0.35.1-py3-none-any.whl (563 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 563.3/563.3 KB 531.7 MB/s eta 0:00:00


Collecting torch>=1.9.0
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl (797.1 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 797.1/797.1 MB 224.5 MB/s eta 0:00:00


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.8/44.8 KB 351.9 MB/s eta 0:00:00
Collecting timm
  Downloading timm-1.0.20-py3-none-any.whl (2.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.5/2.5 MB 236.1 MB/s eta 0:00:00
Collecting torchvision
  Downloading torchvision-0.19.1-cp311-cp311-manylinux1_x86_64.whl (7.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.0/7.0 MB 224.5 MB/s eta 0:00:00


Collecting regex
  Downloading regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (798 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 KB 551.1 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 KB 446.3 MB/s eta 0:00:00
Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 433.3 MB/s eta 0:00:00
Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 326.3 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 315.8 MB/s eta 0:00:00
Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 547.7 MB/s eta 0:00:00
Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 277.0 MB/s eta 0:00:00


Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 521.0 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)
Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 484.2 MB/s eta 0:00:00
Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 275.3 MB/s eta 0:00:00
Collecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 395.4 MB/s eta 0:00:00
Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 265.2 MB/s eta 0:00:00
Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 289.0 MB/s eta 0:00:00
Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 196.0 MB/s eta 0:00:00
Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 308.0 MB/s eta 0:00:00
Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 382.0 MB/s eta 0:00:00
Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 475.2 MB/s eta 0:00:00
Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 150.5 MB/s eta 0:00:00
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 315.6 MB/s eta 0:00:00


Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 229.6 MB/s eta 0:00:00


Collecting wcwidth
  Downloading wcwidth-0.2.14-py2.py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 763.0/763.0 KB 502.8 MB/s eta 0:00:00


Collecting hf-xet<2.0.0,>=1.1.3
  Downloading hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 358.3 MB/s eta 0:00:00
Collecting packaging>=20.9
  Downloading packaging-25.0-py3-none-any.whl (66 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.5/66.5 KB 431.3 MB/s eta 0:00:00
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.7/64.7 KB 412.6 MB/s eta 0:00:00


Collecting safetensors
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (485 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.8/485.8 KB 241.5 MB/s eta 0:00:00


Collecting pillow!=8.3.*,>=5.3.0
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 357.7 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 370.7 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)
Collecting charset_normalizer<4,>=2
  Downloading charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (150 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 KB 460.0 MB/s eta 0:00:00
Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.8/129.8 KB 492.0 MB/s eta 0:00:00


Collecting certifi>=2017.4.17
  Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 161.2/161.2 KB 509.6 MB/s eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.10-py3-none-any.whl (70 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 70.4/70.4 KB 426.0 MB/s eta 0:00:00
Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 472.2 MB/s eta 0:00:00


Installing collected packages: mpmath, wcwidth, urllib3, typing-extensions, tqdm, sympy, safetensors, regex, pyyaml, pillow, packaging, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, idna, hf-xet, fsspec, filelock, charset_normalizer, certifi, triton, requests, nvidia-cusparse-cu12, nvidia-cudnn-cu12, jinja2, ftfy, nvidia-cusolver-cu12, huggingface-hub, torch, torchvision, timm, open_clip_torch


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 2.0.8 requires albucore==0.0.24, but you have albucore 0.0.33 which is incompatible.


Successfully installed MarkupSafe-3.0.2 certifi-2025.8.3 charset_normalizer-3.4.3 filelock-3.19.1 fsspec-2025.9.0 ftfy-6.3.1 hf-xet-1.1.10 huggingface-hub-0.35.1 idna-3.10 jinja2-3.1.6 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 open_clip_torch-2.26.1 packaging-25.0 pillow-11.3.0 pyyaml-6.0.2 regex-2025.9.18 requests-2.32.5 safetensors-0.6.2 sympy-1.14.0 timm-1.0.20 torch-2.4.1 torchvision-0.19.1 tqdm-4.67.1 triton-3.0.0 typing-extensions-4.15.0 urllib3-2.5.0 wcwidth-0.2.14


[CLIP] Device: cpu


  checkpoint = torch.load(checkpoint_path, map_location=map_location)


[CLIP] Extracting train embeddings...


[CLIP] 2560/7240 imgs, elapsed 367.7s


In [None]:
# Tiny-only with k-reciprocal re-ranking (OOF-tuned tau/delta); uses cached tiny embeddings; single-config (k1=20,k2=6,lambda=0.3) with gal_full fallback
import os, time, gc
import numpy as np
import pandas as pd

def map5_score(y_true_ids, y_pred_ranked_ids):
    scores = []
    for t, preds in zip(y_true_ids, y_pred_ranked_ids):
        sc = 0.0
        for i, p in enumerate(preds[:5]):
            if p == t:
                sc = 1.0/(i+1); break
        scores.append(sc)
    return float(np.mean(scores))

def tune_tau_delta(rank_lists, true_ids, tau_grid_coarse=None, tau_window=0.05, tau_step_fine=0.005, delta_grid=None):
    if tau_grid_coarse is None:
        tau_grid_coarse = np.arange(0.2, 0.801, 0.02)
    if delta_grid is None:
        delta_grid = [0.0, 0.03, 0.05, 0.08, 0.10, 0.12, 0.15]
    best = (-1.0, 0.5, 0.0)
    def build_preds(tau, delta):
        out = []
        for ranked in rank_lists:
            if len(ranked) == 0:
                out.append(['new_whale']*5); continue
            top1 = ranked[0][1]
            top2 = ranked[1][1] if len(ranked) > 1 else -1.0
            cond_new = (top1 < tau) or ((top1 - top2) < delta)
            cand = (['new_whale'] + [c for c,_ in ranked][:4]) if cond_new else [c for c,_ in ranked][:5]
            uniq = []
            for c in cand:
                if c not in uniq: uniq.append(c)
                if len(uniq) == 5: break
            while len(uniq) < 5: uniq.append('new_whale')
            out.append(uniq)
        return out
    for dlt in delta_grid:
        for tau in tau_grid_coarse:
            m = map5_score(true_ids, build_preds(tau, dlt))
            if m > best[0]: best = (m, float(tau), float(dlt))
    _, tau_c, dlt_c = best
    tau_fine = np.arange(max(0.0, tau_c - tau_window), min(1.0, tau_c + tau_window)+1e-6, tau_step_fine)
    for tau in tau_fine:
        m = map5_score(true_ids, build_preds(tau, dlt_c))
        if m > best[0]: best = (m, float(tau), float(dlt_c))
    return best  # (map5, tau, delta)

def _compute_distance_mats(Q, G):
    # Assumes L2-normalized embeddings. Euclidean^2 = 2 - 2*cos
    Q = Q.astype('float32'); G = G.astype('float32')
    qg = 2.0 - 2.0 * (Q @ G.T)
    qq = 2.0 - 2.0 * (Q @ Q.T)
    gg = 2.0 - 2.0 * (G @ G.T)
    # clamp to non-negative for numerical stability
    np.maximum(qg, 0.0, out=qg); np.maximum(qq, 0.0, out=qq); np.maximum(gg, 0.0, out=gg)
    return qg, qq, gg

def re_ranking_kreciprocal(Q, G, k1=20, k2=6, lambda_value=0.3, print_log=False):
    # Adapted from Zhong et al. (CVPR'17) re-ranking; numpy version
    # Returns re-ranked distance matrix of shape (nq, ng)
    q_g_dist, q_q_dist, g_g_dist = _compute_distance_mats(Q, G)
    nq, ng = q_g_dist.shape
    all_num = nq + ng
    # Combine query and gallery for unified k-reciprocal computation
    orig_dist = np.zeros((all_num, all_num), dtype=np.float32)
    orig_dist[:nq, :nq] = q_q_dist
    orig_dist[:nq, nq:] = q_g_dist
    orig_dist[nq:, :nq] = q_g_dist.T
    orig_dist[nq:, nq:] = g_g_dist
    del q_q_dist, g_g_dist
    V = np.zeros_like(orig_dist, dtype=np.float32)
    initial_rank = np.argsort(orig_dist, axis=1).astype(np.int32)
    for i in range(all_num):
        forward_k_neigh_index = initial_rank[i, :min(k1+1, 500)]
        backward_k_neigh_index = initial_rank[forward_k_neigh_index, :min(k1+1, 500)]
        fi = np.where(backward_k_neigh_index == i)[0]
        k_reciprocal_index = forward_k_neigh_index[fi]
        k_reciprocal_expansion_index = k_reciprocal_index
        for candidate in k_reciprocal_index:
            candidate_forward_k = initial_rank[candidate, :int(np.around(k1/2))+1]
            candidate_backward_k = initial_rank[candidate_forward_k, :int(np.around(k1/2))+1]
            fi2 = np.where(candidate_backward_k == candidate)[0]
            if len(np.intersect1d(fi2, np.where(candidate_forward_k==i)[0])) > 2/3*len(fi2):
                k_reciprocal_expansion_index = np.append(k_reciprocal_expansion_index, candidate_forward_k)
        k_reciprocal_expansion_index = np.unique(k_reciprocal_expansion_index)
        weights = np.exp(-orig_dist[i, k_reciprocal_expansion_index])
        V[i, k_reciprocal_expansion_index] = weights / np.sum(weights)
    if print_log:
        print('[ReRank] V computed')
    if k2 > 1:
        V_qe = np.zeros_like(V, dtype=np.float32)
        for i in range(all_num):
            idx = initial_rank[i, :k2]
            V_qe[i] = V[idx].mean(axis=0)
        V = V_qe
        if print_log:
            print('[ReRank] Query expansion applied')
    invIndex = []
    for i in range(all_num):
        invIndex.append(np.where(V[:, i] != 0)[0])
    if print_log:
        print('[ReRank] Inverted index built')
    jaccard_dist = np.zeros((all_num, all_num), dtype=np.float32)
    for i in range(all_num):
        temp_min = np.zeros((1, all_num), dtype=np.float32)
        indNonZero = np.where(V[i, :] != 0)[0]
        indImages = []
        for j in indNonZero:
            indImages += invIndex[j].tolist()
        indImages = np.unique(np.array(indImages))
        temp_min[0, indImages] = np.minimum(V[i, indImages], V[indImages, i]).sum(axis=0)
        jaccard_dist[i] = 1 - temp_min / (2 - temp_min)
    final_dist = jaccard_dist * (1 - lambda_value) + orig_dist * lambda_value
    del jaccard_dist, V, orig_dist, initial_rank
    # Return only query-gallery part
    return final_dist[:nq, nq:]

def per_class_rank_from_sim(sim_mat, gallery_labels):
    # sim_mat: (nq, ng) similarities in [0,1]
    preds = []
    for i in range(sim_mat.shape[0]):
        best = {}
        sims = sim_mat[i]
        for gi, s in enumerate(sims):
            cls = gallery_labels[gi]
            v = float(s)
            if (cls not in best) or (v > best[cls]):
                best[cls] = v
        arr = sorted(best.items(), key=lambda x: x[1], reverse=True)
        preds.append(arr)
    return preds

# Minimal fallback: load convnext_tiny checkpoint and compute FULL gallery embeddings if missing
def _fallback_compute_full_gallery_embs(fold, full_gal_df, img_size=384, batch_size=64):
    import torch, torchvision.transforms as T
    from PIL import Image
    import timm
    IM_DIR_TRAIN = 'train'
    class ImageDS(torch.utils.data.Dataset):
        def __init__(self, df, img_dir, tfm):
            self.df = df.reset_index(drop=True); self.dir = img_dir; self.tfm = tfm
        def __len__(self): return len(self.df)
        def __getitem__(self, i):
            p = os.path.join(self.dir, self.df.iloc[i]['Image'])
            with Image.open(p) as im:
                im = im.convert('RGB')
                return self.tfm(im), self.df.iloc[i]['Image']
    class EmbeddingModel(torch.nn.Module):
        def __init__(self, backbone_name='convnext_tiny', embed_dim=512):
            super().__init__()
            self.backbone = timm.create_model(backbone_name, pretrained=False, num_classes=0, global_pool='avg')
            feat_dim = self.backbone.num_features
            self.head = torch.nn.Linear(feat_dim, embed_dim, bias=False)
            self.bn = torch.nn.BatchNorm1d(embed_dim)
        def forward(self, x):
            f = self.backbone(x)
            e = self.head(f)
            e = self.bn(e)
            e = torch.nn.functional.normalize(e, p=2, dim=1)
            return e
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tfm = T.Compose([
        T.Resize(int(img_size*1.15), interpolation=T.InterpolationMode.BILINEAR),
        T.CenterCrop(img_size),
        T.ToTensor(),
        T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ])
    ds = ImageDS(full_gal_df, IM_DIR_TRAIN, tfm)
    dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True)
    model = EmbeddingModel('convnext_tiny', 512).to(device)
    state = torch.load(f'checkpoints/fold{fold}.pt', map_location=device)
    model.load_state_dict(state['model'], strict=True); model.eval()
    embs = []
    with torch.no_grad():
        autocast = torch.amp.autocast('cuda', enabled=torch.cuda.is_available())
        with autocast:
            for imgs, _ in dl:
                imgs = imgs.to(device, non_blocking=True)
                e1 = model(imgs)
                e2 = model(torch.flip(imgs, dims=[3]))
                e = (e1 + e2) / 2.0
                embs.append(e.detach().cpu().numpy())
    E = np.concatenate(embs, axis=0).astype('float32')
    # L2-normalize
    E /= (np.linalg.norm(E, axis=1, keepdims=True) + 1e-9)
    return E

def tiny_krecip_rerank_and_submit(k1_grid=(20,), k2_grid=(6,), lam_grid=(0.3,), topK=None):
    t_all = time.time()
    k1, k2, lam = int(k1_grid[0]), int(k2_grid[0]), float(lam_grid[0])
    # OOF: single-config re-ranking across 5 folds
    oof_true = []; oof_ranked = []
    for f in range(5):
        t_fold = time.time()
        gal_embs = np.load(f'embeddings/f{f}_gal_embs.npy')  # (ng, d), L2-normalized
        val_embs = np.load(f'embeddings/f{f}_val_embs.npy')  # (nq, d), L2-normalized
        gal_df = pd.read_csv(f'embeddings/f{f}_gal_df.csv')
        val_df = pd.read_csv(f'embeddings/f{f}_val_df.csv')
        gal_labels = gal_df['Id'].tolist()
        dist_qg = re_ranking_kreciprocal(val_embs, gal_embs, k1=k1, k2=k2, lambda_value=lam, print_log=False)
        sim_qg = 1.0 / (1.0 + dist_qg)  # convert distance to [0,1] similarity
        ranked = per_class_rank_from_sim(sim_qg, gal_labels)
        oof_ranked.extend(ranked); oof_true.extend(val_df['Id'].tolist())
        print(f'[ReRank OOF] fold {f} processed (val={len(val_df)}, gal={len(gal_df)}) in {time.time()-t_fold:.1f}s', flush=True)
    # Tune tau/delta on OOF
    best_map, best_tau, best_delta = tune_tau_delta(oof_ranked, oof_true)
    print(f"[ReRank OOF] cfg k1={k1}, k2={k2}, lambda={lam} | tau={best_tau:.3f}, delta={best_delta:.3f}, MAP@5={best_map:.4f}")
    # TEST: compute re-ranked sim for each fold using FULL-train gallery embeddings (fallback if missing)
    tr = pd.read_csv('train.csv')
    full_gal_df = tr[tr.Id != 'new_whale'].copy()
    labs_full = full_gal_df['Id'].tolist()
    te_ranked_folds = []
    for f in range(5):
        t_fold = time.time()
        gal_full_path = f'embeddings/f{f}_gal_full_embs.npy'
        if os.path.exists(gal_full_path):
            gal_full = np.load(gal_full_path)
        else:
            print(f'[ReRank TEST] fold {f}: gal_full missing; computing via checkpoint...', flush=True)
            gal_full = _fallback_compute_full_gallery_embs(f, full_gal_df, img_size=384, batch_size=64)
            os.makedirs('embeddings', exist_ok=True)
            np.save(gal_full_path, gal_full)
        te_embs = np.load(f'embeddings/f{f}_te_embs.npy')
        dist_qg = re_ranking_kreciprocal(te_embs, gal_full, k1=k1, k2=k2, lambda_value=lam, print_log=False)
        sim_qg = 1.0 / (1.0 + dist_qg)
        ranked = per_class_rank_from_sim(sim_qg, labs_full)
        te_ranked_folds.append(ranked)
        print(f'[ReRank TEST] fold {f} processed (test={sim_qg.shape[0]}, gal_full={sim_qg.shape[1]}) in {time.time()-t_fold:.1f}s', flush=True)
    # Combine folds by mean score per class
    N = len(te_ranked_folds[0])
    te_comb = []
    for i in range(N):
        d = {}
        for f in range(5):
            for c, s in te_ranked_folds[f][i]:
                d.setdefault(c, []).append(s)
        arr = [(c, float(np.mean(v))) for c, v in d.items()]
        arr.sort(key=lambda x: x[1], reverse=True)
        te_comb.append(arr)
    # Apply decision rule with best tau/delta
    preds5 = []
    for ranked in te_comb:
        if len(ranked) == 0:
            preds5.append('new_whale new_whale new_whale new_whale new_whale'); continue
        top1 = ranked[0][1]
        top2 = ranked[1][1] if len(ranked) > 1 else -1.0
        cond_new = (top1 < best_tau) or ((top1 - top2) < best_delta)
        cand = (['new_whale'] + [c for c,_ in ranked][:4]) if cond_new else [c for c,_ in ranked][:5]
        uniq = []
        for c in cand:
            if c not in uniq: uniq.append(c)
            if len(uniq) == 5: break
        while len(uniq) < 5: uniq.append('new_whale')
        preds5.append(' '.join(uniq))
    sub = pd.read_csv('sample_submission.csv')
    sub['Id'] = preds5
    sub.to_csv('submission.csv', index=False)
    print(f"[ReRank] Saved submission.csv | Total elapsed {time.time()-t_all:.1f}s")

# Execute single-config k-reciprocal re-ranking
tiny_krecip_rerank_and_submit(k1_grid=(20,), k2_grid=(6,), lam_grid=(0.3,))