
# Dog Breed Identification — Final Submission Pipeline
This notebook is a clean, linear, and self-contained pipeline that builds the final submission.csv from validated artifacts:
- Calibrated baseline (embedding linear head) logits and temperature
- Optimized kNN geometric blend hyperparameters (K, tau, lambda)
- OOF-calibrated temperature for the blended model

Inputs expected in CWD:
- labels.csv, sample_submission.csv, train_image_meta.csv, test_image_meta.csv
- oof_probs.npy, test_logits.npy, temperatures.json (baseline calibration)
- train_embeds.npy, test_embeds.npy (L2-normalized at use-time)
- fold_assignments.csv (5-fold StratifiedGroupKFold)
- knn_blend_config.json (with best K, tau, lambda); falls back to hard-coded best if missing

Output:
- submission.csv (final calibrated, kNN-blended probabilities)


In [None]:

import numpy as np, pandas as pd, json, shutil
from pathlib import Path
import faiss
import torch

def softmax_np(x):
    m = x.max(axis=1, keepdims=True)
    ex = np.exp(x - m)
    return ex / ex.sum(axis=1, keepdims=True)

def row_normalize(p, eps=1e-12):
    p = np.clip(p, eps, 1.0)
    p /= p.sum(axis=1, keepdims=True)
    return p

def fit_temperature_from_probs(P, y_true, device='cpu'):
    x = torch.from_numpy(np.log(np.clip(P, 1e-12, 1.0))).to(device)
    y_t = torch.from_numpy(y_true).long().to(device)
    T = torch.tensor(1.0, dtype=torch.float32, requires_grad=True, device=device)
    nll = torch.nn.CrossEntropyLoss()
    opt = torch.optim.LBFGS([T], lr=0.5, max_iter=100, line_search_fn='strong_wolfe')
    def closure():
        opt.zero_grad()
        logits = x / torch.clamp(T, min=1e-3)
        loss = nll(logits, y_t)
        loss.backward()
        return loss
    opt.step(closure)
    return float(T.detach().cpu().clamp_min(1e-3).item())

# 1) Rebuild calibrated baseline test probs and promote to submission.csv
sample_df = pd.read_csv('sample_submission.csv')
classes = [c for c in sample_df.columns if c != 'id']
test_meta = pd.read_csv('test_image_meta.csv')
test_ids = test_meta['id'].tolist()
baseline_logits_path = Path('test_logits.npy')
temps_path = Path('temperatures.json')
assert baseline_logits_path.exists(), 'test_logits.npy missing; regenerate baseline artifacts first.'
logits = np.load(baseline_logits_path)
T_base = 1.0
if temps_path.exists():
    try:
        T_base = float(json.load(open(temps_path))['global_T'])
    except Exception:
        T_base = 1.0
probs_base_test = row_normalize(softmax_np(logits / max(1e-3, T_base)))
sub_base = pd.DataFrame(probs_base_test, columns=classes)
sub_base.insert(0, 'id', test_ids)
sub_base.to_csv('submission_baseline.csv', index=False)
shutil.copyfile('submission_baseline.csv', 'submission.csv')
print('Baseline rebuilt and promoted: submission_baseline.csv -> submission.csv')

# 2) Load optimized kNN config (fallback to validated best if missing)
cfg_path = Path('knn_blend_config.json')
if cfg_path.exists():
    cfg = json.load(open(cfg_path))
    K = int(cfg.get('K', 300))
    tau = float(cfg.get('tau', 0.02))
    lam = float(cfg.get('lambda', 0.59))
else:
    K, tau, lam = 300, 0.02, 0.59
print(f'Using optimized kNN config: K={K}, tau={tau}, lambda={lam}')

# 3) Recompute OOF kNN with (K,tau), blend with baseline OOF, and fit global temperature T
labels_df = pd.read_csv('labels.csv')
breed_to_idx = {b:i for i,b in enumerate(classes)}
y = labels_df['breed'].map(breed_to_idx).values.astype(np.int64)
folds = pd.read_csv('fold_assignments.csv').set_index('id').loc[labels_df['id'], 'fold'].values.astype(int)
n_folds = int(folds.max() + 1)
oof_base = np.load('oof_probs.npy').astype(np.float64)

train_embeds_raw = np.load('train_embeds.npy').astype(np.float32)
train_ids_order = pd.read_csv('train_image_meta.csv')['id'].tolist()
id_to_pos = {id_: i for i, id_ in enumerate(train_ids_order)}
reindex = np.array([id_to_pos[id_] for id_ in labels_df['id']], dtype=np.int64)
X = train_embeds_raw[reindex]
X /= (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)

oof_knn = np.zeros_like(oof_base, dtype=np.float64)
for f in range(n_folds):
    tr = np.where(folds != f)[0]
    va = np.where(folds == f)[0]
    index = faiss.IndexFlatIP(X.shape[1])
    index.add(X[tr].astype('float32'))
    sims, idxs = index.search(X[va].astype('float32'), min(K, len(tr)))
    C = len(classes)
    for i in range(len(va)):
        s = sims[i]
        ids = idxs[i].astype(int)
        lbls = y[tr][ids]
        w = np.exp(s / max(1e-8, tau))
        w = w / max(1e-12, w.sum())
        row = np.full(C, 1e-12, dtype=np.float64)
        for c, ww in zip(lbls, w):
            row[int(c)] += float(ww)
        row /= row.sum()
        oof_knn[va[i]] = row

P_oof = row_normalize((oof_base ** max(0.0, 1.0 - lam)) * (oof_knn ** lam))
T = fit_temperature_from_probs(P_oof, y)
with open('temperatures_knn_blend.json','w') as f:
    json.dump({'global_T': float(T), 'K': int(K), 'tau': float(tau), 'lambda': float(lam)}, f)
print(f'OOF calibration complete: T={T:.4f} (lambda={lam}, K={K}, tau={tau})')

# 4) Build test kNN with (K,tau), blend with baseline test probs, apply T, and write submission.csv
test_embeds_raw = np.load('test_embeds.npy').astype(np.float32)
X_test = test_embeds_raw
X_test /= (np.linalg.norm(X_test, axis=1, keepdims=True) + 1e-12)
index_full = faiss.IndexFlatIP(X.shape[1])
index_full.add(X.astype('float32'))
sims_te, idxs_te = index_full.search(X_test.astype('float32'), min(K, X.shape[0]))
C = len(classes)
test_knn = np.full((X_test.shape[0], C), 1e-12, dtype=np.float64)
w = np.exp(sims_te / max(1e-8, tau))
w = w / np.clip(w.sum(axis=1, keepdims=True), 1e-12, None)
for i in range(X_test.shape[0]):
    lbls = y[idxs_te[i].astype(int)]
    for c, ww in zip(lbls, w[i]):
        test_knn[i, int(c)] += float(ww)
    test_knn[i] /= test_knn[i].sum()

sub_base_aligned = pd.read_csv('submission.csv')
assert list(sub_base_aligned.columns)[1:] == classes
P_base_test = sub_base_aligned[classes].to_numpy(dtype=np.float64)

P_blend = row_normalize((P_base_test ** max(0.0, 1.0 - lam)) * (test_knn ** lam))
P_test_cal = row_normalize(np.power(P_blend, 1.0 / max(1e-3, T)))

sub_out = pd.DataFrame(P_test_cal, columns=classes)
sub_out.insert(0, 'id', sub_base_aligned['id'])
sub_out.to_csv('submission.csv', index=False)
print('Final submission.csv written: baseline-aligned geometric kNN blend + OOF-calibrated temperature applied.')
    