In [None]:
# Build 3-fold Grouped CV by source archive (leave-one-tar-out) and save folds
import os, io, tarfile, zipfile, json, sys, time
from pathlib import Path
import pandas as pd

print('=== Building archive-grouped CV folds (training1/2/3) ===', flush=True)
CWD = Path('.')
TRAIN_TARS = [CWD/'training1.tar.gz', CWD/'training2.tar.gz', CWD/'training3.tar.gz']

train_df = pd.read_csv('training.csv')
train_ids = set(train_df['Id'].astype(int).tolist())

def tar_members_ids(tarpath: Path):
    ids = set()
    with tarfile.open(tarpath, 'r:*') as tf:
        for m in tf:
            if not m.isreg():
                continue
            nm = m.name.lstrip('./')
            if nm.endswith('.zip') and nm.startswith('Sample') and len(nm) >= len('Sample00001.zip'):
                try:
                    sid = int(nm[6:11])
                    if sid in train_ids:
                        ids.add(sid)
                except Exception:
                    pass
    return ids

groups = {}  # id -> group (1,2,3)
tar_id_sets = []
for gi, tp in enumerate(TRAIN_TARS, start=1):
    if not tp.exists():
        print(f'WARNING: missing {tp}', flush=True)
        tar_id_sets.append(set());
        continue
    s = tar_members_ids(tp); tar_id_sets.append(s)
    for sid in s:
        groups[sid] = gi

# Sanity: all training ids should appear in one of the tars
miss = sorted([sid for sid in train_ids if sid not in groups])
if miss:
    print(f'WARNING: {len(miss)} training Ids not found in any training*.tar.gz e.g., {miss[:10]}', flush=True)

# Build 3 folds: each fold validates on one tar, trains on the other two
folds = []
for holdout_idx in range(3):
    val_ids = sorted(tar_id_sets[holdout_idx])
    tr_ids = sorted(set().union(*[tar_id_sets[j] for j in range(3) if j != holdout_idx]))
    folds.append({'fold': holdout_idx, 'train_ids': tr_ids, 'val_ids': val_ids})

# Save id->tar map and folds
pd.DataFrame({'Id': list(groups.keys()), 'archive_group': [groups[i] for i in groups.keys()]}).to_csv('id_to_archive.csv', index=False)
with open('folds_archive_cv.json', 'w') as f:
    json.dump(folds, f)

# Print summary
print('Fold sizes:')
for f in folds:
    print(f"  fold={f['fold']} train={len(f['train_ids'])} val={len(f['val_ids'])}")
cover = set().union(*tar_id_sets) if tar_id_sets else set()
print(f'Total train_ids={len(train_ids)}; covered_by_tars={len(cover)}; unmatched={len(train_ids-cover)}')
print('Saved: id_to_archive.csv, folds_archive_cv.json', flush=True)

In [None]:
# Grouped-CV OOF eval + probs caching + small grid over decoder/blend (per expert plan)
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA:', torch.cuda.is_available(), flush=True)

feat_tr_dir = Path('features3d_v2')/'train'
lab_tr_dir  = Path('labels3d_v2')/'train'
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)

# Load folds and training sequences
folds = json.load(open('folds_archive_cv.json', 'r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

def load_feat(sample_id: int, split='train', max_T=1800):
    p = (feat_tr_dir)/f"{sample_id}.npz"
    d = np.load(p); X = d['X'].astype(np.float32)
    return X[:max_T] if X.shape[0] > max_T else X

def compute_class_median_durations_for_ids(id_list):
    # compute per-class median frame durations using ONLY the provided training ids (no leakage)
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum())
            if cnt>0: dur_by_c[c].append(cnt)
    med = {}
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

# Minimal model defs for loading checkpoints
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=96, layers=10, num_classes=21, dropout=0.3):
        super().__init__()
        self.inp = nn.Conv1d(d_in, channels, kernel_size=1)
        blocks = []
        dil = 1
        for _ in range(layers):
            blocks.append(nn.Sequential(
                nn.Conv1d(channels, channels, kernel_size=3, padding=dil, dilation=dil),
                nn.GroupNorm(num_groups=8, num_channels=channels),
                nn.ReLU(inplace=True),
                nn.Dropout(dropout),
                nn.Conv1d(channels, channels, kernel_size=1),
                nn.GroupNorm(num_groups=8, num_channels=channels),
                nn.ReLU(inplace=True)
            ))
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(channels, num_classes, kernel_size=1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2)
        h = self.inp(x)
        for blk in self.blocks:
            res = h
            h = blk(h)
            h = h + res
        logits = self.head(h)
        return logits.transpose(1,2)

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.3, groups=8, k=3):
        super().__init__()
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch)
        self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1)
        self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x)
        h = self.gn1(h)
        h = F.relu(h, inplace=True)
        h = self.drop(h)
        h = self.conv2(h)
        h = self.gn2(h)
        h = F.relu(h, inplace=True)
        return x + h

class Stage(nn.Module):
    def __init__(self, in_ch, ch=128, layers=10, drop=0.3):
        super().__init__()
        self.inp = nn.Conv1d(in_ch, ch, 1)
        blocks = []
        dil = 1
        for _ in range(layers):
            blocks.append(DilatedResBlock(ch, dil, drop=drop))
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(ch, 21, 1)
    def forward(self, x):
        h = self.inp(x)
        for b in self.blocks:
            h = b(h)
        return self.head(h)

class MSTCNPP(nn.Module):
    def __init__(self, d_in, stages=4, ch=128, layers=10, drop=0.3):
        super().__init__()
        self.input_proj = nn.Conv1d(d_in, d_in, 1)
        self.stages = nn.ModuleList()
        self.stages.append(Stage(d_in, ch=ch, layers=layers, drop=drop))
        for _ in range(stages-1):
            self.stages.append(Stage(21, ch=ch, layers=layers, drop=drop))
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2)
        x = self.input_proj(x)
        logits_list = []
        prev = self.stages[0](x)
        logits_list.append(prev.transpose(1,2))
        for s in range(1, len(self.stages)):
            probs = prev.softmax(dim=1)
            prev = self.stages[s](probs)
            logits_list.append(prev.transpose(1,2))
        return logits_list

def load_models():
    ce_paths=["model_ce_tcn_s0.pth","model_ce_tcn_s1.pth","model_ce_tcn_s2.pth"]
    ms_path="model_mstcnpp_s2.pth"
    for p in ce_paths+[ms_path]:
        assert Path(p).exists(), f"Missing {p}"
    ce_models=[]
    for p in ce_paths:
        m = DilatedTCN(d_in=D_in, channels=96, layers=10, num_classes=21, dropout=0.3).to(device)
        m.load_state_dict(torch.load(p, map_location=device))
        m.eval()
        ce_models.append(m)
    ms = MSTCNPP(d_in=D_in, stages=4, ch=128, layers=10, drop=0.3).to(device)
    ms.load_state_dict(torch.load(ms_path, map_location=device))
    ms.eval()
    return ce_models, ms

def time_warp_probs(p_t_c: torch.Tensor, factor: float) -> torch.Tensor:
    T, C = p_t_c.shape
    tgt_len = max(1, int(round(T*factor)))
    x = p_t_c.T.unsqueeze(0)
    y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False)
    y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T
    y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)
    return y2

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        ps = time_warp_probs(p_t_c, s)
        acc = ps if acc is None else (acc + ps)
    out = acc / float(len(factors))
    out = out / (out.sum(dim=-1, keepdim=True) + 1e-8)
    return out

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2)
    y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2)
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    # ensure SAME-length output even for even kernels
    k = max(1, int(k))
    x = p_t.view(1,1,-1)
    w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k)
    pad = (k-1)//2
    y = F.conv1d(x, w, padding=pad)
    y = y.view(-1)
    T = p_t.shape[0]
    if y.shape[0] < T:
        y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T:
        y = y[:T]
    return y

def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]
    a = max(0, t_star - w)
    b = min(T-1, t_star + w)
    idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype)
    seg = p[a:b+1]
    s = seg.sum() + 1e-8
    return float(((idx * seg).sum() / s).item())

def decode_peaks(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9):
    # refined peak-time decoder with per-fold duration priors (no leakage) and gamma scaling
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp))
        p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k)
    T, C = p_s.shape
    scores = torch.empty_like(p_s)
    ks = [13]*C
    for c in range(C):
        if c==0:
            scores[:,c] = p_s[:,c]
            ks[c] = 13
            continue
        base_k = med_k.get(c, 13)
        k = int(np.clip(round(gamma * base_k), 9, 25))
        if k % 2 == 0:
            k = min(25, k + 1)  # force odd to keep same length
        ks[c] = k
        scores[:,c] = duration_integral_single(p_s[:,c], k=k)
    peaks = []
    for c in range(1,21):
        k = ks[c]
        w_com = max(5, k//3)
        radius = max(10, k//2)
        s = scores[:,c]
        t_star = int(torch.argmax(s).item())
        t_ref = refine_com(p_s[:,c], t_star, w=w_com)
        t_idx = int(round(t_ref))
        t_idx = min(max(t_idx,0), T-1)
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item()
        pooled_at_ref = p_s[t_idx, c].item()
        peaks.append([c, t_ref, float(scores[t_idx,c].item()), float(local_mean), float(pooled_at_ref)])
    # sort by time then by score then local mean then pooled prob
    peaks.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    # enforce minimum separation >=2 frames and strictly increasing timestamps
    last_t = -1e9
    for i in range(len(peaks)):
        if peaks[i][1] <= last_t:
            peaks[i][1] = last_t + 2.0
        last_t = min(peaks[i][1], float(T-1))
    return [int(c) for c,_,_,_,_ in peaks]

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

def cache_probs_for_id(sid:int, ce_models, ms_model):
    # cache CE-avg and MS probs (without TTA), to speed grid sweeps
    ce_out=probs_cache/f"{sid}_ce.npy"; ms_out=probs_cache/f"{sid}_ms.npy"
    if ce_out.exists() and ms_out.exists():
        return
    X=load_feat(sid,'train',1800); xb=torch.from_numpy(X).unsqueeze(0).to(device)
    with torch.no_grad():
        # CE avg
        ce_sum=None
        for m in ce_models:
            p=m(xb)[0].softmax(dim=-1)
            ce_sum = p if ce_sum is None else (ce_sum + p)
        ce = (ce_sum/len(ce_models)).cpu().numpy()
        # MS last stage
        p_ms = ms_model(xb)[-1][0].softmax(dim=-1).cpu().numpy()
    np.save(ce_out, ce); np.save(ms_out, p_ms)

def load_cached_probs(sid:int):
    ce=np.load(probs_cache/f"{sid}_ce.npy"); ms=np.load(probs_cache/f"{sid}_ms.npy")
    return torch.from_numpy(ce).to(device), torch.from_numpy(ms).to(device)

# Global TTA factors (wider) used consistently in CV and test-time
TTA_FACTORS = (0.85, 0.9, 1.0, 1.1, 1.15)

def blend_probs(ce_torch: torch.Tensor, ms_torch: torch.Tensor, w_ce=0.9, temp_ms: float = 1.0):
    # geometric mean in prob-space via log domain, with optional MS temperature
    if temp_ms!=1.0:
        ms_torch = (ms_torch ** (1.0/temp_ms)); ms_torch = ms_torch/(ms_torch.sum(dim=-1,keepdim=True)+1e-8)
    log_ce = torch.log(ce_torch+1e-8); log_ms=torch.log(ms_torch+1e-8)
    comb = torch.exp(w_ce*log_ce + (1.0-w_ce)*log_ms)
    return comb/(comb.sum(dim=-1,keepdim=True)+1e-8)

print('Loading models...', flush=True)
ce_models, ms_model = load_models()

# Pre-cache val probs for all folds
t0=time.time()
for f in folds:
    vids = f['val_ids']
    print(f"[Cache] fold={f['fold']} val_ids={len(vids)}", flush=True)
    for i, sid in enumerate(vids, 1):
        cache_probs_for_id(int(sid), ce_models, ms_model)
        if (i%25)==0 or i==len(vids):
            print(f"  cached {i}/{len(vids)} (elapsed {time.time()-t0:.1f}s)", flush=True)

# Small grid over decoder and blend settings WITH per-fold priors and gamma scale
pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; w_ces=[0.95]; gammas=[0.90,0.95,0.975,1.00,1.025,1.05,1.10]

# cache med_k per fold to avoid recomputation
med_cache = {}  # fold_idx -> med_k dict

def eval_setting_on_fold(fold, use_ms: bool, pool_k:int, temp:float, gamma: float, w_ce:float=0.95, temp_ms:float=1.0):
    fold_idx = fold['fold']
    if fold_idx not in med_cache:
        med_cache[fold_idx] = compute_class_median_durations_for_ids(fold['train_ids'])
    med_k = med_cache[fold_idx]
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        ce, ms = load_cached_probs(int(sid))
        probs = ce if not use_ms else blend_probs(ce, ms, w_ce=w_ce, temp_ms=temp_ms)
        probs = apply_tta_timewarp(probs, factors=TTA_FACTORS)
        seq = decode_peaks(probs, med_k=med_k, gamma=gamma, pool_k=pool_k, temp=temp)
        tot += levenshtein(seq, id2seq[int(sid)]); cnt += 1
    return tot/max(cnt,1)

def sweep(use_ms: bool):
    results=[]
    for pool_k in pool_ks:
        for temp in temps:
            for gamma in gammas:
                if use_ms:
                    for w_ce in w_ces:
                        per_fold=[]
                        for f in folds:
                            lev = eval_setting_on_fold(f, True, pool_k, temp, gamma=gamma, w_ce=w_ce, temp_ms=0.95)
                            per_fold.append(lev)
                        results.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'w_ce':w_ce,'use_ms':True}))
                else:
                    per_fold=[]
                    for f in folds:
                        lev = eval_setting_on_fold(f, False, pool_k, temp, gamma=gamma)
                        per_fold.append(lev)
                    results.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'use_ms':False}))
    results.sort(key=lambda x: (x[1], x[0]))  # prioritize worst-fold, then mean
    return results

print('Sweeping CE-only...', flush=True)
res_ce = sweep(False)
print('Top CE-only (mean, worst, cfg):')
for r in res_ce[:5]:
    print(r)

print('Sweeping CE+MS (geom, CE-heavy)...', flush=True)
res_ms = sweep(True)
print('Top CE+MS (mean, worst, cfg):')
for r in res_ms[:5]:
    print(r)

# Save sweep results
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res_ce]).to_csv('cv_sweep_ce.csv', index=False)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res_ms]).to_csv('cv_sweep_ce_ms.csv', index=False)
print('Saved cv_sweep_ce.csv and cv_sweep_ce_ms.csv', flush=True)

In [None]:
# Build submissions: primary (CE+MS geom, CE-heavy) and backup (CE-only) using grouped-CV tuned settings
import pandas as pd, numpy as np, time, torch, torch.nn.functional as F
from pathlib import Path

feat_te_dir = Path('features3d_v2')/'test'
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()

def load_feat_test(sample_id: int, max_T=1800):
    p = (feat_te_dir)/f"{sample_id}.npz"
    d = np.load(p); X = d['X'].astype(np.float32)
    return X[:max_T] if X.shape[0] > max_T else X

# Read best configs (sorted by worst then mean earlier)
cfg_ce = pd.read_csv('cv_sweep_ce.csv').sort_values(['worst','mean']).iloc[0].to_dict()
cfg_ms = pd.read_csv('cv_sweep_ce_ms.csv').sort_values(['worst','mean']).iloc[0].to_dict()
print('Chosen CE-only cfg:', cfg_ce)
print('Chosen CE+MS cfg:', cfg_ms)

# Compute test-time priors from ALL training ids (non-leaky), and extract gammas
train_ids_all = pd.read_csv('training.csv')['Id'].astype(int).tolist()
med_k_test = compute_class_median_durations_for_ids(train_ids_all)
gamma_ce = float(cfg_ce.get('gamma', 1.0))
gamma_ms = float(cfg_ms.get('gamma', 1.0))

# Ensure models available from previous cell; otherwise load
try:
    ce_models, ms_model
except NameError:
    ce_models, ms_model = load_models()

def ensemble_ce_probs_from_models(xb, ce_models):
    with torch.no_grad():
        acc=None
        for m in ce_models:
            p = m(xb)[0].softmax(dim=-1)
            acc = p if acc is None else (acc + p)
        probs = acc/len(ce_models)
        return probs/(probs.sum(dim=-1,keepdim=True)+1e-8)

def blend_probs_geom(ce_prob: torch.Tensor, ms_prob: torch.Tensor, w_ce=0.9, temp_ms=0.95):
    if temp_ms!=1.0:
        ms_prob = (ms_prob ** (1.0/temp_ms)); ms_prob = ms_prob/(ms_prob.sum(dim=-1,keepdim=True)+1e-8)
    log_ce = torch.log(ce_prob+1e-8); log_ms=torch.log(ms_prob+1e-8)
    comb = torch.exp(w_ce*log_ce + (1.0-w_ce)*log_ms)
    return comb/(comb.sum(dim=-1,keepdim=True)+1e-8)

# Consistent wider TTA factors with CV
TTA_FACTORS = (0.85, 0.9, 1.0, 1.1, 1.15)

# Backup submission: CE-only
rows_ce=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    X = load_feat_test(int(sid), 1800)
    xb = torch.from_numpy(X).unsqueeze(0).to(device)
    ce_prob = ensemble_ce_probs_from_models(xb, ce_models)
    probs = apply_tta_timewarp(ce_prob, factors=TTA_FACTORS)
    seq = decode_peaks(probs, med_k=med_k_test, gamma=gamma_ce, pool_k=int(cfg_ce['pool_k']), temp=float(cfg_ce['temp']))
    rows_ce.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE-only] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub_ce = pd.DataFrame(rows_ce, columns=['Id','Sequence'])
assert len(sub_ce)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub_ce.Sequence), 'CE-only submission format invalid'
sub_ce.to_csv('submission_backup_ce_only.csv', index=False)
print('Wrote submission_backup_ce_only.csv; head:\n', sub_ce.head())

# Primary submission: CE+MS geometric mean, CE-heavy
rows_ms=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    X = load_feat_test(int(sid), 1800)
    xb = torch.from_numpy(X).unsqueeze(0).to(device)
    with torch.no_grad():
        ce_prob = ensemble_ce_probs_from_models(xb, ce_models)
        ms_prob = ms_model(xb)[-1][0].softmax(dim=-1)
    probs = blend_probs_geom(ce_prob, ms_prob, w_ce=float(cfg_ms.get('w_ce', 0.95)), temp_ms=0.95)
    probs = apply_tta_timewarp(probs, factors=TTA_FACTORS)
    seq = decode_peaks(probs, med_k=med_k_test, gamma=gamma_ms, pool_k=int(cfg_ms['pool_k']), temp=float(cfg_ms['temp']))
    rows_ms.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE+MS] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub_ms = pd.DataFrame(rows_ms, columns=['Id','Sequence'])
assert len(sub_ms)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub_ms.Sequence), 'CE+MS submission format invalid'
sub_ms.to_csv('submission_primary_ce_ms.csv', index=False)
sub_ms.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_ms.csv and submission.csv; head:\n', sub_ms.head())
print('Done building submissions.')

In [None]:
# Fallback: use CE-only backup submission as final submission.csv
import pandas as pd, shutil, os
src = 'submission_backup_ce_only.csv'
dst = 'submission.csv'
assert os.path.exists(src), 'Missing CE-only backup submission file'
shutil.copyfile(src, dst)
df = pd.read_csv(dst).head()
print('submission.csv head (CE-only backup):\n', df)

In [None]:
# Inspect features3d_v2 to decide if we can append cheap scalars (scale_ema diffs, hand curvature)
import numpy as np, json, random
from pathlib import Path

feat_tr_dir = Path('features3d_v2')/'train'
paths = sorted(feat_tr_dir.glob('*.npz'))
assert len(paths)==297, f'Expected 297 train npz, got {len(paths)}'
p = paths[0]
d = np.load(p)
print('Keys:', list(d.keys()))
X = d['X']
print('Shape X:', X.shape, 'dtype:', X.dtype)
if 'meta' in d.files:
    try:
        meta = json.loads(d['meta'].tobytes().decode('utf-8')) if hasattr(d['meta'], 'tobytes') else d['meta'].item()
        print('Meta example:', str(meta)[:200])
    except Exception as e:
        print('Meta parse failed:', e)

# Peek several samples to estimate typical T and D
Ds = []; Ts = []
for q in random.sample(paths, k=min(5, len(paths))):
    Xq = np.load(q)['X']
    Ts.append(Xq.shape[0]); Ds.append(Xq.shape[1])
print('Sampled T range:', (min(Ts), max(Ts)), 'D unique:', sorted(set(Ds)))

In [None]:
# Robust primary submission: CE+MS with per-sample fallback to CE-only to avoid hangs (now with priors + gamma)
import pandas as pd, numpy as np, time, torch, os
from pathlib import Path

feat_te_dir = Path('features3d_v2')/'test'
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()

def load_feat_test(sample_id: int, max_T=1800):
    d = np.load((feat_te_dir/f"{sample_id}.npz")); X = d['X'].astype(np.float32)
    return X[:max_T] if X.shape[0] > max_T else X

# Load best configs
cfg_ce = pd.read_csv('cv_sweep_ce.csv').sort_values(['worst','mean']).iloc[0].to_dict()
cfg_ms = pd.read_csv('cv_sweep_ce_ms.csv').sort_values(['worst','mean']).iloc[0].to_dict()
w_ce = float(cfg_ms.get('w_ce', 0.95)); pool_k_ms = int(cfg_ms['pool_k']); temp_msdec = float(cfg_ms['temp'])
pool_k_ce = int(cfg_ce['pool_k']); temp_cedec = float(cfg_ce['temp'])
gamma_ce = float(cfg_ce.get('gamma', 1.0)); gamma_ms = float(cfg_ms.get('gamma', 1.0))
print('Using cfg CE:', cfg_ce, '\nUsing cfg CE+MS:', cfg_ms, flush=True)

# Compute non-leaky test-time priors from ALL training ids once
train_ids_all = pd.read_csv('training.csv')['Id'].astype(int).tolist()
med_k_test = compute_class_median_durations_for_ids(train_ids_all)

try:
    ce_models, ms_model
except NameError:
    ce_models, ms_model = load_models()

def ensemble_ce_probs_from_models(xb, ce_models):
    with torch.no_grad():
        acc=None
        for m in ce_models:
            p = m(xb)[0].softmax(dim=-1)
            acc = p if acc is None else (acc + p)
        probs = acc/len(ce_models)
        return probs/(probs.sum(dim=-1,keepdim=True)+1e-8)

def blend_probs_geom(ce_prob: torch.Tensor, ms_prob: torch.Tensor, w_ce=0.95, temp_ms=0.95):
    if temp_ms!=1.0:
        ms_prob = (ms_prob ** (1.0/temp_ms)); ms_prob = ms_prob/(ms_prob.sum(dim=-1,keepdim=True)+1e-8)
    log_ce = torch.log(ce_prob+1e-8); log_ms=torch.log(ms_prob+1e-8)
    comb = torch.exp(w_ce*log_ce + (1.0-w_ce)*log_ms)
    return comb/(comb.sum(dim=-1,keepdim=True)+1e-8)

rows=[]; t0=time.time(); failures=0
for i, sid in enumerate(test_ids, 1):
    X = load_feat_test(int(sid), 1800)
    xb = torch.from_numpy(X).unsqueeze(0).to(device)
    with torch.no_grad():
        ce_prob = ensemble_ce_probs_from_models(xb, ce_models)
        use_ms = True
        try:
            ms_prob = ms_model(xb)[-1][0].softmax(dim=-1)
        except Exception as e:
            use_ms = False; failures += 1
    if use_ms:
        probs = blend_probs_geom(ce_prob, ms_prob, w_ce=w_ce, temp_ms=0.95)
        probs = apply_tta_timewarp(probs, factors=(0.9,1.0,1.1))
        seq = decode_peaks(probs, med_k=med_k_test, gamma=gamma_ms, pool_k=pool_k_ms, temp=temp_msdec)
    else:
        probs = apply_tta_timewarp(ce_prob, factors=(0.9,1.0,1.1))
        seq = decode_peaks(probs, med_k=med_k_test, gamma=gamma_ce, pool_k=pool_k_ce, temp=temp_cedec)
    rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [robust CE+MS] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m fails={failures}", flush=True)

sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_ms_fallback.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_ms_fallback.csv and submission.csv; head:\n', sub.head(), flush=True)

In [None]:
# Train CE-only DilatedTCN per fold under grouped CV (expert spec)
import os, json, math, time, random, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available())
assert torch.cuda.is_available(), 'GPU required for timely training'
torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision('high')
except Exception:
    pass

feat_tr_dir = Path('features3d_v2')/'train'
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))

# Model per expert spec
class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__()
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch)
        self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1)
        self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h)
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True)
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__()
        self.inp = nn.Conv1d(d_in, channels, 1)
        blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3))
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2)
        h = self.inp(x)
        for b in self.blocks:
            h = b(h)
        out = self.head(h)
        return out.transpose(1,2)  # B,T,C

# EMA helper
class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n] = p.detach().clone()
    @torch.no_grad()
    def update(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0 - self.decay)
    def apply_to(self, model):
        self.backup = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.backup[n] = p.detach().clone()
                p.data.copy_(self.shadow[n].data)
    def restore(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                p.data.copy_(self.backup[n].data)

# Data utils
def load_feat_full(sample_id: int):
    d = np.load((feat_tr_dir/f"{sample_id}.npz"))
    X = d['X'].astype(np.float32)  # full length, no truncation
    return X
def load_labels(sample_id: int):
    y = np.load(lab_tr_dir/f"{sample_id}.npy").astype(np.int64)
    return y

def compute_fold_scaler(id_list):
    # Running mean/var across frames for numerical stability
    n = 0
    mean = None
    M2 = None
    for sid in id_list:
        X = load_feat_full(int(sid))
        n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0)
            M2 = ((X - mean)**2).sum(axis=0)
            n = n_i
        else:
            # combine two sets
            mean_i = X.mean(axis=0)
            n_new = n + n_i
            delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new))
            n = n_new
    var = M2 / max(1, (n - 1))
    std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_weights(train_ids):
    # per-frame frequency over 21 classes (0..20); cap class 0 weight
    counts = np.zeros(21, dtype=np.int64)
    for sid in train_ids:
        y = load_labels(int(sid))
        vals, cnts = np.unique(y, return_counts=True)
        for v, c in zip(vals, cnts):
            if 0 <= v <= 20:
                counts[v] += int(c)
    freq = counts / max(1, counts.sum())
    w = 1.0 / np.sqrt(np.clip(freq, 1e-12, None))
    w = w / w.mean()
    w0_cap = 0.7 * w.mean()
    w[0] = min(w[0], w0_cap)
    return torch.tensor(w, dtype=torch.float32, device=device)

class SeqDataset(Dataset):
    def __init__(self, ids, mean, std, train=True, crop_min=1600, crop_max=1800, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=42):
        self.ids = list(ids)
        self.mean = torch.from_numpy(mean).float()
        self.std = torch.from_numpy(std).float()
        self.train = train
        self.crop_min = crop_min
        self.crop_max = crop_max
        self.tmask_lo, self.tmask_hi = time_masks
        self.mlen_lo, self.mlen_hi = mask_len
        self.noise_std = noise_std
        self.rng = random.Random(seed)
    def __len__(self):
        return len(self.ids)
    def _rand_crop(self, X, y):
        T = X.shape[0]
        if not self.train:
            return X, y
        tgt = self.rng.randint(self.crop_min, self.crop_max)
        if T <= tgt:
            return X, y
        start = self.rng.randint(0, T - tgt)
        end = start + tgt
        return X[start:end], y[start:end]
    def _time_mask(self, X):
        if not self.train:
            return X
        T = X.shape[0]
        m = self.rng.randint(self.tmask_lo, self.tmask_hi)
        for _ in range(m):
            L = self.rng.randint(self.mlen_lo, self.mlen_hi)
            if T <= L: continue
            s = self.rng.randint(0, T - L)
            e = s + L
            seg_mean = X[max(0, s-5):min(T, e+5)].mean(axis=0, keepdims=True)
            X[s:e] = seg_mean
        return X
    def __getitem__(self, idx):
        sid = int(self.ids[idx])
        X = load_feat_full(sid)
        y = load_labels(sid)
        X, y = self._rand_crop(X, y)
        # standardize
        X = (torch.from_numpy(X).float() - self.mean) / (self.std + 1e-6)
        if self.train:
            if self.noise_std > 0:
                X = X + torch.randn_like(X) * self.noise_std
            # time mask in numpy for speed then back
            X_np = X.numpy()
            X_np = self._time_mask(X_np)
            X = torch.from_numpy(X_np).float()
        y = torch.from_numpy(y).long()
        return X, y

def collate_pad(batch):
    # pad to max T in batch for efficient training; return CPU tensors
    xs, ys = zip(*batch)
    T_max = max(x.shape[0] for x in xs)
    D = xs[0].shape[1]
    xb = torch.zeros((len(xs), T_max, D), dtype=torch.float32)
    yb = torch.full((len(xs), T_max), -100, dtype=torch.long)
    for i, (x, y) in enumerate(zip(xs, ys)):
        T = x.shape[0]
        xb[i, :T] = x
        yb[i, :T] = y
    return xb, yb  # keep on CPU; DataLoader pin_memory will handle page-locking

def cosine_with_warmup(step, total_steps, warmup_steps, base_lr, min_lr):
    if step < warmup_steps:
        return base_lr * (step / max(1, warmup_steps))
    t = (step - warmup_steps) / max(1, (total_steps - warmup_steps))
    return min_lr + 0.5 * (base_lr - min_lr) * (1 + math.cos(math.pi * t))

def train_fold(fold_idx, train_ids, val_ids, epochs=50, batch_size=8, accum_steps=1, base_lr=3e-3, min_lr=3e-5, wd=0.01, label_smooth=0.05):
    print(f"=== Train fold {fold_idx}: train_n={len(train_ids)} val_n={len(val_ids)} ===", flush=True)
    # compute scaler and class weights on train only
    mean, std = compute_fold_scaler(train_ids)
    class_w = compute_class_weights(train_ids)
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    ema = EMA(model, decay=0.999)
    scaler = torch.amp.GradScaler('cuda', enabled=True)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=wd, betas=(0.9, 0.999))
    # datasets
    tr_ds = SeqDataset(train_ids, mean, std, train=True, crop_min=1600, crop_max=4096, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=fold_idx+123)
    va_ds = SeqDataset(val_ids, mean, std, train=False, seed=fold_idx+777)
    # num_workers=0; keep pin_memory True to speed H2D copies; tensors stay on CPU in collate_pad
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    va_ld = DataLoader(va_ds, batch_size=1, shuffle=False, drop_last=False, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    # schedule
    steps_per_epoch = max(1, len(tr_ld))
    total_steps = steps_per_epoch * epochs
    warmup_steps = 5 * steps_per_epoch  # 5 epochs warmup
    crit = nn.CrossEntropyLoss(weight=class_w, label_smoothing=label_smooth, ignore_index=-100)
    best_val = float('inf'); best_path = f"model_ce_fold{fold_idx}.pth"; patience=5; bad=0
    t0=time.time()
    for ep in range(1, epochs+1):
        model.train()
        tr_loss = 0.0; seen = 0; t_ep=time.time()
        opt.zero_grad(set_to_none=True)
        for step, (xb, yb) in enumerate(tr_ld):
            # move batch to GPU
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            bs, T, D = xb.shape; C = 21
            lr = cosine_with_warmup((ep-1)*steps_per_epoch + step, total_steps, warmup_steps, base_lr, min_lr)
            for pg in opt.param_groups: pg['lr'] = lr
            with torch.amp.autocast('cuda'):
                logits = model(xb)  # B,T,C
                loss = crit(logits.reshape(-1, C), yb.reshape(-1))
            # backward + step with AMP
            scaler.scale(loss / accum_steps).backward()
            if ((step + 1) % accum_steps) == 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(opt)
                scaler.update()
                opt.zero_grad(set_to_none=True)
                ema.update(model)
            tr_loss += loss.item() * bs
            seen += bs
            if (step+1) % 50 == 0 or (step+1)==steps_per_epoch:
                print(f"  ep{ep} step {step+1}/{steps_per_epoch} lr={lr:.2e} loss={tr_loss/max(1,seen):.4f} elapsed={(time.time()-t_ep):.1f}s", flush=True)
        # validate with EMA weights
        model.eval(); ema.apply_to(model)
        val_loss = 0.0; vseen=0
        with torch.no_grad(), torch.amp.autocast('cuda'):
            for xb, yb in va_ld:
                xb = xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)
                bs, T, D = xb.shape; C = 21
                logits = model(xb)
                loss = crit(logits.reshape(-1, C), yb.reshape(-1))
                val_loss += loss.item()
                vseen += 1
        ema.restore(model)
        val_loss = val_loss / max(1, vseen)
        print(f"[Fold {fold_idx}] Epoch {ep} train_loss={tr_loss/max(1,seen):.4f} val_loss={val_loss:.4f} epoch_time={(time.time()-t_ep):.1f}s total={(time.time()-t0)/60:.1f}m", flush=True)
        # early stopping on val CE; save EMA weights at best
        if val_loss < best_val - 1e-4:
            best_val = val_loss; bad = 0
            ema.apply_to(model); torch.save(model.state_dict(), best_path); ema.restore(model)
            print(f"  Saved best to {best_path}", flush=True)
        else:
            bad += 1
            if bad >= patience:
                print(f"  Early stop at epoch {ep}", flush=True)
                break
        # free cache
        torch.cuda.empty_cache(); gc.collect()
    print(f"Fold {fold_idx} done. Best val CE={best_val:.4f}. Model -> {best_path}")

# Kick off training sequentially across folds (force retrain, overwrite existing checkpoints)
for f in folds:
    fold_idx = int(f['fold'])
    outp = Path(f"model_ce_fold{fold_idx}.pth")
    if outp.exists():
        print(f"[Overwrite] Removing existing {outp} to retrain with fixed loss/EMA...")
        try:
            outp.unlink()
        except Exception as e:
            print(f"  Warning: could not delete {outp}: {e}")
    train_ids = f['train_ids']
    val_ids = f['val_ids']
    train_fold(fold_idx, train_ids, val_ids, epochs=50, batch_size=8, accum_steps=1, base_lr=3e-3, min_lr=3e-5, wd=0.01, label_smooth=0.05)
    # After each fold, flush CUDA
    torch.cuda.empty_cache(); gc.collect()
print('All folds processed.')

In [None]:
# OOF eval and test inference using newly trained per-fold CE models (DilatedTCN 128x12, EMA checkpoints)
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA:', torch.cuda.is_available(), flush=True)

feat_tr_dir = Path('features3d_v2')/'train'
feat_te_dir = Path('features3d_v2')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)

folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz");
    return d['X'].astype(np.float32)

def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid))
        n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0);
            M2 = ((X - mean)**2).sum(axis=0);
            n = n_i
        else:
            mean_i = X.mean(axis=0);
            n_new = n + n_i; delta = mean_i - mean;
            mean = mean + delta * (n_i / max(1, n_new));
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new));
            n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {}
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

# Model def matching training
class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__();
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch)
        self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1)
        self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h);
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True);
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__();
        self.inp = nn.Conv1d(d_in, channels, 1)
        blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2);
        h = self.inp(x);
        for b in self.blocks: h = b(h);
        out = self.head(h);
        return out.transpose(1,2)

def time_warp_probs(p_t_c: torch.Tensor, factor: float) -> torch.Tensor:
    T, C = p_t_c.shape; tgt_len = max(1, int(round(T*factor)));
    x = p_t_c.T.unsqueeze(0);
    y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
    y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T;
    return y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        ps = time_warp_probs(p_t_c, s);
        acc = ps if acc is None else (acc + ps)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2);
    y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1);
    w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1);
    T = p_t.shape[0];
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y

def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a = max(0, t_star - w); b = min(T-1, t_star + w);
    idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype);
    seg = p[a:b+1]; s = seg.sum() + 1e-8;
    return float(((idx * seg).sum() / s).item())

def decode_peaks(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.95):
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape;
    scores = torch.empty_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: scores[:,c]=p_s[:,c]; ks[c]=13; continue
        base_k = med_k.get(c, 13); k = int(np.clip(round(gamma * base_k), 9, 25));
        if k % 2 == 0: k = min(25, k + 1); ks[c]=k;
        scores[:,c] = duration_integral_single(p_s[:,c], k=k)
    peaks=[]
    for c in range(1,21):
        k=ks[c]; w_com = max(5, k//3); radius = max(10, k//2); s=scores[:,c];
        t_star = int(torch.argmax(s).item()); t_ref = refine_com(p_s[:,c], t_star, w=w_com);
        t_idx = int(round(max(0, min(t_ref, T-1))));
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item();
        pooled_at_ref = p_s[t_idx, c].item();
        peaks.append([c, t_ref, float(scores[t_idx,c].item()), float(local_mean), float(pooled_at_ref)])
    peaks.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]));
    last_t = -1e9
    for i in range(len(peaks)):
        if peaks[i][1] <= last_t: peaks[i][1] = last_t + 2.0;
        last_t = min(peaks[i][1], float(T-1))
    return [int(c) for c,_,_,_,_ in peaks]

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# OOF: per-fold model on its own val_ids only; probs cached
def cache_fold_val_probs(fold):
    fold_idx = int(fold['fold'])
    ckpt = Path(f"model_ce_fold{fold_idx}.pth");
    assert ckpt.exists(), f"Missing {ckpt}; train fold models first"
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()
    mean,std = compute_fold_scaler(fold['train_ids'])
    mean_t = torch.from_numpy(mean).float().to(device); std_t = torch.from_numpy(std).float().to(device)
    vids = fold['val_ids']
    t0=time.time()
    for i, sid in enumerate(vids, 1):
        sid=int(sid); outp = probs_cache/f"{sid}_ce_new.npy"
        if outp.exists():
            if (i%25)==0 or i==len(vids):
                print(f"  [fold {fold_idx}] cached {i}/{len(vids)} elapsed {time.time()-t0:.1f}s", flush=True)
            continue
        X = load_feat('train', sid); xb = torch.from_numpy(X).float().to(device);
        xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
        with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
            probs = model(xb)[0].softmax(dim=-1);
            probs = apply_tta_timewarp(probs, factors=(0.9,1.0,1.1))
        np.save(outp, probs.cpu().numpy())
        if (i%25)==0 or i==len(vids):
            print(f"  [fold {fold_idx}] cached {i}/{len(vids)} elapsed {time.time()-t0:.1f}s", flush=True)

def load_cached_prob_new(sid:int):
    return torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)

print('Caching OOF probs per fold (new CE models)...', flush=True)
for f in folds: cache_fold_val_probs(f)

# Small grid over decoder settings (CE-only); per-fold priors, select by worst-fold then mean
pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; gammas=[0.90,0.95,0.975,1.00,1.025,1.05]
med_cache={}
def eval_cfg_on_fold(fold, pool_k, temp, gamma):
    fi = int(fold['fold'])
    if fi not in med_cache: med_cache[fi] = compute_class_median_durations_for_ids(fold['train_ids'])
    med_k = med_cache[fi]
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        p = load_cached_prob_new(int(sid));
        seq = decode_peaks(p, med_k=med_k, gamma=gamma, pool_k=pool_k, temp=temp);
        tot += levenshtein(seq, id2seq[int(sid)]); cnt += 1
    return tot/max(cnt,1)

res=[]
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            per_fold=[]
            for f in folds:
                lev = eval_cfg_on_fold(f, pool_k, temp, gamma); per_fold.append(lev)
            res.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k, 'temp':temp, 'gamma':gamma}))
res.sort(key=lambda x: (x[1], x[0]))
print('Top CE-only (new models):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_new.csv', index=False)
print('Saved cv_sweep_ce_new.csv', flush=True)

# Test-time inference: ensemble 3 fold models; standardize per-model with its own scaler, then average probs
print('Building CE-only test submission with new models...', flush=True)
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
cfg_best = pd.read_csv('cv_sweep_ce_new.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_new.csv').exists() else {'pool_k':13,'temp':0.95,'gamma':1.0}
pool_k=int(cfg_best['pool_k']); temp=float(cfg_best['temp']); gamma=float(cfg_best.get('gamma',1.0))
med_k_test = compute_class_median_durations_for_ids(pd.read_csv('training.csv')['Id'].astype(int).tolist())

# preload models and their scalers
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
models=[]; scalers=[]
for fi in range(3):
    ckpt = Path(f"model_ce_fold{fi}.pth");
    if not ckpt.exists():
        print(f"WARNING: missing {ckpt}; skipping in ensemble")
        continue
    m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval(); models.append(m)
    mean,std = compute_fold_scaler(folds[fi]['train_ids']);
    scalers.append((torch.from_numpy(mean).float().to(device), torch.from_numpy(std).float().to(device)))
assert len(models)>0, 'No CE fold models available'

rows=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    X = load_feat('test', int(sid));
    acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for m, (mean_t, std_t) in zip(models, scalers):
            xb = torch.from_numpy(X).float().to(device);
            xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0);
            p = m(xb)[0].softmax(dim=-1);
            p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1));
            acc = p if acc is None else (acc + p)
        probs = acc / float(len(models)); probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
    seq = decode_peaks(probs, med_k=med_k_test, gamma=gamma, pool_k=pool_k, temp=temp)
    rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE-new] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_new.csv', index=False);
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_new.csv and submission.csv; head:\n', sub.head(), flush=True)

In [2]:
# Train a second CE seed per fold (to ensemble 6 models total)
import os, json, math, time, random, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available(), flush=True)
assert torch.cuda.is_available(), 'GPU required for timely training'
torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision('high')
except Exception:
    pass

feat_tr_dir = Path('features3d_v2')/'train'
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__()
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch)
        self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1)
        self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h)
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True)
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__()
        self.inp = nn.Conv1d(d_in, channels, 1)
        blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3))
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2)
        h = self.inp(x)
        for b in self.blocks:
            h = b(h)
        out = self.head(h)
        return out.transpose(1,2)

class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {n: p.detach().clone() for n,p in model.named_parameters() if p.requires_grad}
    @torch.no_grad()
    def update(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0 - self.decay)
    def apply_to(self, model):
        self.backup = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.backup[n] = p.detach().clone()
                p.data.copy_(self.shadow[n].data)
    def restore(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                p.data.copy_(self.backup[n].data)

def load_feat_full(sample_id: int):
    d = np.load((feat_tr_dir/f"{sample_id}.npz"))
    return d['X'].astype(np.float32)
def load_labels(sample_id: int):
    return np.load(lab_tr_dir/f"{sample_id}.npy").astype(np.int64)

def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat_full(int(sid))
        n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0)
            M2 = ((X - mean)**2).sum(axis=0)
            n = n_i
        else:
            mean_i = X.mean(axis=0)
            n_new = n + n_i
            delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new))
            n = n_new
    var = M2 / max(1, (n - 1))
    std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_weights(train_ids):
    counts = np.zeros(21, dtype=np.int64)
    for sid in train_ids:
        y = load_labels(int(sid))
        vals, cnts = np.unique(y, return_counts=True)
        for v, c in zip(vals, cnts):
            if 0 <= v <= 20:
                counts[v] += int(c)
    freq = counts / max(1, counts.sum())
    w = 1.0 / np.sqrt(np.clip(freq, 1e-12, None))
    w = w / w.mean()
    w0_cap = 0.7 * w.mean()
    w[0] = min(w[0], w0_cap)
    return torch.tensor(w, dtype=torch.float32, device=device)

class SeqDataset(Dataset):
    def __init__(self, ids, mean, std, train=True, crop_min=1600, crop_max=4096, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=777):
        self.ids = list(ids)
        self.mean = torch.from_numpy(mean).float()
        self.std = torch.from_numpy(std).float()
        self.train = train
        self.crop_min = crop_min
        self.crop_max = crop_max
        self.tmask_lo, self.tmask_hi = time_masks
        self.mlen_lo, self.mlen_hi = mask_len
        self.noise_std = noise_std
        self.rng = random.Random(seed)
    def __len__(self):
        return len(self.ids)
    def _rand_crop(self, X, y):
        T = X.shape[0]
        if not self.train:
            return X, y
        tgt = self.rng.randint(self.crop_min, self.crop_max)
        if T <= tgt:
            return X, y
        start = self.rng.randint(0, T - tgt)
        end = start + tgt
        return X[start:end], y[start:end]
    def _time_mask(self, X):
        if not self.train:
            return X
        T = X.shape[0]
        m = self.rng.randint(self.tmask_lo, self.tmask_hi)
        for _ in range(m):
            L = self.rng.randint(self.mlen_lo, self.mlen_hi)
            if T <= L: continue
            s = self.rng.randint(0, T - L)
            e = s + L
            seg_mean = X[max(0, s-5):min(T, e+5)].mean(axis=0, keepdims=True)
            X[s:e] = seg_mean
        return X      
    def __getitem__(self, idx):
        sid = int(self.ids[idx])
        X = load_feat_full(sid)
        y = load_labels(sid)
        X, y = self._rand_crop(X, y)
        X = (torch.from_numpy(X).float() - self.mean) / (self.std + 1e-6)
        if self.train:
            if self.noise_std > 0:
                X = X + torch.randn_like(X) * self.noise_std
            X_np = X.numpy(); X_np = self._time_mask(X_np); X = torch.from_numpy(X_np).float()
        y = torch.from_numpy(y).long()
        return X, y

def collate_pad(batch):
    xs, ys = zip(*batch)
    T_max = max(x.shape[0] for x in xs)
    D = xs[0].shape[1]
    xb = torch.zeros((len(xs), T_max, D), dtype=torch.float32)
    yb = torch.full((len(xs), T_max), -100, dtype=torch.long)
    for i, (x, y) in enumerate(zip(xs, ys)):
        T = x.shape[0]
        xb[i, :T] = x
        yb[i, :T] = y
    return xb, yb

def cosine_with_warmup(step, total_steps, warmup_steps, base_lr, min_lr):
    if step < warmup_steps:
        return base_lr * (step / max(1, warmup_steps))
    t = (step - warmup_steps) / max(1, (total_steps - warmup_steps))
    return min_lr + 0.5 * (base_lr - min_lr) * (1 + math.cos(math.pi * t))

def train_fold_seed(fold_idx, train_ids, val_ids, out_name, ds_seed, epochs=50, batch_size=8, accum_steps=1, base_lr=3e-3, min_lr=3e-5, wd=0.01, label_smooth=0.05):
    print(f"=== Train fold {fold_idx} (seed2): train_n={len(train_ids)} val_n={len(val_ids)} ===", flush=True)
    mean, std = compute_fold_scaler(train_ids)
    class_w = compute_class_weights(train_ids)
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    torch.manual_seed(1337 + fold_idx*11)
    np.random.seed(4242 + fold_idx*17)
    random.seed(9001 + fold_idx*23)
    ema = EMA(model, decay=0.999)
    scaler = torch.amp.GradScaler('cuda', enabled=True)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=wd, betas=(0.9, 0.999))
    tr_ds = SeqDataset(train_ids, mean, std, train=True, crop_min=1600, crop_max=4096, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=ds_seed)
    va_ds = SeqDataset(val_ids, mean, std, train=False, seed=ds_seed+777)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    va_ld = DataLoader(va_ds, batch_size=1, shuffle=False, drop_last=False, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    steps_per_epoch = max(1, len(tr_ld))
    total_steps = steps_per_epoch * epochs
    warmup_steps = 5 * steps_per_epoch
    crit = nn.CrossEntropyLoss(weight=class_w, label_smoothing=label_smooth, ignore_index=-100)
    best_val = float('inf'); patience=5; bad=0
    t0=time.time()
    for ep in range(1, epochs+1):
        model.train(); tr_loss=0.0; seen=0; t_ep=time.time()
        opt.zero_grad(set_to_none=True)
        for step, (xb, yb) in enumerate(tr_ld):
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            bs, T, D = xb.shape; C = 21
            lr = cosine_with_warmup((ep-1)*steps_per_epoch + step, total_steps, warmup_steps, base_lr, min_lr)
            for pg in opt.param_groups: pg['lr'] = lr
            with torch.amp.autocast('cuda'):
                logits = model(xb)
                loss = crit(logits.reshape(-1, C), yb.reshape(-1))
            scaler.scale(loss / accum_steps).backward()
            if ((step + 1) % accum_steps) == 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(opt)
                scaler.update()
                opt.zero_grad(set_to_none=True)
                ema.update(model)
            tr_loss += loss.item() * bs; seen += bs
            if (step+1) % 50 == 0 or (step+1)==steps_per_epoch:
                print(f"  ep{ep} step {step+1}/{steps_per_epoch} lr={lr:.2e} loss={tr_loss/max(1,seen):.4f} elapsed={(time.time()-t_ep):.1f}s", flush=True)
        model.eval(); ema.apply_to(model)
        val_loss = 0.0; vseen=0
        with torch.no_grad(), torch.amp.autocast('cuda'):
            for xb, yb in va_ld:
                xb = xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)
                bs, T, D = xb.shape; C = 21
                logits = model(xb)
                loss = crit(logits.reshape(-1, C), yb.reshape(-1))
                val_loss += loss.item(); vseen += 1
        ema.restore(model)
        val_loss = val_loss / max(1, vseen)
        print(f"[Fold {fold_idx} seed2] Epoch {ep} train_loss={tr_loss/max(1,seen):.4f} val_loss={val_loss:.4f} epoch_time={(time.time()-t_ep):.1f}s total={(time.time()-t0)/60:.1f}m", flush=True)
        if val_loss < best_val - 1e-4:
            best_val = val_loss; bad = 0
            ema.apply_to(model); torch.save(model.state_dict(), out_name); ema.restore(model)
            print(f"  Saved best to {out_name}", flush=True)
        else:
            bad += 1
            if bad >= patience:
                print(f"  Early stop at epoch {ep}", flush=True)
                break
        torch.cuda.empty_cache(); gc.collect()
    print(f"Fold {fold_idx} seed2 done. Best val CE={best_val:.4f}. Model -> {out_name}")

# Kick off training for seed2 across folds (separate ckpts)
for f in folds:
    fold_idx = int(f['fold'])
    outp = Path(f"model_ce_fold{fold_idx}_s1.pth")
    if outp.exists():
        print(f"[Overwrite] Removing existing {outp} to retrain seed2...")
        try:
            outp.unlink()
        except Exception as e:
            print(f"  Warning: could not delete {outp}: {e}")
    train_ids = f['train_ids']; val_ids = f['val_ids']
    train_fold_seed(fold_idx, train_ids, val_ids, out_name=str(outp), ds_seed=2025 + fold_idx)
    torch.cuda.empty_cache(); gc.collect()
print('All folds (seed2) processed.')

CUDA available: True


=== Train fold 0 (seed2): train_n=199 val_n=98 ===


  ep1 step 24/24 lr=5.75e-04 loss=4.3221 elapsed=1.6s


[Fold 0 seed2] Epoch 1 train_loss=4.3221 val_loss=6.2157 epoch_time=2.5s total=0.0m


  Saved best to model_ce_fold0_s1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.8240 elapsed=1.6s


[Fold 0 seed2] Epoch 2 train_loss=2.8240 val_loss=5.7264 epoch_time=2.5s total=0.1m


  Saved best to model_ce_fold0_s1.pth


  ep3 step 24/24 lr=1.78e-03 loss=2.3179 elapsed=1.6s


[Fold 0 seed2] Epoch 3 train_loss=2.3179 val_loss=5.2618 epoch_time=2.5s total=0.1m


  Saved best to model_ce_fold0_s1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.9468 elapsed=1.6s


[Fold 0 seed2] Epoch 4 train_loss=1.9468 val_loss=4.8661 epoch_time=2.6s total=0.2m


  Saved best to model_ce_fold0_s1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.7240 elapsed=1.6s


[Fold 0 seed2] Epoch 5 train_loss=1.7240 val_loss=4.5117 epoch_time=2.6s total=0.2m


  Saved best to model_ce_fold0_s1.pth


  ep6 step 24/24 lr=3.00e-03 loss=1.5131 elapsed=1.6s


[Fold 0 seed2] Epoch 6 train_loss=1.5131 val_loss=4.2118 epoch_time=2.6s total=0.3m


  Saved best to model_ce_fold0_s1.pth


  ep7 step 24/24 lr=2.99e-03 loss=1.4153 elapsed=1.6s


[Fold 0 seed2] Epoch 7 train_loss=1.4153 val_loss=3.9526 epoch_time=2.6s total=0.3m


  Saved best to model_ce_fold0_s1.pth


  ep8 step 24/24 lr=2.97e-03 loss=1.3417 elapsed=1.6s


[Fold 0 seed2] Epoch 8 train_loss=1.3417 val_loss=3.7161 epoch_time=2.6s total=0.4m


  Saved best to model_ce_fold0_s1.pth


  ep9 step 24/24 lr=2.94e-03 loss=1.2169 elapsed=1.6s


[Fold 0 seed2] Epoch 9 train_loss=1.2169 val_loss=3.5032 epoch_time=2.6s total=0.4m


  Saved best to model_ce_fold0_s1.pth


  ep10 step 24/24 lr=2.91e-03 loss=1.1507 elapsed=1.6s


[Fold 0 seed2] Epoch 10 train_loss=1.1507 val_loss=3.3038 epoch_time=2.5s total=0.4m


  Saved best to model_ce_fold0_s1.pth


  ep11 step 24/24 lr=2.87e-03 loss=1.1207 elapsed=1.6s


[Fold 0 seed2] Epoch 11 train_loss=1.1207 val_loss=3.1110 epoch_time=2.5s total=0.5m


  Saved best to model_ce_fold0_s1.pth


  ep12 step 24/24 lr=2.83e-03 loss=1.0550 elapsed=1.6s


[Fold 0 seed2] Epoch 12 train_loss=1.0550 val_loss=2.9316 epoch_time=2.6s total=0.5m


  Saved best to model_ce_fold0_s1.pth


  ep13 step 24/24 lr=2.78e-03 loss=1.0209 elapsed=1.6s


[Fold 0 seed2] Epoch 13 train_loss=1.0209 val_loss=2.7620 epoch_time=2.6s total=0.6m


  Saved best to model_ce_fold0_s1.pth


  ep14 step 24/24 lr=2.72e-03 loss=0.9766 elapsed=1.6s


[Fold 0 seed2] Epoch 14 train_loss=0.9766 val_loss=2.6160 epoch_time=2.5s total=0.6m


  Saved best to model_ce_fold0_s1.pth


  ep15 step 24/24 lr=2.66e-03 loss=0.9401 elapsed=1.7s


[Fold 0 seed2] Epoch 15 train_loss=0.9401 val_loss=2.4818 epoch_time=2.6s total=0.7m


  Saved best to model_ce_fold0_s1.pth


  ep16 step 24/24 lr=2.59e-03 loss=0.9050 elapsed=1.6s


[Fold 0 seed2] Epoch 16 train_loss=0.9050 val_loss=2.3671 epoch_time=2.6s total=0.7m


  Saved best to model_ce_fold0_s1.pth


  ep17 step 24/24 lr=2.51e-03 loss=0.8736 elapsed=1.8s


[Fold 0 seed2] Epoch 17 train_loss=0.8736 val_loss=2.2670 epoch_time=2.7s total=0.8m


  Saved best to model_ce_fold0_s1.pth


  ep18 step 24/24 lr=2.43e-03 loss=0.8713 elapsed=1.6s


[Fold 0 seed2] Epoch 18 train_loss=0.8713 val_loss=2.1798 epoch_time=2.5s total=0.8m


  Saved best to model_ce_fold0_s1.pth


  ep19 step 24/24 lr=2.35e-03 loss=0.8294 elapsed=1.6s


[Fold 0 seed2] Epoch 19 train_loss=0.8294 val_loss=2.1096 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold0_s1.pth


  ep20 step 24/24 lr=2.26e-03 loss=0.7967 elapsed=1.6s


[Fold 0 seed2] Epoch 20 train_loss=0.7967 val_loss=2.0478 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold0_s1.pth


  ep21 step 24/24 lr=2.17e-03 loss=0.7826 elapsed=1.6s


[Fold 0 seed2] Epoch 21 train_loss=0.7826 val_loss=1.9985 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold0_s1.pth


  ep22 step 24/24 lr=2.08e-03 loss=0.7625 elapsed=1.6s


[Fold 0 seed2] Epoch 22 train_loss=0.7625 val_loss=1.9569 epoch_time=2.5s total=1.0m


  Saved best to model_ce_fold0_s1.pth


  ep23 step 24/24 lr=1.98e-03 loss=0.7373 elapsed=1.6s


[Fold 0 seed2] Epoch 23 train_loss=0.7373 val_loss=1.9235 epoch_time=2.6s total=1.0m


  Saved best to model_ce_fold0_s1.pth


  ep24 step 24/24 lr=1.88e-03 loss=0.7117 elapsed=1.6s


[Fold 0 seed2] Epoch 24 train_loss=0.7117 val_loss=1.8980 epoch_time=2.6s total=1.1m


  Saved best to model_ce_fold0_s1.pth


  ep25 step 24/24 lr=1.78e-03 loss=0.7009 elapsed=1.6s


[Fold 0 seed2] Epoch 25 train_loss=0.7009 val_loss=1.8763 epoch_time=2.5s total=1.1m


  Saved best to model_ce_fold0_s1.pth


  ep26 step 24/24 lr=1.67e-03 loss=0.6892 elapsed=1.6s


[Fold 0 seed2] Epoch 26 train_loss=0.6892 val_loss=1.8591 epoch_time=2.5s total=1.2m


  Saved best to model_ce_fold0_s1.pth


  ep27 step 24/24 lr=1.57e-03 loss=0.6678 elapsed=1.6s


[Fold 0 seed2] Epoch 27 train_loss=0.6678 val_loss=1.8447 epoch_time=2.6s total=1.2m


  Saved best to model_ce_fold0_s1.pth


  ep28 step 24/24 lr=1.47e-03 loss=0.6478 elapsed=1.6s


[Fold 0 seed2] Epoch 28 train_loss=0.6478 val_loss=1.8344 epoch_time=2.6s total=1.3m


  Saved best to model_ce_fold0_s1.pth


  ep29 step 24/24 lr=1.36e-03 loss=0.6241 elapsed=1.6s


[Fold 0 seed2] Epoch 29 train_loss=0.6241 val_loss=1.8267 epoch_time=2.6s total=1.3m


  Saved best to model_ce_fold0_s1.pth


  ep30 step 24/24 lr=1.26e-03 loss=0.6188 elapsed=1.6s


[Fold 0 seed2] Epoch 30 train_loss=0.6188 val_loss=1.8212 epoch_time=2.5s total=1.3m


  Saved best to model_ce_fold0_s1.pth


  ep31 step 24/24 lr=1.16e-03 loss=0.5915 elapsed=1.6s


[Fold 0 seed2] Epoch 31 train_loss=0.5915 val_loss=1.8176 epoch_time=2.6s total=1.4m


  Saved best to model_ce_fold0_s1.pth


  ep32 step 24/24 lr=1.06e-03 loss=0.5972 elapsed=1.6s


[Fold 0 seed2] Epoch 32 train_loss=0.5972 val_loss=1.8155 epoch_time=2.6s total=1.4m


  Saved best to model_ce_fold0_s1.pth


  ep33 step 24/24 lr=9.63e-04 loss=0.5906 elapsed=1.6s


[Fold 0 seed2] Epoch 33 train_loss=0.5906 val_loss=1.8142 epoch_time=2.6s total=1.5m


  Saved best to model_ce_fold0_s1.pth


  ep34 step 24/24 lr=8.68e-04 loss=0.5678 elapsed=1.6s


[Fold 0 seed2] Epoch 34 train_loss=0.5678 val_loss=1.8143 epoch_time=2.5s total=1.5m


  ep35 step 24/24 lr=7.76e-04 loss=0.5636 elapsed=1.6s


[Fold 0 seed2] Epoch 35 train_loss=0.5636 val_loss=1.8147 epoch_time=2.6s total=1.6m


  ep36 step 24/24 lr=6.88e-04 loss=0.5569 elapsed=1.6s


[Fold 0 seed2] Epoch 36 train_loss=0.5569 val_loss=1.8160 epoch_time=2.6s total=1.6m


  ep37 step 24/24 lr=6.04e-04 loss=0.5511 elapsed=1.6s


[Fold 0 seed2] Epoch 37 train_loss=0.5511 val_loss=1.8176 epoch_time=2.6s total=1.7m


  ep38 step 24/24 lr=5.25e-04 loss=0.5469 elapsed=1.6s


[Fold 0 seed2] Epoch 38 train_loss=0.5469 val_loss=1.8196 epoch_time=2.6s total=1.7m


  Early stop at epoch 38


Fold 0 seed2 done. Best val CE=1.8142. Model -> model_ce_fold0_s1.pth
=== Train fold 1 (seed2): train_n=198 val_n=99 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.9077 elapsed=1.8s


[Fold 1 seed2] Epoch 1 train_loss=3.9077 val_loss=5.2814 epoch_time=2.6s total=0.0m


  Saved best to model_ce_fold1_s1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.8003 elapsed=1.8s


[Fold 1 seed2] Epoch 2 train_loss=2.8003 val_loss=4.9305 epoch_time=2.6s total=0.1m


  Saved best to model_ce_fold1_s1.pth


  ep3 step 24/24 lr=1.78e-03 loss=2.2498 elapsed=1.7s


[Fold 1 seed2] Epoch 3 train_loss=2.2498 val_loss=4.6204 epoch_time=2.6s total=0.1m


  Saved best to model_ce_fold1_s1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.9123 elapsed=1.8s


[Fold 1 seed2] Epoch 4 train_loss=1.9123 val_loss=4.3503 epoch_time=2.6s total=0.2m


  Saved best to model_ce_fold1_s1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.6704 elapsed=1.7s


[Fold 1 seed2] Epoch 5 train_loss=1.6704 val_loss=4.1068 epoch_time=2.6s total=0.2m


  Saved best to model_ce_fold1_s1.pth


  ep6 step 24/24 lr=3.00e-03 loss=1.5400 elapsed=1.8s


[Fold 1 seed2] Epoch 6 train_loss=1.5400 val_loss=3.8875 epoch_time=2.6s total=0.3m


  Saved best to model_ce_fold1_s1.pth


  ep7 step 24/24 lr=2.99e-03 loss=1.4024 elapsed=1.8s


[Fold 1 seed2] Epoch 7 train_loss=1.4024 val_loss=3.6800 epoch_time=2.6s total=0.3m


  Saved best to model_ce_fold1_s1.pth


  ep8 step 24/24 lr=2.97e-03 loss=1.2832 elapsed=1.7s


[Fold 1 seed2] Epoch 8 train_loss=1.2832 val_loss=3.4860 epoch_time=2.6s total=0.4m


  Saved best to model_ce_fold1_s1.pth


  ep9 step 24/24 lr=2.94e-03 loss=1.1876 elapsed=1.7s


[Fold 1 seed2] Epoch 9 train_loss=1.1876 val_loss=3.3066 epoch_time=2.6s total=0.4m


  Saved best to model_ce_fold1_s1.pth


  ep10 step 24/24 lr=2.91e-03 loss=1.1518 elapsed=1.7s


[Fold 1 seed2] Epoch 10 train_loss=1.1518 val_loss=3.1377 epoch_time=2.6s total=0.5m


  Saved best to model_ce_fold1_s1.pth


  ep11 step 24/24 lr=2.87e-03 loss=1.0971 elapsed=1.7s


[Fold 1 seed2] Epoch 11 train_loss=1.0971 val_loss=2.9767 epoch_time=2.6s total=0.5m


  Saved best to model_ce_fold1_s1.pth


  ep12 step 24/24 lr=2.83e-03 loss=1.0612 elapsed=1.7s


[Fold 1 seed2] Epoch 12 train_loss=1.0612 val_loss=2.8215 epoch_time=2.6s total=0.5m


  Saved best to model_ce_fold1_s1.pth


  ep13 step 24/24 lr=2.78e-03 loss=0.9910 elapsed=1.8s


[Fold 1 seed2] Epoch 13 train_loss=0.9910 val_loss=2.6735 epoch_time=2.6s total=0.6m


  Saved best to model_ce_fold1_s1.pth


  ep14 step 24/24 lr=2.72e-03 loss=0.9458 elapsed=1.8s


[Fold 1 seed2] Epoch 14 train_loss=0.9458 val_loss=2.5356 epoch_time=2.6s total=0.6m


  Saved best to model_ce_fold1_s1.pth


  ep15 step 24/24 lr=2.66e-03 loss=0.9101 elapsed=1.8s


[Fold 1 seed2] Epoch 15 train_loss=0.9101 val_loss=2.4055 epoch_time=2.6s total=0.7m


  Saved best to model_ce_fold1_s1.pth


  ep16 step 24/24 lr=2.59e-03 loss=0.9061 elapsed=1.8s


[Fold 1 seed2] Epoch 16 train_loss=0.9061 val_loss=2.2882 epoch_time=2.6s total=0.7m


  Saved best to model_ce_fold1_s1.pth


  ep17 step 24/24 lr=2.51e-03 loss=0.8579 elapsed=1.8s


[Fold 1 seed2] Epoch 17 train_loss=0.8579 val_loss=2.1844 epoch_time=2.6s total=0.8m


  Saved best to model_ce_fold1_s1.pth


  ep18 step 24/24 lr=2.43e-03 loss=0.8282 elapsed=1.7s


[Fold 1 seed2] Epoch 18 train_loss=0.8282 val_loss=2.0938 epoch_time=2.6s total=0.8m


  Saved best to model_ce_fold1_s1.pth


  ep19 step 24/24 lr=2.35e-03 loss=0.7876 elapsed=1.7s


[Fold 1 seed2] Epoch 19 train_loss=0.7876 val_loss=2.0142 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold1_s1.pth


  ep20 step 24/24 lr=2.26e-03 loss=0.7652 elapsed=1.7s


[Fold 1 seed2] Epoch 20 train_loss=0.7652 val_loss=1.9467 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold1_s1.pth


  ep21 step 24/24 lr=2.17e-03 loss=0.7394 elapsed=1.7s


[Fold 1 seed2] Epoch 21 train_loss=0.7394 val_loss=1.8890 epoch_time=2.6s total=1.0m


  Saved best to model_ce_fold1_s1.pth


  ep22 step 24/24 lr=2.08e-03 loss=0.7280 elapsed=1.8s


[Fold 1 seed2] Epoch 22 train_loss=0.7280 val_loss=1.8421 epoch_time=2.6s total=1.0m


  Saved best to model_ce_fold1_s1.pth


  ep23 step 24/24 lr=1.98e-03 loss=0.6970 elapsed=1.8s


[Fold 1 seed2] Epoch 23 train_loss=0.6970 val_loss=1.8015 epoch_time=2.6s total=1.1m


  Saved best to model_ce_fold1_s1.pth


  ep24 step 24/24 lr=1.88e-03 loss=0.6924 elapsed=1.7s


[Fold 1 seed2] Epoch 24 train_loss=0.6924 val_loss=1.7707 epoch_time=2.6s total=1.1m


  Saved best to model_ce_fold1_s1.pth


  ep25 step 24/24 lr=1.78e-03 loss=0.6732 elapsed=1.8s


[Fold 1 seed2] Epoch 25 train_loss=0.6732 val_loss=1.7410 epoch_time=2.6s total=1.1m


  Saved best to model_ce_fold1_s1.pth


  ep26 step 24/24 lr=1.67e-03 loss=0.6516 elapsed=1.8s


[Fold 1 seed2] Epoch 26 train_loss=0.6516 val_loss=1.7182 epoch_time=2.6s total=1.2m


  Saved best to model_ce_fold1_s1.pth


  ep27 step 24/24 lr=1.57e-03 loss=0.6381 elapsed=1.8s


[Fold 1 seed2] Epoch 27 train_loss=0.6381 val_loss=1.6990 epoch_time=2.6s total=1.2m


  Saved best to model_ce_fold1_s1.pth


  ep28 step 24/24 lr=1.47e-03 loss=0.6306 elapsed=1.8s


[Fold 1 seed2] Epoch 28 train_loss=0.6306 val_loss=1.6817 epoch_time=2.6s total=1.3m


  Saved best to model_ce_fold1_s1.pth


  ep29 step 24/24 lr=1.36e-03 loss=0.6193 elapsed=1.7s


[Fold 1 seed2] Epoch 29 train_loss=0.6193 val_loss=1.6671 epoch_time=2.6s total=1.3m


  Saved best to model_ce_fold1_s1.pth


  ep30 step 24/24 lr=1.26e-03 loss=0.5898 elapsed=1.8s


[Fold 1 seed2] Epoch 30 train_loss=0.5898 val_loss=1.6563 epoch_time=2.6s total=1.4m


  Saved best to model_ce_fold1_s1.pth


  ep31 step 24/24 lr=1.16e-03 loss=0.5777 elapsed=1.7s


[Fold 1 seed2] Epoch 31 train_loss=0.5777 val_loss=1.6457 epoch_time=2.6s total=1.4m


  Saved best to model_ce_fold1_s1.pth


  ep32 step 24/24 lr=1.06e-03 loss=0.5711 elapsed=1.7s


[Fold 1 seed2] Epoch 32 train_loss=0.5711 val_loss=1.6367 epoch_time=2.6s total=1.5m


  Saved best to model_ce_fold1_s1.pth


  ep33 step 24/24 lr=9.63e-04 loss=0.5587 elapsed=1.8s


[Fold 1 seed2] Epoch 33 train_loss=0.5587 val_loss=1.6292 epoch_time=2.6s total=1.5m


  Saved best to model_ce_fold1_s1.pth


  ep34 step 24/24 lr=8.68e-04 loss=0.5544 elapsed=1.7s


[Fold 1 seed2] Epoch 34 train_loss=0.5544 val_loss=1.6226 epoch_time=2.6s total=1.6m


  Saved best to model_ce_fold1_s1.pth


  ep35 step 24/24 lr=7.76e-04 loss=0.5470 elapsed=1.8s


[Fold 1 seed2] Epoch 35 train_loss=0.5470 val_loss=1.6170 epoch_time=2.6s total=1.6m


  Saved best to model_ce_fold1_s1.pth


  ep36 step 24/24 lr=6.88e-04 loss=0.5391 elapsed=1.8s


[Fold 1 seed2] Epoch 36 train_loss=0.5391 val_loss=1.6129 epoch_time=2.6s total=1.7m


  Saved best to model_ce_fold1_s1.pth


  ep37 step 24/24 lr=6.04e-04 loss=0.5350 elapsed=1.7s


[Fold 1 seed2] Epoch 37 train_loss=0.5350 val_loss=1.6095 epoch_time=2.6s total=1.7m


  Saved best to model_ce_fold1_s1.pth


  ep38 step 24/24 lr=5.25e-04 loss=0.5304 elapsed=1.7s


[Fold 1 seed2] Epoch 38 train_loss=0.5304 val_loss=1.6067 epoch_time=2.6s total=1.7m


  Saved best to model_ce_fold1_s1.pth


  ep39 step 24/24 lr=4.50e-04 loss=0.5255 elapsed=1.8s


[Fold 1 seed2] Epoch 39 train_loss=0.5255 val_loss=1.6041 epoch_time=2.6s total=1.8m


  Saved best to model_ce_fold1_s1.pth


  ep40 step 24/24 lr=3.80e-04 loss=0.5228 elapsed=1.8s


[Fold 1 seed2] Epoch 40 train_loss=0.5228 val_loss=1.6020 epoch_time=2.6s total=1.8m


  Saved best to model_ce_fold1_s1.pth


  ep41 step 24/24 lr=3.16e-04 loss=0.5208 elapsed=1.9s


[Fold 1 seed2] Epoch 41 train_loss=0.5208 val_loss=1.6006 epoch_time=2.8s total=1.9m


  Saved best to model_ce_fold1_s1.pth


  ep42 step 24/24 lr=2.58e-04 loss=0.5162 elapsed=1.8s


[Fold 1 seed2] Epoch 42 train_loss=0.5162 val_loss=1.5994 epoch_time=2.6s total=1.9m


  Saved best to model_ce_fold1_s1.pth


  ep43 step 24/24 lr=2.06e-04 loss=0.5152 elapsed=1.9s


[Fold 1 seed2] Epoch 43 train_loss=0.5152 val_loss=1.5983 epoch_time=2.8s total=2.0m


  Saved best to model_ce_fold1_s1.pth


  ep44 step 24/24 lr=1.60e-04 loss=0.5116 elapsed=1.7s


[Fold 1 seed2] Epoch 44 train_loss=0.5116 val_loss=1.5979 epoch_time=2.6s total=2.0m


  Saved best to model_ce_fold1_s1.pth


  ep45 step 24/24 lr=1.21e-04 loss=0.5112 elapsed=1.8s


[Fold 1 seed2] Epoch 45 train_loss=0.5112 val_loss=1.5975 epoch_time=2.6s total=2.1m


  Saved best to model_ce_fold1_s1.pth


  ep46 step 24/24 lr=8.87e-05 loss=0.5105 elapsed=1.8s


[Fold 1 seed2] Epoch 46 train_loss=0.5105 val_loss=1.5972 epoch_time=2.7s total=2.1m


  Saved best to model_ce_fold1_s1.pth


  ep47 step 24/24 lr=6.34e-05 loss=0.5094 elapsed=1.8s


[Fold 1 seed2] Epoch 47 train_loss=0.5094 val_loss=1.5969 epoch_time=2.6s total=2.2m


  Saved best to model_ce_fold1_s1.pth


  ep48 step 24/24 lr=4.51e-05 loss=0.5087 elapsed=1.8s


[Fold 1 seed2] Epoch 48 train_loss=0.5087 val_loss=1.5971 epoch_time=2.6s total=2.2m


  ep49 step 24/24 lr=3.39e-05 loss=0.5096 elapsed=1.8s


[Fold 1 seed2] Epoch 49 train_loss=0.5096 val_loss=1.5973 epoch_time=2.6s total=2.3m


  ep50 step 24/24 lr=3.00e-05 loss=0.5039 elapsed=1.8s


[Fold 1 seed2] Epoch 50 train_loss=0.5039 val_loss=1.5978 epoch_time=2.6s total=2.3m


Fold 1 seed2 done. Best val CE=1.5969. Model -> model_ce_fold1_s1.pth
=== Train fold 2 (seed2): train_n=197 val_n=100 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.8870 elapsed=1.8s


[Fold 2 seed2] Epoch 1 train_loss=3.8870 val_loss=5.2328 epoch_time=2.6s total=0.0m


  Saved best to model_ce_fold2_s1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.5170 elapsed=1.7s


[Fold 2 seed2] Epoch 2 train_loss=2.5170 val_loss=4.9687 epoch_time=2.6s total=0.1m


  Saved best to model_ce_fold2_s1.pth


  ep3 step 24/24 lr=1.78e-03 loss=1.8888 elapsed=1.8s


[Fold 2 seed2] Epoch 3 train_loss=1.8888 val_loss=4.7228 epoch_time=2.6s total=0.1m


  Saved best to model_ce_fold2_s1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.5874 elapsed=1.8s


[Fold 2 seed2] Epoch 4 train_loss=1.5874 val_loss=4.5057 epoch_time=2.7s total=0.2m


  Saved best to model_ce_fold2_s1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.4474 elapsed=1.8s


[Fold 2 seed2] Epoch 5 train_loss=1.4474 val_loss=4.3084 epoch_time=2.7s total=0.2m


  Saved best to model_ce_fold2_s1.pth


  ep6 step 24/24 lr=3.00e-03 loss=1.3688 elapsed=1.8s


[Fold 2 seed2] Epoch 6 train_loss=1.3688 val_loss=4.1181 epoch_time=2.7s total=0.3m


  Saved best to model_ce_fold2_s1.pth


  ep7 step 24/24 lr=2.99e-03 loss=1.2299 elapsed=1.8s


[Fold 2 seed2] Epoch 7 train_loss=1.2299 val_loss=3.9397 epoch_time=2.6s total=0.3m


  Saved best to model_ce_fold2_s1.pth


  ep8 step 24/24 lr=2.97e-03 loss=1.1676 elapsed=1.7s


[Fold 2 seed2] Epoch 8 train_loss=1.1676 val_loss=3.7722 epoch_time=2.6s total=0.4m


  Saved best to model_ce_fold2_s1.pth


  ep9 step 24/24 lr=2.94e-03 loss=1.0639 elapsed=1.8s


[Fold 2 seed2] Epoch 9 train_loss=1.0639 val_loss=3.6144 epoch_time=2.6s total=0.4m


  Saved best to model_ce_fold2_s1.pth


  ep10 step 24/24 lr=2.91e-03 loss=1.0345 elapsed=1.8s


[Fold 2 seed2] Epoch 10 train_loss=1.0345 val_loss=3.4615 epoch_time=2.6s total=0.5m


  Saved best to model_ce_fold2_s1.pth


  ep11 step 24/24 lr=2.87e-03 loss=1.0037 elapsed=1.8s


[Fold 2 seed2] Epoch 11 train_loss=1.0037 val_loss=3.3071 epoch_time=2.6s total=0.5m


  Saved best to model_ce_fold2_s1.pth


  ep12 step 24/24 lr=2.83e-03 loss=0.9477 elapsed=1.8s


[Fold 2 seed2] Epoch 12 train_loss=0.9477 val_loss=3.1638 epoch_time=2.7s total=0.6m


  Saved best to model_ce_fold2_s1.pth


  ep13 step 24/24 lr=2.78e-03 loss=0.9207 elapsed=1.8s


[Fold 2 seed2] Epoch 13 train_loss=0.9207 val_loss=3.0243 epoch_time=2.6s total=0.6m


  Saved best to model_ce_fold2_s1.pth


  ep14 step 24/24 lr=2.72e-03 loss=0.8855 elapsed=1.8s


[Fold 2 seed2] Epoch 14 train_loss=0.8855 val_loss=2.8873 epoch_time=2.6s total=0.6m


  Saved best to model_ce_fold2_s1.pth


  ep15 step 24/24 lr=2.66e-03 loss=0.8457 elapsed=1.8s


[Fold 2 seed2] Epoch 15 train_loss=0.8457 val_loss=2.7586 epoch_time=2.6s total=0.7m


  Saved best to model_ce_fold2_s1.pth


  ep16 step 24/24 lr=2.59e-03 loss=0.8414 elapsed=1.8s


[Fold 2 seed2] Epoch 16 train_loss=0.8414 val_loss=2.6417 epoch_time=2.6s total=0.7m


  Saved best to model_ce_fold2_s1.pth


  ep17 step 24/24 lr=2.51e-03 loss=0.8094 elapsed=1.7s


[Fold 2 seed2] Epoch 17 train_loss=0.8094 val_loss=2.5342 epoch_time=2.6s total=0.8m


  Saved best to model_ce_fold2_s1.pth


  ep18 step 24/24 lr=2.43e-03 loss=0.7856 elapsed=1.8s


[Fold 2 seed2] Epoch 18 train_loss=0.7856 val_loss=2.4377 epoch_time=2.7s total=0.8m


  Saved best to model_ce_fold2_s1.pth


  ep19 step 24/24 lr=2.35e-03 loss=0.8012 elapsed=1.8s


[Fold 2 seed2] Epoch 19 train_loss=0.8012 val_loss=2.3527 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold2_s1.pth


  ep20 step 24/24 lr=2.26e-03 loss=0.7446 elapsed=1.8s


[Fold 2 seed2] Epoch 20 train_loss=0.7446 val_loss=2.2797 epoch_time=2.6s total=0.9m


  Saved best to model_ce_fold2_s1.pth


  ep21 step 24/24 lr=2.17e-03 loss=0.7158 elapsed=1.8s


[Fold 2 seed2] Epoch 21 train_loss=0.7158 val_loss=2.2169 epoch_time=2.6s total=1.0m


  Saved best to model_ce_fold2_s1.pth


  ep22 step 24/24 lr=2.08e-03 loss=0.7068 elapsed=1.7s


[Fold 2 seed2] Epoch 22 train_loss=0.7068 val_loss=2.1629 epoch_time=2.6s total=1.0m


  Saved best to model_ce_fold2_s1.pth


  ep23 step 24/24 lr=1.98e-03 loss=0.6889 elapsed=1.8s


[Fold 2 seed2] Epoch 23 train_loss=0.6889 val_loss=2.1184 epoch_time=2.7s total=1.1m


  Saved best to model_ce_fold2_s1.pth


  ep24 step 24/24 lr=1.88e-03 loss=0.6853 elapsed=1.7s


[Fold 2 seed2] Epoch 24 train_loss=0.6853 val_loss=2.0802 epoch_time=2.6s total=1.1m


  Saved best to model_ce_fold2_s1.pth


  ep25 step 24/24 lr=1.78e-03 loss=0.6654 elapsed=1.8s


[Fold 2 seed2] Epoch 25 train_loss=0.6654 val_loss=2.0507 epoch_time=2.6s total=1.2m


  Saved best to model_ce_fold2_s1.pth


  ep26 step 24/24 lr=1.67e-03 loss=0.6536 elapsed=1.8s


[Fold 2 seed2] Epoch 26 train_loss=0.6536 val_loss=2.0244 epoch_time=2.6s total=1.2m


  Saved best to model_ce_fold2_s1.pth


  ep27 step 24/24 lr=1.57e-03 loss=0.6499 elapsed=1.8s


[Fold 2 seed2] Epoch 27 train_loss=0.6499 val_loss=2.0041 epoch_time=2.6s total=1.2m


  Saved best to model_ce_fold2_s1.pth


  ep28 step 24/24 lr=1.47e-03 loss=0.6342 elapsed=1.8s


[Fold 2 seed2] Epoch 28 train_loss=0.6342 val_loss=1.9867 epoch_time=2.6s total=1.3m


  Saved best to model_ce_fold2_s1.pth


  ep29 step 24/24 lr=1.36e-03 loss=0.6140 elapsed=1.8s


[Fold 2 seed2] Epoch 29 train_loss=0.6140 val_loss=1.9741 epoch_time=2.6s total=1.3m


  Saved best to model_ce_fold2_s1.pth


  ep30 step 24/24 lr=1.26e-03 loss=0.6084 elapsed=1.7s


[Fold 2 seed2] Epoch 30 train_loss=0.6084 val_loss=1.9643 epoch_time=2.6s total=1.4m


  Saved best to model_ce_fold2_s1.pth


  ep31 step 24/24 lr=1.16e-03 loss=0.5988 elapsed=1.7s


[Fold 2 seed2] Epoch 31 train_loss=0.5988 val_loss=1.9577 epoch_time=2.6s total=1.4m


  Saved best to model_ce_fold2_s1.pth


  ep32 step 24/24 lr=1.06e-03 loss=0.5895 elapsed=1.7s


[Fold 2 seed2] Epoch 32 train_loss=0.5895 val_loss=1.9521 epoch_time=2.6s total=1.5m


  Saved best to model_ce_fold2_s1.pth


  ep33 step 24/24 lr=9.63e-04 loss=0.5922 elapsed=2.0s


[Fold 2 seed2] Epoch 33 train_loss=0.5922 val_loss=1.9484 epoch_time=2.8s total=1.5m


  Saved best to model_ce_fold2_s1.pth


  ep34 step 24/24 lr=8.68e-04 loss=0.5791 elapsed=1.8s


[Fold 2 seed2] Epoch 34 train_loss=0.5791 val_loss=1.9461 epoch_time=2.6s total=1.6m


  Saved best to model_ce_fold2_s1.pth


  ep35 step 24/24 lr=7.76e-04 loss=0.5776 elapsed=1.8s


[Fold 2 seed2] Epoch 35 train_loss=0.5776 val_loss=1.9459 epoch_time=2.6s total=1.6m


  Saved best to model_ce_fold2_s1.pth


  ep36 step 24/24 lr=6.88e-04 loss=0.5689 elapsed=1.7s


[Fold 2 seed2] Epoch 36 train_loss=0.5689 val_loss=1.9461 epoch_time=2.6s total=1.7m


  ep37 step 24/24 lr=6.04e-04 loss=0.5641 elapsed=1.8s


[Fold 2 seed2] Epoch 37 train_loss=0.5641 val_loss=1.9471 epoch_time=2.6s total=1.7m


  ep38 step 24/24 lr=5.25e-04 loss=0.5647 elapsed=1.8s


[Fold 2 seed2] Epoch 38 train_loss=0.5647 val_loss=1.9496 epoch_time=2.6s total=1.8m


  ep39 step 24/24 lr=4.50e-04 loss=0.5580 elapsed=1.7s


[Fold 2 seed2] Epoch 39 train_loss=0.5580 val_loss=1.9522 epoch_time=2.6s total=1.8m


  ep40 step 24/24 lr=3.80e-04 loss=0.5526 elapsed=1.8s


[Fold 2 seed2] Epoch 40 train_loss=0.5526 val_loss=1.9557 epoch_time=2.6s total=1.8m


  Early stop at epoch 40


Fold 2 seed2 done. Best val CE=1.9459. Model -> model_ce_fold2_s1.pth
All folds (seed2) processed.


In [3]:
# Build 6-model CE ensemble: cache OOF for seed2, average OOF per fold, sweep decoder, infer test, write submission
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA:', torch.cuda.is_available(), flush=True)

feat_tr_dir = Path('features3d_v2')/'train'
feat_te_dir = Path('features3d_v2')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)

folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz");
    return d['X'].astype(np.float32)

def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid)); n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {}
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__();
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h);
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True);
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__();
        self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h);
        out = self.head(h); return out.transpose(1,2)

def time_warp_probs(p_t_c: torch.Tensor, factor: float) -> torch.Tensor:
    T, C = p_t_c.shape; tgt_len = max(1, int(round(T*factor)));
    x = p_t_c.T.unsqueeze(0);
    y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
    y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T;
    return y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        ps = time_warp_probs(p_t_c, s);
        acc = ps if acc is None else (acc + ps)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2);
    y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1);
    w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1);
    T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y

def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a = max(0, t_star - w); b = min(T-1, t_star + w);
    idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype);
    seg = p[a:b+1]; s = seg.sum() + 1e-8;
    return float(((idx * seg).sum() / s).item())

def decode_peaks(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9):
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape;
    scores = torch.empty_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: scores[:,c]=p_s[:,c]; ks[c]=13; continue
        base_k = med_k.get(c, 13); k = int(np.clip(round(gamma * base_k), 9, 25));
        if k % 2 == 0: k = min(25, k + 1); ks[c]=k;
        scores[:,c] = duration_integral_single(p_s[:,c], k=k)
    peaks=[]
    for c in range(1,21):
        k=ks[c]; w_com = max(5, k//3); radius = max(10, k//2); s=scores[:,c];
        t_star = int(torch.argmax(s).item()); t_ref = refine_com(p_s[:,c], t_star, w=w_com);
        t_idx = int(round(max(0, min(t_ref, T-1))));
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item();
        pooled_at_ref = p_s[t_idx, c].item();
        peaks.append([c, t_ref, float(scores[t_idx,c].item()), float(local_mean), float(pooled_at_ref)])
    peaks.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]));
    last_t = -1e9
    for i in range(len(peaks)):
        if peaks[i][1] <= last_t: peaks[i][1] = last_t + 2.0;
        last_t = min(peaks[i][1], float(T-1))
    return [int(c) for c,_,_,_,_ in peaks]

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# 1) Cache OOF for seed2: per-fold model on its own val_ids, TTA=(0.9,1.0,1.1); save as {sid}_ce_new_s1.npy
def cache_fold_val_probs_seed2(fold):
    fold_idx = int(fold['fold'])
    ckpt = Path(f"model_ce_fold{fold_idx}_s1.pth");
    assert ckpt.exists(), f"Missing {ckpt}; train seed2 models first"
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()
    mean,std = compute_fold_scaler(fold['train_ids'])
    mean_t = torch.from_numpy(mean).float().to(device); std_t = torch.from_numpy(std).float().to(device)
    vids = fold['val_ids']; t0=time.time()
    for i, sid in enumerate(vids, 1):
        sid=int(sid); outp = probs_cache/f"{sid}_ce_new_s1.npy"
        if outp.exists():
            if (i%25)==0 or i==len(vids):
                print(f"  [fold {fold_idx} s1] cached {i}/{len(vids)} elapsed {time.time()-t0:.1f}s", flush=True)
            continue
        X = load_feat('train', sid); xb = torch.from_numpy(X).float().to(device);
        xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
        with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
            probs = model(xb)[0].softmax(dim=-1);
            probs = apply_tta_timewarp(probs, factors=(0.9,1.0,1.1))
        np.save(outp, probs.cpu().numpy())
        if (i%25)==0 or i==len(vids):
            print(f"  [fold {fold_idx} s1] cached {i}/{len(vids)} elapsed {time.time()-t0:.1f}s", flush=True)

def load_cached_prob_seed(sid:int, seed:int):
    if seed==0:
        return torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    else:
        return torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new_s1.npy")).to(device)

print('Caching OOF probs for seed2...', flush=True)
for f in folds: cache_fold_val_probs_seed2(f)

# 2) Sweep decoder on averaged OOF (seed0+seed1) with per-fold priors; select by worst-fold then mean
pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; gammas=[0.90,0.95,0.975,1.00,1.025,1.05]
med_cache={}
def eval_cfg_on_fold_avg(fold, pool_k, temp, gamma):
    fi = int(fold['fold'])
    if fi not in med_cache: med_cache[fi] = compute_class_median_durations_for_ids(fold['train_ids'])
    med_k = med_cache[fi]
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        p0 = load_cached_prob_seed(int(sid), 0); p1 = load_cached_prob_seed(int(sid), 1);
        p = (p0 + p1) * 0.5; p = p / (p.sum(dim=-1, keepdim=True) + 1e-8)
        seq = decode_peaks(p, med_k=med_k, gamma=gamma, pool_k=pool_k, temp=temp)
        tot += levenshtein(seq, id2seq[int(sid)]); cnt += 1
    return tot/max(cnt,1)

res=[]
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            per_fold=[]
            for f in folds:
                lev = eval_cfg_on_fold_avg(f, pool_k, temp, gamma); per_fold.append(lev)
            res.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k, 'temp':temp, 'gamma':gamma}))
res.sort(key=lambda x: (x[1], x[0]))
print('Top CE-only 6x (avg OOF):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_6x.csv', index=False)
print('Saved cv_sweep_ce_6x.csv', flush=True)

# 3) Test inference: load all 6 CE models; per-model standardize with its fold scaler; TTA=(0.9,1.0,1.1); average probs; decode with best cfg
print('Building CE 6-model ensemble test submission...', flush=True)
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
cfg_best = pd.read_csv('cv_sweep_ce_6x.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_6x.csv').exists() else {'pool_k':13,'temp':0.95,'gamma':1.0}
pool_k=int(cfg_best['pool_k']); temp=float(cfg_best['temp']); gamma=float(cfg_best.get('gamma',1.0))
med_k_test = compute_class_median_durations_for_ids(pd.read_csv('training.csv')['Id'].astype(int).tolist())

D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
models=[]; scalers=[]
for fi in range(3):
    for s in (0,1):
        ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth");
        if not ckpt.exists():
            print(f"WARNING: missing {ckpt}; skipping")
            continue
        m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
        m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval(); models.append(m)
        mean,std = compute_class_median_durations_for_ids([]), None  # placeholder to keep scope
    # per-fold scaler computed once and reused for both seeds
for fi in range(3):
    mean,std = compute_fold_scaler(folds[fi]['train_ids'])
    scalers.append((torch.from_numpy(mean).float().to(device), torch.from_numpy(std).float().to(device)))

rows=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    X = load_feat('test', int(sid));
    acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        mi = 0
        for fi in range(3):
            mean_t, std_t = scalers[fi]
            for s in (0,1):
                ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists():
                    continue
                # lazy load per model to keep VRAM low
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device);
                xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0);
                p = m(xb)[0].softmax(dim=-1);
                p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1));
                acc = p if acc is None else (acc + p)
                del m
        probs = acc / float(6); probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
    seq = decode_peaks(probs, med_k=med_k_test, gamma=gamma, pool_k=pool_k, temp=temp)
    rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE-6x] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)

sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_6x.csv', index=False);
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_6x.csv and submission.csv; head:\n', sub.head(), flush=True)

CUDA: True


Caching OOF probs for seed2...


  model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()


  [fold 0 s1] cached 25/98 elapsed 0.2s


  [fold 0 s1] cached 50/98 elapsed 0.5s


  [fold 0 s1] cached 75/98 elapsed 0.7s


  [fold 0 s1] cached 98/98 elapsed 1.0s


  [fold 1 s1] cached 25/99 elapsed 0.2s


  [fold 1 s1] cached 50/99 elapsed 0.5s


  [fold 1 s1] cached 75/99 elapsed 0.7s


  [fold 1 s1] cached 99/99 elapsed 0.9s


  [fold 2 s1] cached 25/100 elapsed 0.2s


  [fold 2 s1] cached 50/100 elapsed 0.5s


  [fold 2 s1] cached 75/100 elapsed 0.7s


  [fold 2 s1] cached 100/100 elapsed 0.9s


Top CE-only 6x (avg OOF):
(4.469703841132413, 5.1, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.95})
(4.469703841132413, 5.1, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.975})
(4.469703841132413, 5.1, {'pool_k': 11, 'temp': 0.9, 'gamma': 1.0})
(4.469703841132413, 5.1, {'pool_k': 11, 'temp': 0.9, 'gamma': 1.025})
(4.469703841132413, 5.1, {'pool_k': 11, 'temp': 0.9, 'gamma': 1.05})
Saved cv_sweep_ce_6x.csv


Building CE 6-model ensemble test submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval(); models.append(m)


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE-6x] 10/95 elapsed=0.0m


  [infer CE-6x] 20/95 elapsed=0.0m


  [infer CE-6x] 30/95 elapsed=0.1m


  [infer CE-6x] 40/95 elapsed=0.1m


  [infer CE-6x] 50/95 elapsed=0.1m


  [infer CE-6x] 60/95 elapsed=0.1m


  [infer CE-6x] 70/95 elapsed=0.2m


  [infer CE-6x] 80/95 elapsed=0.2m


  [infer CE-6x] 90/95 elapsed=0.2m


  [infer CE-6x] 95/95 elapsed=0.2m


Wrote submission_primary_ce_6x.csv and submission.csv; head:
     Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 1 10 14 5 19 15 20 17 11 16 8 18 7 3 6...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 5 10 6 15 17 ...


In [4]:
# Improved decoder + re-sweep on averaged OOF (6x CE) + rebuild submission (no retrain)
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA:', torch.cuda.is_available(), flush=True)

feat_tr_dir = Path('features3d_v2')/'train'
feat_te_dir = Path('features3d_v2')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
probs_cache = Path('probs_cache')
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz");
    return d['X'].astype(np.float32)

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {}
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

def time_warp_probs(p_t_c: torch.Tensor, factor: float) -> torch.Tensor:
    T, C = p_t_c.shape; tgt_len = max(1, int(round(T*factor)));
    x = p_t_c.T.unsqueeze(0);
    y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
    y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T;
    return y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        ps = time_warp_probs(p_t_c, s);
        acc = ps if acc is None else (acc + ps)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2);
    y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1);
    w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1);
    T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Improved decoder pieces
def topk_candidates_per_class(p_s: torch.Tensor, scores: torch.Tensor, c: int, k_c: int, temp: float, K: int = 3):
    # return up to K candidate (time_refined, score_tuple) for class c
    T = p_s.shape[0]
    s = scores[:, c]
    vals, idxs = torch.topk(s, k=min(K, T))
    cand = []
    w_com = max(5, k_c//3); radius = max(10, k_c//2)
    for v, t_star in zip(vals.tolist(), idxs.tolist()):
        t_ref = refine_com(p_s[:,c], int(t_star), w=w_com)
        t_idx = int(round(max(0, min(t_ref, T-1))))
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item()
        pooled_at_ref = p_s[t_idx, c].item()
        cand.append((t_ref, float(v), float(local_mean), float(pooled_at_ref)))
    # sort by refined time, then score desc, then local stats desc
    cand.sort(key=lambda x: (x[0], -x[1], -x[2], -x[3]))
    return cand

def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=2, K=3, k_delta=4):
    # temperature calibration
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    # base smoothing
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape
    # compute per-class kernel k_c and multi-scale duration integrals
    scores = torch.zeros_like(p_s)
    ks = [13]*C
    for c in range(C):
        if c == 0:
            scores[:, c] = p_s[:, c]; ks[c]=13; continue
        base_k = med_k.get(c, 13)
        k_c = int(np.clip(round(gamma * base_k), 9, 25))
        if k_c % 2 == 0: k_c = min(25, k_c + 1)
        ks[c] = k_c
        # multi-scale
        ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]))
        ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:, c], k=k).unsqueeze(1)
            acc = di if acc is None else (acc + di)
        scores[:, c] = (acc / float(len(ks_multi))).squeeze(1)
    # build candidate list per class
    all_cand = []
    for c in range(1,21):
        cand = topk_candidates_per_class(p_s, scores, c, ks[c], temp=temp, K=K)
        if len(cand)==0:
            all_cand.append((c, 0.0, -1e9, -1e9, -1e9))
        else:
            # pick best candidate per class after monotonic assignment below; store all
            for (t_ref, v, lm, pr) in cand:
                all_cand.append((c, t_ref, v, lm, pr))
    # monotonic assignment: one slot per class, increasing times with min separation
    # greedy: sort by time, then score desc; enforce separation
    all_cand.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    chosen = {}  # class -> (t_ref, scores)
    last_t = -1e9
    for c, t_ref, v, lm, pr in all_cand:
        if c in chosen:
            continue
        if t_ref <= last_t + float(min_sep):
            # shift forward minimally
            t_ref = last_t + float(min_sep)
        last_t = min(t_ref, float(T-1))
        chosen[c] = (last_t, v, lm, pr)
        if len(chosen)==20:
            break
    # ensure all classes present (fallback times spaced if missing)
    if len(chosen) < 20:
        missing = [c for c in range(1,21) if c not in chosen]
        t = max(last_t, 0.0)
        for c in missing:
            t = min(t + float(min_sep), float(T-1))
            chosen[c] = (t, -1e9, -1e9, -1e9)
    # sort by time to produce final sequence
    seq = [c for c,_ in sorted(chosen.items(), key=lambda kv: kv[1][0])]
    return seq

# Per-sample gamma scaling based on length ratio
def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0:
        return gamma_cv
    ratio = float(T) / L_est
    gamma_s = float(np.clip(ratio, 0.85, 1.15))
    return float(gamma_cv * gamma_s)

# Load averaged OOF (seed0+seed1) for each fold's val ids
def load_oof_avg_for_id(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new_s1.npy")).to(device)
    p = (p0 + p1) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

# Sweep over improved decoder params on OOF avg
pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; gammas=[0.90,0.95,0.975,1.00,1.025,1.05]; seps=[2,3,4]
print('Sweeping improved decoder on averaged OOF...', flush=True)
med_cache={}
def eval_cfg_on_fold_improved(fold, pool_k, temp, gamma, sep):
    fi = int(fold['fold'])
    if fi not in med_cache: med_cache[fi] = compute_class_median_durations_for_ids(fold['train_ids'])
    med_k = med_cache[fi]
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        sid = int(sid)
        p = load_oof_avg_for_id(sid)
        T = p.shape[0]
        gamma_eff = gamma_with_length(gamma, T, med_k)
        seq = decode_peaks_improved(p, med_k=med_k, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
        tot += levenshtein(seq, id2seq[sid]); cnt += 1
    return tot/max(cnt,1)

res=[]
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            for sep in seps:
                per_fold=[]
                for f in folds:
                    lev = eval_cfg_on_fold_improved(f, pool_k, temp, gamma, sep); per_fold.append(lev)
                res.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k, 'temp':temp, 'gamma':gamma, 'sep':sep}))
res.sort(key=lambda x: (x[1], x[0]))
print('Top improved decoder (mean, worst, cfg):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_6x_improved.csv', index=False)
print('Saved cv_sweep_ce_6x_improved.csv', flush=True)

# Test-time: load 6 CE models lazily, standardize per-fold, TTA=(0.9,1.0,1.1), improved decoder with gamma length scaling
print('Building CE 6-model ensemble test submission (improved decoder)...', flush=True)
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
cfg_best = pd.read_csv('cv_sweep_ce_6x_improved.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_6x_improved.csv').exists() else {'pool_k':13,'temp':0.95,'gamma':1.0,'sep':2}
pool_k=int(cfg_best['pool_k']); temp=float(cfg_best['temp']); gamma=float(cfg_best.get('gamma',1.0)); sep=int(cfg_best.get('sep',2))
med_k_train_all = compute_class_median_durations_for_ids(pd.read_csv('training.csv')['Id'].astype(int).tolist())

def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid)); n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
def load_fold_scalers():
    scalers=[]
    for fi in range(3):
        mean,std = compute_fold_scaler(folds[fi]['train_ids'])
        scalers.append((torch.from_numpy(mean).float().to(device), torch.from_numpy(std).float().to(device)))
    return scalers

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__();
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h);
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True);
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__();
        self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h);
        out = self.head(h); return out.transpose(1,2)

scalers = load_fold_scalers()
rows=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    X = load_feat('test', int(sid));
    T = X.shape[0]
    acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for fi in range(3):
            mean_t, std_t = scalers[fi]
            for s in (0,1):
                ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists():
                    continue
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device);
                xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0);
                p = m(xb)[0].softmax(dim=-1);
                p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1));
                acc = p if acc is None else (acc + p)
                del m
        probs = acc / float(6); probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
    gamma_eff = gamma_with_length(gamma, T, med_k_train_all)
    seq = decode_peaks_improved(probs, med_k=med_k_train_all, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
    rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE-6x improved] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)

sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_6x_v2.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_6x_v2.csv and submission.csv; head:\n', sub.head(), flush=True)

CUDA: True


Sweeping improved decoder on averaged OOF...


Top improved decoder (mean, worst, cfg):
(4.4864048649762935, 5.15, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 2})
(4.4864048649762935, 5.15, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 3})
(4.4864048649762935, 5.15, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 4})
(4.4864048649762935, 5.15, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 2})
(4.4864048649762935, 5.15, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 3})
Saved cv_sweep_ce_6x_improved.csv


Building CE 6-model ensemble test submission (improved decoder)...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE-6x improved] 10/95 elapsed=0.0m


  [infer CE-6x improved] 20/95 elapsed=0.1m


  [infer CE-6x improved] 30/95 elapsed=0.1m


  [infer CE-6x improved] 40/95 elapsed=0.1m


  [infer CE-6x improved] 50/95 elapsed=0.1m


  [infer CE-6x improved] 60/95 elapsed=0.2m


  [infer CE-6x improved] 70/95 elapsed=0.2m


  [infer CE-6x improved] 80/95 elapsed=0.2m


  [infer CE-6x improved] 90/95 elapsed=0.2m


  [infer CE-6x improved] 95/95 elapsed=0.2m


Wrote submission_primary_ce_6x_v2.csv and submission.csv; head:
     Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 1 10 14 5 19 15 20 17 11 16 8 18 7 3 6...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 5 10 6 15 17 ...


In [5]:
# Decoder sanity checks: (A) OOF debug on a few samples, (B) Oracle one-hot GT probs test
import json, numpy as np, pandas as pd, torch, torch.nn.functional as F
from pathlib import Path

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache')
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

# Reuse helpers from earlier cells if present; otherwise minimal fallbacks
def load_oof_avg_for_id(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new_s1.npy")).to(device)
    p = (p0 + p1) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

def compute_class_median_durations_for_ids(id_list):
    lab_tr_dir = Path('labels3d_v2')/'train'
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {}
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

# If decode_peaks_improved/gamma_with_length not in globals (e.g., fresh kernel), import from Cell 10 context by redefining minimal versions
if 'decode_peaks_improved' not in globals():
    # Minimal dependencies
    def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
        x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
        return y.transpose(1,2).squeeze(0)
    def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
        k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
        pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
        if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]));
        elif y.shape[0] > T: y = y[:T];
        return y
    def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
        T = p.shape[0]; a=max(0,t_star-w); b=min(T-1,t_star+w); idx=torch.arange(a,b+1, device=p.device, dtype=p.dtype);
        seg=p[a:b+1]; s=seg.sum()+1e-8; return float(((idx*seg).sum()/s).item())
    def topk_candidates_per_class(p_s: torch.Tensor, scores: torch.Tensor, c: int, k_c: int, temp: float, K: int = 3):
        T = p_s.shape[0]; s = scores[:, c]; vals, idxs = torch.topk(s, k=min(K, T));
        cand=[]; w_com=max(5,k_c//3); radius=max(10,k_c//2)
        for v, t_star in zip(vals.tolist(), idxs.tolist()):
            t_ref = refine_com(p_s[:,c], int(t_star), w=w_com); t_idx=int(round(max(0, min(t_ref, T-1))));
            local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item(); pooled_at_ref = p_s[t_idx, c].item();
            cand.append((t_ref, float(v), float(local_mean), float(pooled_at_ref)))
        cand.sort(key=lambda x: (x[0], -x[1], -x[2], -x[3])); return cand
    def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=2, K=3, k_delta=4):
        if temp != 1.0:
            p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
        p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape; scores = torch.zeros_like(p_s); ks=[13]*C
        for c in range(C):
            if c==0: scores[:,c]=p_s[:,c]; ks[c]=13; continue
            base_k = med_k.get(c, 13); k_c = int(np.clip(round(gamma*base_k), 9, 25));
            if k_c % 2 == 0: k_c = min(25, k_c + 1); ks[c]=k_c
            ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]));
            ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]; acc=None
            for k in ks_multi:
                di = duration_integral_single(p_s[:, c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
            scores[:, c] = (acc / float(len(ks_multi))).squeeze(1)
        all_cand=[]
        for c in range(1,21):
            cand = topk_candidates_per_class(p_s, scores, c, ks[c], temp=temp, K=K)
            if len(cand)==0: all_cand.append((c, 0.0, -1e9, -1e9, -1e9))
            else:
                for (t_ref, v, lm, pr) in cand: all_cand.append((c, t_ref, v, lm, pr))
        all_cand.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4])); chosen={}; last_t=-1e9
        for c, t_ref, v, lm, pr in all_cand:
            if c in chosen: continue
            if t_ref <= last_t + 2.0: t_ref = last_t + 2.0
            last_t = min(t_ref, float(T-1)); chosen[c]=(last_t, v, lm, pr)
            if len(chosen)==20: break
        if len(chosen) < 20:
            missing = [c for c in range(1,21) if c not in chosen]; t = max(last_t, 0.0)
            for c in missing: t = min(t + 2.0, float(T-1)); chosen[c]=(t,-1e9,-1e9,-1e9)
        seq = [c for c,_ in sorted(chosen.items(), key=lambda kv: kv[1][0])]; return seq
    def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
        L_est = float(sum(med_k.get(c,13) for c in range(1,21)));
        if L_est <= 0: return gamma_cv
        ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Load best improved-decoder cfg if available
cfg_path = Path('cv_sweep_ce_6x_improved.csv')
if cfg_path.exists():
    cfg_best = pd.read_csv(cfg_path).sort_values(['worst','mean']).iloc[0].to_dict()
    pool_k=int(cfg_best['pool_k']); temp=float(cfg_best['temp']); gamma=float(cfg_best.get('gamma',1.0)); sep=int(cfg_best.get('sep',2))
else:
    pool_k, temp, gamma, sep = 13, 0.95, 1.0, 2

# (A) OOF debug on 6 samples from fold 0
f0 = folds[0]
med_k_f0 = compute_class_median_durations_for_ids(f0['train_ids'])
print('--- OOF debug (fold 0) ---', flush=True)
for sid in list(f0['val_ids'])[:6]:
    p = load_oof_avg_for_id(int(sid))
    T = p.shape[0]
    gamma_eff = gamma_with_length(gamma, T, med_k_f0)
    seq = decode_peaks_improved(p, med_k=med_k_f0, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
    gt = id2seq[int(sid)]
    ld = levenshtein(seq, gt)
    uniq = len(set(seq))
    print(f"sid={sid} T={T} LD={ld} uniq={uniq} seq[:5]={seq[:5]} ...", flush=True)

# (B) Oracle test: construct near one-hot GT probs and ensure decoder recovers GT (LD≈0)
print('--- Oracle decoder test ---', flush=True)
def build_oracle_probs(T:int, gt_seq, med_k):
    # place Gaussian-like peaks for each class at cumulative centers spaced by med_k
    C=21; p = torch.full((T,C), 1e-8, device=device, dtype=torch.float32);
    centers=[]; t=0
    for c in gt_seq:
        k = int(np.clip(med_k.get(c,13), 9, 25));
        t = min(t + max(3, k//2), T-1); centers.append(t); t = min(t + max(3, k//2), T-1)
    if len(centers)>0 and centers[-1] < T-1:
        # spread remaining
        pass
    for c, t0 in zip(gt_seq, centers):
        width = 3
        for dt in range(-3*width, 3*width+1):
            tt = int(np.clip(t0+dt, 0, T-1))
            p[tt, c] = max(p[tt, c].item(), float(np.exp(-0.5*(dt/width)**2)))
    p = p / (p.sum(dim=-1, keepdim=True) + 1e-8);
    return p

tested = 0
for sid in list(f0['val_ids'])[:6]:
    gt = id2seq[int(sid)]
    # estimate a T similar to OOF probs for this sid
    p_oof = load_oof_avg_for_id(int(sid)); T = p_oof.shape[0]
    p_oracle = build_oracle_probs(T, gt, med_k_f0)
    gamma_eff = gamma_with_length(gamma, T, med_k_f0)
    seq_or = decode_peaks_improved(p_oracle, med_k=med_k_f0, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
    ld_or = levenshtein(seq_or, gt)
    print(f"sid={sid} ORACLE LD={ld_or} uniq={len(set(seq_or))} seq[:5]={seq_or[:5]} ...", flush=True)
    tested += 1
    if tested>=4: break
print('Decoder sanity checks done.', flush=True)

--- OOF debug (fold 0) ---


sid=1 T=1254 LD=0 uniq=20 seq[:5]=[2, 14, 20, 6, 7] ...


sid=3 T=1117 LD=2 uniq=20 seq[:5]=[12, 3, 17, 18, 14] ...


sid=4 T=1336 LD=0 uniq=20 seq[:5]=[13, 1, 8, 18, 7] ...


sid=5 T=1334 LD=0 uniq=20 seq[:5]=[10, 4, 7, 13, 19] ...


sid=6 T=1202 LD=0 uniq=20 seq[:5]=[14, 15, 10, 16, 11] ...


sid=7 T=1124 LD=2 uniq=20 seq[:5]=[19, 10, 11, 12, 9] ...


--- Oracle decoder test ---


sid=1 ORACLE LD=0 uniq=20 seq[:5]=[2, 14, 20, 6, 7] ...


sid=3 ORACLE LD=0 uniq=20 seq[:5]=[12, 3, 18, 14, 16] ...


sid=4 ORACLE LD=0 uniq=20 seq[:5]=[13, 1, 8, 18, 7] ...


sid=5 ORACLE LD=0 uniq=20 seq[:5]=[10, 4, 7, 13, 19] ...


Decoder sanity checks done.


In [8]:
# Train CE+TC DilatedTCN per fold (3 folds) with 2 seeds, per expert recipe
import os, json, math, time, random, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available(), flush=True)
assert torch.cuda.is_available(), 'GPU required for timely training'
torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision('high')
except Exception:
    pass

feat_tr_dir = Path('features3d_v2')/'train'
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__()
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch)
        self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1)
        self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h)
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True)  # FIXED: conv2(h) not conv2(x)
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__()
        self.inp = nn.Conv1d(d_in, channels, 1)
        blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3))
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2)
        h = self.inp(x)
        for b in self.blocks:
            h = b(h)
        out = self.head(h)
        return out.transpose(1,2)  # B,T,C

class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {n: p.detach().clone() for n,p in model.named_parameters() if p.requires_grad}
    @torch.no_grad()
    def update(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0 - self.decay)
    def apply_to(self, model):
        self.backup = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.backup[n] = p.detach().clone()
                p.data.copy_(self.shadow[n].data)
    def restore(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                p.data.copy_(self.backup[n].data)

def load_feat_full(sample_id: int):
    d = np.load((feat_tr_dir/f"{sample_id}.npz"))
    return d['X'].astype(np.float32)
def load_labels(sample_id: int):
    return np.load(lab_tr_dir/f"{sample_id}.npy").astype(np.int64)

def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat_full(int(sid))
        n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0)
            M2 = ((X - mean)**2).sum(axis=0)
            n = n_i
        else:
            mean_i = X.mean(axis=0)
            n_new = n + n_i
            delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new))
            n = n_new
    var = M2 / max(1, (n - 1))
    std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_weights(train_ids):
    counts = np.zeros(21, dtype=np.int64)
    for sid in train_ids:
        y = load_labels(int(sid))
        vals, cnts = np.unique(y, return_counts=True)
        for v, c in zip(vals, cnts):
            if 0 <= v <= 20:
                counts[v] += int(c)
    freq = counts / max(1, counts.sum())
    w = 1.0 / np.sqrt(np.clip(freq, 1e-12, None))
    w = w / w.mean()
    w0_cap = 0.7 * w.mean()
    w[0] = min(w[0], w0_cap)
    return torch.tensor(w, dtype=torch.float32, device=device)

class SeqDataset(Dataset):
    def __init__(self, ids, mean, std, train=True, crop_min=1600, crop_max=4096, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=42):
        self.ids = list(ids)
        self.mean = torch.from_numpy(mean).float()
        self.std = torch.from_numpy(std).float()
        self.train = train
        self.crop_min = crop_min
        self.crop_max = crop_max
        self.tmask_lo, self.tmask_hi = time_masks
        self.mlen_lo, self.mlen_hi = mask_len
        self.noise_std = noise_std
        self.rng = random.Random(seed)
    def __len__(self):
        return len(self.ids)
    def _rand_crop(self, X, y):
        T = X.shape[0]
        if not self.train:
            return X, y
        tgt = self.rng.randint(self.crop_min, self.crop_max)
        if T <= tgt:
            return X, y
        start = self.rng.randint(0, T - tgt)
        end = start + tgt
        return X[start:end], y[start:end]
    def _time_mask(self, X):
        if not self.train:
            return X
        T = X.shape[0]
        m = self.rng.randint(self.tmask_lo, self.tmask_hi)
        for _ in range(m):
            L = self.rng.randint(self.mlen_lo, self.mlen_hi)
            if T <= L: continue
            s = self.rng.randint(0, T - L)
            e = s + L
            seg_mean = X[max(0, s-5):min(T, e+5)].mean(axis=0, keepdims=True)
            X[s:e] = seg_mean
        return X
    def __getitem__(self, idx):
        sid = int(self.ids[idx])
        X = load_feat_full(sid)
        y = load_labels(sid)
        X, y = self._rand_crop(X, y)
        X = (torch.from_numpy(X).float() - self.mean) / (self.std + 1e-6)
        if self.train:
            if self.noise_std > 0:
                X = X + torch.randn_like(X) * self.noise_std
            X_np = X.numpy(); X_np = self._time_mask(X_np); X = torch.from_numpy(X_np).float()
        y = torch.from_numpy(y).long()
        return X, y

def collate_pad(batch):
    xs, ys = zip(*batch)
    T_max = max(x.shape[0] for x in xs)
    D = xs[0].shape[1]
    xb = torch.zeros((len(xs), T_max, D), dtype=torch.float32)
    yb = torch.full((len(xs), T_max), -100, dtype=torch.long)
    for i, (x, y) in enumerate(zip(xs, ys)):
        T = x.shape[0]
        xb[i, :T] = x
        yb[i, :T] = y
    return xb, yb

def cosine_with_warmup(step, total_steps, warmup_steps, base_lr, min_lr):
    if step < warmup_steps:
        return base_lr * (step / max(1, warmup_steps))
    t = (step - warmup_steps) / max(1, (total_steps - warmup_steps))
    return min_lr + 0.5 * (base_lr - min_lr) * (1 + math.cos(math.pi * t))

# Temporal Consistency (TC) loss per expert recipe
def tc_loss_kld_adjacent(logits: torch.Tensor, y: torch.Tensor, k2: bool = True):
    # logits: [B,T,C], y: [B,T] with -100 pad
    B,T,C = logits.shape
    log_probs = F.log_softmax(logits, dim=-1)
    probs = F.softmax(logits, dim=-1)
    total = 0.0
    denom = 0.0
    for k in (1, 2) if k2 else (1,):
        y_t   = y[:, k:]
        y_tm  = y[:, :-k]
        m = (y_t != -100) & (y_tm != -100) & (y_t == y_tm) & (y_t != 0)
        if not m.any():
            continue
        lp = log_probs[:, k:, :]               # teacher at t-1 (or t-2) as stop-grad on probs
        p_prev = probs[:, :-k, :].detach()
        kl = F.kl_div(lp, p_prev, reduction='none').sum(dim=-1)  # [B,T-k]
        # mask and mean
        kl = torch.where(m, kl, torch.zeros_like(kl))
        total = total + kl.sum() * (1.0 if k==1 else 0.5)
        denom = denom + m.sum() * (1.0 if k==1 else 0.5)
    if denom == 0:
        return logits.new_tensor(0.0)
    return total / denom

def train_fold_tc(fold_idx, train_ids, val_ids, out_name, ds_seed, epochs=40, batch_size=8, accum_steps=1, base_lr=3e-3, min_lr=3e-5, wd=0.01, label_smooth=0.05, lambda_tc=0.20, tc_warmup_epochs=5, k2=True):
    print(f"=== Train CE+TC fold {fold_idx} ({out_name}) : train_n={len(train_ids)} val_n={len(val_ids)} ===", flush=True)
    # scaler and class weights from train only
    mean, std = compute_fold_scaler(train_ids)
    class_w = compute_class_weights(train_ids)
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    # seeds
    torch.manual_seed(1337 + ds_seed)
    np.random.seed(4242 + ds_seed)
    random.seed(9001 + ds_seed)
    ema = EMA(model, decay=0.999)
    scaler = torch.amp.GradScaler('cuda', enabled=True)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=wd, betas=(0.9, 0.999))
    tr_ds = SeqDataset(train_ids, mean, std, train=True, crop_min=1600, crop_max=4096, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=ds_seed)
    va_ds = SeqDataset(val_ids, mean, std, train=False, seed=ds_seed+777)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    va_ld = DataLoader(va_ds, batch_size=1, shuffle=False, drop_last=False, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    steps_per_epoch = max(1, len(tr_ld))
    total_steps = steps_per_epoch * epochs
    warmup_steps = 5 * steps_per_epoch
    ce_crit = nn.CrossEntropyLoss(weight=class_w, label_smoothing=label_smooth, ignore_index=-100)
    best_val = float('inf'); patience=6; bad=0
    t0=time.time()
    for ep in range(1, epochs+1):
        model.train(); tr_loss=0.0; tr_ce=0.0; tr_tc=0.0; seen=0; t_ep=time.time()
        opt.zero_grad(set_to_none=True)
        # TC lambda schedule: linear warmup over first tc_warmup_epochs
        lam_tc = float(lambda_tc * min(1.0, ep / max(1, tc_warmup_epochs)))
        for step, (xb, yb) in enumerate(tr_ld):
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)
            bs, T, D = xb.shape; C = 21
            lr = cosine_with_warmup((ep-1)*steps_per_epoch + step, total_steps, warmup_steps, base_lr, min_lr)
            for pg in opt.param_groups: pg['lr'] = lr
            with torch.amp.autocast('cuda'):
                logits = model(xb)  # B,T,C
                loss_ce = ce_crit(logits.reshape(-1, C), yb.reshape(-1))
                loss_tc = tc_loss_kld_adjacent(logits, yb, k2=k2)
                loss = loss_ce + lam_tc * loss_tc
            scaler.scale(loss / accum_steps).backward()
            if ((step + 1) % accum_steps) == 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(opt)
                scaler.update()
                opt.zero_grad(set_to_none=True)
                ema.update(model)
            tr_loss += loss.item() * bs; tr_ce += loss_ce.item() * bs; tr_tc += (loss_tc.item() if torch.is_tensor(loss_tc) else float(loss_tc)) * bs; seen += bs
            if (step+1) % 50 == 0 or (step+1)==steps_per_epoch:
                print(f"  ep{ep} step {step+1}/{steps_per_epoch} lr={lr:.2e} loss={tr_loss/max(1,seen):.4f} ce={tr_ce/max(1,seen):.4f} tc={tr_tc/max(1,seen):.4f} lam_tc={lam_tc:.3f} elapsed={(time.time()-t_ep):.1f}s", flush=True)
        # validate CE only (selection by CE), with EMA weights
        model.eval(); ema.apply_to(model)
        val_loss = 0.0; vseen=0
        with torch.no_grad(), torch.amp.autocast('cuda'):
            for xb, yb in va_ld:
                xb = xb.to(device, non_blocking=True)
                yb = yb.to(device, non_blocking=True)
                bs, T, D = xb.shape; C = 21
                logits = model(xb)
                loss = ce_crit(logits.reshape(-1, C), yb.reshape(-1))
                val_loss += loss.item(); vseen += 1
        ema.restore(model)
        val_loss = val_loss / max(1, vseen)
        print(f"[Fold {fold_idx} CE+TC] Epoch {ep} train_loss={tr_loss/max(1,seen):.4f} train_ce={tr_ce/max(1,seen):.4f} train_tc={tr_tc/max(1,seen):.4f} val_ce={val_loss:.4f} epoch_time={(time.time()-t_ep):.1f}s total={(time.time()-t0)/60:.1f}m", flush=True)
        if val_loss < best_val - 1e-4:
            best_val = val_loss; bad = 0
            ema.apply_to(model); torch.save(model.state_dict(), out_name); ema.restore(model)
            print(f"  Saved best EMA weights to {out_name}", flush=True)
        else:
            bad += 1
            if bad >= patience:
                print(f"  Early stop at epoch {ep}", flush=True)
                break
        torch.cuda.empty_cache(); gc.collect()
    print(f"Fold {fold_idx} CE+TC done. Best val CE={best_val:.4f}. Model -> {out_name}")

# Kick off CE+TC training across folds for two seeds (6 models); overwrite existing
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
for f in folds:
    fi = int(f['fold'])
    for seed_idx, suf in enumerate(['', '_s1']):
        outp = Path(f"model_tc_fold{fi}{suf}.pth")
        if outp.exists():
            print(f"[Overwrite] Removing existing {outp} to retrain CE+TC...")
            try: outp.unlink()
            except Exception as e: print(f"  Warning: could not delete {outp}: {e}")
        ds_seed = (2026 + fi*17 + (seed_idx*101))
        train_fold_tc(fi, f['train_ids'], f['val_ids'], out_name=str(outp), ds_seed=ds_seed,
                      epochs=40, batch_size=8, accum_steps=1, base_lr=3e-3, min_lr=3e-5, wd=0.01,
                      label_smooth=0.05, lambda_tc=0.20, tc_warmup_epochs=5, k2=True)
        torch.cuda.empty_cache(); gc.collect()
print('All folds CE+TC processed.')

# Note: After training, run a new cell to cache OOF for TC models (e.g., *_tc.npy), sweep decoder per expert grid,
# and evaluate CE6, TC6, and CE6+TC6 blends by worst-fold then mean. Then build test submission with winning blend.

CUDA available: True


[Overwrite] Removing existing model_tc_fold0.pth to retrain CE+TC...
=== Train CE+TC fold 0 (model_tc_fold0.pth) : train_n=199 val_n=98 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.9350 ce=3.9207 tc=0.3569 lam_tc=0.040 elapsed=1.7s


[Fold 0 CE+TC] Epoch 1 train_loss=3.9350 train_ce=3.9207 train_tc=0.3569 val_ce=5.3257 epoch_time=2.6s total=0.0m


  Saved best EMA weights to model_tc_fold0.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.7930 ce=2.7810 tc=0.1497 lam_tc=0.080 elapsed=1.7s


[Fold 0 CE+TC] Epoch 2 train_loss=2.7930 train_ce=2.7810 train_tc=0.1497 val_ce=4.9782 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold0.pth


  ep3 step 24/24 lr=1.78e-03 loss=2.2651 ce=2.2429 tc=0.1858 lam_tc=0.120 elapsed=1.7s


[Fold 0 CE+TC] Epoch 3 train_loss=2.2651 train_ce=2.2429 train_tc=0.1858 val_ce=4.6737 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold0.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.9310 ce=1.9038 tc=0.1695 lam_tc=0.160 elapsed=1.7s


[Fold 0 CE+TC] Epoch 4 train_loss=1.9310 train_ce=1.9038 train_tc=0.1695 val_ce=4.4008 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold0.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.7373 ce=1.7036 tc=0.1685 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 5 train_loss=1.7373 train_ce=1.7036 train_tc=0.1685 val_ce=4.1559 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold0.pth


  ep6 step 24/24 lr=2.99e-03 loss=1.5848 ce=1.5518 tc=0.1649 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 6 train_loss=1.5848 train_ce=1.5518 train_tc=0.1649 val_ce=3.9378 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold0.pth


  ep7 step 24/24 lr=2.98e-03 loss=1.4690 ce=1.4355 tc=0.1676 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 7 train_loss=1.4690 train_ce=1.4355 train_tc=0.1676 val_ce=3.7348 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold0.pth


  ep8 step 24/24 lr=2.95e-03 loss=1.3563 ce=1.3230 tc=0.1664 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 8 train_loss=1.3563 train_ce=1.3230 train_tc=0.1664 val_ce=3.5482 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold0.pth


  ep9 step 24/24 lr=2.91e-03 loss=1.2685 ce=1.2361 tc=0.1623 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 9 train_loss=1.2685 train_ce=1.2361 train_tc=0.1623 val_ce=3.3744 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold0.pth


  ep10 step 24/24 lr=2.86e-03 loss=1.2104 ce=1.1782 tc=0.1613 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 10 train_loss=1.2104 train_ce=1.1782 train_tc=0.1613 val_ce=3.2159 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold0.pth


  ep11 step 24/24 lr=2.79e-03 loss=1.1379 ce=1.1060 tc=0.1592 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 11 train_loss=1.1379 train_ce=1.1060 train_tc=0.1592 val_ce=3.0619 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold0.pth


  ep12 step 24/24 lr=2.72e-03 loss=1.0917 ce=1.0604 tc=0.1569 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 12 train_loss=1.0917 train_ce=1.0604 train_tc=0.1569 val_ce=2.9203 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold0.pth


  ep13 step 24/24 lr=2.64e-03 loss=1.0351 ce=1.0046 tc=0.1524 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 13 train_loss=1.0351 train_ce=1.0046 train_tc=0.1524 val_ce=2.7923 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold0.pth


  ep14 step 24/24 lr=2.55e-03 loss=1.0103 ce=0.9796 tc=0.1532 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 14 train_loss=1.0103 train_ce=0.9796 train_tc=0.1532 val_ce=2.6708 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold0.pth


  ep15 step 24/24 lr=2.45e-03 loss=0.9630 ce=0.9329 tc=0.1508 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 15 train_loss=0.9630 train_ce=0.9329 train_tc=0.1508 val_ce=2.5664 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold0.pth


  ep16 step 24/24 lr=2.34e-03 loss=0.9566 ce=0.9265 tc=0.1504 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 16 train_loss=0.9566 train_ce=0.9265 train_tc=0.1504 val_ce=2.4710 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold0.pth


  ep17 step 24/24 lr=2.22e-03 loss=0.9205 ce=0.8911 tc=0.1472 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 17 train_loss=0.9205 train_ce=0.8911 train_tc=0.1472 val_ce=2.3796 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold0.pth


  ep18 step 24/24 lr=2.10e-03 loss=0.8901 ce=0.8607 tc=0.1469 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 18 train_loss=0.8901 train_ce=0.8607 train_tc=0.1469 val_ce=2.3002 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold0.pth


  ep19 step 24/24 lr=1.98e-03 loss=0.8483 ce=0.8206 tc=0.1385 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 19 train_loss=0.8483 train_ce=0.8206 train_tc=0.1385 val_ce=2.2321 epoch_time=2.7s total=0.9m


  Saved best EMA weights to model_tc_fold0.pth


  ep20 step 24/24 lr=1.85e-03 loss=0.8149 ce=0.7873 tc=0.1379 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 20 train_loss=0.8149 train_ce=0.7873 train_tc=0.1379 val_ce=2.1724 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold0.pth


  ep21 step 24/24 lr=1.72e-03 loss=0.7991 ce=0.7713 tc=0.1389 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 21 train_loss=0.7991 train_ce=0.7713 train_tc=0.1389 val_ce=2.1227 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold0.pth


  ep22 step 24/24 lr=1.59e-03 loss=0.7909 ce=0.7643 tc=0.1328 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 22 train_loss=0.7909 train_ce=0.7643 train_tc=0.1328 val_ce=2.0823 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold0.pth


  ep23 step 24/24 lr=1.45e-03 loss=0.7500 ce=0.7240 tc=0.1302 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 23 train_loss=0.7500 train_ce=0.7240 train_tc=0.1302 val_ce=2.0464 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold0.pth


  ep24 step 24/24 lr=1.32e-03 loss=0.7347 ce=0.7087 tc=0.1296 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 24 train_loss=0.7347 train_ce=0.7087 train_tc=0.1296 val_ce=2.0159 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold0.pth


  ep25 step 24/24 lr=1.19e-03 loss=0.7090 ce=0.6840 tc=0.1249 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 25 train_loss=0.7090 train_ce=0.6840 train_tc=0.1249 val_ce=1.9906 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold0.pth


  ep26 step 24/24 lr=1.06e-03 loss=0.6850 ce=0.6600 tc=0.1250 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 26 train_loss=0.6850 train_ce=0.6600 train_tc=0.1250 val_ce=1.9692 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold0.pth


  ep27 step 24/24 lr=9.36e-04 loss=0.6684 ce=0.6440 tc=0.1221 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 27 train_loss=0.6684 train_ce=0.6440 train_tc=0.1221 val_ce=1.9528 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold0.pth


  ep28 step 24/24 lr=8.16e-04 loss=0.6517 ce=0.6279 tc=0.1192 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 28 train_loss=0.6517 train_ce=0.6279 train_tc=0.1192 val_ce=1.9378 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold0.pth


  ep29 step 24/24 lr=7.02e-04 loss=0.6446 ce=0.6213 tc=0.1162 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 29 train_loss=0.6446 train_ce=0.6213 train_tc=0.1162 val_ce=1.9273 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold0.pth


  ep30 step 24/24 lr=5.93e-04 loss=0.6285 ce=0.6055 tc=0.1148 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 30 train_loss=0.6285 train_ce=0.6055 train_tc=0.1148 val_ce=1.9170 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold0.pth


  ep31 step 24/24 lr=4.93e-04 loss=0.6169 ce=0.5941 tc=0.1142 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 31 train_loss=0.6169 train_ce=0.5941 train_tc=0.1142 val_ce=1.9077 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold0.pth


  ep32 step 24/24 lr=4.00e-04 loss=0.6098 ce=0.5873 tc=0.1124 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 32 train_loss=0.6098 train_ce=0.5873 train_tc=0.1124 val_ce=1.9005 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold0.pth


  ep33 step 24/24 lr=3.17e-04 loss=0.6021 ce=0.5791 tc=0.1151 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 33 train_loss=0.6021 train_ce=0.5791 train_tc=0.1151 val_ce=1.8949 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold0.pth


  ep34 step 24/24 lr=2.43e-04 loss=0.5989 ce=0.5766 tc=0.1114 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 34 train_loss=0.5989 train_ce=0.5766 train_tc=0.1114 val_ce=1.8906 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold0.pth


  ep35 step 24/24 lr=1.79e-04 loss=0.5921 ce=0.5696 tc=0.1124 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 35 train_loss=0.5921 train_ce=0.5696 train_tc=0.1124 val_ce=1.8864 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold0.pth


  ep36 step 24/24 lr=1.27e-04 loss=0.5928 ce=0.5704 tc=0.1120 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 36 train_loss=0.5928 train_ce=0.5704 train_tc=0.1120 val_ce=1.8836 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold0.pth


  ep37 step 24/24 lr=8.50e-05 loss=0.5753 ce=0.5530 tc=0.1112 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 37 train_loss=0.5753 train_ce=0.5530 train_tc=0.1112 val_ce=1.8805 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold0.pth


  ep38 step 24/24 lr=5.49e-05 loss=0.5814 ce=0.5592 tc=0.1109 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 38 train_loss=0.5814 train_ce=0.5592 train_tc=0.1109 val_ce=1.8785 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold0.pth


  ep39 step 24/24 lr=3.65e-05 loss=0.5808 ce=0.5587 tc=0.1107 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 39 train_loss=0.5808 train_ce=0.5587 train_tc=0.1107 val_ce=1.8768 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold0.pth


  ep40 step 24/24 lr=3.00e-05 loss=0.5837 ce=0.5615 tc=0.1110 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 40 train_loss=0.5837 train_ce=0.5615 train_tc=0.1110 val_ce=1.8752 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold0.pth


Fold 0 CE+TC done. Best val CE=1.8752. Model -> model_tc_fold0.pth


[Overwrite] Removing existing model_tc_fold0_s1.pth to retrain CE+TC...
=== Train CE+TC fold 0 (model_tc_fold0_s1.pth) : train_n=199 val_n=98 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.7025 ce=3.6898 tc=0.3187 lam_tc=0.040 elapsed=1.6s


[Fold 0 CE+TC] Epoch 1 train_loss=3.7025 train_ce=3.6898 train_tc=0.3187 val_ce=4.5695 epoch_time=2.6s total=0.0m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.8007 ce=2.7891 tc=0.1454 lam_tc=0.080 elapsed=1.7s


[Fold 0 CE+TC] Epoch 2 train_loss=2.8007 train_ce=2.7891 train_tc=0.1454 val_ce=4.3268 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep3 step 24/24 lr=1.78e-03 loss=2.2440 ce=2.2205 tc=0.1964 lam_tc=0.120 elapsed=1.6s


[Fold 0 CE+TC] Epoch 3 train_loss=2.2440 train_ce=2.2205 train_tc=0.1964 val_ce=4.1102 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.9379 ce=1.9092 tc=0.1791 lam_tc=0.160 elapsed=1.7s


[Fold 0 CE+TC] Epoch 4 train_loss=1.9379 train_ce=1.9092 train_tc=0.1791 val_ce=3.9078 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.7458 ce=1.7128 tc=0.1652 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 5 train_loss=1.7458 train_ce=1.7128 train_tc=0.1652 val_ce=3.7177 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep6 step 24/24 lr=2.99e-03 loss=1.5985 ce=1.5659 tc=0.1634 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 6 train_loss=1.5985 train_ce=1.5659 train_tc=0.1634 val_ce=3.5396 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep7 step 24/24 lr=2.98e-03 loss=1.4352 ce=1.4026 tc=0.1630 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 7 train_loss=1.4352 train_ce=1.4026 train_tc=0.1630 val_ce=3.3738 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep8 step 24/24 lr=2.95e-03 loss=1.3400 ce=1.3077 tc=0.1612 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 8 train_loss=1.3400 train_ce=1.3077 train_tc=0.1612 val_ce=3.2169 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep9 step 24/24 lr=2.91e-03 loss=1.2601 ce=1.2278 tc=0.1612 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 9 train_loss=1.2601 train_ce=1.2278 train_tc=0.1612 val_ce=3.0677 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep10 step 24/24 lr=2.86e-03 loss=1.1701 ce=1.1379 tc=0.1610 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 10 train_loss=1.1701 train_ce=1.1379 train_tc=0.1610 val_ce=2.9254 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep11 step 24/24 lr=2.79e-03 loss=1.1371 ce=1.1048 tc=0.1616 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 11 train_loss=1.1371 train_ce=1.1048 train_tc=0.1616 val_ce=2.7857 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep12 step 24/24 lr=2.72e-03 loss=1.0844 ce=1.0527 tc=0.1582 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 12 train_loss=1.0844 train_ce=1.0527 train_tc=0.1582 val_ce=2.6531 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep13 step 24/24 lr=2.64e-03 loss=1.0257 ce=0.9951 tc=0.1531 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 13 train_loss=1.0257 train_ce=0.9951 train_tc=0.1531 val_ce=2.5299 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep14 step 24/24 lr=2.55e-03 loss=0.9756 ce=0.9446 tc=0.1546 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 14 train_loss=0.9756 train_ce=0.9446 train_tc=0.1546 val_ce=2.4171 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep15 step 24/24 lr=2.45e-03 loss=0.9556 ce=0.9256 tc=0.1502 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 15 train_loss=0.9556 train_ce=0.9256 train_tc=0.1502 val_ce=2.3140 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep16 step 24/24 lr=2.34e-03 loss=0.9315 ce=0.9020 tc=0.1477 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 16 train_loss=0.9315 train_ce=0.9020 train_tc=0.1477 val_ce=2.2204 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep17 step 24/24 lr=2.22e-03 loss=0.8971 ce=0.8678 tc=0.1462 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 17 train_loss=0.8971 train_ce=0.8678 train_tc=0.1462 val_ce=2.1414 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep18 step 24/24 lr=2.10e-03 loss=0.8643 ce=0.8353 tc=0.1447 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 18 train_loss=0.8643 train_ce=0.8353 train_tc=0.1447 val_ce=2.0739 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep19 step 24/24 lr=1.98e-03 loss=0.8457 ce=0.8176 tc=0.1404 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 19 train_loss=0.8457 train_ce=0.8176 train_tc=0.1404 val_ce=2.0168 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep20 step 24/24 lr=1.85e-03 loss=0.7980 ce=0.7707 tc=0.1364 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 20 train_loss=0.7980 train_ce=0.7707 train_tc=0.1364 val_ce=1.9679 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep21 step 24/24 lr=1.72e-03 loss=0.7803 ce=0.7530 tc=0.1367 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 21 train_loss=0.7803 train_ce=0.7530 train_tc=0.1367 val_ce=1.9296 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep22 step 24/24 lr=1.59e-03 loss=0.7586 ce=0.7314 tc=0.1358 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 22 train_loss=0.7586 train_ce=0.7314 train_tc=0.1358 val_ce=1.8976 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep23 step 24/24 lr=1.45e-03 loss=0.7495 ce=0.7231 tc=0.1323 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 23 train_loss=0.7495 train_ce=0.7231 train_tc=0.1323 val_ce=1.8722 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep24 step 24/24 lr=1.32e-03 loss=0.7231 ce=0.6972 tc=0.1295 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 24 train_loss=0.7231 train_ce=0.6972 train_tc=0.1295 val_ce=1.8511 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep25 step 24/24 lr=1.19e-03 loss=0.7002 ce=0.6750 tc=0.1262 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 25 train_loss=0.7002 train_ce=0.6750 train_tc=0.1262 val_ce=1.8350 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep26 step 24/24 lr=1.06e-03 loss=0.6851 ce=0.6595 tc=0.1280 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 26 train_loss=0.6851 train_ce=0.6595 train_tc=0.1280 val_ce=1.8219 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep27 step 24/24 lr=9.36e-04 loss=0.6663 ce=0.6412 tc=0.1253 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 27 train_loss=0.6663 train_ce=0.6412 train_tc=0.1253 val_ce=1.8119 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep28 step 24/24 lr=8.16e-04 loss=0.6518 ce=0.6275 tc=0.1216 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 28 train_loss=0.6518 train_ce=0.6275 train_tc=0.1216 val_ce=1.8036 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep29 step 24/24 lr=7.02e-04 loss=0.6272 ce=0.6034 tc=0.1188 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 29 train_loss=0.6272 train_ce=0.6034 train_tc=0.1188 val_ce=1.7981 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep30 step 24/24 lr=5.93e-04 loss=0.6205 ce=0.5972 tc=0.1163 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 30 train_loss=0.6205 train_ce=0.5972 train_tc=0.1163 val_ce=1.7931 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep31 step 24/24 lr=4.93e-04 loss=0.6090 ce=0.5861 tc=0.1143 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 31 train_loss=0.6090 train_ce=0.5861 train_tc=0.1143 val_ce=1.7908 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep32 step 24/24 lr=4.00e-04 loss=0.6026 ce=0.5800 tc=0.1131 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 32 train_loss=0.6026 train_ce=0.5800 train_tc=0.1131 val_ce=1.7885 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep33 step 24/24 lr=3.17e-04 loss=0.5972 ce=0.5747 tc=0.1129 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 33 train_loss=0.5972 train_ce=0.5747 train_tc=0.1129 val_ce=1.7877 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep34 step 24/24 lr=2.43e-04 loss=0.5903 ce=0.5676 tc=0.1132 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 34 train_loss=0.5903 train_ce=0.5676 train_tc=0.1132 val_ce=1.7872 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold0_s1.pth


  ep35 step 24/24 lr=1.79e-04 loss=0.5861 ce=0.5639 tc=0.1110 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 35 train_loss=0.5861 train_ce=0.5639 train_tc=0.1110 val_ce=1.7879 epoch_time=2.6s total=1.6m


  ep36 step 24/24 lr=1.27e-04 loss=0.5849 ce=0.5625 tc=0.1124 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 36 train_loss=0.5849 train_ce=0.5625 train_tc=0.1124 val_ce=1.7888 epoch_time=2.6s total=1.6m


  ep37 step 24/24 lr=8.50e-05 loss=0.5834 ce=0.5610 tc=0.1119 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 37 train_loss=0.5834 train_ce=0.5610 train_tc=0.1119 val_ce=1.7900 epoch_time=2.6s total=1.7m


  ep38 step 24/24 lr=5.49e-05 loss=0.5797 ce=0.5575 tc=0.1112 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 38 train_loss=0.5797 train_ce=0.5575 train_tc=0.1112 val_ce=1.7915 epoch_time=2.6s total=1.7m


  ep39 step 24/24 lr=3.65e-05 loss=0.5774 ce=0.5552 tc=0.1107 lam_tc=0.200 elapsed=1.7s


[Fold 0 CE+TC] Epoch 39 train_loss=0.5774 train_ce=0.5552 train_tc=0.1107 val_ce=1.7931 epoch_time=2.6s total=1.8m


  ep40 step 24/24 lr=3.00e-05 loss=0.5786 ce=0.5565 tc=0.1106 lam_tc=0.200 elapsed=1.6s


[Fold 0 CE+TC] Epoch 40 train_loss=0.5786 train_ce=0.5565 train_tc=0.1106 val_ce=1.7949 epoch_time=2.6s total=1.8m


  Early stop at epoch 40


Fold 0 CE+TC done. Best val CE=1.7872. Model -> model_tc_fold0_s1.pth
[Overwrite] Removing existing model_tc_fold1.pth to retrain CE+TC...
=== Train CE+TC fold 1 (model_tc_fold1.pth) : train_n=198 val_n=99 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.5919 ce=3.5765 tc=0.3857 lam_tc=0.040 elapsed=1.8s


[Fold 1 CE+TC] Epoch 1 train_loss=3.5919 train_ce=3.5765 train_tc=0.3857 val_ce=4.1792 epoch_time=2.7s total=0.0m


  Saved best EMA weights to model_tc_fold1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.7753 ce=2.7640 tc=0.1407 lam_tc=0.080 elapsed=1.8s


[Fold 1 CE+TC] Epoch 2 train_loss=2.7753 train_ce=2.7640 train_tc=0.1407 val_ce=4.0270 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold1.pth


  ep3 step 24/24 lr=1.78e-03 loss=2.2952 ce=2.2729 tc=0.1860 lam_tc=0.120 elapsed=1.8s


[Fold 1 CE+TC] Epoch 3 train_loss=2.2952 train_ce=2.2729 train_tc=0.1860 val_ce=3.8824 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.9529 ce=1.9270 tc=0.1618 lam_tc=0.160 elapsed=1.8s


[Fold 1 CE+TC] Epoch 4 train_loss=1.9529 train_ce=1.9270 train_tc=0.1618 val_ce=3.7427 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.7241 ce=1.6908 tc=0.1664 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 5 train_loss=1.7241 train_ce=1.6908 train_tc=0.1664 val_ce=3.6085 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold1.pth


  ep6 step 24/24 lr=2.99e-03 loss=1.6000 ce=1.5691 tc=0.1546 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 6 train_loss=1.6000 train_ce=1.5691 train_tc=0.1546 val_ce=3.4716 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold1.pth


  ep7 step 24/24 lr=2.98e-03 loss=1.4187 ce=1.3877 tc=0.1550 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 7 train_loss=1.4187 train_ce=1.3877 train_tc=0.1550 val_ce=3.3391 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold1.pth


  ep8 step 24/24 lr=2.95e-03 loss=1.3717 ce=1.3402 tc=0.1574 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 8 train_loss=1.3717 train_ce=1.3402 train_tc=0.1574 val_ce=3.2108 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold1.pth


  ep9 step 24/24 lr=2.91e-03 loss=1.2399 ce=1.2091 tc=0.1539 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 9 train_loss=1.2399 train_ce=1.2091 train_tc=0.1539 val_ce=3.0847 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold1.pth


  ep10 step 24/24 lr=2.86e-03 loss=1.1589 ce=1.1281 tc=0.1539 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 10 train_loss=1.1589 train_ce=1.1281 train_tc=0.1539 val_ce=2.9672 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold1.pth


  ep11 step 24/24 lr=2.79e-03 loss=1.1200 ce=1.0897 tc=0.1517 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 11 train_loss=1.1200 train_ce=1.0897 train_tc=0.1517 val_ce=2.8537 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold1.pth


  ep12 step 24/24 lr=2.72e-03 loss=1.0875 ce=1.0558 tc=0.1582 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 12 train_loss=1.0875 train_ce=1.0558 train_tc=0.1582 val_ce=2.7424 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold1.pth


  ep13 step 24/24 lr=2.64e-03 loss=1.0249 ce=0.9948 tc=0.1506 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 13 train_loss=1.0249 train_ce=0.9948 train_tc=0.1506 val_ce=2.6326 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold1.pth


  ep14 step 24/24 lr=2.55e-03 loss=1.0262 ce=0.9965 tc=0.1483 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 14 train_loss=1.0262 train_ce=0.9965 train_tc=0.1483 val_ce=2.5269 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold1.pth


  ep15 step 24/24 lr=2.45e-03 loss=0.9462 ce=0.9167 tc=0.1477 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 15 train_loss=0.9462 train_ce=0.9167 train_tc=0.1477 val_ce=2.4259 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold1.pth


  ep16 step 24/24 lr=2.34e-03 loss=0.9184 ce=0.8886 tc=0.1492 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 16 train_loss=0.9184 train_ce=0.8886 train_tc=0.1492 val_ce=2.3313 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold1.pth


  ep17 step 24/24 lr=2.22e-03 loss=0.8926 ce=0.8631 tc=0.1471 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 17 train_loss=0.8926 train_ce=0.8631 train_tc=0.1471 val_ce=2.2442 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold1.pth


  ep18 step 24/24 lr=2.10e-03 loss=0.8593 ce=0.8310 tc=0.1414 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 18 train_loss=0.8593 train_ce=0.8310 train_tc=0.1414 val_ce=2.1644 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold1.pth


  ep19 step 24/24 lr=1.98e-03 loss=0.8175 ce=0.7892 tc=0.1416 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 19 train_loss=0.8175 train_ce=0.7892 train_tc=0.1416 val_ce=2.0957 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold1.pth


  ep20 step 24/24 lr=1.85e-03 loss=0.7829 ce=0.7554 tc=0.1375 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 20 train_loss=0.7829 train_ce=0.7554 train_tc=0.1375 val_ce=2.0328 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold1.pth


  ep21 step 24/24 lr=1.72e-03 loss=0.7671 ce=0.7398 tc=0.1366 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 21 train_loss=0.7671 train_ce=0.7398 train_tc=0.1366 val_ce=1.9737 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold1.pth


  ep22 step 24/24 lr=1.59e-03 loss=0.7578 ce=0.7311 tc=0.1337 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 22 train_loss=0.7578 train_ce=0.7311 train_tc=0.1337 val_ce=1.9254 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold1.pth


  ep23 step 24/24 lr=1.45e-03 loss=0.7246 ce=0.6986 tc=0.1298 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 23 train_loss=0.7246 train_ce=0.6986 train_tc=0.1298 val_ce=1.8839 epoch_time=2.7s total=1.1m


  Saved best EMA weights to model_tc_fold1.pth


  ep24 step 24/24 lr=1.32e-03 loss=0.7037 ce=0.6783 tc=0.1271 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 24 train_loss=0.7037 train_ce=0.6783 train_tc=0.1271 val_ce=1.8473 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold1.pth


  ep25 step 24/24 lr=1.19e-03 loss=0.6860 ce=0.6605 tc=0.1277 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 25 train_loss=0.6860 train_ce=0.6605 train_tc=0.1277 val_ce=1.8161 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold1.pth


  ep26 step 24/24 lr=1.06e-03 loss=0.6691 ce=0.6448 tc=0.1216 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 26 train_loss=0.6691 train_ce=0.6448 train_tc=0.1216 val_ce=1.7888 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold1.pth


  ep27 step 24/24 lr=9.36e-04 loss=0.6447 ce=0.6205 tc=0.1211 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 27 train_loss=0.6447 train_ce=0.6205 train_tc=0.1211 val_ce=1.7651 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold1.pth


  ep28 step 24/24 lr=8.16e-04 loss=0.6321 ce=0.6086 tc=0.1176 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 28 train_loss=0.6321 train_ce=0.6086 train_tc=0.1176 val_ce=1.7447 epoch_time=2.7s total=1.3m


  Saved best EMA weights to model_tc_fold1.pth


  ep29 step 24/24 lr=7.02e-04 loss=0.6231 ce=0.5998 tc=0.1168 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 29 train_loss=0.6231 train_ce=0.5998 train_tc=0.1168 val_ce=1.7278 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold1.pth


  ep30 step 24/24 lr=5.93e-04 loss=0.6099 ce=0.5873 tc=0.1133 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 30 train_loss=0.6099 train_ce=0.5873 train_tc=0.1133 val_ce=1.7127 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold1.pth


  ep31 step 24/24 lr=4.93e-04 loss=0.6016 ce=0.5792 tc=0.1117 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 31 train_loss=0.6016 train_ce=0.5792 train_tc=0.1117 val_ce=1.7007 epoch_time=2.7s total=1.4m


  Saved best EMA weights to model_tc_fold1.pth


  ep32 step 24/24 lr=4.00e-04 loss=0.5928 ce=0.5709 tc=0.1097 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 32 train_loss=0.5928 train_ce=0.5709 train_tc=0.1097 val_ce=1.6899 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold1.pth


  ep33 step 24/24 lr=3.17e-04 loss=0.5878 ce=0.5656 tc=0.1110 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 33 train_loss=0.5878 train_ce=0.5656 train_tc=0.1110 val_ce=1.6806 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold1.pth


  ep34 step 24/24 lr=2.43e-04 loss=0.5801 ce=0.5586 tc=0.1076 lam_tc=0.200 elapsed=1.9s


[Fold 1 CE+TC] Epoch 34 train_loss=0.5801 train_ce=0.5586 train_tc=0.1076 val_ce=1.6719 epoch_time=2.8s total=1.6m


  Saved best EMA weights to model_tc_fold1.pth


  ep35 step 24/24 lr=1.79e-04 loss=0.5785 ce=0.5568 tc=0.1086 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 35 train_loss=0.5785 train_ce=0.5568 train_tc=0.1086 val_ce=1.6655 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold1.pth


  ep36 step 24/24 lr=1.27e-04 loss=0.5743 ce=0.5529 tc=0.1066 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 36 train_loss=0.5743 train_ce=0.5529 train_tc=0.1066 val_ce=1.6592 epoch_time=2.7s total=1.7m


  Saved best EMA weights to model_tc_fold1.pth


  ep37 step 24/24 lr=8.50e-05 loss=0.5697 ce=0.5481 tc=0.1080 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 37 train_loss=0.5697 train_ce=0.5481 train_tc=0.1080 val_ce=1.6539 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold1.pth


  ep38 step 24/24 lr=5.49e-05 loss=0.5688 ce=0.5473 tc=0.1078 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 38 train_loss=0.5688 train_ce=0.5473 train_tc=0.1078 val_ce=1.6496 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold1.pth


  ep39 step 24/24 lr=3.65e-05 loss=0.5634 ce=0.5420 tc=0.1066 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 39 train_loss=0.5634 train_ce=0.5420 train_tc=0.1066 val_ce=1.6458 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold1.pth


  ep40 step 24/24 lr=3.00e-05 loss=0.5694 ce=0.5478 tc=0.1078 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 40 train_loss=0.5694 train_ce=0.5478 train_tc=0.1078 val_ce=1.6429 epoch_time=2.7s total=1.8m


  Saved best EMA weights to model_tc_fold1.pth


Fold 1 CE+TC done. Best val CE=1.6429. Model -> model_tc_fold1.pth


[Overwrite] Removing existing model_tc_fold1_s1.pth to retrain CE+TC...
=== Train CE+TC fold 1 (model_tc_fold1_s1.pth) : train_n=198 val_n=99 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.8114 ce=3.7953 tc=0.4031 lam_tc=0.040 elapsed=1.9s


[Fold 1 CE+TC] Epoch 1 train_loss=3.8114 train_ce=3.7953 train_tc=0.4031 val_ce=4.9977 epoch_time=2.8s total=0.0m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.7553 ce=2.7420 tc=0.1670 lam_tc=0.080 elapsed=1.8s


[Fold 1 CE+TC] Epoch 2 train_loss=2.7553 train_ce=2.7420 train_tc=0.1670 val_ce=4.7065 epoch_time=2.7s total=0.1m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep3 step 24/24 lr=1.78e-03 loss=2.2091 ce=2.1853 tc=0.1977 lam_tc=0.120 elapsed=1.8s


[Fold 1 CE+TC] Epoch 3 train_loss=2.2091 train_ce=2.1853 train_tc=0.1977 val_ce=4.4427 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.8804 ce=1.8527 tc=0.1729 lam_tc=0.160 elapsed=1.8s


[Fold 1 CE+TC] Epoch 4 train_loss=1.8804 train_ce=1.8527 train_tc=0.1729 val_ce=4.2070 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.6864 ce=1.6538 tc=0.1626 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 5 train_loss=1.6864 train_ce=1.6538 train_tc=0.1626 val_ce=3.9838 epoch_time=2.7s total=0.2m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep6 step 24/24 lr=2.99e-03 loss=1.5026 ce=1.4698 tc=0.1641 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 6 train_loss=1.5026 train_ce=1.4698 train_tc=0.1641 val_ce=3.7846 epoch_time=2.7s total=0.3m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep7 step 24/24 lr=2.98e-03 loss=1.4211 ce=1.3889 tc=0.1610 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 7 train_loss=1.4211 train_ce=1.3889 train_tc=0.1610 val_ce=3.5973 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep8 step 24/24 lr=2.95e-03 loss=1.3094 ce=1.2776 tc=0.1590 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 8 train_loss=1.3094 train_ce=1.2776 train_tc=0.1590 val_ce=3.4217 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep9 step 24/24 lr=2.91e-03 loss=1.1847 ce=1.1529 tc=0.1592 lam_tc=0.200 elapsed=1.9s


[Fold 1 CE+TC] Epoch 9 train_loss=1.1847 train_ce=1.1529 train_tc=0.1592 val_ce=3.2517 epoch_time=2.8s total=0.4m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep10 step 24/24 lr=2.86e-03 loss=1.1721 ce=1.1405 tc=0.1585 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 10 train_loss=1.1721 train_ce=1.1405 train_tc=0.1585 val_ce=3.0890 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep11 step 24/24 lr=2.79e-03 loss=1.1005 ce=1.0697 tc=0.1540 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 11 train_loss=1.1005 train_ce=1.0697 train_tc=0.1540 val_ce=2.9297 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep12 step 24/24 lr=2.72e-03 loss=1.0367 ce=1.0057 tc=0.1553 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 12 train_loss=1.0367 train_ce=1.0057 train_tc=0.1553 val_ce=2.7795 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep13 step 24/24 lr=2.64e-03 loss=1.0210 ce=0.9899 tc=0.1555 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 13 train_loss=1.0210 train_ce=0.9899 train_tc=0.1555 val_ce=2.6370 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep14 step 24/24 lr=2.55e-03 loss=0.9800 ce=0.9494 tc=0.1534 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 14 train_loss=0.9800 train_ce=0.9494 train_tc=0.1534 val_ce=2.5012 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep15 step 24/24 lr=2.45e-03 loss=0.9419 ce=0.9121 tc=0.1490 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 15 train_loss=0.9419 train_ce=0.9121 train_tc=0.1490 val_ce=2.3811 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep16 step 24/24 lr=2.34e-03 loss=0.8777 ce=0.8490 tc=0.1434 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 16 train_loss=0.8777 train_ce=0.8490 train_tc=0.1434 val_ce=2.2748 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep17 step 24/24 lr=2.22e-03 loss=0.8549 ce=0.8255 tc=0.1472 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 17 train_loss=0.8549 train_ce=0.8255 train_tc=0.1472 val_ce=2.1836 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep18 step 24/24 lr=2.10e-03 loss=0.8460 ce=0.8174 tc=0.1428 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 18 train_loss=0.8460 train_ce=0.8174 train_tc=0.1428 val_ce=2.1052 epoch_time=2.7s total=0.8m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep19 step 24/24 lr=1.98e-03 loss=0.8057 ce=0.7774 tc=0.1417 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 19 train_loss=0.8057 train_ce=0.7774 train_tc=0.1417 val_ce=2.0359 epoch_time=2.7s total=0.9m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep20 step 24/24 lr=1.85e-03 loss=0.7885 ce=0.7603 tc=0.1412 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 20 train_loss=0.7885 train_ce=0.7603 train_tc=0.1412 val_ce=1.9767 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep21 step 24/24 lr=1.72e-03 loss=0.7531 ce=0.7249 tc=0.1409 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 21 train_loss=0.7531 train_ce=0.7249 train_tc=0.1409 val_ce=1.9265 epoch_time=2.7s total=1.0m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep22 step 24/24 lr=1.59e-03 loss=0.7395 ce=0.7127 tc=0.1338 lam_tc=0.200 elapsed=1.9s


[Fold 1 CE+TC] Epoch 22 train_loss=0.7395 train_ce=0.7127 train_tc=0.1338 val_ce=1.8865 epoch_time=2.7s total=1.0m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep23 step 24/24 lr=1.45e-03 loss=0.7183 ce=0.6923 tc=0.1299 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 23 train_loss=0.7183 train_ce=0.6923 train_tc=0.1299 val_ce=1.8506 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep24 step 24/24 lr=1.32e-03 loss=0.6901 ce=0.6647 tc=0.1274 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 24 train_loss=0.6901 train_ce=0.6647 train_tc=0.1274 val_ce=1.8205 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep25 step 24/24 lr=1.19e-03 loss=0.6714 ce=0.6461 tc=0.1262 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 25 train_loss=0.6714 train_ce=0.6461 train_tc=0.1262 val_ce=1.7941 epoch_time=2.7s total=1.2m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep26 step 24/24 lr=1.06e-03 loss=0.6590 ce=0.6344 tc=0.1227 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 26 train_loss=0.6590 train_ce=0.6344 train_tc=0.1227 val_ce=1.7718 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep27 step 24/24 lr=9.36e-04 loss=0.6372 ce=0.6136 tc=0.1178 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 27 train_loss=0.6372 train_ce=0.6136 train_tc=0.1178 val_ce=1.7532 epoch_time=2.7s total=1.3m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep28 step 24/24 lr=8.16e-04 loss=0.6238 ce=0.6004 tc=0.1169 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 28 train_loss=0.6238 train_ce=0.6004 train_tc=0.1169 val_ce=1.7382 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep29 step 24/24 lr=7.02e-04 loss=0.6125 ce=0.5899 tc=0.1128 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 29 train_loss=0.6125 train_ce=0.5899 train_tc=0.1128 val_ce=1.7255 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep30 step 24/24 lr=5.93e-04 loss=0.6031 ce=0.5803 tc=0.1141 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 30 train_loss=0.6031 train_ce=0.5803 train_tc=0.1141 val_ce=1.7143 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep31 step 24/24 lr=4.93e-04 loss=0.5919 ce=0.5696 tc=0.1120 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 31 train_loss=0.5919 train_ce=0.5696 train_tc=0.1120 val_ce=1.7044 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep32 step 24/24 lr=4.00e-04 loss=0.5872 ce=0.5649 tc=0.1111 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 32 train_loss=0.5872 train_ce=0.5649 train_tc=0.1111 val_ce=1.6958 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep33 step 24/24 lr=3.17e-04 loss=0.5822 ce=0.5600 tc=0.1108 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 33 train_loss=0.5822 train_ce=0.5600 train_tc=0.1108 val_ce=1.6887 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep34 step 24/24 lr=2.43e-04 loss=0.5771 ce=0.5554 tc=0.1082 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 34 train_loss=0.5771 train_ce=0.5554 train_tc=0.1082 val_ce=1.6831 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep35 step 24/24 lr=1.79e-04 loss=0.5746 ce=0.5529 tc=0.1083 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 35 train_loss=0.5746 train_ce=0.5529 train_tc=0.1083 val_ce=1.6774 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep36 step 24/24 lr=1.27e-04 loss=0.5679 ce=0.5462 tc=0.1083 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 36 train_loss=0.5679 train_ce=0.5462 train_tc=0.1083 val_ce=1.6734 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep37 step 24/24 lr=8.50e-05 loss=0.5673 ce=0.5460 tc=0.1066 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 37 train_loss=0.5673 train_ce=0.5460 train_tc=0.1066 val_ce=1.6699 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep38 step 24/24 lr=5.49e-05 loss=0.5657 ce=0.5443 tc=0.1071 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 38 train_loss=0.5657 train_ce=0.5443 train_tc=0.1071 val_ce=1.6668 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep39 step 24/24 lr=3.65e-05 loss=0.5648 ce=0.5434 tc=0.1069 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 39 train_loss=0.5648 train_ce=0.5434 train_tc=0.1069 val_ce=1.6641 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold1_s1.pth


  ep40 step 24/24 lr=3.00e-05 loss=0.5640 ce=0.5426 tc=0.1069 lam_tc=0.200 elapsed=1.8s


[Fold 1 CE+TC] Epoch 40 train_loss=0.5640 train_ce=0.5426 train_tc=0.1069 val_ce=1.6620 epoch_time=2.7s total=1.9m


  Saved best EMA weights to model_tc_fold1_s1.pth


Fold 1 CE+TC done. Best val CE=1.6620. Model -> model_tc_fold1_s1.pth


[Overwrite] Removing existing model_tc_fold2.pth to retrain CE+TC...
=== Train CE+TC fold 2 (model_tc_fold2.pth) : train_n=197 val_n=100 ===


  ep1 step 24/24 lr=5.75e-04 loss=4.0675 ce=4.0550 tc=0.3125 lam_tc=0.040 elapsed=1.8s


[Fold 2 CE+TC] Epoch 1 train_loss=4.0675 train_ce=4.0550 train_tc=0.3125 val_ce=5.8243 epoch_time=2.6s total=0.0m


  Saved best EMA weights to model_tc_fold2.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.5841 ce=2.5703 tc=0.1729 lam_tc=0.080 elapsed=1.8s


[Fold 2 CE+TC] Epoch 2 train_loss=2.5841 train_ce=2.5703 train_tc=0.1729 val_ce=5.4341 epoch_time=2.7s total=0.1m


  Saved best EMA weights to model_tc_fold2.pth


  ep3 step 24/24 lr=1.78e-03 loss=1.9524 ce=1.9289 tc=0.1962 lam_tc=0.120 elapsed=1.8s


[Fold 2 CE+TC] Epoch 3 train_loss=1.9524 train_ce=1.9289 train_tc=0.1962 val_ce=5.0584 epoch_time=2.7s total=0.1m


  Saved best EMA weights to model_tc_fold2.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.6531 ce=1.6272 tc=0.1621 lam_tc=0.160 elapsed=1.8s


[Fold 2 CE+TC] Epoch 4 train_loss=1.6531 train_ce=1.6272 train_tc=0.1621 val_ce=4.7337 epoch_time=2.7s total=0.2m


  Saved best EMA weights to model_tc_fold2.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.4799 ce=1.4499 tc=0.1501 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 5 train_loss=1.4799 train_ce=1.4499 train_tc=0.1501 val_ce=4.4515 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold2.pth


  ep6 step 24/24 lr=2.99e-03 loss=1.4004 ce=1.3724 tc=0.1402 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 6 train_loss=1.4004 train_ce=1.3724 train_tc=0.1402 val_ce=4.2113 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold2.pth


  ep7 step 24/24 lr=2.98e-03 loss=1.2729 ce=1.2456 tc=0.1362 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 7 train_loss=1.2729 train_ce=1.2456 train_tc=0.1362 val_ce=3.9995 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold2.pth


  ep8 step 24/24 lr=2.95e-03 loss=1.1903 ce=1.1633 tc=0.1346 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 8 train_loss=1.1903 train_ce=1.1633 train_tc=0.1346 val_ce=3.8108 epoch_time=2.7s total=0.4m


  Saved best EMA weights to model_tc_fold2.pth


  ep9 step 24/24 lr=2.91e-03 loss=1.1078 ce=1.0819 tc=0.1299 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 9 train_loss=1.1078 train_ce=1.0819 train_tc=0.1299 val_ce=3.6409 epoch_time=2.6s total=0.4m


  Saved best EMA weights to model_tc_fold2.pth


  ep10 step 24/24 lr=2.86e-03 loss=1.0870 ce=1.0605 tc=0.1323 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 10 train_loss=1.0870 train_ce=1.0605 train_tc=0.1323 val_ce=3.4872 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold2.pth


  ep11 step 24/24 lr=2.79e-03 loss=1.0115 ce=0.9867 tc=0.1240 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 11 train_loss=1.0115 train_ce=0.9867 train_tc=0.1240 val_ce=3.3463 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold2.pth


  ep12 step 24/24 lr=2.72e-03 loss=0.9753 ce=0.9500 tc=0.1263 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 12 train_loss=0.9753 train_ce=0.9500 train_tc=0.1263 val_ce=3.2150 epoch_time=2.7s total=0.6m


  Saved best EMA weights to model_tc_fold2.pth


  ep13 step 24/24 lr=2.64e-03 loss=0.9226 ce=0.8982 tc=0.1218 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 13 train_loss=0.9226 train_ce=0.8982 train_tc=0.1218 val_ce=3.0910 epoch_time=2.7s total=0.6m


  Saved best EMA weights to model_tc_fold2.pth


  ep14 step 24/24 lr=2.55e-03 loss=0.9064 ce=0.8825 tc=0.1196 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 14 train_loss=0.9064 train_ce=0.8825 train_tc=0.1196 val_ce=2.9718 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold2.pth


  ep15 step 24/24 lr=2.45e-03 loss=0.8839 ce=0.8607 tc=0.1162 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 15 train_loss=0.8839 train_ce=0.8607 train_tc=0.1162 val_ce=2.8623 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold2.pth


  ep16 step 24/24 lr=2.34e-03 loss=0.8551 ce=0.8329 tc=0.1108 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 16 train_loss=0.8551 train_ce=0.8329 train_tc=0.1108 val_ce=2.7611 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold2.pth


  ep17 step 24/24 lr=2.22e-03 loss=0.8550 ce=0.8332 tc=0.1090 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 17 train_loss=0.8550 train_ce=0.8332 train_tc=0.1090 val_ce=2.6680 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold2.pth


  ep18 step 24/24 lr=2.10e-03 loss=0.8056 ce=0.7846 tc=0.1051 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 18 train_loss=0.8056 train_ce=0.7846 train_tc=0.1051 val_ce=2.5876 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold2.pth


  ep19 step 24/24 lr=1.98e-03 loss=0.7853 ce=0.7647 tc=0.1032 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 19 train_loss=0.7853 train_ce=0.7647 train_tc=0.1032 val_ce=2.5160 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold2.pth


  ep20 step 24/24 lr=1.85e-03 loss=0.7638 ce=0.7432 tc=0.1034 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 20 train_loss=0.7638 train_ce=0.7432 train_tc=0.1034 val_ce=2.4513 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold2.pth


  ep21 step 24/24 lr=1.72e-03 loss=0.7360 ce=0.7164 tc=0.0983 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 21 train_loss=0.7360 train_ce=0.7164 train_tc=0.0983 val_ce=2.3960 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold2.pth


  ep22 step 24/24 lr=1.59e-03 loss=0.7179 ce=0.6988 tc=0.0956 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 22 train_loss=0.7179 train_ce=0.6988 train_tc=0.0956 val_ce=2.3508 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold2.pth


  ep23 step 24/24 lr=1.45e-03 loss=0.7122 ce=0.6936 tc=0.0930 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 23 train_loss=0.7122 train_ce=0.6936 train_tc=0.0930 val_ce=2.3139 epoch_time=2.7s total=1.1m


  Saved best EMA weights to model_tc_fold2.pth


  ep24 step 24/24 lr=1.32e-03 loss=0.6764 ce=0.6587 tc=0.0885 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 24 train_loss=0.6764 train_ce=0.6587 train_tc=0.0885 val_ce=2.2810 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold2.pth


  ep25 step 24/24 lr=1.19e-03 loss=0.6758 ce=0.6578 tc=0.0901 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 25 train_loss=0.6758 train_ce=0.6578 train_tc=0.0901 val_ce=2.2533 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold2.pth


  ep26 step 24/24 lr=1.06e-03 loss=0.6653 ce=0.6479 tc=0.0868 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 26 train_loss=0.6653 train_ce=0.6479 train_tc=0.0868 val_ce=2.2303 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold2.pth


  ep27 step 24/24 lr=9.36e-04 loss=0.6475 ce=0.6304 tc=0.0856 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 27 train_loss=0.6475 train_ce=0.6304 train_tc=0.0856 val_ce=2.2136 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold2.pth


  ep28 step 24/24 lr=8.16e-04 loss=0.6347 ce=0.6179 tc=0.0839 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 28 train_loss=0.6347 train_ce=0.6179 train_tc=0.0839 val_ce=2.1993 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold2.pth


  ep29 step 24/24 lr=7.02e-04 loss=0.6257 ce=0.6091 tc=0.0831 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 29 train_loss=0.6257 train_ce=0.6091 train_tc=0.0831 val_ce=2.1884 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold2.pth


  ep30 step 24/24 lr=5.93e-04 loss=0.6210 ce=0.6046 tc=0.0819 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 30 train_loss=0.6210 train_ce=0.6046 train_tc=0.0819 val_ce=2.1795 epoch_time=2.7s total=1.4m


  Saved best EMA weights to model_tc_fold2.pth


  ep31 step 24/24 lr=4.93e-04 loss=0.6104 ce=0.5943 tc=0.0806 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 31 train_loss=0.6104 train_ce=0.5943 train_tc=0.0806 val_ce=2.1708 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold2.pth


  ep32 step 24/24 lr=4.00e-04 loss=0.6076 ce=0.5913 tc=0.0814 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 32 train_loss=0.6076 train_ce=0.5913 train_tc=0.0814 val_ce=2.1650 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold2.pth


  ep33 step 24/24 lr=3.17e-04 loss=0.6059 ce=0.5896 tc=0.0811 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 33 train_loss=0.6059 train_ce=0.5896 train_tc=0.0811 val_ce=2.1609 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold2.pth


  ep34 step 24/24 lr=2.43e-04 loss=0.5982 ce=0.5823 tc=0.0795 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 34 train_loss=0.5982 train_ce=0.5823 train_tc=0.0795 val_ce=2.1567 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold2.pth


  ep35 step 24/24 lr=1.79e-04 loss=0.5973 ce=0.5813 tc=0.0805 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 35 train_loss=0.5973 train_ce=0.5813 train_tc=0.0805 val_ce=2.1535 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold2.pth


  ep36 step 24/24 lr=1.27e-04 loss=0.5946 ce=0.5788 tc=0.0791 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 36 train_loss=0.5946 train_ce=0.5788 train_tc=0.0791 val_ce=2.1504 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold2.pth


  ep37 step 24/24 lr=8.50e-05 loss=0.5897 ce=0.5740 tc=0.0786 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 37 train_loss=0.5897 train_ce=0.5740 train_tc=0.0786 val_ce=2.1485 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold2.pth


  ep38 step 24/24 lr=5.49e-05 loss=0.5904 ce=0.5748 tc=0.0781 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 38 train_loss=0.5904 train_ce=0.5748 train_tc=0.0781 val_ce=2.1471 epoch_time=2.7s total=1.8m


  Saved best EMA weights to model_tc_fold2.pth


  ep39 step 24/24 lr=3.65e-05 loss=0.5902 ce=0.5745 tc=0.0784 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 39 train_loss=0.5902 train_ce=0.5745 train_tc=0.0784 val_ce=2.1451 epoch_time=2.7s total=1.8m


  Saved best EMA weights to model_tc_fold2.pth


  ep40 step 24/24 lr=3.00e-05 loss=0.5914 ce=0.5757 tc=0.0789 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 40 train_loss=0.5914 train_ce=0.5757 train_tc=0.0789 val_ce=2.1440 epoch_time=2.7s total=1.8m


  Saved best EMA weights to model_tc_fold2.pth


Fold 2 CE+TC done. Best val CE=2.1440. Model -> model_tc_fold2.pth


[Overwrite] Removing existing model_tc_fold2_s1.pth to retrain CE+TC...
=== Train CE+TC fold 2 (model_tc_fold2_s1.pth) : train_n=197 val_n=100 ===


  ep1 step 24/24 lr=5.75e-04 loss=3.8922 ce=3.8774 tc=0.3689 lam_tc=0.040 elapsed=1.9s


[Fold 2 CE+TC] Epoch 1 train_loss=3.8922 train_ce=3.8774 train_tc=0.3689 val_ce=5.1073 epoch_time=2.7s total=0.0m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep2 step 24/24 lr=1.18e-03 loss=2.5656 ce=2.5520 tc=0.1695 lam_tc=0.080 elapsed=1.8s


[Fold 2 CE+TC] Epoch 2 train_loss=2.5656 train_ce=2.5520 train_tc=0.1695 val_ce=4.8708 epoch_time=2.7s total=0.1m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep3 step 24/24 lr=1.78e-03 loss=1.9598 ce=1.9348 tc=0.2081 lam_tc=0.120 elapsed=1.8s


[Fold 2 CE+TC] Epoch 3 train_loss=1.9598 train_ce=1.9348 train_tc=0.2081 val_ce=4.6621 epoch_time=2.6s total=0.1m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep4 step 24/24 lr=2.37e-03 loss=1.6334 ce=1.6080 tc=0.1588 lam_tc=0.160 elapsed=1.8s


[Fold 2 CE+TC] Epoch 4 train_loss=1.6334 train_ce=1.6080 train_tc=0.1588 val_ce=4.4672 epoch_time=2.6s total=0.2m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep5 step 24/24 lr=2.98e-03 loss=1.5173 ce=1.4876 tc=0.1485 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 5 train_loss=1.5173 train_ce=1.4876 train_tc=0.1485 val_ce=4.2858 epoch_time=2.7s total=0.2m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep6 step 24/24 lr=2.99e-03 loss=1.3984 ce=1.3709 tc=0.1376 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 6 train_loss=1.3984 train_ce=1.3709 train_tc=0.1376 val_ce=4.1042 epoch_time=2.6s total=0.3m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep7 step 24/24 lr=2.98e-03 loss=1.3002 ce=1.2737 tc=0.1323 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 7 train_loss=1.3002 train_ce=1.2737 train_tc=0.1323 val_ce=3.9278 epoch_time=2.7s total=0.3m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep8 step 24/24 lr=2.95e-03 loss=1.1770 ce=1.1508 tc=0.1311 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 8 train_loss=1.1770 train_ce=1.1508 train_tc=0.1311 val_ce=3.7659 epoch_time=2.7s total=0.4m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep9 step 24/24 lr=2.91e-03 loss=1.1144 ce=1.0878 tc=0.1332 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 9 train_loss=1.1144 train_ce=1.0878 train_tc=0.1332 val_ce=3.6112 epoch_time=2.7s total=0.4m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep10 step 24/24 lr=2.86e-03 loss=1.0503 ce=1.0249 tc=0.1267 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 10 train_loss=1.0503 train_ce=1.0249 train_tc=0.1267 val_ce=3.4652 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep11 step 24/24 lr=2.79e-03 loss=1.0053 ce=0.9806 tc=0.1237 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 11 train_loss=1.0053 train_ce=0.9806 train_tc=0.1237 val_ce=3.3314 epoch_time=2.6s total=0.5m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep12 step 24/24 lr=2.72e-03 loss=0.9753 ce=0.9505 tc=0.1239 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 12 train_loss=0.9753 train_ce=0.9505 train_tc=0.1239 val_ce=3.2017 epoch_time=2.7s total=0.6m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep13 step 24/24 lr=2.64e-03 loss=0.9476 ce=0.9236 tc=0.1203 lam_tc=0.200 elapsed=1.9s


[Fold 2 CE+TC] Epoch 13 train_loss=0.9476 train_ce=0.9236 train_tc=0.1203 val_ce=3.0752 epoch_time=2.7s total=0.6m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep14 step 24/24 lr=2.55e-03 loss=0.9213 ce=0.8969 tc=0.1219 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 14 train_loss=0.9213 train_ce=0.8969 train_tc=0.1219 val_ce=2.9584 epoch_time=2.6s total=0.6m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep15 step 24/24 lr=2.45e-03 loss=0.8674 ce=0.8445 tc=0.1146 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 15 train_loss=0.8674 train_ce=0.8445 train_tc=0.1146 val_ce=2.8497 epoch_time=2.7s total=0.7m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep16 step 24/24 lr=2.34e-03 loss=0.8572 ce=0.8351 tc=0.1105 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 16 train_loss=0.8572 train_ce=0.8351 train_tc=0.1105 val_ce=2.7498 epoch_time=2.6s total=0.7m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep17 step 24/24 lr=2.22e-03 loss=0.8110 ce=0.7892 tc=0.1089 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 17 train_loss=0.8110 train_ce=0.7892 train_tc=0.1089 val_ce=2.6620 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep18 step 24/24 lr=2.10e-03 loss=0.7908 ce=0.7699 tc=0.1043 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 18 train_loss=0.7908 train_ce=0.7699 train_tc=0.1043 val_ce=2.5852 epoch_time=2.6s total=0.8m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep19 step 24/24 lr=1.98e-03 loss=0.7805 ce=0.7596 tc=0.1044 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 19 train_loss=0.7805 train_ce=0.7596 train_tc=0.1044 val_ce=2.5168 epoch_time=2.7s total=0.9m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep20 step 24/24 lr=1.85e-03 loss=0.7524 ce=0.7323 tc=0.1004 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 20 train_loss=0.7524 train_ce=0.7323 train_tc=0.1004 val_ce=2.4571 epoch_time=2.6s total=0.9m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep21 step 24/24 lr=1.72e-03 loss=0.7340 ce=0.7144 tc=0.0981 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 21 train_loss=0.7340 train_ce=0.7144 train_tc=0.0981 val_ce=2.4070 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep22 step 24/24 lr=1.59e-03 loss=0.7160 ce=0.6962 tc=0.0988 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 22 train_loss=0.7160 train_ce=0.6962 train_tc=0.0988 val_ce=2.3650 epoch_time=2.6s total=1.0m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep23 step 24/24 lr=1.45e-03 loss=0.6998 ce=0.6807 tc=0.0958 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 23 train_loss=0.6998 train_ce=0.6807 train_tc=0.0958 val_ce=2.3286 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep24 step 24/24 lr=1.32e-03 loss=0.6769 ce=0.6587 tc=0.0911 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 24 train_loss=0.6769 train_ce=0.6587 train_tc=0.0911 val_ce=2.2974 epoch_time=2.6s total=1.1m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep25 step 24/24 lr=1.19e-03 loss=0.6718 ce=0.6543 tc=0.0873 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 25 train_loss=0.6718 train_ce=0.6543 train_tc=0.0873 val_ce=2.2716 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep26 step 24/24 lr=1.06e-03 loss=0.6579 ce=0.6405 tc=0.0873 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 26 train_loss=0.6579 train_ce=0.6405 train_tc=0.0873 val_ce=2.2499 epoch_time=2.6s total=1.2m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep27 step 24/24 lr=9.36e-04 loss=0.6394 ce=0.6221 tc=0.0868 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 27 train_loss=0.6394 train_ce=0.6221 train_tc=0.0868 val_ce=2.2319 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep28 step 24/24 lr=8.16e-04 loss=0.6305 ce=0.6136 tc=0.0842 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 28 train_loss=0.6305 train_ce=0.6136 train_tc=0.0842 val_ce=2.2157 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep29 step 24/24 lr=7.02e-04 loss=0.6282 ce=0.6119 tc=0.0818 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 29 train_loss=0.6282 train_ce=0.6119 train_tc=0.0818 val_ce=2.2028 epoch_time=2.6s total=1.3m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep30 step 24/24 lr=5.93e-04 loss=0.6197 ce=0.6033 tc=0.0819 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 30 train_loss=0.6197 train_ce=0.6033 train_tc=0.0819 val_ce=2.1902 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep31 step 24/24 lr=4.93e-04 loss=0.6070 ce=0.5909 tc=0.0802 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 31 train_loss=0.6070 train_ce=0.5909 train_tc=0.0802 val_ce=2.1807 epoch_time=2.6s total=1.4m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep32 step 24/24 lr=4.00e-04 loss=0.6060 ce=0.5900 tc=0.0801 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 32 train_loss=0.6060 train_ce=0.5900 train_tc=0.0801 val_ce=2.1719 epoch_time=2.6s total=1.5m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep33 step 24/24 lr=3.17e-04 loss=0.6050 ce=0.5888 tc=0.0810 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 33 train_loss=0.6050 train_ce=0.5888 train_tc=0.0810 val_ce=2.1644 epoch_time=2.7s total=1.5m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep34 step 24/24 lr=2.43e-04 loss=0.5971 ce=0.5813 tc=0.0788 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 34 train_loss=0.5971 train_ce=0.5813 train_tc=0.0788 val_ce=2.1580 epoch_time=2.7s total=1.6m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep35 step 24/24 lr=1.79e-04 loss=0.5925 ce=0.5768 tc=0.0788 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 35 train_loss=0.5925 train_ce=0.5768 train_tc=0.0788 val_ce=2.1522 epoch_time=2.6s total=1.6m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep36 step 24/24 lr=1.27e-04 loss=0.5929 ce=0.5770 tc=0.0792 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 36 train_loss=0.5929 train_ce=0.5770 train_tc=0.0792 val_ce=2.1474 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep37 step 24/24 lr=8.50e-05 loss=0.5907 ce=0.5751 tc=0.0780 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 37 train_loss=0.5907 train_ce=0.5751 train_tc=0.0780 val_ce=2.1430 epoch_time=2.6s total=1.7m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep38 step 24/24 lr=5.49e-05 loss=0.5880 ce=0.5724 tc=0.0779 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 38 train_loss=0.5880 train_ce=0.5724 train_tc=0.0779 val_ce=2.1397 epoch_time=2.7s total=1.8m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep39 step 24/24 lr=3.65e-05 loss=0.5874 ce=0.5719 tc=0.0776 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 39 train_loss=0.5874 train_ce=0.5719 train_tc=0.0776 val_ce=2.1373 epoch_time=2.6s total=1.8m


  Saved best EMA weights to model_tc_fold2_s1.pth


  ep40 step 24/24 lr=3.00e-05 loss=0.5916 ce=0.5760 tc=0.0781 lam_tc=0.200 elapsed=1.8s


[Fold 2 CE+TC] Epoch 40 train_loss=0.5916 train_ce=0.5760 train_tc=0.0781 val_ce=2.1350 epoch_time=2.6s total=1.9m


  Saved best EMA weights to model_tc_fold2_s1.pth


Fold 2 CE+TC done. Best val CE=2.1350. Model -> model_tc_fold2_s1.pth


All folds CE+TC processed.


In [9]:
# Cache OOF for TC models, sweep decoder on TC6 and CE6+TC6, and build test submission
import os, json, time, math, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA:', torch.cuda.is_available(), flush=True)

feat_tr_dir = Path('features3d_v2')/'train'
feat_te_dir = Path('features3d_v2')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz");
    return d['X'].astype(np.float32)

def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid)); n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new))
            n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {}
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__();
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h);
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True);
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__();
        self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h);
        out = self.head(h); return out.transpose(1,2)

def time_warp_probs(p_t_c: torch.Tensor, factor: float) -> torch.Tensor:
    T, C = p_t_c.shape; tgt_len = max(1, int(round(T*factor)));
    x = p_t_c.T.unsqueeze(0);
    y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
    y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T;
    return y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        ps = time_warp_probs(p_t_c, s);
        acc = ps if acc is None else (acc + ps)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2);
    y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1);
    w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1);
    T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y

def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a = max(0, t_star - w); b = min(T-1, t_star + w);
    idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype);
    seg = p[a:b+1]; s = seg.sum() + 1e-8;
    return float(((idx * seg).sum() / s).item())

def topk_candidates_per_class(p_s: torch.Tensor, scores: torch.Tensor, c: int, k_c: int, temp: float, K: int = 3):
    T = p_s.shape[0]; s = scores[:, c]; vals, idxs = torch.topk(s, k=min(K, T));
    cand = []; w_com = max(5, k_c//3); radius = max(10, k_c//2)
    for v, t_star in zip(vals.tolist(), idxs.tolist()):
        t_ref = refine_com(p_s[:,c], int(t_star), w=w_com);
        t_idx = int(round(max(0, min(t_ref, T-1))));
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item();
        pooled_at_ref = p_s[t_idx, c].item();
        cand.append((t_ref, float(v), float(local_mean), float(pooled_at_ref)))
    cand.sort(key=lambda x: (x[0], -x[1], -x[2], -x[3]))
    return cand

def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=2, K=3, k_delta=4):
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape
    scores = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c == 0:
            scores[:, c] = p_s[:, c]; ks[c]=13; continue
        base_k = med_k.get(c, 13)
        k_c = int(np.clip(round(gamma * base_k), 9, 25))
        if k_c % 2 == 0: k_c = min(25, k_c + 1)
        ks[c] = k_c
        ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]))
        ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:, c], k=k).unsqueeze(1)
            acc = di if acc is None else (acc + di)
        scores[:, c] = (acc / float(len(ks_multi))).squeeze(1)
    all_cand = []
    for c in range(1,21):
        cand = topk_candidates_per_class(p_s, scores, c, ks[c], temp=temp, K=K)
        if len(cand)==0:
            all_cand.append((c, 0.0, -1e9, -1e9, -1e9))
        else:
            for (t_ref, v, lm, pr) in cand: all_cand.append((c, t_ref, v, lm, pr))
    all_cand.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    chosen = {}; last_t = -1e9
    for c, t_ref, v, lm, pr in all_cand:
        if c in chosen: continue
        if t_ref <= last_t + float(min_sep):
            t_ref = last_t + float(min_sep)
        last_t = min(t_ref, float(T-1))
        chosen[c] = (last_t, v, lm, pr)
        if len(chosen)==20: break
    if len(chosen) < 20:
        missing = [c for c in range(1,21) if c not in chosen]
        t = max(last_t, 0.0)
        for c in missing:
            t = min(t + float(min_sep), float(T-1))
            chosen[c] = (t, -1e9, -1e9, -1e9)
    seq = [c for c,_ in sorted(chosen.items(), key=lambda kv: kv[1][0])]
    return seq

def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est
    gamma_s = float(np.clip(ratio, 0.85, 1.15))
    return float(gamma_cv * gamma_s)

# 1) Cache OOF probs for TC models (per fold) with TTA; save {sid}_tc.npy and {sid}_tc_s1.npy
def cache_fold_val_probs_tc(fold, seed_suffix: str):
    fold_idx = int(fold['fold'])
    ckpt = Path(f"model_tc_fold{fold_idx}{seed_suffix}.pth")
    assert ckpt.exists(), f"Missing {ckpt}; ensure TC training is finished"
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()
    mean,std = compute_fold_scaler(fold['train_ids'])
    mean_t = torch.from_numpy(mean).float().to(device); std_t = torch.from_numpy(std).float().to(device)
    vids = fold['val_ids']; t0=time.time()
    for i, sid in enumerate(vids, 1):
        sid=int(sid); outp = probs_cache/f"{sid}_tc{seed_suffix}.npy"
        # Force refresh after retrain: remove existing cache if present
        if outp.exists():
            try: outp.unlink()
            except Exception as e: pass
        X = load_feat('train', sid); xb = torch.from_numpy(X).float().to(device);
        xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
        with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
            probs = model(xb)[0].softmax(dim=-1)
            probs = apply_tta_timewarp(probs, factors=(0.9,1.0,1.1))
        np.save(outp, probs.cpu().numpy())
        if (i%25)==0 or i==len(vids):
            print(f"  [fold {fold_idx} TC{seed_suffix}] cached {i}/{len(vids)} elapsed {time.time()-t0:.1f}s", flush=True)

print('Caching OOF probs for TC models (both seeds)...', flush=True)
for f in folds:
    for suf in ['', '_s1']:
        ckpt = Path(f"model_tc_fold{int(f['fold'])}{suf}.pth")
        if not ckpt.exists():
            print(f"  [skip fold {f['fold']}{suf}] checkpoint not found yet")
            continue
        cache_fold_val_probs_tc(f, suf)

# 2) Evaluate decoder on TC6 and CE6+TC6 averaged OOF
def load_cached_prob(path):
    return torch.from_numpy(np.load(path)).to(device)

def load_oof_avg(seed_paths):
    ps = [load_cached_prob(p) for p in seed_paths]
    p = sum(ps) / float(len(ps))
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

def eval_cfg_on_fold_with_loader(fold, loader_fn, pool_k, temp, gamma, sep):
    fi = int(fold['fold'])
    med_k = compute_class_median_durations_for_ids(fold['train_ids'])
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        sid=int(sid); p = loader_fn(sid); T = p.shape[0]
        gamma_eff = gamma_with_length(gamma, T, med_k)
        seq = decode_peaks_improved(p, med_k=med_k, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
        tot += levenshtein(seq, id2seq[sid]); cnt += 1
    return tot/max(cnt,1)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Loader for TC6 averaged OOF
def loader_tc6(sid:int):
    paths = [probs_cache/f"{sid}_tc.npy", probs_cache/f"{sid}_tc_s1.npy"]
    paths = [str(p) for p in paths if Path(p).exists()]
    assert len(paths)>0, f"Missing TC OOF for sid={sid}"
    return load_oof_avg(paths)

# Loader for CE6+TC6 averaged OOF (equal weights across 12)
def loader_ce_tc_12(sid:int):
    paths = []
    # CE OOF (seed0 + seed1)
    p0 = probs_cache/f"{sid}_ce_new.npy"; p1 = probs_cache/f"{sid}_ce_new_s1.npy"
    if p0.exists(): paths.append(str(p0))
    if p1.exists(): paths.append(str(p1))
    # TC OOF (seed0 + seed1)
    pt0 = probs_cache/f"{sid}_tc.npy"; pt1 = probs_cache/f"{sid}_tc_s1.npy"
    if pt0.exists(): paths.append(str(pt0))
    if pt1.exists(): paths.append(str(pt1))
    assert len(paths)>0, f"Missing CE/TC OOF for sid={sid}"
    return load_oof_avg(paths)

pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; gammas=[0.90,0.95,0.975,1.00,1.025,1.05]; seps=[3,4,5]

def sweep_with_loader(name, loader_fn):
    print(f'Sweeping {name}...', flush=True)
    results=[]
    for pool_k in pool_ks:
        for temp in temps:
            for gamma in gammas:
                for sep in seps:
                    per_fold=[]
                    ok=True
                    for f in folds:
                        try:
                            lev = eval_cfg_on_fold_with_loader(f, loader_fn, pool_k, temp, gamma, sep)
                            per_fold.append(lev)
                        except AssertionError as e:
                            ok=False; break
                    if ok and len(per_fold)==len(folds):
                        results.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep}))
    results.sort(key=lambda x: (x[1], x[0]))
    return results

res_tc = sweep_with_loader('TC6 (avg OOF)', loader_tc6)
if len(res_tc)>0:
    print('Top TC6 (mean, worst, cfg):')
    for r in res_tc[:5]: print(r)
    pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res_tc]).to_csv('cv_sweep_tc_6x_improved.csv', index=False)
else:
    print('TC6 sweep skipped (missing OOF).')

res_ce_tc = sweep_with_loader('CE6+TC6 (avg OOF)', loader_ce_tc_12)
if len(res_ce_tc)>0:
    print('Top CE6+TC6 (mean, worst, cfg):')
    for r in res_ce_tc[:5]: print(r)
    pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res_ce_tc]).to_csv('cv_sweep_ce_tc_12x_improved.csv', index=False)
else:
    print('CE6+TC6 sweep skipped (missing OOF).')

# 3) Build test submission using the best between TC6 and CE6+TC6 by worst-fold then mean
def choose_best_cfg():
    cand=[]
    if Path('cv_sweep_tc_6x_improved.csv').exists():
        df = pd.read_csv('cv_sweep_tc_6x_improved.csv').sort_values(['worst','mean'])
        if len(df): cand.append(('tc6', df.iloc[0].to_dict()))
    if Path('cv_sweep_ce_tc_12x_improved.csv').exists():
        df = pd.read_csv('cv_sweep_ce_tc_12x_improved.csv').sort_values(['worst','mean'])
        if len(df): cand.append(('ce_tc_12', df.iloc[0].to_dict()))
    if not cand:
        return None, {'pool_k':13,'temp':0.95,'gamma':1.0,'sep':3}
    # pick by lowest worst, then mean
    cand.sort(key=lambda kv: (kv[1]['worst'], kv[1]['mean']))
    return cand[0][0], cand[0][1]

blend, cfg_best = choose_best_cfg()
print('Chosen blend:', blend, 'cfg:', cfg_best)

test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
med_k_train_all = compute_class_median_durations_for_ids(pd.read_csv('training.csv')['Id'].astype(int).tolist())
pool_k=int(cfg_best.get('pool_k',13)); temp=float(cfg_best.get('temp',0.95)); gamma=float(cfg_best.get('gamma',1.0)); sep=int(cfg_best.get('sep',3))

D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]

def infer_tc6():
    # lazy-load each fold's two TC models per sample to control VRAM
    scalers=[compute_fold_scaler(folds[fi]['train_ids']) for fi in range(3)]
    scalers=[(torch.from_numpy(m).float().to(device), torch.from_numpy(s).float().to(device)) for (m,s) in scalers]
    rows=[]; t0=time.time()
    for i, sid in enumerate(test_ids, 1):
        X = load_feat('test', int(sid)); T = X.shape[0]
        acc=None
        with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
            for fi in range(3):
                mean_t, std_t = scalers[fi]
                for s in (0,1):
                    ckpt = Path(f"model_tc_fold{fi}{'_s1' if s==1 else ''}.pth")
                    if not ckpt.exists():
                        continue
                    m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                    m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                    xb = torch.from_numpy(X).float().to(device);
                    xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                    p = m(xb)[0].softmax(dim=-1);
                    p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1));
                    acc = p if acc is None else (acc + p)
                    del m
            probs = acc / float(6); probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
        gamma_eff = gamma_with_length(gamma, T, med_k_train_all)
        seq = decode_peaks_improved(probs, med_k=med_k_train_all, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
        rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
        if (i%10)==0 or i==len(test_ids):
            print(f"  [infer TC-6x] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
    return pd.DataFrame(rows, columns=['Id','Sequence'])

def infer_ce_tc_12():
    # lazy-load all 12 models (6 CE + 6 TC) per sample
    scalers=[compute_fold_scaler(folds[fi]['train_ids']) for fi in range(3)]
    scalers=[(torch.from_numpy(m).float().to(device), torch.from_numpy(s).float().to(device)) for (m,s) in scalers]
    rows=[]; t0=time.time()
    for i, sid in enumerate(test_ids, 1):
        X = load_feat('test', int(sid)); T = X.shape[0]
        acc=None
        with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
            for fi in range(3):
                mean_t, std_t = scalers[fi]
                # CE two seeds
                for s in (0,1):
                    ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth")
                    if ckpt.exists():
                        m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                        m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                        xb = torch.from_numpy(X).float().to(device);
                        xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                        p = m(xb)[0].softmax(dim=-1);
                        p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1));
                        acc = p if acc is None else (acc + p)
                        del m
                # TC two seeds
                for s in (0,1):
                    ckpt = Path(f"model_tc_fold{fi}{'_s1' if s==1 else ''}.pth")
                    if ckpt.exists():
                        m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                        m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                        xb = torch.from_numpy(X).float().to(device);
                        xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                        p = m(xb)[0].softmax(dim=-1);
                        p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1));
                        acc = p if acc is None else (acc + p)
                        del m
            denom = float(max(1, 12))
            probs = acc / denom; probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
        gamma_eff = gamma_with_length(gamma, T, med_k_train_all)
        seq = decode_peaks_improved(probs, med_k=med_k_train_all, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
        rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
        if (i%10)==0 or i==len(test_ids):
            print(f"  [infer CE+TC-12x] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
    return pd.DataFrame(rows, columns=['Id','Sequence'])

sub=None
if blend == 'tc6':
    sub = infer_tc6()
elif blend == 'ce_tc_12':
    sub = infer_ce_tc_12()
else:
    print('No blend selected or missing sweeps; defaulting to TC6 if available else abort')
    if Path('model_tc_fold0.pth').exists():
        sub = infer_tc6()

if sub is not None:
    assert len(sub)==95
    assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
    outp = 'submission_primary_tc_6x.csv' if blend=='tc6' else 'submission_primary_ce_tc_12x.csv'
    sub.to_csv(outp, index=False); sub.to_csv('submission.csv', index=False)
    print(f'Wrote {outp} and submission.csv; head:\n', sub.head(), flush=True)
else:
    print('Submission not created (waiting for TC training to finish and OOF to be cached).')

CUDA: True


Caching OOF probs for TC models (both seeds)...


  model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()


  [fold 0 TC] cached 25/98 elapsed 0.2s


  [fold 0 TC] cached 50/98 elapsed 0.5s


  [fold 0 TC] cached 75/98 elapsed 0.7s


  [fold 0 TC] cached 98/98 elapsed 1.0s


  [fold 0 TC_s1] cached 25/98 elapsed 0.2s


  [fold 0 TC_s1] cached 50/98 elapsed 0.5s


  [fold 0 TC_s1] cached 75/98 elapsed 0.7s


  [fold 0 TC_s1] cached 98/98 elapsed 1.0s


  [fold 1 TC] cached 25/99 elapsed 0.2s


  [fold 1 TC] cached 50/99 elapsed 0.5s


  [fold 1 TC] cached 75/99 elapsed 0.7s


  [fold 1 TC] cached 99/99 elapsed 0.9s


  [fold 1 TC_s1] cached 25/99 elapsed 0.2s


  [fold 1 TC_s1] cached 50/99 elapsed 0.5s


  [fold 1 TC_s1] cached 75/99 elapsed 0.7s


  [fold 1 TC_s1] cached 99/99 elapsed 0.9s


  [fold 2 TC] cached 25/100 elapsed 0.2s


  [fold 2 TC] cached 50/100 elapsed 0.5s


  [fold 2 TC] cached 75/100 elapsed 0.7s


  [fold 2 TC] cached 100/100 elapsed 0.9s


  [fold 2 TC_s1] cached 25/100 elapsed 0.2s


  [fold 2 TC_s1] cached 50/100 elapsed 0.5s


  [fold 2 TC_s1] cached 75/100 elapsed 0.7s


  [fold 2 TC_s1] cached 100/100 elapsed 0.9s


Sweeping TC6 (avg OOF)...


Top TC6 (mean, worst, cfg):
(4.855605717034288, 5.63, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 3})
(4.855605717034288, 5.63, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 4})
(4.855605717034288, 5.63, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 5})
(4.855605717034288, 5.63, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.95, 'sep': 3})
(4.855605717034288, 5.63, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.95, 'sep': 4})
Sweeping CE6+TC6 (avg OOF)...


Top CE6+TC6 (mean, worst, cfg):
(4.579729265443551, 5.3, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 3})
(4.579729265443551, 5.3, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 4})
(4.579729265443551, 5.3, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 5})
(4.579729265443551, 5.3, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.95, 'sep': 3})
(4.579729265443551, 5.3, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.95, 'sep': 4})
Chosen blend: ce_tc_12 cfg: {'mean': 4.579729265443551, 'worst': 5.3, 'pool_k': 11.0, 'temp': 0.9, 'gamma': 0.9, 'sep': 3.0}


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE+TC-12x] 10/95 elapsed=0.0m


  [infer CE+TC-12x] 20/95 elapsed=0.1m


  [infer CE+TC-12x] 30/95 elapsed=0.1m


  [infer CE+TC-12x] 40/95 elapsed=0.2m


  [infer CE+TC-12x] 50/95 elapsed=0.2m


  [infer CE+TC-12x] 60/95 elapsed=0.3m


  [infer CE+TC-12x] 70/95 elapsed=0.3m


  [infer CE+TC-12x] 80/95 elapsed=0.4m


  [infer CE+TC-12x] 90/95 elapsed=0.4m


  [infer CE+TC-12x] 95/95 elapsed=0.4m


Wrote submission_primary_ce_tc_12x.csv and submission.csv; head:
     Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 1 10 5 19 15 20 17 11 16 8 18 7 3 6 2 ...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 5 10 6 15 17 ...


In [11]:
# P1: Per-class temperature calibration and CE/TC reliability blending on OOF; normalized LD diagnostics
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

# Utilities: load OOF probs
def load_oof_prob(sid:int, kind:str):
    # kind in {'ce0','ce1','tc0','tc1'} mapping to filename suffixes
    m = {'ce0': f"{sid}_ce_new.npy", 'ce1': f"{sid}_ce_new_s1.npy", 'tc0': f"{sid}_tc.npy", 'tc1': f"{sid}_tc_s1.npy"}
    p = probs_cache/m[kind]; assert p.exists(), f"Missing {p}"
    return torch.from_numpy(np.load(p)).to(device)

def avg_seed(oof_a: torch.Tensor, oof_b: torch.Tensor):
    p = (oof_a + oof_b) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

# Temperature helpers
def apply_temp_prob(p_t_c: torch.Tensor, T: float):
    # global temperature on all classes
    if abs(T-1.0) < 1e-6: return p_t_c
    q = torch.pow(torch.clamp(p_t_c, 1e-8, 1.0), 1.0/float(T))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

def apply_one_class_temp(p_t_c: torch.Tensor, c: int, T: float):
    # adjust only class c by T, renormalize rows
    if abs(T-1.0) < 1e-6: return p_t_c
    q = p_t_c.clone()
    qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T))
    q[:, c] = qc
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

def apply_per_class_temps(p_t_c: torch.Tensor, T_vec: np.ndarray):
    # elementwise power per class then renormalize rows
    T = torch.from_numpy(T_vec.astype(np.float32)).to(device)  # shape [C]
    exps = 1.0 / (T + 1e-8)
    q = torch.pow(torch.clamp(p_t_c, 1e-8, 1.0), exps.unsqueeze(0))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

# Geometric blend per class alpha: p ∝ (p_ce**alpha_c) * (p_tc**(1-alpha_c))
def blend_ce_tc_perclass(p_ce: torch.Tensor, p_tc: torch.Tensor, alpha_c: np.ndarray):
    al = torch.from_numpy(alpha_c.astype(np.float32)).to(device)
    log_ce = torch.log(torch.clamp(p_ce, 1e-8, 1.0))
    log_tc = torch.log(torch.clamp(p_tc, 1e-8, 1.0))
    comb = torch.exp(log_ce * al + log_tc * (1.0 - al))
    return comb / (comb.sum(dim=-1, keepdim=True) + 1e-8)

# Per-frame NLL for temperature and alpha fitting
def per_frame_nll(p_t_c: torch.Tensor, y_t: torch.Tensor):
    m = (y_t >= 0)
    if not torch.any(m):
        return 0.0
    idx = y_t[m].long()
    picked = p_t_c[m, idx]
    return float((-torch.log(torch.clamp(picked, 1e-8, 1.0))).mean().item())

def load_labels(sid:int):
    y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int64)
    return torch.from_numpy(y).to(device)

# Build fold-out datasets of per-frame probs and labels for calibration without leakage
def collect_oof_for_ids(id_list, stream: str):
    data = []  # list of (p_t_c, y_t)
    for sid in id_list:
        sid = int(sid)
        y = load_labels(sid)
        if stream == 'ce':
            p0 = load_oof_prob(sid, 'ce0'); p1 = load_oof_prob(sid, 'ce1') if (probs_cache/f"{sid}_ce_new_s1.npy").exists() else None
        else:
            p0 = load_oof_prob(sid, 'tc0'); p1 = load_oof_prob(sid, 'tc1') if (probs_cache/f"{sid}_tc_s1.npy").exists() else None
        p = avg_seed(p0, p1) if p1 is not None else p0
        data.append((p, y))
    return data

# Fit per-class temperature T_c using one-class adjustment and NLL on frames of that class
def fit_per_class_temperature(fold_idx: int):
    val_ids_all = []
    for f in folds:
        if int(f['fold']) != int(fold_idx):
            val_ids_all.extend(f['val_ids'])
    ce_data = collect_oof_for_ids(val_ids_all, 'ce')
    tc_data = collect_oof_for_ids(val_ids_all, 'tc')
    T_grid = np.round(np.arange(0.85, 1.1501, 0.02), 2)
    C = ce_data[0][0].shape[1]
    T_ce = np.ones(C, dtype=np.float32)
    T_tc = np.ones(C, dtype=np.float32)
    for c in range(C):
        if c == 0:
            T_ce[c] = 1.0; T_tc[c] = 1.0
            continue
        best_nll_ce, best_T_ce = 1e9, 1.0
        best_nll_tc, best_T_tc = 1e9, 1.0
        for T in T_grid:
            nll_ce_c = 0.0; cnt_ce = 0
            for p, y in ce_data:
                m = (y == c)
                if not torch.any(m):
                    continue
                q = apply_one_class_temp(p, c, float(T))
                nll_ce_c += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt_ce += int(m.sum().item())
            if cnt_ce > 0 and (nll_ce_c / cnt_ce) < best_nll_ce:
                best_nll_ce = nll_ce_c / max(1, cnt_ce); best_T_ce = float(T)
            nll_tc_c = 0.0; cnt_tc = 0
            for p, y in tc_data:
                m = (y == c)
                if not torch.any(m):
                    continue
                q = apply_one_class_temp(p, c, float(T))
                nll_tc_c += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt_tc += int(m.sum().item())
            if cnt_tc > 0 and (nll_tc_c / cnt_tc) < best_nll_tc:
                best_nll_tc = nll_tc_c / max(1, cnt_tc); best_T_tc = float(T)
        T_ce[c] = best_T_ce; T_tc[c] = best_T_tc
    return T_ce, T_tc

# Fit per-class alpha reliability on non-fold data using per-class temperature vectors
def fit_per_class_alpha(fold_idx: int, T_ce: np.ndarray, T_tc: np.ndarray):
    val_ids_all = []
    for f in folds:
        if int(f['fold']) != int(fold_idx):
            val_ids_all.extend(f['val_ids'])
    ce_data = collect_oof_for_ids(val_ids_all, 'ce')
    tc_data = collect_oof_for_ids(val_ids_all, 'tc')
    A_grid = [0.0, 0.25, 0.5, 0.75, 1.0]
    C = ce_data[0][0].shape[1]
    alpha = np.full(C, 0.8, dtype=np.float32)  # CE-heavy default
    # precompute temperature-calibrated probs per sample (vector temps)
    ce_calib = [apply_per_class_temps(p_ce, T_ce) for (p_ce, _) in ce_data]
    tc_calib = [apply_per_class_temps(p_tc, T_tc) for (p_tc, _) in tc_data]
    for c in range(C):
        if c == 0:
            alpha[c] = 0.8
            continue
        best_nll, best_a = 1e9, 0.8
        for a in A_grid:
            nll_c = 0.0; cnt = 0
            for i, ((_, y),) in enumerate(zip(ce_data)):
                q_ce = ce_calib[i]; q_tc = tc_calib[i]
                m = (y == c)
                if not torch.any(m):
                    continue
                a_vec = np.full(C, 0.8, dtype=np.float32); a_vec[c] = float(a)
                q = blend_ce_tc_perclass(q_ce, q_tc, a_vec)
                nll_c += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt > 0 and (nll_c / cnt) < best_nll:
                best_nll = nll_c / max(1, cnt); best_a = float(a)
        alpha[c] = best_a
    return alpha

# Apply per-class temps (vector) and alpha to a given sid's OOF and return blended probs
def calibrated_blend_for_sid(sid:int, T_ce: np.ndarray, T_tc: np.ndarray, alpha: np.ndarray):
    p_ce0 = load_oof_prob(sid, 'ce0'); p_ce1 = load_oof_prob(sid, 'ce1') if (probs_cache/f"{sid}_ce_new_s1.npy").exists() else None
    p_tc0 = load_oof_prob(sid, 'tc0'); p_tc1 = load_oof_prob(sid, 'tc1') if (probs_cache/f"{sid}_tc_s1.npy").exists() else None
    p_ce = avg_seed(p_ce0, p_ce1) if p_ce1 is not None else p_ce0
    p_tc = avg_seed(p_tc0, p_tc1) if p_tc1 is not None else p_tc0
    q_ce = apply_per_class_temps(p_ce, T_ce)
    q_tc = apply_per_class_temps(p_tc, T_tc)
    q = blend_ce_tc_perclass(q_ce, q_tc, alpha)
    return q

# Decoder imports from Cell 10/13 context
def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)
def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]));
    elif y.shape[0] > T: y = y[:T];
    return y
def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a = max(0, t_star - w); b = min(T-1, t_star + w); idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype);
    seg = p[a:b+1]; s = seg.sum() + 1e-8; return float(((idx * seg).sum() / s).item())
def topk_candidates_per_class(p_s: torch.Tensor, scores: torch.Tensor, c: int, k_c: int, temp: float, K: int = 3):
    T = p_s.shape[0]; s = scores[:, c]; vals, idxs = torch.topk(s, k=min(K, T));
    cand = []; w_com = max(5, k_c//3); radius = max(10, k_c//2)
    for v, t_star in zip(vals.tolist(), idxs.tolist()):
        t_ref = refine_com(p_s[:,c], int(t_star), w=w_com);
        t_idx = int(round(max(0, min(t_ref, T-1))));
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item();
        pooled_at_ref = p_s[t_idx, c].item();
        cand.append((t_ref, float(v), float(local_mean), float(pooled_at_ref)))
    cand.sort(key=lambda x: (x[0], -x[1], -x[2], -x[3])); return cand
def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=3, K=3, k_delta=4):
    if temp != 1.0:
        p_t_c = (torch.clamp(p_t_c, 1e-8, 1.0) ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape; scores = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c == 0: scores[:, c] = p_s[:, c]; ks[c]=13; continue
        base_k = med_k.get(c, 13); k_c = int(np.clip(round(gamma * base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c + 1); ks[c] = k_c
        ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]));
        ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]; acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:, c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
        scores[:, c] = (acc / float(len(ks_multi))).squeeze(1)
    all_cand=[]; last_t=-1e9; chosen={}
    for c in range(1,21):
        for (t_ref, v, lm, pr) in topk_candidates_per_class(p_s, scores, c, ks[c], temp=temp, K=3):
            all_cand.append((c, t_ref, v, lm, pr))
    all_cand.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    for c, t_ref, v, lm, pr in all_cand:
        if c in chosen: continue
        if t_ref <= last_t + float(min_sep): t_ref = last_t + float(min_sep)
        last_t = min(t_ref, float(T-1)); chosen[c] = (last_t, v, lm, pr)
        if len(chosen)==20: break
    if len(chosen) < 20:
        missing = [c for c in range(1,21) if c not in chosen]; t = max(last_t, 0.0)
        for c in missing: t = min(t + float(min_sep), float(T-1)); chosen[c] = (t, -1e9, -1e9, -1e9)
    seq = [c for c,_ in sorted(chosen.items(), key=lambda kv: kv[1][0])]; return seq

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Metric diagnostics: mean LD and normalized LD (total edits / total gt length) per setting
def eval_fold_sequences(fold, seqs_by_sid):
    vids = [int(s) for s in fold['val_ids']];
    tot_edits=0; tot_len=0; per_s=[]
    for sid in vids:
        gt = id2seq[sid]; pred = seqs_by_sid[sid];
        ld = levenshtein(pred, gt); tot_edits += ld; tot_len += len(gt); per_s.append(ld)
    mean_ld = float(np.mean(per_s)) if per_s else 0.0
    norm_ld = float(tot_edits / max(1, tot_len))
    return mean_ld, norm_ld

# Run fold-out calibration, then evaluate calibrated CE/TC blend on each fold using improved decoder; log normalized LD
pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; gammas=[0.90,0.95,0.975,1.00,1.025,1.05]; seps=[3]
results=[]
print('Starting per-class calibration and reliability blending (fold-out) ...', flush=True)
for f in folds:
    fi = int(f['fold'])
    print(f'  Calibrating using folds != {fi} ...', flush=True)
    T_ce, T_tc = fit_per_class_temperature(fi)
    alpha = fit_per_class_alpha(fi, T_ce, T_tc)
    Path(f'calib_fold{fi}.json').write_text(json.dumps({'T_ce': T_ce.tolist(), 'T_tc': T_tc.tolist(), 'alpha': alpha.tolist()}))

print('Sweeping decoder on calibrated CE+TC OOF ...', flush=True)
rows=[]
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            for sep in seps:
                worsts=[]; means=[]; norms=[]
                ok=True
                for f in folds:
                    fi = int(f['fold'])
                    calib = json.loads(Path(f'calib_fold{fi}.json').read_text())
                    T_ce = np.array(calib['T_ce'], dtype=np.float32); T_tc = np.array(calib['T_tc'], dtype=np.float32); alpha = np.array(calib['alpha'], dtype=np.float32)
                    med_k = compute_class_median_durations_for_ids(f['train_ids'])
                    seqs_by_sid = {}
                    for sid in f['val_ids']:
                        sid = int(sid)
                        try:
                            q = calibrated_blend_for_sid(sid, T_ce, T_tc, alpha)
                        except AssertionError:
                            ok=False; break
                        Tlen = q.shape[0]; gamma_eff = gamma_with_length(gamma, Tlen, med_k)
                        seq = decode_peaks_improved(q, med_k=med_k, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
                        seqs_by_sid[sid] = seq
                    if not ok: break
                    mean_ld, norm_ld = eval_fold_sequences(f, seqs_by_sid)
                    means.append(mean_ld); norms.append(norm_ld)
                if ok and len(means)==len(folds):
                    rows.append({'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep,'mean':float(np.mean(means)), 'worst':float(np.max(means)), 'norm_mean':float(np.mean(norms)), 'norm_worst': float(np.max(norms))})
df = pd.DataFrame(rows)
if len(df):
    df.sort_values(['worst','mean'], inplace=True)
    df.to_csv('cv_sweep_ce_tc_calibrated.csv', index=False)
    print('Top calibrated (by worst then mean):')
    print(df.head())
else:
    print('Calibration sweep produced no results (missing OOF?)')

print('Done calibration + sweep. Next: implement test-time inference applying per-class T_c and alpha refit on all OOF.', flush=True)

Starting per-class calibration and reliability blending (fold-out) ...


  Calibrating using folds != 0 ...


  Calibrating using folds != 1 ...


  Calibrating using folds != 2 ...


Sweeping decoder on calibrated CE+TC OOF ...


Top calibrated (by worst then mean):
    pool_k  temp  gamma  sep      mean  worst  norm_mean  norm_worst
48      15   1.0  0.900    3  4.667622    5.4   0.235703    0.275792
49      15   1.0  0.950    3  4.667622    5.4   0.235703    0.275792
50      15   1.0  0.975    3  4.667622    5.4   0.235703    0.275792
51      15   1.0  1.000    3  4.667622    5.4   0.235703    0.275792
52      15   1.0  1.025    3  4.667622    5.4   0.235703    0.275792
Done calibration + sweep. Next: implement test-time inference applying per-class T_c and alpha refit on all OOF.


In [12]:
# P1-test: Refit per-class calibration on all OOF, apply at test-time for CE+TC-12x, build calibrated submission
import os, json, time, math, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
feat_tr_dir = Path('features3d_v2')/'train'
feat_te_dir = Path('features3d_v2')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()

# Utilities from calibration cell
def load_oof_prob(sid:int, kind:str):
    m = {'ce0': f"{sid}_ce_new.npy", 'ce1': f"{sid}_ce_new_s1.npy", 'tc0': f"{sid}_tc.npy", 'tc1': f"{sid}_tc_s1.npy"}
    p = probs_cache/m[kind]; assert p.exists(), f"Missing {p}"
    return torch.from_numpy(np.load(p)).to(device)
def avg_seed(oof_a: torch.Tensor, oof_b: torch.Tensor):
    p = (oof_a + oof_b) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)
def apply_per_class_temps(p_t_c: torch.Tensor, T_vec: np.ndarray):
    T = torch.from_numpy(T_vec.astype(np.float32)).to(device)
    exps = 1.0 / (T + 1e-8)
    q = torch.pow(torch.clamp(p_t_c, 1e-8, 1.0), exps.unsqueeze(0))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)
def blend_ce_tc_perclass(p_ce: torch.Tensor, p_tc: torch.Tensor, alpha_c: np.ndarray):
    al = torch.from_numpy(alpha_c.astype(np.float32)).to(device)
    log_ce = torch.log(torch.clamp(p_ce, 1e-8, 1.0))
    log_tc = torch.log(torch.clamp(p_tc, 1e-8, 1.0))
    comb = torch.exp(log_ce * al + log_tc * (1.0 - al))
    return comb / (comb.sum(dim=-1, keepdim=True) + 1e-8)
def per_frame_nll(p_t_c: torch.Tensor, y_t: torch.Tensor):
    m = (y_t >= 0)
    if not torch.any(m):
        return 0.0
    idx = y_t[m].long()
    picked = p_t_c[m, idx]
    return float((-torch.log(torch.clamp(picked, 1e-8, 1.0))).mean().item())
def load_labels(sid:int):
    y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int64)
    return torch.from_numpy(y).to(device)
def collect_oof_for_ids(id_list, stream: str):
    data = []
    for sid in id_list:
        sid = int(sid)
        y = load_labels(sid)
        if stream == 'ce':
            p0 = load_oof_prob(sid, 'ce0'); p1 = load_oof_prob(sid, 'ce1') if (probs_cache/f"{sid}_ce_new_s1.npy").exists() else None
        else:
            p0 = load_oof_prob(sid, 'tc0'); p1 = load_oof_prob(sid, 'tc1') if (probs_cache/f"{sid}_tc_s1.npy").exists() else None
        p = avg_seed(p0, p1) if p1 is not None else p0
        data.append((p, y))
    return data

# Refit per-class T_ce, T_tc on ALL OOF (train) and per-class alpha on ALL OOF
def refit_per_class_temperature_all():
    all_ids = train_df['Id'].astype(int).tolist()
    ce_data = collect_oof_for_ids(all_ids, 'ce')
    tc_data = collect_oof_for_ids(all_ids, 'tc')
    T_grid = np.round(np.arange(0.85, 1.1501, 0.02), 2)
    C = ce_data[0][0].shape[1]
    T_ce = np.ones(C, dtype=np.float32)
    T_tc = np.ones(C, dtype=np.float32)
    for c in range(C):
        if c == 0: continue
        best_nll_ce, best_T_ce = 1e9, 1.0
        best_nll_tc, best_T_tc = 1e9, 1.0
        for T in T_grid:
            nll_ce_c = 0.0; cnt_ce = 0
            for p, y in ce_data:
                m = (y == c)
                if not torch.any(m): continue
                # one-class adjust and renormalize
                q = p.clone(); qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T)); q[:, c] = qc; q = q/(q.sum(dim=-1, keepdim=True)+1e-8)
                nll_ce_c += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt_ce += int(m.sum().item())
            if cnt_ce > 0 and (nll_ce_c / cnt_ce) < best_nll_ce:
                best_nll_ce = nll_ce_c / max(1, cnt_ce); best_T_ce = float(T)
            nll_tc_c = 0.0; cnt_tc = 0
            for p, y in tc_data:
                m = (y == c)
                if not torch.any(m): continue
                q = p.clone(); qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T)); q[:, c] = qc; q = q/(q.sum(dim=-1, keepdim=True)+1e-8)
                nll_tc_c += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt_tc += int(m.sum().item())
            if cnt_tc > 0 and (nll_tc_c / cnt_tc) < best_nll_tc:
                best_nll_tc = nll_tc_c / max(1, cnt_tc); best_T_tc = float(T)
        T_ce[c] = best_T_ce; T_tc[c] = best_T_tc
    return T_ce, T_tc

def refit_per_class_alpha_all(T_ce: np.ndarray, T_tc: np.ndarray):
    all_ids = train_df['Id'].astype(int).tolist()
    ce_data = collect_oof_for_ids(all_ids, 'ce')
    tc_data = collect_oof_for_ids(all_ids, 'tc')
    C = ce_data[0][0].shape[1]
    A_grid = [0.0, 0.25, 0.5, 0.75, 1.0]
    alpha = np.full(C, 0.8, dtype=np.float32)
    # precompute per-class temp calibration
    ce_calib = [apply_per_class_temps(p_ce, T_ce) for (p_ce, _) in ce_data]
    tc_calib = [apply_per_class_temps(p_tc, T_tc) for (p_tc, _) in tc_data]
    for c in range(C):
        if c == 0: alpha[c] = 0.8; continue
        best_nll, best_a = 1e9, 0.8
        for a in A_grid:
            nll_c = 0.0; cnt = 0
            for i, ((_, y),) in enumerate(zip(ce_data)):
                q_ce = ce_calib[i]; q_tc = tc_calib[i]
                m = (y == c)
                if not torch.any(m): continue
                a_vec = np.full(C, 0.8, dtype=np.float32); a_vec[c] = float(a)
                q = blend_ce_tc_perclass(q_ce, q_tc, a_vec)
                nll_c += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt > 0 and (nll_c / cnt) < best_nll:
                best_nll = nll_c / max(1, cnt); best_a = float(a)
        alpha[c] = best_a
    return alpha

# Decoder helpers (reuse from earlier cells)
def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)
def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]));
    elif y.shape[0] > T: y = y[:T];
    return y
def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a = max(0, t_star - w); b = min(T-1, t_star + w); idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype);
    seg = p[a:b+1]; s = seg.sum() + 1e-8; return float(((idx * seg).sum() / s).item())
def topk_candidates_per_class(p_s: torch.Tensor, scores: torch.Tensor, c: int, k_c: int, temp: float, K: int = 3):
    T = p_s.shape[0]; s = scores[:, c]; vals, idxs = torch.topk(s, k=min(K, T));
    cand = []; w_com = max(5, k_c//3); radius = max(10, k_c//2)
    for v, t_star in zip(vals.tolist(), idxs.tolist()):
        t_ref = refine_com(p_s[:,c], int(t_star), w=w_com);
        t_idx = int(round(max(0, min(t_ref, T-1))));
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item();
        pooled_at_ref = p_s[t_idx, c].item();
        cand.append((t_ref, float(v), float(local_mean), float(pooled_at_ref)))
    cand.sort(key=lambda x: (x[0], -x[1], -x[2], -x[3])); return cand
def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=3, K=3, k_delta=4):
    if temp != 1.0:
        p_t_c = (torch.clamp(p_t_c, 1e-8, 1.0) ** (1.0/temp)); p_t_c = p_t_c / (p_t_c.sum(dim=-1, keepdim=True) + 1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape; scores = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c == 0: scores[:, c] = p_s[:, c]; ks[c]=13; continue
        base_k = med_k.get(c, 13); k_c = int(np.clip(round(gamma * base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c + 1); ks[c] = k_c
        ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]));
        ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]; acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:, c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
        scores[:, c] = (acc / float(len(ks_multi))).squeeze(1)
    all_cand=[]; last_t=-1e9; chosen={}
    for c in range(1,21):
        for (t_ref, v, lm, pr) in topk_candidates_per_class(p_s, scores, c, ks[c], temp=temp, K=3):
            all_cand.append((c, t_ref, v, lm, pr))
    all_cand.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    for c, t_ref, v, lm, pr in all_cand:
        if c in chosen: continue
        if t_ref <= last_t + float(min_sep): t_ref = last_t + float(min_sep)
        last_t = min(t_ref, float(T-1)); chosen[c] = (last_t, v, lm, pr)
        if len(chosen)==20: break
    if len(chosen) < 20:
        missing = [c for c in range(1,21) if c not in chosen]; t = max(last_t, 0.0)
        for c in missing: t = min(t + float(min_sep), float(T-1)); chosen[c] = (t, -1e9, -1e9, -1e9)
    seq = [c for c,_ in sorted(chosen.items(), key=lambda kv: kv[1][0])]; return seq
def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med
def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

# Model for inference (TCN architecture used for CE/TC checkpoints)
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__()
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h);
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True);
        return x + h
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__()
        self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h);
        out = self.head(h); return out.transpose(1,2)

def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz");
    return d['X'].astype(np.float32)
def compute_fold_scaler(id_list):
    n = 0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid)); n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)
def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T, C = p_t_c.shape; tgt_len = max(1, int(round(T*s)))
        x = p_t_c.T.unsqueeze(0);
        y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
        y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T;
        y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)
        acc = y2 if acc is None else (acc + y2)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

# 1) Refit calibration on all OOF and save
print('Refitting per-class calibration (T_ce, T_tc, alpha) on ALL OOF ...', flush=True)
T_ce_all, T_tc_all = refit_per_class_temperature_all()
alpha_all = refit_per_class_alpha_all(T_ce_all, T_tc_all)
Path('calib_all.json').write_text(json.dumps({'T_ce': T_ce_all.tolist(), 'T_tc': T_tc_all.tolist(), 'alpha': alpha_all.tolist()}))
print('Saved calib_all.json', flush=True)

# 2) Read calibrated OOF sweep to pick decoder cfg (by worst then mean)
cfg_df = pd.read_csv('cv_sweep_ce_tc_calibrated.csv').sort_values(['worst','mean']) if Path('cv_sweep_ce_tc_calibrated.csv').exists() else None
if cfg_df is None or len(cfg_df)==0:
    pool_k, temp, gamma, sep = 15, 1.0, 0.95, 3
else:
    best = cfg_df.iloc[0].to_dict()
    pool_k = int(best.get('pool_k', 15)); temp = float(best.get('temp', 1.0)); gamma = float(best.get('gamma', 0.95)); sep = int(best.get('sep', 3))
print('Using calibrated decoder cfg:', {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep}, flush=True)

# 3) Test-time inference: CE+TC 12-model average into two streams, apply per-class temps and per-class alpha blend, then decode
med_k_train_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]

def infer_calibrated_ce_tc_12():
    scalers=[compute_fold_scaler(folds[fi]['train_ids']) for fi in range(3)]
    scalers=[(torch.from_numpy(m).float().to(device), torch.from_numpy(s).float().to(device)) for (m,s) in scalers]
    rows=[]; t0=time.time()
    for i, sid in enumerate(test_ids, 1):
        X = load_feat('test', int(sid)); Tlen = X.shape[0]
        acc_ce=None; acc_tc=None; cnt_ce=0; cnt_tc=0
        with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
            for fi in range(3):
                mean_t, std_t = scalers[fi]
                for s in (0,1):
                    # CE stream
                    ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth")
                    if ckpt.exists():
                        m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                        m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                        xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                        p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                        acc_ce = p if acc_ce is None else (acc_ce + p); cnt_ce += 1; del m
                for s in (0,1):
                    # TC stream
                    ckpt = Path(f"model_tc_fold{fi}{'_s1' if s==1 else ''}.pth")
                    if ckpt.exists():
                        m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                        m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                        xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                        p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                        acc_tc = p if acc_tc is None else (acc_tc + p); cnt_tc += 1; del m
        ce_prob = (acc_ce / float(max(1, cnt_ce))) if acc_ce is not None else None
        tc_prob = (acc_tc / float(max(1, cnt_tc))) if acc_tc is not None else None
        # Apply per-class temps then per-class alpha blend
        if ce_prob is None and tc_prob is None:
            raise RuntimeError('No CE/TC models found for inference')
        if ce_prob is None:
            probs = apply_per_class_temps(tc_prob, T_tc_all)
        elif tc_prob is None:
            probs = apply_per_class_temps(ce_prob, T_ce_all)
        else:
            q_ce = apply_per_class_temps(ce_prob, T_ce_all)
            q_tc = apply_per_class_temps(tc_prob, T_tc_all)
            probs = blend_ce_tc_perclass(q_ce, q_tc, alpha_all)
        probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)
        gamma_eff = gamma_with_length(gamma, Tlen, med_k_train_all)
        seq = decode_peaks_improved(probs, med_k=med_k_train_all, gamma=gamma_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
        rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
        if (i%10)==0 or i==len(test_ids):
            print(f"  [infer CE+TC-12x calibrated] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
    return pd.DataFrame(rows, columns=['Id','Sequence'])

print('Building calibrated CE+TC-12x submission...', flush=True)
sub = infer_calibrated_ce_tc_12()
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_tc_12x_calibrated.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_tc_12x_calibrated.csv and submission.csv; head:\n', sub.head(), flush=True)

Refitting per-class calibration (T_ce, T_tc, alpha) on ALL OOF ...


Saved calib_all.json


Using calibrated decoder cfg: {'pool_k': 15, 'temp': 1.0, 'gamma': 0.9, 'sep': 3}


Building calibrated CE+TC-12x submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE+TC-12x calibrated] 10/95 elapsed=0.0m


  [infer CE+TC-12x calibrated] 20/95 elapsed=0.1m


  [infer CE+TC-12x calibrated] 30/95 elapsed=0.1m


  [infer CE+TC-12x calibrated] 40/95 elapsed=0.2m


  [infer CE+TC-12x calibrated] 50/95 elapsed=0.2m


  [infer CE+TC-12x calibrated] 60/95 elapsed=0.3m


  [infer CE+TC-12x calibrated] 70/95 elapsed=0.3m


  [infer CE+TC-12x calibrated] 80/95 elapsed=0.4m


  [infer CE+TC-12x calibrated] 90/95 elapsed=0.4m


  [infer CE+TC-12x calibrated] 95/95 elapsed=0.4m


Wrote submission_primary_ce_tc_12x_calibrated.csv and submission.csv; head:
     Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 10 5 19 15 20 17 11 16 8 18 7 3 1 6 2 ...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 5 10 6 15 17 ...


In [14]:
# P2: Duration-aware DP decoder over top-K candidates (on CE6 OOF), sweep (reduced grid with progress logs) and build CE6-DP submission
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache')
feat_tr_dir = Path('features3d_v2')/'train'
feat_te_dir = Path('features3d_v2')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

# Helpers reused
def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz");
    return d['X'].astype(np.float32)

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]));
    elif y.shape[0] > T: y = y[:T];
    return y

def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a = max(0, t_star - w); b = min(T-1, t_star + w); idx = torch.arange(a, b+1, device=p.device, dtype=p.dtype);
    seg = p[a:b+1]; s = seg.sum() + 1e-8; return float(((idx * seg).sum() / s).item())

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Load averaged CE OOF probs (seed0+seed1)
def load_oof_ce_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new_s1.npy")).to(device)
    p = (p0 + p1) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

# Build candidates per class using multi-scale duration-integral scores and COM refine
def build_candidates(p_t_c: torch.Tensor, med_k: dict, pool_k:int, gamma: float, temp: float, K_top:int, k_delta:int=4):
    if temp != 1.0:
        p_t_c = (torch.clamp(p_t_c, 1e-8, 1.0) ** (1.0/temp)); p_t_c = p_t_c/(p_t_c.sum(dim=-1, keepdim=True)+1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape
    scores = torch.zeros_like(p_s); k_eff=[13]*C
    for c in range(C):
        if c == 0:
            scores[:, c] = p_s[:, c]; k_eff[c]=13; continue
        base_k = med_k.get(c,13); k_c = int(np.clip(round(gamma * base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c+1); k_eff[c]=k_c
        ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]));
        ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:, c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
        scores[:, c] = (acc / float(len(ks_multi))).squeeze(1)
    cand_by_c = {}
    for c in range(1,21):
        s = scores[:, c]; T = s.shape[0]
        k = min(K_top, T)
        vals, idxs = torch.topk(s, k=k)
        w_com = max(5, k_eff[c]//3); radius = max(10, k_eff[c]//2)
        cand=[]
        for v, t_star in zip(vals.tolist(), idxs.tolist()):
            t_ref = refine_com(p_s[:,c], int(t_star), w=w_com);
            t_idx = int(round(max(0, min(t_ref, T-1))));
            local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item();
            pooled_at_ref = p_s[t_idx, c].item();
            cand.append((t_ref, float(v), float(local_mean), float(pooled_at_ref)))
        cand.sort(key=lambda x: (-x[1], -x[2], -x[3], x[0]))
        cand_by_c[c] = cand
    return cand_by_c

# Duration-aware DP (beam search) over ordered classes 1..20
def dp_decode_from_candidates(cand_by_c: dict, med_k: dict, gamma: float, min_sep: float, lambda_dur: float, T_len: int, beam_width: int = 80):
    beams = [ (0.0, -1e9, []) ]
    for i, c in enumerate(range(1,21), start=1):
        new_beams = []
        exp_gap = float(np.clip(round(gamma * med_k.get(c,13)), 3, 30))
        cand_list = cand_by_c.get(c, [])
        if len(cand_list)==0:
            for sc, last_t, path in beams:
                t = last_t + max(min_sep, exp_gap)
                t = min(t, float(T_len-1))
                new_beams.append((sc - 1e6, t, path + [c]))
        else:
            for (sc, last_t, path) in beams:
                for (t_ref, score_v, local_m, pooled) in cand_list:
                    t_use = t_ref
                    if t_use <= last_t + float(min_sep):
                        t_use = last_t + float(min_sep)
                    t_use = min(t_use, float(T_len-1))
                    gap = t_use - last_t if last_t > -1e8 else exp_gap
                    pen = lambda_dur * abs(gap - exp_gap) / max(1.0, exp_gap)
                    new_score = sc + (score_v - pen)
                    new_beams.append((new_score, t_use, path + [c]))
        new_beams.sort(key=lambda x: (-x[0], x[1]))
        beams = new_beams[:beam_width]
    best = max(beams, key=lambda x: (x[0], -x[1]))
    return best[2] if len(best)>=3 else [c for c in range(1,21)]

# Evaluate DP-decoder cfg on a fold, using CE OOF avg
def eval_dp_cfg_on_fold(fold, pool_k:int, temp:float, gamma: float, min_sep:int, K_top:int, lambda_dur: float, beam_width:int):
    fi = int(fold['fold'])
    med_k = compute_class_median_durations_for_ids(fold['train_ids'])
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        sid = int(sid)
        p = load_oof_ce_avg(sid); T = p.shape[0]
        g_eff = gamma_with_length(gamma, T, med_k)
        cand = build_candidates(p, med_k=med_k, pool_k=pool_k, gamma=g_eff, temp=temp, K_top=K_top, k_delta=4)
        seq = dp_decode_from_candidates(cand, med_k=med_k, gamma=g_eff, min_sep=float(min_sep), lambda_dur=lambda_dur, T_len=T, beam_width=beam_width)
        tot += levenshtein(seq, id2seq[sid]); cnt += 1
    return tot/max(cnt,1)

# Reduced sweep grid per expert guidance + progress logging
pool_ks=[11,15]; temps=[0.90]; gammas=[0.90,0.95]; seps=[3,4]; K_tops=[20,25]; lambdas=[0.2]; beams=[60]
total_cfg = len(pool_ks)*len(temps)*len(gammas)*len(seps)*len(K_tops)*len(lambdas)*len(beams)
print(f'Sweeping DP decoder on CE6 averaged OOF (reduced grid) ... total_cfg={total_cfg}', flush=True)
res=[]; cfg_idx=0; t0=time.time()
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            for sep in seps:
                for K_top in K_tops:
                    for lam in lambdas:
                        for bw in beams:
                            cfg_idx += 1
                            per_fold=[]
                            for f in folds:
                                lev = eval_dp_cfg_on_fold(f, pool_k, temp, gamma, sep, K_top, lam, bw)
                                per_fold.append(lev)
                            res.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep,'K':K_top,'lambda_dur':lam,'beam':bw}))
                            if (cfg_idx % 5)==0 or cfg_idx==total_cfg:
                                elapsed = (time.time()-t0)/60.0
                                print(f'  [sweep DP] cfg {cfg_idx}/{total_cfg} elapsed={elapsed:.1f}m', flush=True)
res.sort(key=lambda x: (x[1], x[0]))
print('Top CE6+DP (mean, worst, cfg):')
for r in res[:5]:
    print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_6x_dp.csv', index=False)
print('Saved cv_sweep_ce_6x_dp.csv', flush=True)

# Build test submission using CE 6x models + DP decoder best cfg
print('Building CE-6x DP submission...', flush=True)
cfg_best = pd.read_csv('cv_sweep_ce_6x_dp.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_6x_dp.csv').exists() else {'pool_k':13,'temp':0.95,'gamma':1.0,'sep':3,'K':20,'lambda_dur':0.2,'beam':80}
pool_k=int(cfg_best['pool_k']); temp=float(cfg_best['temp']); gamma=float(cfg_best.get('gamma',1.0)); sep=int(cfg_best['sep']); K_top=int(cfg_best.get('K',20)); lam=float(cfg_best.get('lambda_dur',0.2)); bw=int(cfg_best.get('beam',80))
med_k_train_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())

# Minimal CE model def to load checkpoints
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__();
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h);
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True);
        return x + h
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__();
        self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h);
        out = self.head(h); return out.transpose(1,2)

def compute_fold_scaler(id_list):
    n=0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid)); n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T, C = p_t_c.shape; tgt_len = max(1, int(round(T*s)))
        x = p_t_c.T.unsqueeze(0);
        y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
        y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T;
        y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)
        acc = y2 if acc is None else (acc + y2)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def infer_ce6_probs_for_sid(sid:int, models_info, scalers):
    X = load_feat('test', int(sid));
    acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for fi in range(3):
            mean_t, std_t = scalers[fi]
            for s in (0,1):
                ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists():
                    continue
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device);
                xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                acc = p if acc is None else (acc + p)
                del m
    probs = acc / float(6)
    return probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)

def build_submission_ce6_dp():
    test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
    scalers=[compute_fold_scaler(folds[fi]['train_ids']) for fi in range(3)]
    scalers=[(torch.from_numpy(m).float().to(device), torch.from_numpy(s).float().to(device)) for (m,s) in scalers]
    rows=[]; t0=time.time()
    for i, sid in enumerate(test_ids, 1):
        p = infer_ce6_probs_for_sid(int(sid), None, scalers); T = p.shape[0]
        g_eff = gamma_with_length(gamma, T, med_k_train_all)
        cand = build_candidates(p, med_k=med_k_train_all, pool_k=pool_k, gamma=g_eff, temp=temp, K_top=K_top, k_delta=4)
        seq = dp_decode_from_candidates(cand, med_k=med_k_train_all, gamma=g_eff, min_sep=float(sep), lambda_dur=lam, T_len=T, beam_width=bw)
        rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
        if (i%10)==0 or i==len(test_ids):
            print(f"  [infer CE-6x DP] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
    sub = pd.DataFrame(rows, columns=['Id','Sequence'])
    return sub

med_k_train_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
sub = build_submission_ce6_dp()
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_6x_dp.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_6x_dp.csv and submission.csv; head:\n', sub.head(), flush=True)

Sweeping DP decoder on CE6 averaged OOF (reduced grid) ... total_cfg=16


  [sweep DP] cfg 5/16 elapsed=1.9m


  [sweep DP] cfg 10/16 elapsed=3.9m


  [sweep DP] cfg 15/16 elapsed=5.9m


  [sweep DP] cfg 16/16 elapsed=6.3m


Top CE6+DP (mean, worst, cfg):
(18.1293808836666, 18.306122448979593, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 3, 'K': 20, 'lambda_dur': 0.2, 'beam': 60})
(18.1293808836666, 18.306122448979593, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 3, 'K': 25, 'lambda_dur': 0.2, 'beam': 60})
(18.1293808836666, 18.306122448979593, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 4, 'K': 20, 'lambda_dur': 0.2, 'beam': 60})
(18.1293808836666, 18.306122448979593, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 4, 'K': 25, 'lambda_dur': 0.2, 'beam': 60})
(18.1293808836666, 18.306122448979593, {'pool_k': 11, 'temp': 0.9, 'gamma': 0.95, 'sep': 3, 'K': 20, 'lambda_dur': 0.2, 'beam': 60})
Saved cv_sweep_ce_6x_dp.csv


Building CE-6x DP submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE-6x DP] 10/95 elapsed=0.0m


  [infer CE-6x DP] 20/95 elapsed=0.1m


  [infer CE-6x DP] 30/95 elapsed=0.1m


  [infer CE-6x DP] 40/95 elapsed=0.1m


  [infer CE-6x DP] 50/95 elapsed=0.2m


  [infer CE-6x DP] 60/95 elapsed=0.2m


  [infer CE-6x DP] 70/95 elapsed=0.2m


  [infer CE-6x DP] 80/95 elapsed=0.3m


  [infer CE-6x DP] 90/95 elapsed=0.3m


  [infer CE-6x DP] 95/95 elapsed=0.3m


Wrote submission_primary_ce_6x_dp.csv and submission.csv; head:
     Id                                           Sequence
0  300  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1  301  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
2  302  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
3  303  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
4  304  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...


In [19]:
# Set submission.csv to the best available prior submission (prefer improved decoders)
import os, shutil, pandas as pd
candidates = [
    'submission_primary_ce_6x_v2.csv',  # improved decoder (best OOF: mean 4.486, worst 5.15)
    'submission_primary_ce_6x_localsrch.csv',  # local-search decoder (OOF: mean 4.503, worst 5.18)
    'submission_primary_ce_6x.csv',
    'submission_primary_ce_tc_12x_calibrated.csv',
    'submission_primary_ce_tc_12x.csv',
    'submission_primary_ce_ms.csv',
    'submission_backup_ce_only.csv',
]
chosen = None
for c in candidates:
    if os.path.exists(c):
        chosen = c
        break
assert chosen is not None, 'No candidate submission files found'
shutil.copyfile(chosen, 'submission.csv')
print('submission.csv set from:', chosen)
print(pd.read_csv('submission.csv').head())

submission.csv set from: submission_primary_ce_6x_v2.csv
    Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 1 10 14 5 19 15 20 17 11 16 8 18 7 3 6...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 5 10 6 15 17 ...


In [17]:
# P2-alt: Candidate assignment + adjacent-swap hill-climb decoder on CE6 OOF; tiny grid; build submission
import os, json, time, math, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache'); lab_tr_dir = Path('labels3d_v2')/'train'
feat_tr_dir = Path('features3d_v2')/'train'; feat_te_dir = Path('features3d_v2')/'test'
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv'); id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

def load_oof_ce_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new_s1.npy")).to(device)
    p = (p0 + p1) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)

def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y

def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T = p.shape[0]; a=max(0,t_star-w); b=min(T-1,t_star+w); idx=torch.arange(a,b+1, device=p.device, dtype=p.dtype);
    seg=p[a:b+1]; s=seg.sum() + 1e-8; return float(((idx*seg).sum()/s).item())

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Build simple pairwise order prior P[a,b] = Pr(a before b) from training sequences (robust to missing classes)
def build_order_prior(train_df):
    cnt = np.zeros((21,21), dtype=np.int64)
    tot = np.zeros((21,21), dtype=np.int64)
    for seq in train_df['Sequence'].astype(str).tolist():
        s = [int(x) for x in seq.strip().split() if x.isdigit()]
        s = [x for x in s if 1 <= x <= 20]
        n = len(s)
        for i in range(n):
            a = s[i]
            for j in range(i+1, n):
                b = s[j]
                if a == b: continue
                cnt[a, b] += 1
                tot[a, b] += 1
    P = np.zeros((21,21), dtype=np.float32)
    with np.errstate(divide='ignore', invalid='ignore'):
        P = np.where(tot>0, cnt / np.maximum(1, tot), 0.5)
    np.fill_diagonal(P, 0.5)
    return P
P_order = build_order_prior(train_df)

# Candidate extraction and per-(c,t) score s(c,t) = log(pooled_prob) + beta * z(di_score)
def build_scoring_and_candidates(p_t_c: torch.Tensor, med_k: dict, pool_k:int, gamma: float, temp: float, K:int, k_delta:int=4):
    if temp != 1.0:
        p_t_c = (torch.clamp(p_t_c, 1e-8, 1.0) ** (1.0/temp)); p_t_c = p_t_c/(p_t_c.sum(dim=-1, keepdim=True)+1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T, C = p_s.shape
    di = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: di[:,c]=p_s[:,c]; ks[c]=13; continue
        base_k = med_k.get(c,13); k_c = int(np.clip(round(gamma*base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c+1); ks[c]=k_c
        ks_multi = sorted(set([int(np.clip(k_c - k_delta, 9, 25)), k_c, int(np.clip(k_c + k_delta, 9, 25))]));
        ks_multi = [k if (k % 2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            x = duration_integral_single(p_s[:,c], k=k).unsqueeze(1); acc = x if acc is None else (acc + x)
        di[:,c] = (acc/float(len(ks_multi))).squeeze(1)
    # z-score di per class over time
    mu = di.mean(dim=0, keepdim=True); sd = di.std(dim=0, keepdim=True) + 1e-8
    z = (di - mu) / sd
    # score grid
    logp = torch.log(torch.clamp(p_s, 1e-8, 1.0))
    return p_s, di, z, logp, ks

def initial_assignment(p_s, z, logp, ks, beta: float, K:int, min_sep:int):
    T, C = p_s.shape
    # independent best per class by s(c,t), then time-sort and enforce min_sep
    items=[]
    for c in range(1,21):
        s = logp[:,c] + beta * z[:,c]
        t_star = int(torch.argmax(s).item())
        # refine with COM on pooled prob to stabilize
        w_com = max(5, ks[c]//3);
        t_ref = refine_com(p_s[:,c], t_star, w=w_com)
        t_idx = max(0, min(int(round(t_ref)), T-1))
        score_ct = float(s[t_idx].item())
        items.append([float(t_ref), int(c), score_ct])
    items.sort(key=lambda x: x[0])
    # enforce min_sep monotonic times
    last_t = -1e9
    for it in items:
        if it[0] <= last_t + float(min_sep):
            it[0] = last_t + float(min_sep)
        last_t = min(it[0], float(T-1))
    return items  # list of [t, c, s]

def objective_S(items, beta: float, lambda_ord: float):
    # items: list of [t, c, s_ct]; s_ct precomputed = logp+beta*z at that (c,t)
    S = 0.0
    # main score
    for _, _, sct in items:
        S += float(sct)
    # order prior penalty: for each i<j, if c_i after c_j then penalty proportional to (1 - P[c_i,c_j])
    if lambda_ord > 0:
        n = len(items)
        for i in range(n):
            ci = items[i][1]
            for j in range(i+1, n):
                cj = items[j][1]
                pij = float(P_order[ci, cj]) if 1 <= ci <= 20 and 1 <= cj <= 20 else 0.5
                # in sequence, i comes before j; if pij is low, penalize
                S -= lambda_ord * (1.0 - pij)
    return S

def hill_climb_adjacent(items, p_s, z, logp, ks, beta: float, lambda_ord: float, max_passes:int=5):
    # We keep times fixed to positions; swapping classes swaps which score s(c,t) we pick.
    # Recompute s_ct on-demand for swapped pairs.
    improved = True; passes = 0
    # Precompute s(c,t) accessor
    def s_at(c:int, t:float):
        T = p_s.shape[0]
        t_idx = max(0, min(int(round(t)), T-1))
        return float((logp[t_idx, c] + beta * z[t_idx, c]).item())
    while improved and passes < max_passes:
        improved = False; passes += 1
        i = 0
        while i < len(items)-1:
            t_i, c_i, s_i = items[i]
            t_j, c_j, s_j = items[i+1]
            # score before
            S_before = objective_S(items, beta, lambda_ord)
            # try swap c_i and c_j (times stay t_i, t_j)
            s_i_new = s_at(c_j, t_i); s_j_new = s_at(c_i, t_j)
            items[i][1] = c_j; items[i][2] = s_i_new
            items[i+1][1] = c_i; items[i+1][2] = s_j_new
            S_after = objective_S(items, beta, lambda_ord)
            if S_after + 1e-9 >= S_before:
                improved = improved or (S_after > S_before + 1e-6)
                # keep swap
            else:
                # revert
                items[i][1] = c_i; items[i][2] = s_i
                items[i+1][1] = c_j; items[i+1][2] = s_j
            i += 1
    return items

def decode_localsrch(p_t_c: torch.Tensor, med_k: dict, pool_k:int, temp: float, gamma: float, min_sep:int, beta: float, lambda_ord: float, K:int=3):
    T = p_t_c.shape[0]
    g_eff = gamma
    p_s, di, z, logp, ks = build_scoring_and_candidates(p_t_c, med_k, pool_k, g_eff, temp, K)
    items = initial_assignment(p_s, z, logp, ks, beta=beta, K=K, min_sep=min_sep)
    items = hill_climb_adjacent(items, p_s, z, logp, ks, beta=beta, lambda_ord=lambda_ord, max_passes=4)
    # return sequence by positions (already increasing time)
    seq = [int(c) for (_, c, _) in items]
    # uniqueness enforcement (rare): if duplicates, fallback to unique by first occurrence
    if len(set(seq)) < 20:
        seen=set(); seq2=[]
        for c in seq:
            if c in seen: continue
            seen.add(c); seq2.append(c)
        # append any missing in order of not-seen
        for c in range(1,21):
            if c not in seen: seq2.append(c)
        seq = seq2[:20]
    return seq

# Evaluate on CE6 averaged OOF with small grid; pick by worst-fold then mean
pool_ks=[11,15]; temps=[0.90,1.00]; gammas=[0.90,0.95,1.00]; seps=[2,3]; betas=[0.3,0.5]; lords=[0.03,0.05]
print('Sweeping local-search decoder on CE6 averaged OOF (tiny grid)...', flush=True)
med_cache={}
def eval_cfg_on_fold_localsrch(fold, pool_k, temp, gamma, sep, beta, l_ord):
    fi = int(fold['fold'])
    if fi not in med_cache: med_cache[fi] = compute_class_median_durations_for_ids(fold['train_ids'])
    med_k = med_cache[fi]
    vids = fold['val_ids']; tot=0; cnt=0
    for sid in vids:
        sid=int(sid); p = load_oof_ce_avg(sid); T = p.shape[0]
        g_eff = gamma_with_length(gamma, T, med_k)
        seq = decode_localsrch(p, med_k=med_k, pool_k=pool_k, temp=temp, gamma=g_eff, min_sep=sep, beta=beta, lambda_ord=l_ord, K=3)
        tot += levenshtein(seq, id2seq[sid]); cnt += 1
    return tot/max(cnt,1)

res=[]; t0=time.time(); cfg_idx=0; total_cfg=len(pool_ks)*len(temps)*len(gammas)*len(seps)*len(betas)*len(lords)
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            for sep in seps:
                for beta in betas:
                    for l_ord in lords:
                        cfg_idx += 1
                        per_fold=[]
                        for f in folds:
                            lev = eval_cfg_on_fold_localsrch(f, pool_k, temp, gamma, sep, beta, l_ord)
                            per_fold.append(lev)
                        res.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep,'beta':beta,'lambda_ord':l_ord}))
                        if (cfg_idx % 6)==0 or cfg_idx==total_cfg:
                            print(f"  [sweep local] cfg {cfg_idx}/{total_cfg} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
res.sort(key=lambda x: (x[1], x[0]))
print('Top CE6 local-search (mean, worst, cfg):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_6x_localsrch.csv', index=False)
print('Saved cv_sweep_ce_6x_localsrch.csv', flush=True)

# Test-time inference with 6 CE models + local-search decoder
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__(); self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop); self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h); h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True); return x + h
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__(); self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers): blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3)); dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h); out = self.head(h); return out.transpose(1,2)

def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz"); return d['X'].astype(np.float32)

def compute_fold_scaler(id_list):
    n=0; mean=None; M2=None
    for sid in id_list:
        X = load_feat('train', int(sid)); n_i = X.shape[0]
        if mean is None: mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new));
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T, C = p_t_c.shape; tgt_len = max(1, int(round(T*s)))
        x = p_t_c.T.unsqueeze(0); y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
        y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T; y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)
        acc = y2 if acc is None else (acc + y2)
    out = acc / float(len(factors)); return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def infer_ce6_probs_for_sid(sid:int, scalers):
    X = load_feat('test', int(sid)); acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for fi in range(3):
            mean_t, std_t = scalers[fi]
            for s in (0,1):
                ckpt = Path(f"model_ce_fold{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists(): continue
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t) / (std_t + 1e-6); xb = xb.unsqueeze(0)
                p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                acc = p if acc is None else (acc + p); del m
    probs = acc / float(6); return probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)

def build_submission_ce6_localsrch():
    cfg = pd.read_csv('cv_sweep_ce_6x_localsrch.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_6x_localsrch.csv').exists() else {'pool_k':11,'temp':0.9,'gamma':0.95,'sep':2,'beta':0.5,'lambda_ord':0.05}
    pool_k=int(cfg['pool_k']); temp=float(cfg['temp']); gamma=float(cfg.get('gamma',1.0)); sep=int(cfg['sep']); beta=float(cfg.get('beta',0.5)); l_ord=float(cfg.get('lambda_ord',0.05))
    med_k_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
    scalers=[compute_fold_scaler(folds[fi]['train_ids']) for fi in range(3)]
    scalers=[(torch.from_numpy(m).float().to(device), torch.from_numpy(s).float().to(device)) for (m,s) in scalers]
    test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
    rows=[]; t0=time.time()
    for i, sid in enumerate(test_ids, 1):
        p = infer_ce6_probs_for_sid(int(sid), scalers)
        T = p.shape[0]; g_eff = gamma_with_length(gamma, T, med_k_all)
        seq = decode_localsrch(p, med_k=med_k_all, pool_k=pool_k, temp=temp, gamma=g_eff, min_sep=sep, beta=beta, lambda_ord=l_ord, K=3)
        rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
        if (i%10)==0 or i==len(test_ids):
            print(f"  [infer CE-6x local] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
    sub = pd.DataFrame(rows, columns=['Id','Sequence'])
    assert len(sub)==95
    assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
    sub.to_csv('submission_primary_ce_6x_localsrch.csv', index=False); sub.to_csv('submission.csv', index=False)
    print('Wrote submission_primary_ce_6x_localsrch.csv and submission.csv; head:\n', sub.head(), flush=True)

# After sweep finishes, run test-time build
print('Local-search decoder cell ready. Execute this cell to sweep and build submission.', flush=True)

Sweeping local-search decoder on CE6 averaged OOF (tiny grid)...


  [sweep local] cfg 6/96 elapsed=0.3m


  [sweep local] cfg 12/96 elapsed=0.6m


  [sweep local] cfg 18/96 elapsed=0.9m


  [sweep local] cfg 24/96 elapsed=1.2m


  [sweep local] cfg 30/96 elapsed=1.5m


  [sweep local] cfg 36/96 elapsed=1.8m


  [sweep local] cfg 42/96 elapsed=2.1m


  [sweep local] cfg 48/96 elapsed=2.4m


  [sweep local] cfg 54/96 elapsed=2.7m


  [sweep local] cfg 60/96 elapsed=3.0m


  [sweep local] cfg 66/96 elapsed=3.4m


  [sweep local] cfg 72/96 elapsed=3.7m


  [sweep local] cfg 78/96 elapsed=4.0m


  [sweep local] cfg 84/96 elapsed=4.3m


  [sweep local] cfg 90/96 elapsed=4.6m


  [sweep local] cfg 96/96 elapsed=4.9m


Top CE6 local-search (mean, worst, cfg):
(4.5029670858242286, 5.18, {'pool_k': 11, 'temp': 1.0, 'gamma': 0.9, 'sep': 2, 'beta': 0.5, 'lambda_ord': 0.03})
(4.5029670858242286, 5.18, {'pool_k': 11, 'temp': 1.0, 'gamma': 0.9, 'sep': 2, 'beta': 0.5, 'lambda_ord': 0.05})
(4.5029670858242286, 5.18, {'pool_k': 11, 'temp': 1.0, 'gamma': 0.9, 'sep': 3, 'beta': 0.5, 'lambda_ord': 0.03})
(4.5029670858242286, 5.18, {'pool_k': 11, 'temp': 1.0, 'gamma': 0.9, 'sep': 3, 'beta': 0.5, 'lambda_ord': 0.05})
(4.5029670858242286, 5.18, {'pool_k': 11, 'temp': 1.0, 'gamma': 0.95, 'sep': 2, 'beta': 0.5, 'lambda_ord': 0.03})
Saved cv_sweep_ce_6x_localsrch.csv


Local-search decoder cell ready. Execute this cell to sweep and build submission.


In [18]:
# Build submission using best local-search cfg
build_submission_ce6_localsrch()

  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE-6x local] 10/95 elapsed=0.0m


  [infer CE-6x local] 20/95 elapsed=0.0m


  [infer CE-6x local] 30/95 elapsed=0.1m


  [infer CE-6x local] 40/95 elapsed=0.1m


  [infer CE-6x local] 50/95 elapsed=0.1m


  [infer CE-6x local] 60/95 elapsed=0.1m


  [infer CE-6x local] 70/95 elapsed=0.2m


  [infer CE-6x local] 80/95 elapsed=0.2m


  [infer CE-6x local] 90/95 elapsed=0.2m


  [infer CE-6x local] 95/95 elapsed=0.2m


Wrote submission_primary_ce_6x_localsrch.csv and submission.csv; head:
     Id                                           Sequence
0  300  9 2 1 15 18 3 12 8 10 11 4 20 13 5 14 19 6 16 ...
1  301  12 10 1 11 4 6 15 13 5 19 9 7 8 18 2 14 3 16 2...
2  302  12 1 17 20 16 19 13 5 18 3 10 4 6 8 14 15 7 9 ...
3  303  18 13 5 10 4 11 20 12 17 14 16 8 3 9 7 19 1 6 ...
4  304  1 12 10 2 14 18 13 5 9 7 11 3 19 6 15 8 17 16 ...


In [20]:
# P3: Build features3d_v3 by augmenting v2 with motion features (velocity, acceleration, |v|, EMA|v|)
import os, json, time
from pathlib import Path
import numpy as np

src_tr = Path('features3d_v2')/'train'
src_te = Path('features3d_v2')/'test'
dst_tr = Path('features3d_v3')/'train'
dst_te = Path('features3d_v3')/'test'
dst_tr.mkdir(parents=True, exist_ok=True)
dst_te.mkdir(parents=True, exist_ok=True)

def five_point_derivative(x_t_d: np.ndarray):
    # x: [T, D]; compute 5-point symmetric first derivative per feature with edge replication
    T, D = x_t_d.shape
    if T < 5:
        # fallback to simple 1-step diff with pad
        dx = np.zeros_like(x_t_d, dtype=np.float32)
        dx[1:] = x_t_d[1:] - x_t_d[:-1]
        dx[0] = dx[1]
        return dx.astype(np.float32)
    x = x_t_d.astype(np.float32)
    x_m2 = np.vstack([x[0:1], x[0:1], x[:-2]])
    x_m1 = np.vstack([x[0:1], x[:-1]])
    x_p1 = np.vstack([x[1:], x[-1:]])
    x_p2 = np.vstack([x[2:], x[-1:], x[-1:]])
    v = (-x_p2 + 8.0*x_p1 - 8.0*x_m1 + x_m2) / 12.0
    return v.astype(np.float32)

def five_point_second_derivative(x_t_d: np.ndarray):
    # approximate second derivative via 5-tap stencil
    T, D = x_t_d.shape
    x = x_t_d.astype(np.float32)
    if T < 5:
        # fallback: second diff of simple diff
        d1 = np.zeros_like(x); d1[1:] = x[1:] - x[:-1]; d1[0] = d1[1] if T>1 else 0.0
        a = np.zeros_like(x); a[1:] = d1[1:] - d1[:-1]; a[0] = a[1] if T>1 else 0.0
        return a.astype(np.float32)
    x_m2 = np.vstack([x[0:1], x[0:1], x[:-2]])
    x_m1 = np.vstack([x[0:1], x[:-1]])
    x_p1 = np.vstack([x[1:], x[-1:]])
    x_p2 = np.vstack([x[2:], x[-1:], x[-1:]])
    a = (-x_p2 + 16.0*x_p1 - 30.0*x + 16.0*x_m1 - x_m2) / 12.0
    return a.astype(np.float32)

def ema(arr: np.ndarray, alpha: float = 0.9):
    # arr: [T, D]
    out = np.empty_like(arr, dtype=np.float32)
    if arr.shape[0] == 0:
        return out
    out[0] = arr[0]
    for t in range(1, arr.shape[0]):
        out[t] = alpha * out[t-1] + (1.0 - alpha) * arr[t]
    return out

def build_v3_from_v2_file(src_path: Path, dst_path: Path):
    d = np.load(src_path)
    X = d['X'].astype(np.float32)  # [T, D]
    v = five_point_derivative(X)               # [T, D]
    a = five_point_second_derivative(X)        # [T, D]
    abs_v = np.abs(v).astype(np.float32)       # [T, D]
    ema_abs_v = ema(abs_v, alpha=0.9)          # [T, D]
    Xv3 = np.concatenate([X, v, a, abs_v, ema_abs_v], axis=1).astype(np.float32)
    np.savez_compressed(dst_path, X=Xv3)

def process_split(src_dir: Path, dst_dir: Path, tag: str):
    paths = sorted(src_dir.glob('*.npz'))
    t0 = time.time()
    for i, p in enumerate(paths, 1):
        outp = dst_dir / p.name
        if outp.exists():
            continue
        build_v3_from_v2_file(p, outp)
        if (i % 25) == 0 or i == len(paths):
            print(f"  [{tag}] {i}/{len(paths)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)

print('Building features3d_v3 train/test from features3d_v2 ...', flush=True)
process_split(src_tr, dst_tr, 'train')
process_split(src_te, dst_te, 'test')
print('Done features3d_v3. Example shapes:', flush=True)
ex = next(iter(sorted(dst_tr.glob('*.npz'))))
d_ex = np.load(ex)
print('Sample:', ex.name, 'X shape:', d_ex['X'].shape, flush=True)

Building features3d_v3 train/test from features3d_v2 ...


  [train] 25/297 elapsed=0.1m


  [train] 50/297 elapsed=0.2m


  [train] 75/297 elapsed=0.2m


  [train] 100/297 elapsed=0.3m


  [train] 125/297 elapsed=0.4m


  [train] 150/297 elapsed=0.5m


  [train] 175/297 elapsed=0.5m


  [train] 200/297 elapsed=0.6m


  [train] 225/297 elapsed=0.7m


  [train] 250/297 elapsed=0.8m


  [train] 275/297 elapsed=0.9m


  [train] 297/297 elapsed=0.9m


  [test] 25/95 elapsed=0.1m


  [test] 50/95 elapsed=0.2m


  [test] 75/95 elapsed=0.2m


  [test] 95/95 elapsed=0.3m


Done features3d_v3. Example shapes:


Sample: 1.npz X shape: (1254, 1095)


In [21]:
# P3/P4: Train CE on features3d_v3 (motion-augmented), cache OOF, sweep improved decoder, build 6x submission
import os, json, math, time, random, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available(), flush=True)
assert torch.cuda.is_available(), 'GPU required'
torch.backends.cudnn.benchmark = True
try: torch.set_float32_matmul_precision('high')
except Exception: pass

feat_tr_dir = Path('features3d_v3')/'train'
feat_te_dir = Path('features3d_v3')/'test'
lab_tr_dir  = Path('labels3d_v2')/'train'
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__()
        self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop)
        self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h)
        h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True)
        return x + h

class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__()
        self.inp = nn.Conv1d(d_in, channels, 1)
        blocks=[]; dil=1
        for _ in range(layers):
            blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3));
            dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks)
        self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2);
        h = self.inp(x)
        for b in self.blocks: h = b(h)
        out = self.head(h)
        return out.transpose(1,2)

class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay; self.shadow = {n: p.detach().clone() for n,p in model.named_parameters() if p.requires_grad}
    @torch.no_grad()
    def update(self, model):
        for n,p in model.named_parameters():
            if p.requires_grad: self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0-self.decay)
    def apply_to(self, model):
        self.backup={}
        for n,p in model.named_parameters():
            if p.requires_grad: self.backup[n]=p.detach().clone(); p.data.copy_(self.shadow[n].data)
    def restore(self, model):
        for n,p in model.named_parameters():
            if p.requires_grad: p.data.copy_(self.backup[n].data)

def load_feat_full(sample_id:int):
    d = np.load((feat_tr_dir/f"{sample_id}.npz")); return d['X'].astype(np.float32)
def load_feat(split, sid:int):
    d = np.load((feat_tr_dir if split=='train' else feat_te_dir)/f"{sid}.npz"); return d['X'].astype(np.float32)
def load_labels(sample_id:int):
    return np.load(lab_tr_dir/f"{sample_id}.npy").astype(np.int64)

def compute_fold_scaler(id_list):
    n=0; mean=None; M2=None
    for sid in id_list:
        X = load_feat_full(int(sid)); n_i = X.shape[0]
        if mean is None:
            mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def compute_class_weights(train_ids):
    counts = np.zeros(21, dtype=np.int64)
    for sid in train_ids:
        y = load_labels(int(sid)); vals, cnts = np.unique(y, return_counts=True)
        for v,c in zip(vals, cnts):
            if 0 <= v <= 20: counts[v] += int(c)
    freq = counts / max(1, counts.sum())
    w = 1.0 / np.sqrt(np.clip(freq, 1e-12, None)); w = w / w.mean()
    w[0] = min(w[0], 0.7 * w.mean())
    return torch.tensor(w, dtype=torch.float32, device=device)

class SeqDataset(Dataset):
    def __init__(self, ids, mean, std, train=True, crop_min=1600, crop_max=4096, time_masks=(3,5), mask_len=(5,15), noise_std=0.01, seed=42, time_stretch=(0.95,1.05)):
        self.ids=list(ids); self.mean=torch.from_numpy(mean).float(); self.std=torch.from_numpy(std).float()
        self.train=train; self.crop_min=crop_min; self.crop_max=crop_max
        self.tmask_lo, self.tmask_hi = time_masks; self.mlen_lo, self.mlen_hi = mask_len
        self.noise_std=noise_std; self.ts=time_stretch; self.rng=random.Random(seed)
    def __len__(self): return len(self.ids)
    def _rand_crop(self, X, y):
        T=X.shape[0];
        if not self.train: return X,y
        tgt=self.rng.randint(self.crop_min, min(self.crop_max, max(self.crop_min,T)))
        if T<=tgt: return X,y
        s=self.rng.randint(0, T - tgt); e=s+tgt; return X[s:e], y[s:e]
    def _time_mask(self, X):
        if not self.train: return X
        T=X.shape[0]; m=self.rng.randint(self.tmask_lo, self.tmask_hi)
        for _ in range(m):
            L=self.rng.randint(self.mlen_lo, self.mlen_hi)
            if T<=L: continue
            s=self.rng.randint(0, T-L); e=s+L
            seg_mean = X[max(0,s-5):min(T,e+5)].mean(axis=0, keepdims=True)
            X[s:e] = seg_mean
        return X
    def _time_stretch(self, X, y):
        if not self.train or self.ts is None: return X,y
        lo,hi = self.ts; s = self.rng.uniform(lo, hi)
        if abs(s-1.0) < 1e-3: return X,y
        T = X.shape[0]; tgt = max(1, int(round(T*s)))
        # linear interp for X
        x_t = torch.from_numpy(X).float().unsqueeze(0).transpose(1,2)
        Xs = F.interpolate(x_t, size=tgt, mode='linear', align_corners=False).transpose(1,2)[0].numpy()
        # nearest for y
        y_t = torch.from_numpy(y).long().unsqueeze(0).unsqueeze(0).float()
        ys = F.interpolate(y_t, size=tgt, mode='nearest')[0,0].long().numpy()
        return Xs, ys
    def __getitem__(self, idx):
        sid = int(self.ids[idx])
        X = load_feat_full(sid); y = load_labels(sid)
        X, y = self._rand_crop(X, y)
        X, y = self._time_stretch(X, y)
        X = (torch.from_numpy(X).float() - self.mean) / (self.std + 1e-6)
        if self.train:
            if self.noise_std>0: X = X + torch.randn_like(X) * self.noise_std
            Xn = X.numpy(); Xn = self._time_mask(Xn); X = torch.from_numpy(Xn).float()
        y = torch.from_numpy(y).long()
        return X, y

def collate_pad(batch):
    xs, ys = zip(*batch); T_max = max(x.shape[0] for x in xs); D = xs[0].shape[1]
    xb = torch.zeros((len(xs), T_max, D), dtype=torch.float32)
    yb = torch.full((len(xs), T_max), -100, dtype=torch.long)
    for i,(x,y) in enumerate(zip(xs,ys)):
        T = x.shape[0]; xb[i,:T]=x; yb[i,:T]=y
    return xb, yb

def cosine_with_warmup(step, total_steps, warmup_steps, base_lr, min_lr):
    if step < warmup_steps: return base_lr * (step / max(1, warmup_steps))
    t = (step - warmup_steps) / max(1, (total_steps - warmup_steps))
    return min_lr + 0.5*(base_lr - min_lr)*(1 + math.cos(math.pi * t))

def train_fold(fold_idx, train_ids, val_ids, out_name, ds_seed, epochs=35, batch_size=8, base_lr=3e-3, min_lr=3e-5, wd=0.01, label_smooth=0.05):
    print(f"=== Train v3 fold {fold_idx} ({out_name}) ===", flush=True)
    mean, std = compute_fold_scaler(train_ids); class_w = compute_class_weights(train_ids)
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    # seeds
    torch.manual_seed(1337 + ds_seed); np.random.seed(4242 + ds_seed); random.seed(9001 + ds_seed)
    ema = EMA(model, decay=0.999); scaler = torch.amp.GradScaler('cuda', enabled=True)
    opt = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=wd, betas=(0.9,0.999))
    tr_ds = SeqDataset(train_ids, mean, std, train=True, seed=ds_seed, time_stretch=(0.95,1.05))
    va_ds = SeqDataset(val_ids, mean, std, train=False, seed=ds_seed+777, time_stretch=None)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    va_ld = DataLoader(va_ds, batch_size=1, shuffle=False, drop_last=False, num_workers=0, collate_fn=collate_pad, pin_memory=True)
    steps_per_epoch = max(1, len(tr_ld)); total_steps = steps_per_epoch * epochs; warmup_steps = 5 * steps_per_epoch
    crit = nn.CrossEntropyLoss(weight=class_w, label_smoothing=label_smooth, ignore_index=-100)
    best_val = float('inf'); bad=0; patience=6; t0=time.time()
    for ep in range(1, epochs+1):
        model.train(); tr_loss=0.0; seen=0; t_ep=time.time(); opt.zero_grad(set_to_none=True)
        for step,(xb,yb) in enumerate(tr_ld):
            xb=xb.to(device, non_blocking=True); yb=yb.to(device, non_blocking=True); C=21
            lr = cosine_with_warmup((ep-1)*steps_per_epoch + step, total_steps, warmup_steps, base_lr, min_lr)
            for pg in opt.param_groups: pg['lr']=lr
            with torch.amp.autocast('cuda'):
                logits = model(xb); loss = crit(logits.reshape(-1, C), yb.reshape(-1))
            scaler.scale(loss).backward()
            scaler.unscale_(opt); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(opt); scaler.update(); opt.zero_grad(set_to_none=True); ema.update(model)
            tr_loss += loss.item() * xb.shape[0]; seen += xb.shape[0]
        # val
        model.eval(); ema.apply_to(model); val_loss=0.0; vseen=0
        with torch.no_grad(), torch.amp.autocast('cuda'):
            for xb,yb in va_ld:
                xb=xb.to(device, non_blocking=True); yb=yb.to(device, non_blocking=True); C=21
                logits = model(xb); loss = crit(logits.reshape(-1, C), yb.reshape(-1)); val_loss += loss.item(); vseen += 1
        ema.restore(model); val_loss = val_loss / max(1, vseen)
        print(f"[v3 fold {fold_idx}] ep{ep} tr={tr_loss/max(1,seen):.4f} va={val_loss:.4f} elapsed={(time.time()-t_ep):.1f}s total={(time.time()-t0)/60:.1f}m", flush=True)
        if val_loss < best_val - 1e-4:
            best_val = val_loss; bad=0; ema.apply_to(model); torch.save(model.state_dict(), out_name); ema.restore(model)
        else:
            bad += 1
            if bad >= patience: print(f"  Early stop at ep{ep}", flush=True); break
        torch.cuda.empty_cache(); gc.collect()
    print(f"Fold {fold_idx} v3 done. Best CE={best_val:.4f} -> {out_name}")

# Train v3 models: two seeds x 3 folds
for f in folds:
    fi = int(f['fold'])
    for si, suf in enumerate(['', '_s1']):
        outp = Path(f"model_ce_v3_fold{fi}{suf}.pth")
        if outp.exists():
            try: outp.unlink()
            except Exception: pass
        ds_seed = 2027 + fi*13 + si*101
        train_fold(fi, f['train_ids'], f['val_ids'], out_name=str(outp), ds_seed=ds_seed, epochs=35, batch_size=8)
        torch.cuda.empty_cache(); gc.collect()

# Cache OOF probs for v3 models (both seeds), with TTA=(0.9,1.0,1.1)
def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T,C = p_t_c.shape; tgt_len = max(1, int(round(T*s)))
        x = p_t_c.T.unsqueeze(0)
        y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False)
        y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T
        y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)
        acc = y2 if acc is None else (acc + y2)
    out = acc / float(len(factors))
    return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def cache_fold_val_probs_v3(fold, seed_suffix: str):
    fi = int(fold['fold'])
    ckpt = Path(f"model_ce_v3_fold{fi}{seed_suffix}.pth"); assert ckpt.exists(), f"Missing {ckpt}"
    D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
    model = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
    model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()
    mean,std = compute_fold_scaler(fold['train_ids'])
    mean_t = torch.from_numpy(mean).float().to(device); std_t = torch.from_numpy(std).float().to(device)
    vids = fold['val_ids']; t0=time.time()
    for i, sid in enumerate(vids, 1):
        sid=int(sid); outp = probs_cache/f"{sid}_ce_v3{seed_suffix}.npy"
        X = load_feat('train', sid); xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t)/(std_t+1e-6); xb = xb.unsqueeze(0)
        with torch.no_grad(), torch.amp.autocast('cuda'):
            p = model(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
        np.save(outp, p.cpu().numpy())
        if (i%25)==0 or i==len(vids): print(f"  [v3 fold {fi}{seed_suffix}] cached {i}/{len(vids)}", flush=True)

print('Caching v3 OOF probs ...', flush=True)
for f in folds:
    for suf in ['', '_s1']:
        if Path(f"model_ce_v3_fold{int(f['fold'])}{suf}.pth").exists():
            cache_fold_val_probs_v3(f, suf)

# Improved decoder utilities (reuse from earlier)
def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)
def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y
def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T=p.shape[0]; a=max(0,t_star-w); b=min(T-1,t_star+w); idx=torch.arange(a,b+1, device=p.device, dtype=p.dtype);
    seg=p[a:b+1]; s=seg.sum()+1e-8; return float(((idx*seg).sum()/s).item())
def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=2, K=3, k_delta=4):
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c/(p_t_c.sum(dim=-1, keepdim=True)+1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T,C = p_s.shape
    scores = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: scores[:,c]=p_s[:,c]; ks[c]=13; continue
        base_k = med_k.get(c,13); k_c = int(np.clip(round(gamma*base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c+1); ks[c]=k_c
        ks_multi = sorted(set([int(np.clip(k_c-4,9,25)), k_c, int(np.clip(k_c+4,9,25))]));
        ks_multi = [k if (k%2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:,c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
        scores[:,c] = (acc/float(len(ks_multi))).squeeze(1)
    # candidates per class (best only) with COM refine
    peaks=[]
    for c in range(1,21):
        k=ks[c]; s=scores[:,c]; t_star=int(torch.argmax(s).item())
        w_com=max(5,k//3); radius=max(10,k//2); t_ref = refine_com(p_s[:,c], t_star, w=w_com)
        t_idx=int(round(max(0,min(t_ref,T-1))))
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item()
        pooled_at_ref = p_s[t_idx, c].item()
        peaks.append([c, t_ref, float(scores[t_idx,c].item()), float(local_mean), float(pooled_at_ref)])
    peaks.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    last_t=-1e9
    for i in range(len(peaks)):
        if peaks[i][1] <= last_t: peaks[i][1] = last_t + min_sep
        last_t = min(peaks[i][1], float(T-1))
    return [int(c) for c,_,_,_,_ in peaks]

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Sweep decoder on averaged v3 OOF (seed0+seed1) with small grid; select by worst-fold then mean
def load_oof_v3_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_v3.npy")).to(device)
    p1 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_v3_s1.npy")).to(device) if (probs_cache/f"{sid}_ce_v3_s1.npy").exists() else None
    p = p0 if p1 is None else (p0 + p1) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

pool_ks=[11,13,15]; temps=[0.90,0.95,1.00]; gammas=[0.90,0.95,0.975,1.00,1.025]; seps=[2,3]
print('Sweeping improved decoder on v3 averaged OOF...', flush=True)
med_cache={}; res=[]
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            for sep in seps:
                per_fold=[]
                for f in folds:
                    fi=int(f['fold'])
                    if fi not in med_cache: med_cache[fi]=compute_class_median_durations_for_ids(f['train_ids'])
                    med_k = med_cache[fi]
                    vids=f['val_ids']; tot=0; cnt=0
                    for sid in vids:
                        sid=int(sid); p = load_oof_v3_avg(sid); T = p.shape[0]
                        g_eff = gamma_with_length(gamma, T, med_k)
                        seq = decode_peaks_improved(p, med_k=med_k, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
                        tot += levenshtein(seq, id2seq[sid]); cnt += 1
                    per_fold.append(tot/max(cnt,1))
                res.append((np.mean(per_fold), np.max(per_fold), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep}))
res.sort(key=lambda x: (x[1], x[0]))
print('Top v3 improved decoder (mean, worst, cfg):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_v3_6x_improved.csv', index=False)
print('Saved cv_sweep_ce_v3_6x_improved.csv', flush=True)

# Test-time inference: 6 v3 models (3 folds x 2 seeds), TTA, improved decoder, write submission
print('Building CE v3 6-model ensemble test submission...', flush=True)
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
cfg_best = pd.read_csv('cv_sweep_ce_v3_6x_improved.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_v3_6x_improved.csv').exists() else {'pool_k':13,'temp':0.95,'gamma':1.0,'sep':2}
pool_k=int(cfg_best['pool_k']); temp=float(cfg_best['temp']); gamma=float(cfg_best.get('gamma',1.0)); sep=int(cfg_best.get('sep',2))
med_k_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]

def load_fold_scalers_v3():
    sc=[]
    for fi in range(3):
        m,s = compute_fold_scaler(folds[fi]['train_ids'])
        sc.append((torch.from_numpy(m).float().to(device), torch.from_numpy(s).float().to(device)))
    return sc

scalers = load_fold_scalers_v3()
rows=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    X = load_feat('test', int(sid)); acc=None
    with torch.no_grad(), torch.amp.autocast('cuda'):
        for fi in range(3):
            mean_t, std_t = scalers[fi]
            for s in (0,1):
                ckpt = Path(f"model_ce_v3_fold{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists(): continue
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t)/(std_t+1e-6); xb = xb.unsqueeze(0)
                p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                acc = p if acc is None else (acc + p); del m
    probs = acc / float(6); probs = probs/(probs.sum(dim=-1, keepdim=True)+1e-8)
    T = probs.shape[0]; g_eff = gamma_with_length(gamma, T, med_k_all)
    seq = decode_peaks_improved(probs, med_k=med_k_all, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
    rows.append({'Id': int(sid), 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE-v3 6x] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_v3_6x_v2.csv', index=False); sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_v3_6x_v2.csv and submission.csv; head:\n', sub.head(), flush=True)

CUDA available: True


=== Train v3 fold 0 (model_ce_v3_fold0.pth) ===


[v3 fold 0] ep1 tr=3.6552 va=4.8023 elapsed=20.4s total=0.3m


[v3 fold 0] ep2 tr=2.8409 va=4.5217 elapsed=19.7s total=0.7m


[v3 fold 0] ep3 tr=2.3003 va=4.2815 elapsed=20.7s total=1.0m


[v3 fold 0] ep4 tr=1.9365 va=4.0719 elapsed=20.3s total=1.4m


[v3 fold 0] ep5 tr=1.7407 va=3.8802 elapsed=19.0s total=1.7m


[v3 fold 0] ep6 tr=1.5912 va=3.6986 elapsed=19.4s total=2.0m


[v3 fold 0] ep7 tr=1.4666 va=3.5324 elapsed=18.3s total=2.3m


[v3 fold 0] ep8 tr=1.3781 va=3.3824 elapsed=18.5s total=2.6m


[v3 fold 0] ep9 tr=1.2774 va=3.2427 elapsed=19.1s total=2.9m


[v3 fold 0] ep10 tr=1.2287 va=3.1076 elapsed=19.0s total=3.3m


[v3 fold 0] ep11 tr=1.1766 va=2.9764 elapsed=18.0s total=3.6m


[v3 fold 0] ep12 tr=1.1075 va=2.8513 elapsed=18.2s total=3.9m


[v3 fold 0] ep13 tr=1.0555 va=2.7375 elapsed=18.3s total=4.2m


[v3 fold 0] ep14 tr=1.0100 va=2.6286 elapsed=18.1s total=4.5m


[v3 fold 0] ep15 tr=0.9431 va=2.5365 elapsed=18.2s total=4.8m


[v3 fold 0] ep16 tr=0.9316 va=2.4528 elapsed=18.3s total=5.1m


[v3 fold 0] ep17 tr=0.9035 va=2.3782 elapsed=18.5s total=5.4m


[v3 fold 0] ep18 tr=0.8474 va=2.3139 elapsed=18.2s total=5.7m


[v3 fold 0] ep19 tr=0.8160 va=2.2597 elapsed=18.3s total=6.0m


[v3 fold 0] ep20 tr=0.7827 va=2.2156 elapsed=18.4s total=6.3m


[v3 fold 0] ep21 tr=0.7729 va=2.1780 elapsed=17.9s total=6.6m


[v3 fold 0] ep22 tr=0.7355 va=2.1474 elapsed=17.9s total=6.9m


[v3 fold 0] ep23 tr=0.6992 va=2.1226 elapsed=18.0s total=7.2m


[v3 fold 0] ep24 tr=0.6780 va=2.1021 elapsed=17.8s total=7.5m


[v3 fold 0] ep25 tr=0.6611 va=2.0843 elapsed=18.2s total=7.8m


[v3 fold 0] ep26 tr=0.6434 va=2.0692 elapsed=17.8s total=8.1m


[v3 fold 0] ep27 tr=0.6263 va=2.0557 elapsed=18.0s total=8.4m


[v3 fold 0] ep28 tr=0.6243 va=2.0444 elapsed=17.6s total=8.7m


[v3 fold 0] ep29 tr=0.6062 va=2.0351 elapsed=17.5s total=9.0m


[v3 fold 0] ep30 tr=0.6009 va=2.0265 elapsed=17.9s total=9.3m


[v3 fold 0] ep31 tr=0.5964 va=2.0194 elapsed=17.5s total=9.6m


[v3 fold 0] ep32 tr=0.5900 va=2.0127 elapsed=17.8s total=9.9m


[v3 fold 0] ep33 tr=0.5894 va=2.0083 elapsed=17.8s total=10.2m


[v3 fold 0] ep34 tr=0.5886 va=2.0035 elapsed=18.0s total=10.5m


[v3 fold 0] ep35 tr=0.5787 va=1.9997 elapsed=17.9s total=10.8m


Fold 0 v3 done. Best CE=1.9997 -> model_ce_v3_fold0.pth


=== Train v3 fold 0 (model_ce_v3_fold0_s1.pth) ===


[v3 fold 0] ep1 tr=3.9658 va=5.5504 elapsed=17.8s total=0.3m


[v3 fold 0] ep2 tr=2.8496 va=5.0544 elapsed=17.5s total=0.6m


[v3 fold 0] ep3 tr=2.3604 va=4.6543 elapsed=17.4s total=0.9m


[v3 fold 0] ep4 tr=1.9452 va=4.3339 elapsed=17.6s total=1.2m


[v3 fold 0] ep5 tr=1.6879 va=4.0681 elapsed=18.1s total=1.5m


[v3 fold 0] ep6 tr=1.5766 va=3.8328 elapsed=17.9s total=1.8m


[v3 fold 0] ep7 tr=1.4535 va=3.6257 elapsed=18.1s total=2.1m


[v3 fold 0] ep8 tr=1.3592 va=3.4524 elapsed=17.6s total=2.4m


[v3 fold 0] ep9 tr=1.3022 va=3.2881 elapsed=18.0s total=2.7m


[v3 fold 0] ep10 tr=1.2247 va=3.1398 elapsed=17.5s total=3.0m


[v3 fold 0] ep11 tr=1.1424 va=2.9964 elapsed=18.0s total=3.3m


[v3 fold 0] ep12 tr=1.1067 va=2.8586 elapsed=17.7s total=3.6m


[v3 fold 0] ep13 tr=1.0519 va=2.7272 elapsed=17.6s total=3.9m


[v3 fold 0] ep14 tr=0.9855 va=2.6011 elapsed=17.8s total=4.2m


[v3 fold 0] ep15 tr=0.9540 va=2.4812 elapsed=18.0s total=4.5m


[v3 fold 0] ep16 tr=0.9198 va=2.3735 elapsed=17.3s total=4.8m


[v3 fold 0] ep17 tr=0.8770 va=2.2805 elapsed=17.7s total=5.1m


[v3 fold 0] ep18 tr=0.8321 va=2.2001 elapsed=17.3s total=5.4m


[v3 fold 0] ep19 tr=0.8252 va=2.1313 elapsed=17.6s total=5.7m


[v3 fold 0] ep20 tr=0.7812 va=2.0742 elapsed=17.5s total=5.9m


[v3 fold 0] ep21 tr=0.7418 va=2.0304 elapsed=17.4s total=6.2m


[v3 fold 0] ep22 tr=0.7243 va=1.9933 elapsed=17.3s total=6.5m


[v3 fold 0] ep23 tr=0.6965 va=1.9627 elapsed=17.2s total=6.8m


[v3 fold 0] ep24 tr=0.6727 va=1.9388 elapsed=17.2s total=7.1m


[v3 fold 0] ep25 tr=0.6635 va=1.9196 elapsed=17.7s total=7.4m


[v3 fold 0] ep26 tr=0.6479 va=1.9047 elapsed=17.3s total=7.7m


[v3 fold 0] ep27 tr=0.6304 va=1.8933 elapsed=17.6s total=8.0m


[v3 fold 0] ep28 tr=0.6132 va=1.8842 elapsed=17.4s total=8.3m


[v3 fold 0] ep29 tr=0.6076 va=1.8782 elapsed=17.3s total=8.6m


[v3 fold 0] ep30 tr=0.5927 va=1.8737 elapsed=17.4s total=8.9m


[v3 fold 0] ep31 tr=0.5905 va=1.8705 elapsed=17.3s total=9.2m


[v3 fold 0] ep32 tr=0.5876 va=1.8687 elapsed=17.4s total=9.4m


[v3 fold 0] ep33 tr=0.5855 va=1.8673 elapsed=17.6s total=9.7m


[v3 fold 0] ep34 tr=0.5800 va=1.8672 elapsed=17.3s total=10.0m


[v3 fold 0] ep35 tr=0.5780 va=1.8671 elapsed=17.8s total=10.3m


Fold 0 v3 done. Best CE=1.8672 -> model_ce_v3_fold0_s1.pth


=== Train v3 fold 1 (model_ce_v3_fold1.pth) ===


[v3 fold 1] ep1 tr=3.5768 va=4.0807 elapsed=19.1s total=0.3m


[v3 fold 1] ep2 tr=2.8262 va=3.9053 elapsed=19.0s total=0.6m


[v3 fold 1] ep3 tr=2.3220 va=3.7453 elapsed=19.2s total=1.0m


[v3 fold 1] ep4 tr=1.9018 va=3.6062 elapsed=18.6s total=1.3m


[v3 fold 1] ep5 tr=1.7112 va=3.4706 elapsed=19.4s total=1.6m


[v3 fold 1] ep6 tr=1.5473 va=3.3417 elapsed=18.6s total=1.9m


[v3 fold 1] ep7 tr=1.4130 va=3.2170 elapsed=18.7s total=2.2m


[v3 fold 1] ep8 tr=1.2879 va=3.0935 elapsed=19.1s total=2.5m


[v3 fold 1] ep9 tr=1.2328 va=2.9717 elapsed=19.1s total=2.9m


[v3 fold 1] ep10 tr=1.1917 va=2.8442 elapsed=18.8s total=3.2m


[v3 fold 1] ep11 tr=1.0971 va=2.7107 elapsed=19.3s total=3.5m


[v3 fold 1] ep12 tr=1.0251 va=2.5779 elapsed=19.1s total=3.8m


[v3 fold 1] ep13 tr=0.9920 va=2.4476 elapsed=19.3s total=4.1m


[v3 fold 1] ep14 tr=0.9527 va=2.3226 elapsed=19.3s total=4.5m


[v3 fold 1] ep15 tr=0.8992 va=2.2084 elapsed=18.8s total=4.8m


[v3 fold 1] ep16 tr=0.8735 va=2.1065 elapsed=18.6s total=5.1m


[v3 fold 1] ep17 tr=0.8369 va=2.0201 elapsed=18.4s total=5.4m


[v3 fold 1] ep18 tr=0.8266 va=1.9449 elapsed=18.7s total=5.7m


[v3 fold 1] ep19 tr=0.7638 va=1.8827 elapsed=18.2s total=6.0m


[v3 fold 1] ep20 tr=0.7414 va=1.8311 elapsed=18.5s total=6.3m


[v3 fold 1] ep21 tr=0.7003 va=1.7885 elapsed=18.3s total=6.6m


[v3 fold 1] ep22 tr=0.6845 va=1.7536 elapsed=18.3s total=7.0m


[v3 fold 1] ep23 tr=0.6621 va=1.7247 elapsed=19.0s total=7.3m


[v3 fold 1] ep24 tr=0.6437 va=1.7006 elapsed=18.5s total=7.6m


[v3 fold 1] ep25 tr=0.6302 va=1.6803 elapsed=18.4s total=7.9m


[v3 fold 1] ep26 tr=0.6123 va=1.6635 elapsed=18.7s total=8.2m


[v3 fold 1] ep27 tr=0.6026 va=1.6496 elapsed=18.6s total=8.5m


[v3 fold 1] ep28 tr=0.5952 va=1.6375 elapsed=18.5s total=8.8m


[v3 fold 1] ep29 tr=0.5826 va=1.6278 elapsed=18.2s total=9.1m


[v3 fold 1] ep30 tr=0.5748 va=1.6200 elapsed=18.5s total=9.4m


[v3 fold 1] ep31 tr=0.5716 va=1.6134 elapsed=18.0s total=9.7m


[v3 fold 1] ep32 tr=0.5687 va=1.6081 elapsed=18.0s total=10.1m


[v3 fold 1] ep33 tr=0.5629 va=1.6038 elapsed=18.3s total=10.4m


[v3 fold 1] ep34 tr=0.5629 va=1.6006 elapsed=18.6s total=10.7m


[v3 fold 1] ep35 tr=0.5631 va=1.5981 elapsed=17.8s total=11.0m


Fold 1 v3 done. Best CE=1.5981 -> model_ce_v3_fold1.pth


=== Train v3 fold 1 (model_ce_v3_fold1_s1.pth) ===


[v3 fold 1] ep1 tr=3.8627 va=4.9465 elapsed=17.9s total=0.3m


[v3 fold 1] ep2 tr=2.8492 va=4.5739 elapsed=18.7s total=0.6m


[v3 fold 1] ep3 tr=2.3798 va=4.2638 elapsed=18.5s total=0.9m


[v3 fold 1] ep4 tr=1.9935 va=4.0199 elapsed=18.7s total=1.2m


[v3 fold 1] ep5 tr=1.7532 va=3.8139 elapsed=18.4s total=1.5m


[v3 fold 1] ep6 tr=1.6284 va=3.6274 elapsed=18.4s total=1.9m


[v3 fold 1] ep7 tr=1.4479 va=3.4602 elapsed=17.9s total=2.2m


[v3 fold 1] ep8 tr=1.3500 va=3.3103 elapsed=18.5s total=2.5m


[v3 fold 1] ep9 tr=1.2536 va=3.1731 elapsed=18.3s total=2.8m


[v3 fold 1] ep10 tr=1.1745 va=3.0452 elapsed=18.3s total=3.1m


[v3 fold 1] ep11 tr=1.1198 va=2.9211 elapsed=18.3s total=3.4m


[v3 fold 1] ep12 tr=1.0487 va=2.7978 elapsed=18.1s total=3.7m


[v3 fold 1] ep13 tr=1.0060 va=2.6733 elapsed=18.3s total=4.0m


[v3 fold 1] ep14 tr=0.9648 va=2.5509 elapsed=18.3s total=4.3m


[v3 fold 1] ep15 tr=0.9313 va=2.4322 elapsed=18.6s total=4.6m


[v3 fold 1] ep16 tr=0.8799 va=2.3227 elapsed=18.1s total=4.9m


[v3 fold 1] ep17 tr=0.8447 va=2.2229 elapsed=18.8s total=5.2m


[v3 fold 1] ep18 tr=0.8396 va=2.1347 elapsed=18.0s total=5.5m


[v3 fold 1] ep19 tr=0.7859 va=2.0596 elapsed=18.6s total=5.9m


[v3 fold 1] ep20 tr=0.7482 va=1.9941 elapsed=18.3s total=6.2m


[v3 fold 1] ep21 tr=0.7301 va=1.9399 elapsed=18.2s total=6.5m


[v3 fold 1] ep22 tr=0.7018 va=1.8931 elapsed=18.9s total=6.8m


[v3 fold 1] ep23 tr=0.6816 va=1.8534 elapsed=18.0s total=7.1m


[v3 fold 1] ep24 tr=0.6576 va=1.8197 elapsed=18.3s total=7.4m


[v3 fold 1] ep25 tr=0.6414 va=1.7908 elapsed=18.7s total=7.7m


[v3 fold 1] ep26 tr=0.6222 va=1.7665 elapsed=18.4s total=8.0m


[v3 fold 1] ep27 tr=0.6104 va=1.7459 elapsed=18.4s total=8.3m


[v3 fold 1] ep28 tr=0.5957 va=1.7288 elapsed=18.8s total=8.6m


[v3 fold 1] ep29 tr=0.5903 va=1.7139 elapsed=18.6s total=9.0m


[v3 fold 1] ep30 tr=0.5795 va=1.7013 elapsed=18.0s total=9.3m


[v3 fold 1] ep31 tr=0.5741 va=1.6905 elapsed=18.4s total=9.6m


[v3 fold 1] ep32 tr=0.5763 va=1.6815 elapsed=18.6s total=9.9m


[v3 fold 1] ep33 tr=0.5722 va=1.6738 elapsed=18.6s total=10.2m


[v3 fold 1] ep34 tr=0.5715 va=1.6671 elapsed=18.1s total=10.5m


[v3 fold 1] ep35 tr=0.5674 va=1.6616 elapsed=18.4s total=10.8m


Fold 1 v3 done. Best CE=1.6616 -> model_ce_v3_fold1_s1.pth


=== Train v3 fold 2 (model_ce_v3_fold2.pth) ===


[v3 fold 2] ep1 tr=3.8128 va=5.1485 elapsed=19.8s total=0.3m


[v3 fold 2] ep2 tr=2.6740 va=4.7698 elapsed=18.1s total=0.6m


[v3 fold 2] ep3 tr=2.0562 va=4.4569 elapsed=19.0s total=1.0m


[v3 fold 2] ep4 tr=1.6664 va=4.2153 elapsed=18.7s total=1.3m


[v3 fold 2] ep5 tr=1.5031 va=4.0110 elapsed=18.3s total=1.6m


[v3 fold 2] ep6 tr=1.4180 va=3.8303 elapsed=18.6s total=1.9m


[v3 fold 2] ep7 tr=1.3343 va=3.6682 elapsed=18.8s total=2.2m


[v3 fold 2] ep8 tr=1.2402 va=3.5284 elapsed=18.0s total=2.5m


[v3 fold 2] ep9 tr=1.1112 va=3.4096 elapsed=18.2s total=2.8m


[v3 fold 2] ep10 tr=1.0504 va=3.3013 elapsed=18.4s total=3.1m


[v3 fold 2] ep11 tr=1.0322 va=3.1948 elapsed=18.5s total=3.4m


[v3 fold 2] ep12 tr=0.9849 va=3.0900 elapsed=18.4s total=3.7m


[v3 fold 2] ep13 tr=0.9735 va=2.9892 elapsed=18.5s total=4.0m


[v3 fold 2] ep14 tr=0.9045 va=2.8923 elapsed=18.3s total=4.4m


[v3 fold 2] ep15 tr=0.8852 va=2.8047 elapsed=18.3s total=4.7m


[v3 fold 2] ep16 tr=0.8587 va=2.7231 elapsed=18.1s total=5.0m


[v3 fold 2] ep17 tr=0.8362 va=2.6514 elapsed=18.1s total=5.3m


[v3 fold 2] ep18 tr=0.7948 va=2.5885 elapsed=18.6s total=5.6m


[v3 fold 2] ep19 tr=0.7645 va=2.5365 elapsed=18.6s total=5.9m


[v3 fold 2] ep20 tr=0.7635 va=2.4914 elapsed=18.3s total=6.2m


[v3 fold 2] ep21 tr=0.7129 va=2.4554 elapsed=18.5s total=6.5m


[v3 fold 2] ep22 tr=0.6944 va=2.4255 elapsed=18.3s total=6.8m


[v3 fold 2] ep23 tr=0.6831 va=2.4015 elapsed=18.4s total=7.1m


[v3 fold 2] ep24 tr=0.6708 va=2.3809 elapsed=18.8s total=7.4m


[v3 fold 2] ep25 tr=0.6487 va=2.3633 elapsed=18.2s total=7.7m


[v3 fold 2] ep26 tr=0.6352 va=2.3483 elapsed=18.0s total=8.1m


[v3 fold 2] ep27 tr=0.6247 va=2.3357 elapsed=18.7s total=8.4m


[v3 fold 2] ep28 tr=0.6184 va=2.3273 elapsed=17.9s total=8.7m


[v3 fold 2] ep29 tr=0.6118 va=2.3182 elapsed=18.5s total=9.0m


[v3 fold 2] ep30 tr=0.6060 va=2.3129 elapsed=18.4s total=9.3m


[v3 fold 2] ep31 tr=0.6002 va=2.3071 elapsed=18.6s total=9.6m


[v3 fold 2] ep32 tr=0.5992 va=2.3026 elapsed=18.0s total=9.9m


[v3 fold 2] ep33 tr=0.5955 va=2.2993 elapsed=18.1s total=10.2m


[v3 fold 2] ep34 tr=0.5958 va=2.2972 elapsed=18.3s total=10.5m


[v3 fold 2] ep35 tr=0.5957 va=2.2948 elapsed=18.2s total=10.8m


Fold 2 v3 done. Best CE=2.2948 -> model_ce_v3_fold2.pth


=== Train v3 fold 2 (model_ce_v3_fold2_s1.pth) ===


[v3 fold 2] ep1 tr=3.7127 va=4.8984 elapsed=18.1s total=0.3m


[v3 fold 2] ep2 tr=2.5949 va=4.5837 elapsed=18.4s total=0.6m


[v3 fold 2] ep3 tr=1.9953 va=4.3230 elapsed=18.7s total=0.9m


[v3 fold 2] ep4 tr=1.6737 va=4.1047 elapsed=18.2s total=1.2m


[v3 fold 2] ep5 tr=1.5184 va=3.9213 elapsed=18.2s total=1.5m


[v3 fold 2] ep6 tr=1.3975 va=3.7586 elapsed=18.3s total=1.8m


[v3 fold 2] ep7 tr=1.2728 va=3.6187 elapsed=18.3s total=2.2m


[v3 fold 2] ep8 tr=1.2083 va=3.4946 elapsed=17.9s total=2.5m


[v3 fold 2] ep9 tr=1.1289 va=3.3798 elapsed=18.3s total=2.8m


[v3 fold 2] ep10 tr=1.0963 va=3.2703 elapsed=18.4s total=3.1m


[v3 fold 2] ep11 tr=1.0303 va=3.1649 elapsed=18.7s total=3.4m


[v3 fold 2] ep12 tr=0.9859 va=3.0582 elapsed=17.9s total=3.7m


[v3 fold 2] ep13 tr=0.9470 va=2.9535 elapsed=17.7s total=4.0m


[v3 fold 2] ep14 tr=0.8960 va=2.8506 elapsed=18.5s total=4.3m


[v3 fold 2] ep15 tr=0.8631 va=2.7482 elapsed=18.2s total=4.6m


[v3 fold 2] ep16 tr=0.8252 va=2.6512 elapsed=18.2s total=4.9m


[v3 fold 2] ep17 tr=0.8134 va=2.5612 elapsed=18.5s total=5.2m


[v3 fold 2] ep18 tr=0.7836 va=2.4780 elapsed=18.3s total=5.5m


[v3 fold 2] ep19 tr=0.7546 va=2.4059 elapsed=17.9s total=5.8m


[v3 fold 2] ep20 tr=0.7325 va=2.3446 elapsed=18.2s total=6.1m


[v3 fold 2] ep21 tr=0.7155 va=2.2908 elapsed=18.4s total=6.4m


[v3 fold 2] ep22 tr=0.6939 va=2.2483 elapsed=18.0s total=6.7m


[v3 fold 2] ep23 tr=0.6718 va=2.2114 elapsed=18.4s total=7.0m


[v3 fold 2] ep24 tr=0.6609 va=2.1817 elapsed=18.1s total=7.4m


[v3 fold 2] ep25 tr=0.6472 va=2.1564 elapsed=18.4s total=7.7m


[v3 fold 2] ep26 tr=0.6344 va=2.1379 elapsed=18.2s total=8.0m


[v3 fold 2] ep27 tr=0.6260 va=2.1235 elapsed=18.0s total=8.3m


[v3 fold 2] ep28 tr=0.6162 va=2.1129 elapsed=18.2s total=8.6m


[v3 fold 2] ep29 tr=0.6109 va=2.1040 elapsed=18.5s total=8.9m


[v3 fold 2] ep30 tr=0.6054 va=2.0974 elapsed=18.0s total=9.2m


[v3 fold 2] ep31 tr=0.6022 va=2.0922 elapsed=18.4s total=9.5m


[v3 fold 2] ep32 tr=0.5950 va=2.0890 elapsed=18.4s total=9.8m


[v3 fold 2] ep33 tr=0.5934 va=2.0868 elapsed=18.5s total=10.1m


[v3 fold 2] ep34 tr=0.5938 va=2.0862 elapsed=18.0s total=10.4m


[v3 fold 2] ep35 tr=0.5898 va=2.0858 elapsed=18.3s total=10.7m


Fold 2 v3 done. Best CE=2.0858 -> model_ce_v3_fold2_s1.pth


Caching v3 OOF probs ...


  model.load_state_dict(torch.load(ckpt, map_location=device)); model.eval()


  [v3 fold 0] cached 25/98


  [v3 fold 0] cached 50/98


  [v3 fold 0] cached 75/98


  [v3 fold 0] cached 98/98


  [v3 fold 0_s1] cached 25/98


  [v3 fold 0_s1] cached 50/98


  [v3 fold 0_s1] cached 75/98


  [v3 fold 0_s1] cached 98/98


  [v3 fold 1] cached 25/99


  [v3 fold 1] cached 50/99


  [v3 fold 1] cached 75/99


  [v3 fold 1] cached 99/99


  [v3 fold 1_s1] cached 25/99


  [v3 fold 1_s1] cached 50/99


  [v3 fold 1_s1] cached 75/99


  [v3 fold 1_s1] cached 99/99


  [v3 fold 2] cached 25/100


  [v3 fold 2] cached 50/100


  [v3 fold 2] cached 75/100


  [v3 fold 2] cached 100/100


  [v3 fold 2_s1] cached 25/100


  [v3 fold 2_s1] cached 50/100


  [v3 fold 2_s1] cached 75/100


  [v3 fold 2_s1] cached 100/100


Sweeping improved decoder on v3 averaged OOF...


Top v3 improved decoder (mean, worst, cfg):
(4.916285301999587, 5.7, {'pool_k': 15, 'temp': 1.0, 'gamma': 0.9, 'sep': 2})
(4.916285301999587, 5.7, {'pool_k': 15, 'temp': 1.0, 'gamma': 0.9, 'sep': 3})
(4.916285301999587, 5.7, {'pool_k': 15, 'temp': 1.0, 'gamma': 0.95, 'sep': 2})
(4.916285301999587, 5.7, {'pool_k': 15, 'temp': 1.0, 'gamma': 0.95, 'sep': 3})
(4.916285301999587, 5.7, {'pool_k': 15, 'temp': 1.0, 'gamma': 0.975, 'sep': 2})
Saved cv_sweep_ce_v3_6x_improved.csv


Building CE v3 6-model ensemble test submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE-v3 6x] 10/95 elapsed=0.0m


  [infer CE-v3 6x] 20/95 elapsed=0.1m


  [infer CE-v3 6x] 30/95 elapsed=0.1m


  [infer CE-v3 6x] 40/95 elapsed=0.1m


  [infer CE-v3 6x] 50/95 elapsed=0.2m


  [infer CE-v3 6x] 60/95 elapsed=0.2m


  [infer CE-v3 6x] 70/95 elapsed=0.2m


  [infer CE-v3 6x] 80/95 elapsed=0.3m


  [infer CE-v3 6x] 90/95 elapsed=0.3m


  [infer CE-v3 6x] 95/95 elapsed=0.3m


Wrote submission_primary_ce_v3_6x_v2.csv and submission.csv; head:
     Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 10 15 5 19 7 20 17 11 16 8 18 9 1 3 6 ...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 15 5 10 6 17 ...


In [22]:
# P3b: Blend CE-6x (features3d_v2) with CE-v3-6x (features3d_v3) at OOF, pick best, then build blended test submission
import os, json, time
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache')
lab_tr_dir = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

# Load averaged OOF for v2 (ce_new) and v3 (ce_v3)
def load_oof_v2_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1p = probs_cache/f"{sid}_ce_new_s1.npy"
    if p1p.exists():
        p1 = torch.from_numpy(np.load(p1p)).to(device); p = (p0 + p1) * 0.5
    else:
        p = p0
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

def load_oof_v3_avg(sid:int):
    p0p = probs_cache/f"{sid}_ce_v3.npy"
    assert p0p.exists(), f"Missing v3 OOF for {sid}: {p0p}"
    p0 = torch.from_numpy(np.load(p0p)).to(device)
    p1p = probs_cache/f"{sid}_ce_v3_s1.npy"
    if p1p.exists():
        p1 = torch.from_numpy(np.load(p1p)).to(device); p = (p0 + p1) * 0.5
    else:
        p = p0
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

def blend_probs_linear(p2: torch.Tensor, p3: torch.Tensor, w: float):
    q = w * p2 + (1.0 - w) * p3
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

# Decoder helpers (reuse improved peak-time decoder with gamma length scaling)
def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)
def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y
def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T=p.shape[0]; a=max(0,t_star-w); b=min(T-1,t_star+w); idx=torch.arange(a,b+1, device=p.device, dtype=p.dtype);
    seg=p[a:b+1]; s=seg.sum()+1e-8; return float(((idx*seg).sum()/s).item())
def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=2, K=3, k_delta=4):
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c/(p_t_c.sum(dim=-1, keepdim=True)+1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T,C = p_s.shape
    scores = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: scores[:,c]=p_s[:,c]; ks[c]=13; continue
        base_k = med_k.get(c,13); k_c = int(np.clip(round(gamma*base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c+1); ks[c]=k_c
        ks_multi = sorted(set([int(np.clip(k_c-4,9,25)), k_c, int(np.clip(k_c+4,9,25))]));
        ks_multi = [k if (k%2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:,c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
        scores[:,c] = (acc/float(len(ks_multi))).squeeze(1)
    peaks=[]
    for c in range(1,21):
        k=ks[c]; s=scores[:,c]; t_star=int(torch.argmax(s).item())
        w_com=max(5,k//3); radius=max(10,k//2); t_ref = refine_com(p_s[:,c], t_star, w=w_com)
        t_idx=int(round(max(0,min(t_ref,T-1))))
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item()
        pooled_at_ref = p_s[t_idx, c].item()
        peaks.append([c, t_ref, float(scores[t_idx,c].item()), float(local_mean), float(pooled_at_ref)])
    peaks.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    last_t=-1e9
    for i in range(len(peaks)):
        if peaks[i][1] <= last_t: peaks[i][1] = last_t + min_sep
        last_t = min(peaks[i][1], float(T-1))
    return [int(c) for c,_,_,_,_ in peaks]
def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med
def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)
def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# OOF sweep: small grid over weights and decoder params; select by worst-fold then mean
w_list=[0.3, 0.5, 0.7]
pool_ks=[11,15]; temps=[0.90,1.00]; gammas=[0.90,0.95]; seps=[2,3]
print('Sweeping v2-v3 OOF blend...', flush=True)
med_cache={}; res=[]; t0=time.time(); cfg_idx=0; total=len(w_list)*len(pool_ks)*len(temps)*len(gammas)*len(seps)
for w in w_list:
    for pool_k in pool_ks:
        for temp in temps:
            for gamma in gammas:
                for sep in seps:
                    cfg_idx += 1
                    per_fold=[]
                    for f in folds:
                        fi = int(f['fold'])
                        if fi not in med_cache: med_cache[fi]=compute_class_median_durations_for_ids(f['train_ids'])
                        med_k = med_cache[fi]
                        vids = f['val_ids']; tot=0; cnt=0
                        for sid in vids:
                            sid=int(sid); p2 = load_oof_v2_avg(sid); p3 = load_oof_v3_avg(sid); p = blend_probs_linear(p2, p3, float(w))
                            T = p.shape[0]; g_eff = gamma_with_length(gamma, T, med_k)
                            seq = decode_peaks_improved(p, med_k=med_k, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
                            tot += levenshtein(seq, id2seq[sid]); cnt += 1
                        per_fold.append(tot/max(cnt,1))
                    res.append((np.mean(per_fold), np.max(per_fold), {'w':w,'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep}))
                    if (cfg_idx % 8)==0 or cfg_idx==total:
                        print(f"  [sweep v2v3] cfg {cfg_idx}/{total} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
res.sort(key=lambda x: (x[1], x[0]))
print('Top v2-v3 blend OOF (mean,worst,cfg):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':wst, **cfg} for m,wst,cfg in res]).to_csv('cv_sweep_ce_v2v3_blend.csv', index=False)
print('Saved cv_sweep_ce_v2v3_blend.csv', flush=True)

# Test-time blended inference: load v2 CE-6x and v3 CE-6x models lazily per sample, combine probs with chosen w, decode
feat_v2_tr = Path('features3d_v2')/'train'; feat_v2_te = Path('features3d_v2')/'test'
feat_v3_tr = Path('features3d_v3')/'train'; feat_v3_te = Path('features3d_v3')/'test'

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__(); self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation)
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop); self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h); h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True); return x + h
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__(); self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers): blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3)); dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h); out = self.head(h); return out.transpose(1,2)

def compute_fold_scaler_from_dir(id_list, feat_dir: Path):
    n=0; mean=None; M2=None
    for sid in id_list:
        X = np.load(feat_dir/f"{int(sid)}.npz")['X'].astype(np.float32); n_i = X.shape[0]
        if mean is None: mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new))
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T,C = p_t_c.shape; tgt_len = max(1, int(round(T*s)))
        x = p_t_c.T.unsqueeze(0); y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False);
        y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T; y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8)
        acc = y2 if acc is None else (acc + y2)
    out = acc / float(len(factors)); return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def infer_probs_for_sid_from_stack(sid:int, feat_dir: Path, folds_info, model_prefix: str):
    # folds_info: list of (train_ids, ) used to compute scalers per fold index
    # model_prefix like 'model_ce_fold' or 'model_ce_v3_fold'
    X = np.load((feat_dir.parent/'test'/f"{sid}.npz"))['X'].astype(np.float32)
    acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for fi in range(3):
            mean,std = compute_fold_scaler_from_dir(folds[fi]['train_ids'], feat_dir)
            mean_t = torch.from_numpy(mean).float().to(device); std_t = torch.from_numpy(std).float().to(device)
            for s in (0,1):
                ckpt = Path(f"{model_prefix}{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists(): continue
                D_in = np.load(next(iter((feat_dir).glob('*.npz'))))['X'].shape[1]
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t)/(std_t+1e-6); xb = xb.unsqueeze(0)
                p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                acc = p if acc is None else (acc + p); del m
    probs = acc / float(6); return probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)

print('Building blended v2+v3 CE-6x submission...', flush=True)
cfg = pd.read_csv('cv_sweep_ce_v2v3_blend.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_v2v3_blend.csv').exists() else {'w':0.7,'pool_k':15,'temp':0.9,'gamma':0.9,'sep':2}
w_best=float(cfg['w']); pool_k=int(cfg['pool_k']); temp=float(cfg['temp']); gamma=float(cfg.get('gamma',1.0)); sep=int(cfg.get('sep',2))
med_k_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
rows=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    sid=int(sid)
    p2 = infer_probs_for_sid_from_stack(sid, feat_v2_tr, folds, 'model_ce_fold')
    p3 = infer_probs_for_sid_from_stack(sid, feat_v3_tr, folds, 'model_ce_v3_fold')
    p = blend_probs_linear(p2, p3, w_best)
    T = p.shape[0]; g_eff = gamma_with_length(gamma, T, med_k_all)
    seq = decode_peaks_improved(p, med_k=med_k_all, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
    rows.append({'Id': sid, 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer CE v2+v3 blend] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_v2v3_blend.csv', index=False); sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_v2v3_blend.csv and submission.csv; head:\n', sub.head(), flush=True)

Sweeping v2-v3 OOF blend...


  [sweep v2v3] cfg 8/48 elapsed=0.3m


  [sweep v2v3] cfg 16/48 elapsed=0.6m


  [sweep v2v3] cfg 24/48 elapsed=1.0m


  [sweep v2v3] cfg 32/48 elapsed=1.3m


  [sweep v2v3] cfg 40/48 elapsed=1.6m


  [sweep v2v3] cfg 48/48 elapsed=1.9m


Top v2-v3 blend OOF (mean,worst,cfg):
(4.300865800865801, 5.0, {'w': 0.7, 'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 2})
(4.300865800865801, 5.0, {'w': 0.7, 'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 3})
(4.300865800865801, 5.0, {'w': 0.7, 'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 2})
(4.300865800865801, 5.0, {'w': 0.7, 'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 3})
(4.307463753178038, 5.02, {'w': 0.7, 'pool_k': 11, 'temp': 0.9, 'gamma': 0.9, 'sep': 2})
Saved cv_sweep_ce_v2v3_blend.csv


Building blended v2+v3 CE-6x submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer CE v2+v3 blend] 10/95 elapsed=2.9m


  [infer CE v2+v3 blend] 20/95 elapsed=5.7m


  [infer CE v2+v3 blend] 30/95 elapsed=8.6m


  [infer CE v2+v3 blend] 40/95 elapsed=11.5m


  [infer CE v2+v3 blend] 50/95 elapsed=14.4m


  [infer CE v2+v3 blend] 60/95 elapsed=17.3m


  [infer CE v2+v3 blend] 70/95 elapsed=20.1m


  [infer CE v2+v3 blend] 80/95 elapsed=23.0m


  [infer CE v2+v3 blend] 90/95 elapsed=25.9m


  [infer CE v2+v3 blend] 95/95 elapsed=27.3m


Wrote submission_primary_ce_v2v3_blend.csv and submission.csv; head:
     Id                                           Sequence
0  300  3 5 9 19 2 11 18 12 8 10 4 20 13 14 6 16 7 15 ...
1  301  10 1 5 11 4 6 2 13 19 9 15 7 12 3 18 14 16 20 ...
2  302  19 1 12 17 16 5 15 13 20 18 3 10 4 6 8 14 7 9 ...
3  303  11 18 3 13 10 4 5 15 20 1 17 12 16 8 9 7 19 6 ...
4  304  19 1 12 14 18 10 13 9 7 2 11 3 5 6 15 8 17 16 ...


In [23]:
# P1 (v2-v3): Per-class meta-blend calibration on OOF (fold-safe), tiny grids; refit on all; build test submission
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
lab_tr_dir  = Path('labels3d_v2')/'train'
folds = json.load(open('folds_archive_cv.json','r'))
train_df = pd.read_csv('training.csv')
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

# Load averaged OOF probs for v2 (ce_new) and v3 (ce_v3)
def load_oof_v2_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1p = probs_cache/f"{sid}_ce_new_s1.npy"
    p = p0 if not p1p.exists() else (p0 + torch.from_numpy(np.load(p1p)).to(device)) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

def load_oof_v3_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_v3.npy")).to(device)
    p1p = probs_cache/f"{sid}_ce_v3_s1.npy"
    p = p0 if not p1p.exists() else (p0 + torch.from_numpy(np.load(p1p)).to(device)) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

# Per-class operations
def apply_per_class_temps(p_t_c: torch.Tensor, T_vec: np.ndarray):
    T = torch.from_numpy(T_vec.astype(np.float32)).to(device)  # [C]
    exps = 1.0 / (T + 1e-8)
    q = torch.pow(torch.clamp(p_t_c, 1e-8, 1.0), exps.unsqueeze(0))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

def blend_geom_perclass(p2: torch.Tensor, p3: torch.Tensor, alpha: np.ndarray):
    a = torch.from_numpy(alpha.astype(np.float32)).to(device)
    log2 = torch.log(torch.clamp(p2, 1e-8, 1.0)); log3 = torch.log(torch.clamp(p3, 1e-8, 1.0))
    q = torch.exp(log2 * a + log3 * (1.0 - a))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

def load_labels(sid:int):
    y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int64)
    return torch.from_numpy(y).to(device)

def per_frame_nll(p_t_c: torch.Tensor, y_t: torch.Tensor):
    m = (y_t >= 0)
    if not torch.any(m): return 0.0
    idx = y_t[m].long()
    return float((-torch.log(torch.clamp(p_t_c[m, idx], 1e-8, 1.0))).mean().item())

# Fold-safe fit of per-class T2[c], T3[c], alpha[c] on data excluding the fold
T_grid = np.array([0.9, 1.0, 1.1], dtype=np.float32)
A_grid = np.array([0.3, 0.5, 0.7], dtype=np.float32)

def collect_stream_data(ids, loader_fn):
    data = []  # list of (p_t_c, y_t)
    for sid in ids:
        sid = int(sid)
        p = loader_fn(sid); y = load_labels(sid)
        data.append((p, y))
    return data

def fit_per_class_params_excluding_fold(fold_idx:int):
    # build training ids = union of other folds' val ids
    val_ids_rest = []
    for f in folds:
        if int(f['fold']) != int(fold_idx):
            val_ids_rest.extend(f['val_ids'])
    v2_data = collect_stream_data(val_ids_rest, load_oof_v2_avg)
    v3_data = collect_stream_data(val_ids_rest, load_oof_v3_avg)
    C = v2_data[0][0].shape[1]
    T2 = np.ones(C, dtype=np.float32); T3 = np.ones(C, dtype=np.float32); alpha = np.full(C, 0.7, dtype=np.float32)
    for c in range(1, C):
        # fit T2[c]
        best_nll, best_T = 1e9, 1.0
        for T in T_grid:
            nll_sum = 0.0; cnt = 0
            for p, y in v2_data:
                m = (y == c)
                if not torch.any(m):
                    continue
                q = p.clone()
                qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T)); q[:, c] = qc
                q = q / (q.sum(dim=-1, keepdim=True) + 1e-8)
                nll_sum += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt > 0:
                nll = nll_sum / max(1, cnt)
                if nll < best_nll: best_nll, best_T = nll, float(T)
        T2[c] = best_T
        # fit T3[c]
        best_nll, best_T = 1e9, 1.0
        for T in T_grid:
            nll_sum = 0.0; cnt = 0
            for p, y in v3_data:
                m = (y == c)
                if not torch.any(m):
                    continue
                q = p.clone()
                qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T)); q[:, c] = qc
                q = q / (q.sum(dim=-1, keepdim=True) + 1e-8)
                nll_sum += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt > 0:
                nll = nll_sum / max(1, cnt)
                if nll < best_nll: best_nll, best_T = nll, float(T)
        T3[c] = best_T
        # fit alpha[c]
        best_nll, best_a = 1e9, 0.7
        # preapply temps once per sample for speed
        v2_cal = [apply_per_class_temps(p, T2) for (p, _) in v2_data]
        v3_cal = [apply_per_class_temps(p, T3) for (p, _) in v3_data]
        for a in A_grid:
            nll_sum = 0.0; cnt = 0
            for i in range(len(v2_data)):
                p2c, y = v2_cal[i], v2_data[i][1]
                p3c = v3_cal[i]
                m = (y == c)
                if not torch.any(m):
                    continue
                a_vec = np.full(C, 0.7, dtype=np.float32); a_vec[c] = float(a)
                q = blend_geom_perclass(p2c, p3c, a_vec)
                nll_sum += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt > 0:
                nll = nll_sum / max(1, cnt)
                if nll < best_nll: best_nll, best_a = nll, float(a)
        alpha[c] = best_a
    return T2, T3, alpha

# Decoder helpers (reuse improved peak-time) + gamma-with-length
def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x = p_t_c.unsqueeze(0).transpose(1,2); y = F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)
def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k = max(1, int(k)); x = p_t.view(1,1,-1); w = torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype) / float(k);
    pad = (k-1)//2; y = F.conv1d(x, w, padding=pad).view(-1); T = p_t.shape[0]
    if y.shape[0] < T: y = F.pad(y, (0, T - y.shape[0]))
    elif y.shape[0] > T: y = y[:T]
    return y
def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T=p.shape[0]; a=max(0,t_star-w); b=min(T-1,t_star+w); idx=torch.arange(a,b+1, device=p.device, dtype=p.dtype);
    seg=p[a:b+1]; s=seg.sum()+1e-8; return float(((idx*seg).sum()/s).item())
def decode_peaks_improved(p_t_c: torch.Tensor, med_k: dict, gamma: float = 1.0, pool_k=13, temp=0.9, min_sep=2, K=3, k_delta=4):
    if temp != 1.0:
        p_t_c = (p_t_c ** (1.0/temp)); p_t_c = p_t_c/(p_t_c.sum(dim=-1, keepdim=True)+1e-8)
    p_s = avg_pool_probs(p_t_c, k=pool_k); T,C = p_s.shape
    scores = torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: scores[:,c]=p_s[:,c]; ks[c]=13; continue
        base_k = med_k.get(c,13); k_c = int(np.clip(round(gamma*base_k), 9, 25));
        if k_c % 2 == 0: k_c = min(25, k_c+1); ks[c]=k_c
        ks_multi = sorted(set([int(np.clip(k_c-4,9,25)), k_c, int(np.clip(k_c+4,9,25))]));
        ks_multi = [k if (k%2)==1 else min(25, k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            di = duration_integral_single(p_s[:,c], k=k).unsqueeze(1); acc = di if acc is None else (acc + di)
        scores[:,c] = (acc/float(len(ks_multi))).squeeze(1)
    peaks=[]; last_t=-1e9
    for c in range(1,21):
        k=ks[c]; s=scores[:,c]; t_star=int(torch.argmax(s).item());
        w_com=max(5,k//3); radius=max(10,k//2); t_ref = refine_com(p_s[:,c], t_star, w=w_com)
        t_idx=int(round(max(0,min(t_ref,T-1))))
        local_mean = p_s[max(0,t_idx-radius):min(T,t_idx+radius+1), c].mean().item()
        pooled_at_ref = p_s[t_idx, c].item()
        peaks.append([c, t_ref, float(scores[t_idx,c].item()), float(local_mean), float(pooled_at_ref)])
    peaks.sort(key=lambda x: (x[1], -x[2], -x[3], -x[4]))
    for i in range(len(peaks)):
        if peaks[i][1] <= last_t + float(min_sep): peaks[i][1] = last_t + float(min_sep)
        last_t = min(peaks[i][1], float(T-1))
    return [int(c) for c,_,_,_,_ in peaks]

def compute_class_median_durations_for_ids(id_list):
    dur_by_c = {c: [] for c in range(1,21)}
    for sid in id_list:
        y = np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt = int((y==c).sum());
            if cnt>0: dur_by_c[c].append(cnt)
    med = {};
    for c in range(1,21):
        m = np.median(dur_by_c[c]) if len(dur_by_c[c])>0 else 13
        med[c] = int(np.clip(m, 9, 25))
    return med

def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L_est = float(sum(med_k.get(c,13) for c in range(1,21)))
    if L_est <= 0: return gamma_cv
    ratio = float(T) / L_est; gamma_s = float(np.clip(ratio, 0.85, 1.15)); return float(gamma_cv * gamma_s)

def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Sweep tiny decoder grid on calibrated per-fold OOF with per-class meta-blend
pool_ks=[13,15]; temps=[0.90]; gammas=[0.90,0.95]; seps=[2]
print('Fitting per-class T2/T3/alpha per fold (fold-safe)...', flush=True)
calib_by_fold = {}  # fold_idx -> dict
for f in folds:
    fi = int(f['fold'])
    T2, T3, A = fit_per_class_params_excluding_fold(fi)
    calib_by_fold[fi] = {'T2': T2.tolist(), 'T3': T3.tolist(), 'A': A.tolist()}

print('Sweeping decoder on calibrated v2+v3 OOF meta-blend...', flush=True)
res=[]; med_cache={}
for pool_k in pool_ks:
    for temp in temps:
        for gamma in gammas:
            for sep in seps:
                per_fold=[]
                for f in folds:
                    fi = int(f['fold'])
                    if fi not in med_cache: med_cache[fi] = compute_class_median_durations_for_ids(f['train_ids'])
                    med_k = med_cache[fi]
                    T2 = np.array(calib_by_fold[fi]['T2'], dtype=np.float32)
                    T3 = np.array(calib_by_fold[fi]['T3'], dtype=np.float32)
                    A  = np.array(calib_by_fold[fi]['A'],  dtype=np.float32)
                    vids = f['val_ids']; tot=0; cnt=0
                    for sid in vids:
                        sid=int(sid); p2 = load_oof_v2_avg(sid); p3 = load_oof_v3_avg(sid)
                        q2 = apply_per_class_temps(p2, T2); q3 = apply_per_class_temps(p3, T3)
                        q = blend_geom_perclass(q2, q3, A)
                        Tlen = q.shape[0]; g_eff = gamma_with_length(gamma, Tlen, med_k)
                        seq = decode_peaks_improved(q, med_k=med_k, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
                        tot += levenshtein(seq, id2seq[sid]); cnt += 1
                    per_fold.append(tot/max(cnt,1))
                res.append((float(np.mean(per_fold)), float(np.max(per_fold)), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep}))
res.sort(key=lambda x: (x[1], x[0]))
print('Top v2-v3 meta-blend OOF (mean,worst,cfg):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_v2v3_meta.csv', index=False)
print('Saved cv_sweep_ce_v2v3_meta.csv', flush=True)

# Refit per-class T2/T3/alpha on ALL OOF (train) for test-time
def refit_on_all():
    all_ids = train_df['Id'].astype(int).tolist()
    v2_data = collect_stream_data(all_ids, load_oof_v2_avg)
    v3_data = collect_stream_data(all_ids, load_oof_v3_avg)
    C = v2_data[0][0].shape[1]
    T2 = np.ones(C, dtype=np.float32); T3 = np.ones(C, dtype=np.float32); A = np.full(C, 0.7, dtype=np.float32)
    for c in range(1, C):
        best_nll, best_T = 1e9, 1.0
        for T in T_grid:
            nll_sum=0.0; cnt=0
            for p,y in v2_data:
                m = (y==c);
                if not torch.any(m): continue
                q = p.clone(); qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T)); q[:, c] = qc; q = q/(q.sum(dim=-1, keepdim=True)+1e-8)
                nll_sum += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt>0 and (nll_sum/max(1,cnt)) < best_nll: best_nll, best_T = nll_sum/max(1,cnt), float(T)
        T2[c] = best_T
        best_nll, best_T = 1e9, 1.0
        for T in T_grid:
            nll_sum=0.0; cnt=0
            for p,y in v3_data:
                m = (y==c);
                if not torch.any(m): continue
                q = p.clone(); qc = torch.pow(torch.clamp(q[:, c], 1e-8, 1.0), 1.0/float(T)); q[:, c] = qc; q = q/(q.sum(dim=-1, keepdim=True)+1e-8)
                nll_sum += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt>0 and (nll_sum/max(1,cnt)) < best_nll: best_nll, best_T = nll_sum/max(1,cnt), float(T)
        T3[c] = best_T
        # preapply
        v2_cal = [apply_per_class_temps(p2, T2) for (p2,_) in v2_data]
        v3_cal = [apply_per_class_temps(p3, T3) for (p3,_) in v3_data]
        best_nll, best_a = 1e9, 0.7
        for a in A_grid:
            nll_sum=0.0; cnt=0
            for i in range(len(v2_data)):
                y = v2_data[i][1]; m = (y==c)
                if not torch.any(m): continue
                a_vec = np.full(C, 0.7, dtype=np.float32); a_vec[c] = float(a)
                q = blend_geom_perclass(v2_cal[i], v3_cal[i], a_vec)
                nll_sum += per_frame_nll(q[m], y[m]) * int(m.sum().item()); cnt += int(m.sum().item())
            if cnt>0 and (nll_sum/max(1,cnt)) < best_nll: best_nll, best_a = nll_sum/max(1,cnt), float(a)
        A[c] = best_a
    return T2, T3, A

# Test-time inference: infer probs for v2 and v3 stacks, apply per-class temps and alpha, then decode
from math import ceil
feat_v2_tr = Path('features3d_v2')/'train'; feat_v3_tr = Path('features3d_v3')/'train'
feat_v2_te = Path('features3d_v2')/'test'; feat_v3_te = Path('features3d_v3')/'test'

class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__(); self.conv1 = nn.Conv1d(ch, ch, k, padding=dilation, dilation=dilation);
        self.gn1 = nn.GroupNorm(groups, ch); self.drop = nn.Dropout(drop); self.conv2 = nn.Conv1d(ch, ch, 1); self.gn2 = nn.GroupNorm(groups, ch)
    def forward(self, x):
        h = self.conv1(x); h = self.gn1(h); h = F.relu(h, inplace=True); h = self.drop(h); h = self.conv2(h); h = self.gn2(h); h = F.relu(h, inplace=True); return x + h
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__(); self.inp = nn.Conv1d(d_in, channels, 1); blocks=[]; dil=1
        for _ in range(layers): blocks.append(DilatedResBlock(channels, dil, drop=dropout, groups=8, k=3)); dil = min(dil*2, 512)
        self.blocks = nn.ModuleList(blocks); self.head = nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2); h = self.inp(x);
        for b in self.blocks: h = b(h); out = self.head(h); return out.transpose(1,2)

def compute_fold_scaler_from_dir(id_list, feat_dir: Path):
    n=0; mean=None; M2=None
    for sid in id_list:
        X = np.load(feat_dir/f"{int(sid)}.npz")["X"].astype(np.float32); n_i = X.shape[0]
        if mean is None: mean = X.mean(axis=0); M2 = ((X - mean)**2).sum(axis=0); n = n_i
        else:
            mean_i = X.mean(axis=0); n_new = n + n_i; delta = mean_i - mean
            mean = mean + delta * (n_i / max(1, n_new));
            M2 = M2 + ((X - mean_i)**2).sum(axis=0) + (delta**2) * (n * n_i / max(1, n_new)); n = n_new
    var = M2 / max(1, (n - 1)); std = np.sqrt(np.clip(var, 1e-8, None))
    return mean.astype(np.float32), std.astype(np.float32)

def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T,C = p_t_c.shape; tgt_len = max(1, int(round(T*s)))
        x = p_t_c.T.unsqueeze(0); y = F.interpolate(x, size=tgt_len, mode='linear', align_corners=False); y2 = F.interpolate(y, size=T, mode='linear', align_corners=False)[0].T
        y2 = y2 / (y2.sum(dim=-1, keepdim=True) + 1e-8); acc = y2 if acc is None else (acc + y2)
    out = acc / float(len(factors)); return out / (out.sum(dim=-1, keepdim=True) + 1e-8)

def infer_probs_for_sid_from_stack(sid:int, feat_tr_dir: Path, model_prefix: str):
    X = np.load((feat_tr_dir.parent/'test'/f"{sid}.npz"))['X'].astype(np.float32)
    acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for fi in range(3):
            mean,std = compute_fold_scaler_from_dir(folds[fi]['train_ids'], feat_tr_dir)
            mean_t = torch.from_numpy(mean).float().to(device); std_t = torch.from_numpy(std).float().to(device)
            D_in = np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
            for s in (0,1):
                ckpt = Path(f"{model_prefix}{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists(): continue
                m = DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device)
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb = torch.from_numpy(X).float().to(device); xb = (xb - mean_t)/(std_t+1e-6); xb = xb.unsqueeze(0)
                p = m(xb)[0].softmax(dim=-1); p = apply_tta_timewarp(p, factors=(0.9,1.0,1.1))
                acc = p if acc is None else (acc + p); del m
    probs = acc / float(6); return probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)

# Choose best decoder cfg by worst-fold then mean
cfg_df = pd.read_csv('cv_sweep_ce_v2v3_meta.csv').sort_values(['worst','mean']) if Path('cv_sweep_ce_v2v3_meta.csv').exists() else None
if cfg_df is None or len(cfg_df)==0:
    best_cfg = {'pool_k':15,'temp':0.90,'gamma':0.90,'sep':2}
else:
    best_cfg = cfg_df.iloc[0].to_dict()
pool_k=int(best_cfg.get('pool_k',15)); temp=float(best_cfg.get('temp',0.90)); gamma=float(best_cfg.get('gamma',0.90)); sep=int(best_cfg.get('sep',2))
print('Chosen meta-blend decoder cfg:', {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep}, flush=True)

print('Refitting per-class T2/T3/alpha on ALL OOF ...', flush=True)
T2_all, T3_all, A_all = refit_on_all()
Path('calib_all_v2v3_meta.json').write_text(json.dumps({'T2': T2_all.tolist(), 'T3': T3_all.tolist(), 'A': A_all.tolist()}))

print('Building v2+v3 per-class meta-blend submission...', flush=True)
med_k_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
rows=[]; t0=time.time()
for i, sid in enumerate(test_ids, 1):
    sid=int(sid)
    p2 = infer_probs_for_sid_from_stack(sid, feat_v2_tr, 'model_ce_fold')
    p3 = infer_probs_for_sid_from_stack(sid, feat_v3_tr, 'model_ce_v3_fold')
    q2 = apply_per_class_temps(p2, T2_all); q3 = apply_per_class_temps(p3, T3_all)
    q = blend_geom_perclass(q2, q3, A_all)
    Tlen = q.shape[0]; g_eff = gamma_with_length(gamma, Tlen, med_k_all)
    seq = decode_peaks_improved(q, med_k=med_k_all, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, K=3, k_delta=4)
    rows.append({'Id': sid, 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer v2v3 meta] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub = pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_v2v3_meta.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_v2v3_meta.csv and submission.csv; head:\n', sub.head(), flush=True)

Fitting per-class T2/T3/alpha per fold (fold-safe)...


Sweeping decoder on calibrated v2+v3 OOF meta-blend...


Top v2-v3 meta-blend OOF (mean,worst,cfg):
(4.3069511440940005, 5.06, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 2})
(4.3069511440940005, 5.06, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 2})
(4.310318147461004, 5.06, {'pool_k': 13, 'temp': 0.9, 'gamma': 0.9, 'sep': 2})
(4.310318147461004, 5.06, {'pool_k': 13, 'temp': 0.9, 'gamma': 0.95, 'sep': 2})
Saved cv_sweep_ce_v2v3_meta.csv


Chosen meta-blend decoder cfg: {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 2}


Refitting per-class T2/T3/alpha on ALL OOF ...


Building v2+v3 per-class meta-blend submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer v2v3 meta] 10/95 elapsed=2.9m


  [infer v2v3 meta] 20/95 elapsed=5.7m


  [infer v2v3 meta] 30/95 elapsed=8.6m


  [infer v2v3 meta] 40/95 elapsed=11.5m


  [infer v2v3 meta] 50/95 elapsed=14.3m


  [infer v2v3 meta] 60/95 elapsed=17.2m


  [infer v2v3 meta] 70/95 elapsed=20.1m


  [infer v2v3 meta] 80/95 elapsed=22.9m


  [infer v2v3 meta] 90/95 elapsed=25.8m


  [infer v2v3 meta] 95/95 elapsed=27.2m


Wrote submission_primary_ce_v2v3_meta.csv and submission.csv; head:
     Id                                           Sequence
0  300  3 5 9 19 2 11 18 12 8 10 4 20 13 14 6 16 7 15 ...
1  301  10 1 5 11 4 6 2 13 19 9 15 7 12 3 18 14 16 20 ...
2  302  19 1 12 17 16 5 15 13 20 18 3 10 4 6 8 14 7 9 ...
3  303  11 18 3 13 10 4 5 15 20 1 17 12 16 8 9 7 19 6 ...
4  304  3 1 12 14 18 10 13 9 7 2 11 19 5 6 15 8 17 16 ...


In [27]:
# P1+Decoder++: v2-v3 per-class meta-blend + local-search with reinsertion; tiny OOF sweep; refit-all; build test submission
import os, json, time, math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
lab_tr_dir  = Path('labels3d_v2')/'train'
train_df = pd.read_csv('training.csv')
folds = json.load(open('folds_archive_cv.json','r'))
id2seq = {int(r.Id): [int(x) for x in str(r.Sequence).strip().split()] for _, r in train_df.iterrows()}

# Streams: OOF loaders
def load_oof_v2_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_new.npy")).to(device)
    p1p = probs_cache/f"{sid}_ce_new_s1.npy"
    p = p0 if not p1p.exists() else (p0 + torch.from_numpy(np.load(p1p)).to(device)) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)
def load_oof_v3_avg(sid:int):
    p0 = torch.from_numpy(np.load(probs_cache/f"{sid}_ce_v3.npy")).to(device)
    p1p = probs_cache/f"{sid}_ce_v3_s1.npy"
    p = p0 if not p1p.exists() else (p0 + torch.from_numpy(np.load(p1p)).to(device)) * 0.5
    return p / (p.sum(dim=-1, keepdim=True) + 1e-8)

# Per-class temps and geometric per-class blend
def apply_per_class_temps(p_t_c: torch.Tensor, T_vec: np.ndarray):
    T = torch.from_numpy(T_vec.astype(np.float32)).to(device)
    q = torch.pow(torch.clamp(p_t_c, 1e-8, 1.0), (1.0/(T+1e-8)).unsqueeze(0))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)
def blend_geom_perclass(p2: torch.Tensor, p3: torch.Tensor, alpha: np.ndarray):
    a = torch.from_numpy(alpha.astype(np.float32)).to(device)
    q = torch.exp(torch.log(torch.clamp(p2,1e-8,1.0))*a + torch.log(torch.clamp(p3,1e-8,1.0))*(1.0-a))
    return q / (q.sum(dim=-1, keepdim=True) + 1e-8)

def load_labels(sid:int):
    return torch.from_numpy(np.load(lab_tr_dir/f"{sid}.npy").astype(np.int64)).to(device)

def per_frame_nll(p_t_c: torch.Tensor, y_t: torch.Tensor):
    m = (y_t >= 0)
    if not torch.any(m): return 0.0
    idx = y_t[m].long()
    return float((-torch.log(torch.clamp(p_t_c[m, idx], 1e-8, 1.0))).mean().item())

T_grid = np.array([0.9, 1.0, 1.1], dtype=np.float32)
A_grid = np.array([0.3, 0.5, 0.7], dtype=np.float32)

def collect_stream_data(ids, loader_fn):
    data=[]
    for sid in ids:
        sid=int(sid); p=loader_fn(sid); y=load_labels(sid); data.append((p,y))
    return data

def fit_per_class_params_excluding_fold(fold_idx:int):
    ids=[]
    for f in folds:
        if int(f['fold'])!=int(fold_idx): ids.extend(f['val_ids'])
    v2 = collect_stream_data(ids, load_oof_v2_avg)
    v3 = collect_stream_data(ids, load_oof_v3_avg)
    C = v2[0][0].shape[1]
    T2=np.ones(C, np.float32); T3=np.ones(C, np.float32); A=np.full(C, 0.7, np.float32)
    for c in range(1, C):
        # T2
        best=(1e9,1.0)
        for T in T_grid:
            s=0.0; n=0
            for p,y in v2:
                m = (y == c)
                if not torch.any(m):
                    continue
                q=p.clone(); q[:,c]=torch.pow(torch.clamp(q[:,c],1e-8,1.0), 1.0/float(T)); q=q/(q.sum(dim=-1,keepdim=True)+1e-8)
                s += per_frame_nll(q[m], y[m]) * int(m.sum().item()); n += int(m.sum().item())
            if n>0:
                val = s/max(1,n)
                if val<best[0]:
                    best=(val,float(T))
        T2[c]=best[1]
        # T3
        best=(1e9,1.0)
        for T in T_grid:
            s=0.0; n=0
            for p,y in v3:
                m = (y == c)
                if not torch.any(m):
                    continue
                q=p.clone(); q[:,c]=torch.pow(torch.clamp(q[:,c],1e-8,1.0), 1.0/float(T)); q=q/(q.sum(dim=-1,keepdim=True)+1e-8)
                s += per_frame_nll(q[m], y[m]) * int(m.sum().item()); n += int(m.sum().item())
            if n>0:
                val = s/max(1,n)
                if val<best[0]:
                    best=(val,float(T))
        T3[c]=best[1]
        # A[c]
        v2c=[apply_per_class_temps(p,T2) for (p,_) in v2]
        v3c=[apply_per_class_temps(p,T3) for (p,_) in v3]
        best=(1e9,0.7)
        for a in A_grid:
            s=0.0; n=0
            for i in range(len(v2)):
                y = v2[i][1]
                m = (y == c)
                if not torch.any(m):
                    continue
                a_vec=np.full(C,0.7,np.float32); a_vec[c]=float(a)
                q=blend_geom_perclass(v2c[i], v3c[i], a_vec)
                s += per_frame_nll(q[m], y[m]) * int(m.sum().item()); n += int(m.sum().item())
            if n>0:
                val = s/max(1,n)
                if val<best[0]:
                    best=(val,float(a))
        A[c]=best[1]
    return T2,T3,A

# Decoder helpers
def avg_pool_probs(p_t_c: torch.Tensor, k:int) -> torch.Tensor:
    x=p_t_c.unsqueeze(0).transpose(1,2); y=F.avg_pool1d(x, kernel_size=k, stride=1, padding=k//2);
    return y.transpose(1,2).squeeze(0)
def duration_integral_single(p_t: torch.Tensor, k:int) -> torch.Tensor:
    k=max(1,int(k)); x=p_t.view(1,1,-1); w=torch.ones(1,1,k, device=p_t.device, dtype=p_t.dtype)/float(k);
    pad=(k-1)//2; y=F.conv1d(x,w,padding=pad).view(-1); T=p_t.shape[0]
    if y.shape[0]<T: y=F.pad(y,(0,T-y.shape[0]))
    elif y.shape[0]>T: y=y[:T]
    return y
def refine_com(p: torch.Tensor, t_star:int, w:int=5) -> float:
    T=p.shape[0]; a=max(0,t_star-w); b=min(T-1,t_star+w); idx=torch.arange(a,b+1, device=p.device, dtype=p.dtype);
    seg=p[a:b+1]; s=seg.sum()+1e-8; return float(((idx*seg).sum()/s).item())
def compute_class_median_durations_for_ids(id_list):
    dur={c:[] for c in range(1,21)}
    for sid in id_list:
        y=np.load(lab_tr_dir/f"{sid}.npy").astype(np.int16)
        for c in range(1,21):
            cnt=int((y==c).sum());
            if cnt>0: dur[c].append(cnt)
    med={}
    for c in range(1,21):
        m=np.median(dur[c]) if len(dur[c])>0 else 13; med[c]=int(np.clip(m,9,25))
    return med
def gamma_with_length(gamma_cv: float, T: int, med_k: dict):
    L=float(sum(med_k.get(c,13) for c in range(1,21)));
    if L<=0: return gamma_cv
    ratio=float(T)/L; g=float(np.clip(ratio,0.85,1.15)); return float(gamma_cv*g)
def levenshtein(a,b):
    n,m=len(a),len(b)
    if n==0: return m
    if m==0: return n
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i; ai=a[i-1]
        for j in range(1,m+1):
            tmp=dp[j]; dp[j]=min(dp[j]+1, dp[j-1]+1, prev + (0 if ai==b[j-1] else 1)); prev=tmp
    return dp[m]

# Order prior from training sequences (robust to missing classes)
def build_order_prior(train_df):
    cnt=np.zeros((21,21),dtype=np.int64); tot=np.zeros((21,21),dtype=np.int64)
    for seq in train_df['Sequence'].astype(str).tolist():
        s=[int(x) for x in seq.strip().split() if x.isdigit()]; s=[x for x in s if 1<=x<=20]
        n=len(s)
        for i in range(n):
            a=s[i]
            for j in range(i+1,n):
                b=s[j]
                if a==b: continue
                cnt[a,b]+=1; tot[a,b]+=1
    P=np.zeros((21,21),dtype=np.float32)
    with np.errstate(divide='ignore', invalid='ignore'):
        P=np.where(tot>0, cnt/np.maximum(1, tot), 0.5)
    np.fill_diagonal(P,0.5)
    return P
P_order = build_order_prior(train_df)

# Build blended calibrated probs for a sid given fold-safe T2/T3/A
def blended_q_for_sid(sid:int, T2: np.ndarray, T3: np.ndarray, A: np.ndarray):
    p2 = load_oof_v2_avg(sid); p3 = load_oof_v3_avg(sid)
    q2 = apply_per_class_temps(p2, T2); q3 = apply_per_class_temps(p3, T3)
    return blend_geom_perclass(q2, q3, A)

# Local-search with adjacent swap + reinsertion
def build_scoring(p_t_c: torch.Tensor, med_k: dict, pool_k:int, gamma: float, temp: float, k_delta:int=4):
    if temp!=1.0:
        p_t_c=(torch.clamp(p_t_c,1e-8,1.0)**(1.0/temp)); p_t_c=p_t_c/(p_t_c.sum(dim=-1,keepdim=True)+1e-8)
    p_s=avg_pool_probs(p_t_c, k=pool_k); T,C=p_s.shape
    di=torch.zeros_like(p_s); ks=[13]*C
    for c in range(C):
        if c==0: di[:,c]=p_s[:,c]; ks[c]=13; continue
        base=med_k.get(c,13); k_c=int(np.clip(round(gamma*base),9,25));
        if (k_c%2)==0: k_c=min(25,k_c+1); ks[c]=k_c
        ks_multi=sorted(set([int(np.clip(k_c-k_delta,9,25)), k_c, int(np.clip(k_c+k_delta,9,25))]));
        ks_multi=[k if (k%2)==1 else min(25,k+1) for k in ks_multi]
        acc=None
        for k in ks_multi:
            x=duration_integral_single(p_s[:,c], k=k).unsqueeze(1); acc=x if acc is None else (acc+x)
        di[:,c]=(acc/float(len(ks_multi))).squeeze(1)
    mu=di.mean(dim=0,keepdim=True); sd=di.std(dim=0,keepdim=True)+1e-8; z=(di-mu)/sd
    logp=torch.log(torch.clamp(p_s,1e-8,1.0))
    return p_s, di, z, logp, ks

def initial_assignment(p_s, z, logp, ks, beta: float, min_sep:int):
    T,C=p_s.shape; items=[]
    for c in range(1,21):
        s_vec = logp[:,c] + beta*z[:,c]
        t_star=int(torch.argmax(s_vec).item())
        w_com=max(5, ks[c]//3); t_ref=refine_com(p_s[:,c], t_star, w=w_com)
        t_idx=max(0,min(int(round(t_ref)), T-1))
        score=float(s_vec[t_idx].item())
        items.append([float(t_ref), int(c), score])
    items.sort(key=lambda x: x[0])
    last=-1e9
    for it in items:
        if it[0] <= last + float(min_sep): it[0]=last+float(min_sep)
        last=min(it[0], float(T-1))
    return items  # list of [t, c, s]

def objective_S(items, beta: float, lambda_ord: float, lambda_len: float, exp_gap: dict):
    S=0.0
    n=len(items)
    # main per-(t,c) score already stored
    for _,_,sct in items: S += float(sct)
    # order prior
    if lambda_ord>0:
        for i in range(n):
            ci=items[i][1]
            for j in range(i+1,n):
                cj=items[j][1]
                pij=float(P_order[ci, cj]) if 1<=ci<=20 and 1<=cj<=20 else 0.5
                S -= lambda_ord * (1.0 - pij)
    # duration (gap) penalty
    if lambda_len>0 and n>1:
        for i in range(1,n):
            t_i=items[i][0]; t_im1=items[i-1][0]; ci=items[i][1]
            gap=max(1.0, float(t_i - t_im1))
            eg=max(1.0, float(exp_gap.get(ci, 13.0)))
            S -= lambda_len * abs(gap - eg) / eg
    return S

def s_at(c:int, t:float, logp, z, beta: float):
    T=logp.shape[0]; t_idx=max(0,min(int(round(t)), T-1))
    return float((logp[t_idx, c] + beta * z[t_idx, c]).item())

def hill_climb_with_reinsertion(items, p_s, z, logp, ks, beta: float, lambda_ord: float, lambda_len: float, exp_gap: dict, max_passes:int=4):
    improved=True; passes=0
    while improved and passes<max_passes:
        improved=False; passes+=1
        # 1) adjacent swaps pass
        i=0
        S_base = objective_S(items, beta, lambda_ord, lambda_len, exp_gap)
        while i < len(items)-1:
            t_i, c_i, s_i = items[i]; t_j, c_j, s_j = items[i+1]
            # swap classes (times fixed)
            s_i_new = s_at(c_j, t_i, logp, z, beta); s_j_new = s_at(c_i, t_j, logp, z, beta)
            items[i][1]=c_j; items[i][2]=s_i_new
            items[i+1][1]=c_i; items[i+1][2]=s_j_new
            S_new = objective_S(items, beta, lambda_ord, lambda_len, exp_gap)
            if S_new + 1e-9 >= S_base:
                improved = improved or (S_new > S_base + 1e-6); S_base = S_new
            else:
                # revert
                items[i][1]=c_i; items[i][2]=s_i
                items[i+1][1]=c_j; items[i+1][2]=s_j
            i+=1
        # 2) reinsertion moves (i -> i±1, i±2)
        changed=True; iter_lim=2
        while changed and iter_lim>0:
            changed=False; iter_lim-=1
            S_base = objective_S(items, beta, lambda_ord, lambda_len, exp_gap)
            n=len(items)
            for i in range(n):
                for d in (-2,-1,1,2):
                    j=i+d
                    if j<0 or j>=n or j==i: continue
                    # remove item i and insert at j
                    it=items.pop(i)
                    items.insert(j, it)
                    # recompute s at affected positions i..j range
                    a=min(i,j); b=max(i,j)
                    for k in range(a, b+1):
                        t_k, c_k, _ = items[k]
                        items[k][2] = s_at(c_k, t_k, logp, z, beta)
                    S_new = objective_S(items, beta, lambda_ord, lambda_len, exp_gap)
                    if S_new + 1e-9 >= S_base:
                        changed = changed or (S_new > S_base + 1e-6); S_base = S_new
                    else:
                        # revert reinsertion
                        it2=items.pop(j); items.insert(i, it2)
            improved = improved or changed
    return items

def decode_localsrch_meta(q: torch.Tensor, med_k: dict, gamma: float, pool_k:int, temp: float, min_sep:int, beta: float, lambda_ord: float, lambda_len: float):
    p_s, di, z, logp, ks = build_scoring(q, med_k, pool_k, gamma, temp, k_delta=4)
    items = initial_assignment(p_s, z, logp, ks, beta=beta, min_sep=min_sep)
    # expected gaps by class
    exp_gap={c: float(np.clip(round(gamma*med_k.get(c,13)), 3, 30)) for c in range(1,21)}
    items = hill_climb_with_reinsertion(items, p_s, z, logp, ks, beta, lambda_ord, lambda_len, exp_gap, max_passes=4)
    seq = [int(c) for (_,c,_) in items]
    if len(set(seq))<20:
        seen=set(); out=[]
        for c in seq:
            if c in seen: continue
            seen.add(c); out.append(c)
        for c in range(1,21):
            if c not in seen: out.append(c)
        seq=out[:20]
    return seq

# Tiny OOF sweep over localsrch params (fold-safe T2/T3/A), select by worst then mean
print('Fitting per-class T2/T3/alpha per fold (fold-safe) for local-search...', flush=True)
calib_by_fold={}
for f in folds:
    fi=int(f['fold'])
    T2,T3,A = fit_per_class_params_excluding_fold(fi)
    calib_by_fold[fi]={'T2':T2.tolist(),'T3':T3.tolist(),'A':A.tolist()}

pool_k=15; temps=[0.90]; gammas=[0.90,0.95]; seps=[2,3]; betas=[0.4,0.5]; lords=[0.03]; llens=[0.15,0.25]
print('Sweeping localsrch (reinsertion) on v2+v3 calibrated OOF...', flush=True)
res=[]
for temp in temps:
  for gamma in gammas:
    for sep in seps:
      for beta in betas:
        for l_ord in lords:
          for l_len in llens:
            per_fold=[]
            for f in folds:
                fi=int(f['fold'])
                T2=np.array(calib_by_fold[fi]['T2'], np.float32); T3=np.array(calib_by_fold[fi]['T3'], np.float32); A=np.array(calib_by_fold[fi]['A'], np.float32)
                med_k = compute_class_median_durations_for_ids(f['train_ids'])
                vids=f['val_ids']; tot=0; cnt=0
                for sid in vids:
                    sid=int(sid); q = blended_q_for_sid(sid, T2, T3, A); Tlen=q.shape[0]
                    g_eff = gamma_with_length(gamma, Tlen, med_k)
                    seq = decode_localsrch_meta(q, med_k=med_k, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, beta=beta, lambda_ord=l_ord, lambda_len=l_len)
                    tot += levenshtein(seq, id2seq[sid]); cnt += 1
                per_fold.append(tot/max(cnt,1))
            res.append((float(np.mean(per_fold)), float(np.max(per_fold)), {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep,'beta':beta,'lambda_ord':l_ord,'lambda_len':l_len}))
res.sort(key=lambda x: (x[1], x[0]))
print('Top localsrch-meta (mean,worst,cfg):')
for r in res[:5]: print(r)
pd.DataFrame([{'mean':m,'worst':w, **cfg} for m,w,cfg in res]).to_csv('cv_sweep_ce_v2v3_meta_localsrch.csv', index=False)
print('Saved cv_sweep_ce_v2v3_meta_localsrch.csv', flush=True)

# Refit T2/T3/A on ALL OOF (train) for test-time
def refit_on_all():
    ids=train_df['Id'].astype(int).tolist()
    v2=collect_stream_data(ids, load_oof_v2_avg); v3=collect_stream_data(ids, load_oof_v3_avg)
    C=v2[0][0].shape[1]
    T2=np.ones(C,np.float32); T3=np.ones(C,np.float32); A=np.full(C,0.7,np.float32)
    for c in range(1,C):
        best=(1e9,1.0)
        for T in T_grid:
            s=0.0; n=0
            for p,y in v2:
                m = (y == c)
                if not torch.any(m):
                    continue
                q=p.clone(); q[:,c]=torch.pow(torch.clamp(q[:,c],1e-8,1.0), 1.0/float(T)); q=q/(q.sum(dim=-1,keepdim=True)+1e-8)
                s += per_frame_nll(q[m], y[m]) * int(m.sum().item()); n += int(m.sum().item())
            if n>0:
                val = s/max(1,n)
                if val<best[0]:
                    best=(val,float(T))
        T2[c]=best[1]
        best=(1e9,1.0)
        for T in T_grid:
            s=0.0; n=0
            for p,y in v3:
                m = (y == c)
                if not torch.any(m):
                    continue
                q=p.clone(); q[:,c]=torch.pow(torch.clamp(q[:,c],1e-8,1.0), 1.0/float(T)); q=q/(q.sum(dim=-1,keepdim=True)+1e-8)
                s += per_frame_nll(q[m], y[m]) * int(m.sum().item()); n += int(m.sum().item())
            if n>0:
                val = s/max(1,n)
                if val<best[0]:
                    best=(val,float(T))
        T3[c]=best[1]
        v2c=[apply_per_class_temps(p,T2) for (p,_) in v2]; v3c=[apply_per_class_temps(p,T3) for (p,_) in v3]
        best=(1e9,0.7)
        for a in A_grid:
            s=0.0; n=0
            for i in range(len(v2)):
                y = v2[i][1]
                m = (y == c)
                if not torch.any(m):
                    continue
                a_vec=np.full(C,0.7,np.float32); a_vec[c]=float(a)
                q=blend_geom_perclass(v2c[i], v3c[i], a_vec)
                s += per_frame_nll(q[m], y[m]) * int(m.sum().item()); n += int(m.sum().item())
            if n>0:
                val = s/max(1,n)
                if val<best[0]:
                    best=(val,float(a))
        A[c]=best[1]
    return T2,T3,A

cfg = pd.read_csv('cv_sweep_ce_v2v3_meta_localsrch.csv').sort_values(['worst','mean']).iloc[0].to_dict() if Path('cv_sweep_ce_v2v3_meta_localsrch.csv').exists() else {'pool_k':15,'temp':0.90,'gamma':0.90,'sep':2,'beta':0.5,'lambda_ord':0.03,'lambda_len':0.15}
pool_k=int(cfg['pool_k']); temp=float(cfg['temp']); gamma=float(cfg.get('gamma',0.90)); sep=int(cfg['sep']); beta=float(cfg.get('beta',0.5)); l_ord=float(cfg.get('lambda_ord',0.03)); l_len=float(cfg.get('lambda_len',0.15))
print('Chosen localsrch-meta cfg:', {'pool_k':pool_k,'temp':temp,'gamma':gamma,'sep':sep,'beta':beta,'lambda_ord':l_ord,'lambda_len':l_len}, flush=True)

print('Refitting T2/T3/alpha on ALL OOF for test...', flush=True)
T2_all, T3_all, A_all = refit_on_all()
Path('calib_all_v2v3_meta.json').write_text(json.dumps({'T2':T2_all.tolist(),'T3':T3_all.tolist(),'A':A_all.tolist()}))

# Test-time inference: reuse stack inference from previous cells
feat_v2_tr = Path('features3d_v2')/'train'; feat_v3_tr = Path('features3d_v3')/'train'
class DilatedResBlock(nn.Module):
    def __init__(self, ch, dilation, drop=0.35, groups=8, k=3):
        super().__init__(); self.conv1=nn.Conv1d(ch,ch,k,padding=dilation,dilation=dilation); self.gn1=nn.GroupNorm(groups,ch); self.drop=nn.Dropout(drop); self.conv2=nn.Conv1d(ch,ch,1); self.gn2=nn.GroupNorm(groups,ch)
    def forward(self,x):
        h=self.conv1(x); h=self.gn1(h); h=F.relu(h, inplace=True); h=self.drop(h); h=self.conv2(h); h=self.gn2(h); h=F.relu(h,inplace=True); return x+h
class DilatedTCN(nn.Module):
    def __init__(self, d_in, channels=128, layers=12, num_classes=21, dropout=0.35):
        super().__init__(); self.inp=nn.Conv1d(d_in,channels,1); blks=[]; dil=1
        for _ in range(layers): blks.append(DilatedResBlock(channels,dil,drop=dropout,groups=8,k=3)); dil=min(dil*2,512)
        self.blocks=nn.ModuleList(blks); self.head=nn.Conv1d(channels, num_classes, 1)
    def forward(self, x_b_t_d):
        x = x_b_t_d.transpose(1,2)
        h = self.inp(x)
        for b in self.blocks:
            h = b(h)
        out = self.head(h)
        return out.transpose(1,2)

def compute_fold_scaler_from_dir(id_list, feat_dir: Path):
    n=0; mean=None; M2=None
    for sid in id_list:
        X=np.load(feat_dir/f"{int(sid)}.npz")["X"].astype(np.float32); n_i=X.shape[0]
        if mean is None: mean=X.mean(axis=0); M2=((X-mean)**2).sum(axis=0); n=n_i
        else:
            mean_i=X.mean(axis=0); n_new=n+n_i; delta=mean_i-mean; mean=mean+delta*(n_i/max(1,n_new)); M2=M2+((X-mean_i)**2).sum(axis=0)+(delta**2)*(n*n_i/max(1,n_new)); n=n_new
    var=M2/max(1,(n-1)); std=np.sqrt(np.clip(var,1e-8,None)); return mean.astype(np.float32), std.astype(np.float32)
def apply_tta_timewarp(p_t_c: torch.Tensor, factors=(0.9,1.0,1.1)) -> torch.Tensor:
    acc=None
    for s in factors:
        T,C = p_t_c.shape; tgt=max(1,int(round(T*s))); x=p_t_c.T.unsqueeze(0); y=F.interpolate(x,size=tgt,mode='linear',align_corners=False); y2=F.interpolate(y,size=T,mode='linear',align_corners=False)[0].T; y2=y2/(y2.sum(dim=-1,keepdim=True)+1e-8); acc=y2 if acc is None else (acc+y2)
    out=acc/float(len(factors)); return out/(out.sum(dim=-1,keepdim=True)+1e-8)
def infer_probs_for_sid_from_stack(sid:int, feat_tr_dir: Path, model_prefix: str):
    X=np.load((feat_tr_dir.parent/'test'/f"{sid}.npz"))['X'].astype(np.float32); acc=None
    with torch.no_grad(), torch.amp.autocast('cuda' if device.type=='cuda' else 'cpu'):
        for fi in range(3):
            mean,std = compute_fold_scaler_from_dir(folds[fi]['train_ids'], feat_tr_dir)
            mean_t=torch.from_numpy(mean).float().to(device); std_t=torch.from_numpy(std).float().to(device)
            D_in=np.load(next(iter((feat_tr_dir).glob('*.npz'))))['X'].shape[1]
            for s in (0,1):
                ckpt=Path(f"{model_prefix}{fi}{'_s1' if s==1 else ''}.pth")
                if not ckpt.exists(): continue
                m=DilatedTCN(d_in=D_in, channels=128, layers=12, num_classes=21, dropout=0.35).to(device);
                m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()
                xb=torch.from_numpy(X).float().to(device); xb=(xb - mean_t)/(std_t+1e-6); xb=xb.unsqueeze(0)
                p=m(xb)[0].softmax(dim=-1); p=apply_tta_timewarp(p, factors=(0.9,1.0,1.1)); acc=p if acc is None else (acc+p); del m
    probs=acc/float(6); return probs/(probs.sum(dim=-1,keepdim=True)+1e-8)

print('Building v2+v3 meta-blend with localsrch+reinsertion submission...', flush=True)
med_k_all = compute_class_median_durations_for_ids(train_df['Id'].astype(int).tolist())
test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
rows=[]; t0=time.time()
for i,sid in enumerate(test_ids,1):
    sid=int(sid)
    p2 = infer_probs_for_sid_from_stack(sid, Path('features3d_v2')/'train', 'model_ce_fold')
    p3 = infer_probs_for_sid_from_stack(sid, Path('features3d_v3')/'train', 'model_ce_v3_fold')
    q2 = apply_per_class_temps(p2, T2_all); q3 = apply_per_class_temps(p3, T3_all)
    q  = blend_geom_perclass(q2, q3, A_all)
    Tlen=q.shape[0]; g_eff=gamma_with_length(gamma, Tlen, med_k_all)
    seq = decode_localsrch_meta(q, med_k=med_k_all, gamma=g_eff, pool_k=pool_k, temp=temp, min_sep=sep, beta=beta, lambda_ord=l_ord, lambda_len=l_len)
    rows.append({'Id': sid, 'Sequence': ' '.join(str(x) for x in seq)})
    if (i%10)==0 or i==len(test_ids):
        print(f"  [infer v2v3 meta-localsrch] {i}/{len(test_ids)} elapsed={(time.time()-t0)/60:.1f}m", flush=True)
sub=pd.DataFrame(rows, columns=['Id','Sequence'])
assert len(sub)==95
assert all(len(s.split())==20 and len(set(s.split()))==20 and all(1<=int(t)<=20 for t in s.split()) for s in sub.Sequence), 'Submission format invalid'
sub.to_csv('submission_primary_ce_v2v3_meta_localsrch.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Wrote submission_primary_ce_v2v3_meta_localsrch.csv and submission.csv; head:\n', sub.head(), flush=True)

Fitting per-class T2/T3/alpha per fold (fold-safe) for local-search...


Sweeping localsrch (reinsertion) on v2+v3 calibrated OOF...


Top localsrch-meta (mean,worst,cfg):
(4.290319521748093, 5.02, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 3, 'beta': 0.4, 'lambda_ord': 0.03, 'lambda_len': 0.25})
(4.293652855081427, 5.03, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 3, 'beta': 0.4, 'lambda_ord': 0.03, 'lambda_len': 0.15})
(4.293652855081427, 5.03, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 2, 'beta': 0.4, 'lambda_ord': 0.03, 'lambda_len': 0.15})
(4.293652855081427, 5.03, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 2, 'beta': 0.4, 'lambda_ord': 0.03, 'lambda_len': 0.25})
(4.293652855081427, 5.03, {'pool_k': 15, 'temp': 0.9, 'gamma': 0.95, 'sep': 3, 'beta': 0.4, 'lambda_ord': 0.03, 'lambda_len': 0.15})
Saved cv_sweep_ce_v2v3_meta_localsrch.csv


Chosen localsrch-meta cfg: {'pool_k': 15, 'temp': 0.9, 'gamma': 0.9, 'sep': 3, 'beta': 0.4, 'lambda_ord': 0.03, 'lambda_len': 0.25}


Refitting T2/T3/alpha on ALL OOF for test...


Building v2+v3 meta-blend with localsrch+reinsertion submission...


  m.load_state_dict(torch.load(ckpt, map_location=device)); m.eval()


  [infer v2v3 meta-localsrch] 10/95 elapsed=2.9m


  [infer v2v3 meta-localsrch] 20/95 elapsed=5.8m


  [infer v2v3 meta-localsrch] 30/95 elapsed=8.6m


  [infer v2v3 meta-localsrch] 40/95 elapsed=11.5m


  [infer v2v3 meta-localsrch] 50/95 elapsed=14.3m


  [infer v2v3 meta-localsrch] 60/95 elapsed=17.2m


  [infer v2v3 meta-localsrch] 70/95 elapsed=20.1m


  [infer v2v3 meta-localsrch] 80/95 elapsed=23.0m


  [infer v2v3 meta-localsrch] 90/95 elapsed=25.8m


  [infer v2v3 meta-localsrch] 95/95 elapsed=27.3m


Wrote submission_primary_ce_v2v3_meta_localsrch.csv and submission.csv; head:
     Id                                           Sequence
0  300  5 9 1 2 18 3 8 4 20 13 12 15 14 11 6 16 19 7 1...
1  301  10 12 1 5 4 20 6 2 11 15 13 19 7 9 8 18 14 3 1...
2  302  1 17 16 12 5 19 7 13 20 18 11 3 4 6 15 8 14 10...
3  303  13 4 12 10 5 19 15 20 17 11 16 8 18 7 3 1 6 2 ...
4  304  8 1 12 14 18 13 9 7 2 11 3 20 19 5 10 6 15 17 ...
