In [2]:
# Entropy-adaptive PoE fusion using cached probabilities; OOF tune and test submission
import numpy as np, pandas as pd, json, time, os
from pathlib import Path
from collections import defaultdict

probs_cache = Path('probs_cache')
labels_dir = Path('labels3d_v2/train')

# Load fold splits
folds = json.load(open('folds_archive_cv.json','r'))
all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})

# Calibration for skeleton v2/v3 blend
calib = json.load(open('calib_all_v2v3_meta.json','r'))
T2 = np.array(calib['T2'], dtype=np.float32)
T3 = np.array(calib['T3'], dtype=np.float32)
A  = np.array(calib.get('A', [0.7]*len(T2)), dtype=np.float32)  # per-class weight for v2

def temp_scale(p, T):
    T = np.asarray(T, dtype=np.float32).reshape(-1)
    p = np.clip(p, 1e-8, 1.0)
    logp = np.log(p)
    if p.shape[0] == T.shape[0]:
        logp = logp / np.maximum(T[:, None], 1e-6)
        q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
        return q.astype(np.float32)
    elif p.shape[-1] == T.shape[0]:
        logp = logp / np.maximum(T[None, :], 1e-6)
        q = np.exp(logp); q /= (q.sum(axis=1, keepdims=True) + 1e-8)
        return q.T.astype(np.float32)
    else:
        raise ValueError('T length mismatch')

def ensure_CxT(p, C=21):
    if p.shape[0] == C: return p
    if p.shape[1] == C: return p.T
    raise ValueError('Bad probs shape')

def load_skeleton_probs(seq_id: int) -> np.ndarray:
    p2 = np.load(probs_cache/f"{seq_id}_ce.npy").astype(np.float32)
    p3 = np.load(probs_cache/f"{seq_id}_ce_v3.npy").astype(np.float32)
    p2 = ensure_CxT(temp_scale(p2, T2))
    p3 = ensure_CxT(temp_scale(p3, T3))
    Tm = min(p2.shape[1], p3.shape[1])
    p2 = p2[:, :Tm]; p3 = p3[:, :Tm]
    a = A.reshape(-1,1)
    p = a*p2 + (1.0-a)*p3
    p /= (p.sum(axis=0, keepdims=True) + 1e-8)
    return p.astype(np.float32)

def load_rgb_probs(seq_id: int) -> np.ndarray | None:
    pth = probs_cache/f"{seq_id}_rgb.npy"
    if not pth.exists(): return None
    p = np.load(pth).astype(np.float32)
    return ensure_CxT(p)

def entropy(p: np.ndarray) -> np.ndarray:
    q = np.clip(p, 1e-8, 1.0)
    return -np.sum(q*np.log(q), axis=0)

def align_by_entropy_corr(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    # Align p_src to p_ref by maximizing entropy correlation.
    # Returns (src_aligned, ref_cropped) with equal time length.
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)  # sh negative
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        # robust corr (handle constant segments)
        if s.std() < 1e-8 or r.std() < 1e-8:
            corr = -1.0
        else:
            corr = float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    sh = best[1]
    if sh >= 0:
        L = min(p_src.shape[1] - sh, p_ref.shape[1])
        return p_src[:, sh:sh+L], p_ref[:, :L]
    else:
        L = min(p_src.shape[1], p_ref.shape[1] + sh)
        return p_src[:, :L], p_ref[:, -sh:-sh+L]

def smooth_probs_box(p: np.ndarray, k: int = 5) -> np.ndarray:
    if k<=1: return p
    C,T = p.shape
    pad = k//2
    x = np.pad(p, ((0,0),(pad,pad)), mode='edge')
    cs = np.cumsum(x, axis=1, dtype=np.float64)
    out = (cs[:, k:] - cs[:, :-k]) / k
    out = out.astype(np.float32)
    out /= (out.sum(axis=0, keepdims=True) + 1e-8)
    return out

def decode_minseg(p: np.ndarray, min_dur: np.ndarray) -> np.ndarray:
    y = p.argmax(axis=0).astype(np.int32)
    T = y.shape[0]; i=0
    while i < T:
        c = y[i]; j=i+1
        while j<T and y[j]==c: j+=1
        L = j-i
        if c!=0 and L < int(min_dur[c]):
            lc = y[i-1] if i>0 else None
            rc = y[j] if j<T else None
            ls = float(p[lc, i:j].mean()) if lc is not None else -1e9
            rs = float(p[rc, i:j].mean()) if rc is not None else -1e9
            if rs >= ls: y[i:j] = rc if rc is not None else 0
            else:        y[i:j] = lc if lc is not None else 0
            i = max(0, i-1); continue
        i = j
    return y

def aba_collapse(y: np.ndarray, max_len: int = 2, ratio: float = 1.04, p: np.ndarray | None = None) -> np.ndarray:
    # collapse short ABA islands
    T = len(y); i=1
    while i < T-1:
        if y[i-1]==y[i+1] and y[i]!=y[i-1]:
            L=1; j=i+1
            while j<T-1 and y[j-1]==y[j+1] and y[j]!=y[j-1]:
                L+=1; j+=1
            if L<=max_len:
                y[i:j] = y[i-1]
                i = max(1, i-1); continue
            i = j
        i+=1
    return y

def compress_to_sequence(y_frames):
    seq=[]; last=-1
    for c in y_frames:
        if c==0: continue
        if c!=last: seq.append(int(c)); last=int(c)
    return seq

def make_perm20(seq_raw, p: np.ndarray):
    # keep first occurrence, then append missing classes by total mass desc
    seen=set(); seq=[]
    for c in seq_raw:
        if 1<=c<=20 and c not in seen:
            seen.add(c); seq.append(c)
    if len(seq)<20:
        masses = [(c, float(p[c].sum())) for c in range(1,21) if c not in seen]
        masses.sort(key=lambda x: x[1], reverse=True)
        for c,_ in masses:
            if len(seq)==20: break
            seq.append(c)
    if len(seq)>20: seq = seq[:20]
    return seq

def segment_lengths(y):
    lens=defaultdict(list); cur=None; run=0
    for c in y:
        if c==0:
            if cur is not None: lens[cur].append(run); cur=None; run=0
            continue
        if cur is None: cur=int(c); run=1
        elif c==cur: run+=1
        else: lens[cur].append(run); cur=int(c); run=1
    if cur is not None: lens[cur].append(run)
    return lens

def compute_min_dur_from_ids(ids):
    agg=defaultdict(list)
    for sid in ids:
        y = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
        for c,ls in segment_lengths(y).items():
            if c!=0: agg[c].extend(ls)
    med = np.zeros(21, dtype=np.float32)
    for c in range(1,21):
        ls = agg.get(c, [])
        med[c] = float(np.median(ls)) if ls else 1.0
    return med

def fuse_entropy_adaptive(ps: np.ndarray, pr: np.ndarray | None, alpha0: float, beta: float, a_min: float, a_max: float) -> np.ndarray:
    if pr is None:
        return ps
    # Align
    pr_a, ps_a = align_by_entropy_corr(pr, ps, max_shift=15)
    # Crop to common
    Tm = min(ps_a.shape[1], pr_a.shape[1])
    ps_a = ps_a[:, :Tm]; pr_a = pr_a[:, :Tm]
    Hs = entropy(ps_a); Hr = entropy(pr_a)
    a_t = alpha0 + beta*(Hr - Hs)
    a_t = np.clip(a_t, a_min, a_max).astype(np.float32)
    logp = (1.0 - a_t)[None,:]*np.log(np.clip(ps_a,1e-8,1.0)) + a_t[None,:]*np.log(np.clip(pr_a,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

def oof_eval_entropy_adaptive(grid_alpha0, grid_beta, bounds_list, smooth_k=5, min_mult=0.7):
    results = {}
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        for a0 in grid_alpha0:
            for b in grid_beta:
                for (a_min, a_max) in bounds_list:
                    key=(a0,b,a_min,a_max,fd['fold'])
                    dists=[]
                    for sid in va:
                        ps = load_skeleton_probs(int(sid))
                        pr = load_rgb_probs(int(sid))
                        pf = fuse_entropy_adaptive(ps, pr, a0, b, a_min, a_max)
                        pf = smooth_probs_box(pf, k=smooth_k)
                        y = decode_minseg(pf, min_dur)
                        y = aba_collapse(y, max_len=2, ratio=1.04, p=pf)
                        seq = compress_to_sequence(y)
                        y_true = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
                        seq_true = compress_to_sequence(y_true)
                        # Levenshtein
                        n=len(seq); m=len(seq_true);
                        if n==0: d=m; dists.append(d); continue
                        dp=list(range(m+1))
                        for i in range(1,n+1):
                            prev=dp[0]; dp[0]=i
                            for j in range(1,m+1):
                                tmp=dp[j]; cost=0 if seq[i-1]==seq_true[j-1] else 1
                                dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                        dists.append(dp[m])
                    results.setdefault((a0,b,a_min,a_max), []).append(float(np.mean(dists)))
    # summarize by worst-fold then mean
    summary=[]
    for k, arr in results.items():
        worst=max(arr); mean=float(np.mean(arr))
        summary.append((worst, mean, k))
    summary.sort(key=lambda x: (x[0], x[1]))
    return summary

def decode_test_with_best(a0, b, a_min, a_max, smooth_k=5, min_mult=0.7, out_csv='submission_entropy_adapt.csv'):
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time()
    n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        ps = load_skeleton_probs(int(sid))
        pr = load_rgb_probs(int(sid))
        pf = fuse_entropy_adaptive(ps, pr, a0, b, a_min, a_max)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur)
        y = aba_collapse(y, max_len=2, ratio=1.04, p=pf)
        seq_raw = compress_to_sequence(y)
        seq = make_perm20(seq_raw, pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

# Grid and run
grid_alpha0 = [0.24, 0.26]
grid_beta   = [0.15, 0.30]
bounds_list = [(0.15, 0.40), (0.20, 0.45)]
print('OOF tuning entropy-adaptive fusion...', flush=True)
summary = oof_eval_entropy_adaptive(grid_alpha0, grid_beta, bounds_list, smooth_k=5, min_mult=0.7)
best = summary[0]
print('Best (worst,mean,params)=', best[:2], best[2])
a0,b,a_min,a_max = best[2]
out_csv = f"submission_entropy_adapt_a{str(a0).replace('.','')}_b{str(b).replace('.','')}_l{str(a_min).replace('.','')}_u{str(a_max).replace('.','')}.csv"
print('Decoding TEST with best params...', flush=True)
decode_test_with_best(a0, b, a_min, a_max, smooth_k=5, min_mult=0.7, out_csv=out_csv)

OOF tuning entropy-adaptive fusion...


Best (worst,mean,params)= (4.69, 4.055534254105683) (0.24, 0.15, 0.15, 0.4)
Decoding TEST with best params...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_entropy_adapt_a024_b015_l015_u04.csv rows= 95
submission.csv written -> submission_entropy_adapt_a024_b015_l015_u04.csv


In [28]:
# Restore best-known baseline submission
import shutil, os
src = 'submission_fused_rgb_audio_g025.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Restored baseline submission: {src} -> {dst}')

Restored baseline submission: submission_fused_rgb_audio_g025.csv -> submission.csv


In [5]:
# Depth+User visual hedge: average visual (rgb/depth/user), PoE with skeleton+audio; small OOF grid and test decode
import numpy as np, pandas as pd, json, time, os
from pathlib import Path
from collections import defaultdict

probs_cache = Path('probs_cache')
labels_dir = Path('labels3d_v2/train')

# Reuse folds and skeleton calibration from cell 0 if available; otherwise reload
try:
    folds
except NameError:
    folds = json.load(open('folds_archive_cv.json','r'))
try:
    T2
except NameError:
    calib = json.load(open('calib_all_v2v3_meta.json','r'))
    T2 = np.array(calib['T2'], dtype=np.float32)
    T3 = np.array(calib['T3'], dtype=np.float32)
    A  = np.array(calib.get('A', [0.7]*len(T2)), dtype=np.float32)

def ensure_CxT(p, C=21):
    if p is None: return None
    if p.ndim==2 and p.shape[0]==C: return p
    if p.ndim==2 and p.shape[1]==C: return p.T
    raise ValueError('Bad probs shape')

def load_skeleton_probs(seq_id: int) -> np.ndarray:
    p2 = np.load(probs_cache/f"{seq_id}_ce.npy").astype(np.float32)
    p3 = np.load(probs_cache/f"{seq_id}_ce_v3.npy").astype(np.float32)
    # temp_scale already applied in upstream pipelines; calib here was used previously; trust cached probs to be calibrated
    p2 = ensure_CxT(p2); p3 = ensure_CxT(p3)
    Tm = min(p2.shape[1], p3.shape[1])
    p2 = p2[:, :Tm]; p3 = p3[:, :Tm]
    a = A.reshape(-1,1).astype(np.float32)
    p = a*p2 + (1.0-a)*p3
    p /= (p.sum(axis=0, keepdims=True) + 1e-8)
    return p.astype(np.float32)

def load_probs_generic(seq_id: int, suffix: str) -> np.ndarray | None:
    pth = probs_cache/f"{seq_id}_{suffix}.npy"
    if not pth.exists(): return None
    p = np.load(pth).astype(np.float32)
    return ensure_CxT(p)

def entropy(p: np.ndarray) -> np.ndarray:
    q = np.clip(p, 1e-8, 1.0)
    return -np.sum(q*np.log(q), axis=0)

def align_by_entropy_corr(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        if s.std() < 1e-8 or r.std() < 1e-8:
            corr = -1.0
        else:
            corr = float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    sh = best[1]
    if sh >= 0:
        L = min(p_src.shape[1] - sh, p_ref.shape[1])
        return p_src[:, sh:sh+L], p_ref[:, :L]
    else:
        L = min(p_src.shape[1], p_ref.shape[1] + sh)
        return p_src[:, :L], p_ref[:, -sh:-sh+L]

def smooth_probs_box(p: np.ndarray, k: int = 5) -> np.ndarray:
    if k<=1: return p
    C,T = p.shape
    pad = k//2
    x = np.pad(p, ((0,0),(pad,pad)), mode='edge')
    cs = np.cumsum(x, axis=1, dtype=np.float64)
    out = (cs[:, k:] - cs[:, :-k]) / k
    out = out.astype(np.float32)
    out /= (out.sum(axis=0, keepdims=True) + 1e-8)
    return out

def decode_minseg(p: np.ndarray, min_dur: np.ndarray) -> np.ndarray:
    y = p.argmax(axis=0).astype(np.int32)
    T = y.shape[0]; i=0
    while i < T:
        c = y[i]; j=i+1
        while j<T and y[j]==c: j+=1
        L = j-i
        if c!=0 and L < int(min_dur[c]):
            lc = y[i-1] if i>0 else None
            rc = y[j] if j<T else None
            ls = float(p[lc, i:j].mean()) if lc is not None else -1e9
            rs = float(p[rc, i:j].mean()) if rc is not None else -1e9
            if rs >= ls: y[i:j] = rc if rc is not None else 0
            else:        y[i:j] = lc if lc is not None else 0
            i = max(0, i-1); continue
        i = j
    return y

def aba_collapse(y: np.ndarray, max_len: int = 2, ratio: float = 1.04, p: np.ndarray | None = None) -> np.ndarray:
    T = len(y); i=1
    while i < T-1:
        if y[i-1]==y[i+1] and y[i]!=y[i-1]:
            L=1; j=i+1
            while j<T-1 and y[j-1]==y[j+1] and y[j]!=y[j-1]:
                L+=1; j+=1
            if L<=max_len:
                y[i:j] = y[i-1]
                i = max(1, i-1); continue
            i = j
        i+=1
    return y

def compress_to_sequence(y_frames):
    seq=[]; last=-1
    for c in y_frames:
        if c==0: continue
        if c!=last: seq.append(int(c)); last=int(c)
    return seq

def make_perm20(seq_raw, p: np.ndarray):
    seen=set(); seq=[]
    for c in seq_raw:
        if 1<=c<=20 and c not in seen:
            seen.add(c); seq.append(c)
    if len(seq)<20:
        masses = [(c, float(p[c].sum())) for c in range(1,21) if c not in seen]
        masses.sort(key=lambda x: x[1], reverse=True)
        for c,_ in masses:
            if len(seq)==20: break
            seq.append(c)
    if len(seq)>20: seq = seq[:20]
    return seq

def segment_lengths(y):
    lens=defaultdict(list); cur=None; run=0
    for c in y:
        if c==0:
            if cur is not None: lens[cur].append(run); cur=None; run=0
            continue
        if cur is None: cur=int(c); run=1
        elif c==cur: run+=1
        else: lens[cur].append(run); cur=int(c); run=1
    if cur is not None: lens[cur].append(run)
    return lens

def compute_min_dur_from_ids(ids):
    agg=defaultdict(list)
    for sid in ids:
        y = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
        for c,ls in segment_lengths(y).items():
            if c!=0: agg[c].extend(ls)
    med = np.zeros(21, dtype=np.float32)
    for c in range(1,21):
        ls = agg.get(c, [])
        med[c] = float(np.median(ls)) if ls else 1.0
    return med

def visual_average_aligned(ps: np.ndarray, pr: np.ndarray | None, pdepth: np.ndarray | None, pu: np.ndarray | None) -> np.ndarray | None:
    streams=[]
    for pv in (pr, pdepth, pu):
        if pv is None: continue
        pv_a, ps_a = align_by_entropy_corr(pv, ps, max_shift=15)
        # crop to common length with skeleton-aligned reference
        Tm = min(pv_a.shape[1], ps_a.shape[1])
        streams.append(pv_a[:, :Tm])
    if not streams: return None
    # average and renormalize
    L = min(s.shape[1] for s in streams)
    streams = [s[:, :L] for s in streams]
    v = np.mean(streams, axis=0)
    v /= (v.sum(axis=0, keepdims=True) + 1e-8)
    return v.astype(np.float32)

def fuse_poe(ps: np.ndarray, pvis: np.ndarray | None, pa: np.ndarray | None, alpha_vis: float, gamma_a: float) -> np.ndarray:
    # Align each non-skeleton to skeleton independently, crop to common T
    parts = [ps]
    if pvis is not None:
        pvis_a, ps_a = align_by_entropy_corr(pvis, ps, max_shift=15)
        parts.append(pvis_a); ps = ps_a
    if pa is not None:
        pa_a, ps_a2 = align_by_entropy_corr(pa, ps, max_shift=15)
        parts.append(pa_a); ps = ps_a2
    Tm = min(p.shape[1] for p in parts)
    ps = ps[:, :Tm]
    pvis_c = parts[1][:, :Tm] if (pvis is not None) else None
    pa_c   = parts[-1][:Tm] if False else None  # placeholder to keep lints calm
    pa_c = parts[-1][:, :Tm] if (pa is not None) else None
    w_s = max(0.0, 1.0 - (alpha_vis if pvis is not None else 0.0) - (gamma_a if pa is not None else 0.0))
    logp = w_s*np.log(np.clip(ps,1e-8,1.0))
    if pvis is not None:
        logp += alpha_vis*np.log(np.clip(pvis_c,1e-8,1.0))
    if pa is not None:
        logp += gamma_a*np.log(np.clip(pa_c,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

def oof_eval_visual_audio_hedge(alpha_list, gamma_list, smooth_k=5, min_mult=0.7):
    results = {}
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        for av in alpha_list:
            for ga in gamma_list:
                key=(av,ga,fd['fold'])
                dists=[]
                for sid in va:
                    ps = load_skeleton_probs(int(sid))
                    pr = load_probs_generic(int(sid), 'rgb')
                    pdepth = load_probs_generic(int(sid), 'depth')
                    pu = load_probs_generic(int(sid), 'user')
                    pa = load_probs_generic(int(sid), 'audio')
                    pvis = visual_average_aligned(ps, pr, pdepth, pu)
                    pf = fuse_poe(ps, pvis, pa, av, ga)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur)
                    y = aba_collapse(y, max_len=2, ratio=1.04, p=pf)
                    seq = compress_to_sequence(y)
                    y_true = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
                    seq_true = compress_to_sequence(y_true)
                    # Levenshtein
                    n=len(seq); m=len(seq_true);
                    if n==0: d=m; dists.append(d); continue
                    dp=list(range(m+1))
                    for i in range(1,n+1):
                        prev=dp[0]; dp[0]=i
                        for j in range(1,m+1):
                            tmp=dp[j]; cost=0 if seq[i-1]==seq_true[j-1] else 1
                            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                    dists.append(dp[m])
                results.setdefault((av,ga), []).append(float(np.mean(dists)))
    summary=[]
    for k, arr in results.items():
        worst=max(arr); mean=float(np.mean(arr))
        summary.append((worst, mean, k))
    summary.sort(key=lambda x: (x[0], x[1]))
    return summary

def decode_test_visual_audio_hedge(av, ga, smooth_k=5, min_mult=0.7, out_csv='submission_va_hedge.csv'):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        ps = load_skeleton_probs(int(sid))
        pr = load_probs_generic(int(sid), 'rgb')
        pdepth = load_probs_generic(int(sid), 'depth')
        pu = load_probs_generic(int(sid), 'user')
        pa = load_probs_generic(int(sid), 'audio')
        pvis = visual_average_aligned(ps, pr, pdepth, pu)
        pf = fuse_poe(ps, pvis, pa, av, ga)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur)
        y = aba_collapse(y, max_len=2, ratio=1.04, p=pf)
        seq_raw = compress_to_sequence(y)
        seq = make_perm20(seq_raw, pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

# Run small grid
alpha_list = [0.20, 0.25, 0.30]
gamma_list = [0.20, 0.25, 0.30]
print('OOF tuning visual+audio hedge...', flush=True)
summary = oof_eval_visual_audio_hedge(alpha_list, gamma_list, smooth_k=5, min_mult=0.7)
best = summary[0]
print('Best (worst,mean,params)=', best[:2], best[2])
av,ga = best[2]
out_csv = f"submission_visavg_poe_av{str(av).replace('.','')}_ga{str(ga).replace('.','')}.csv"
print('Decoding TEST with best params...', flush=True)
decode_test_visual_audio_hedge(av, ga, smooth_k=5, min_mult=0.7, out_csv=out_csv)

OOF tuning visual+audio hedge...


Best (worst,mean,params)= (4.57, 3.8699285370713947) (0.25, 0.2)
Decoding TEST with best params...


Decoded 20/95 elapsed=0.3s


Decoded 40/95 elapsed=0.7s


Decoded 60/95 elapsed=1.0s


Decoded 80/95 elapsed=1.3s


Decoded 95/95 elapsed=1.5s


Wrote submission_visavg_poe_av025_ga02.csv rows= 95
submission.csv written -> submission_visavg_poe_av025_ga02.csv


In [9]:
# Segmental Viterbi/HSMM decoder with duration and sparse transitions; OOF tiny grid and test decode
import numpy as np, pandas as pd, json, time, os
from pathlib import Path
from collections import defaultdict

probs_cache = Path('probs_cache')
labels_dir = Path('labels3d_v2/train')

try:
    folds
except NameError:
    folds = json.load(open('folds_archive_cv.json','r'))

# Utilities reused from previous cells if present; define minimal fallbacks
def ensure_CxT(p, C=21):
    if p is None: return None
    if p.ndim==2 and p.shape[0]==C: return p
    if p.ndim==2 and p.shape[1]==C: return p.T
    raise ValueError('Bad probs shape')

def load_skeleton_probs(seq_id: int) -> np.ndarray:
    p2 = np.load(probs_cache/f"{seq_id}_ce.npy").astype(np.float32)
    p3 = np.load(probs_cache/f"{seq_id}_ce_v3.npy").astype(np.float32)
    p2 = ensure_CxT(p2); p3 = ensure_CxT(p3)
    Tm = min(p2.shape[1], p3.shape[1])
    p2 = p2[:, :Tm]; p3 = p3[:, :Tm]
    # Per-class blend weight A loaded in other cells; fall back to 0.7 if missing
    try:
        a = A.reshape(-1,1).astype(np.float32)
    except NameError:
        a = np.full((21,1), 0.7, dtype=np.float32)
    p = a*p2 + (1.0-a)*p3
    p /= (p.sum(axis=0, keepdims=True) + 1e-8)
    return p.astype(np.float32)

def load_probs_generic(seq_id: int, suffix: str) -> np.ndarray | None:
    pth = probs_cache/f"{seq_id}_{suffix}.npy"
    if not pth.exists(): return None
    p = np.load(pth).astype(np.float32)
    return ensure_CxT(p)

def entropy(p: np.ndarray) -> np.ndarray:
    q = np.clip(p, 1e-8, 1.0)
    return -np.sum(q*np.log(q), axis=0)

def align_by_entropy_corr(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        if s.std() < 1e-8 or r.std() < 1e-8:
            corr = -1.0
        else:
            corr = float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    sh = best[1]
    if sh >= 0:
        L = min(p_src.shape[1] - sh, p_ref.shape[1])
        return p_src[:, sh:sh+L], p_ref[:, :L]
    else:
        L = min(p_src.shape[1], p_ref.shape[1] + sh)
        return p_src[:, :L], p_ref[:, -sh:-sh+L]

def smooth_probs_box(p: np.ndarray, k: int = 5) -> np.ndarray:
    if k<=1: return p
    C,T = p.shape
    pad = k//2
    x = np.pad(p, ((0,0),(pad,pad)), mode='edge')
    cs = np.cumsum(x, axis=1, dtype=np.float64)
    out = (cs[:, k:] - cs[:, :-k]) / k
    out = out.astype(np.float32)
    out /= (out.sum(axis=0, keepdims=True) + 1e-8)
    return out

# Visual/audio fusion: average visual streams then PoE with skeleton and audio
def visual_average_aligned(ps: np.ndarray, pr: np.ndarray | None, pdepth: np.ndarray | None, pu: np.ndarray | None) -> np.ndarray | None:
    streams=[]
    for pv in (pr, pdepth, pu):
        if pv is None: continue
        pv_a, ps_a = align_by_entropy_corr(pv, ps, max_shift=15)
        Tm = min(pv_a.shape[1], ps_a.shape[1])
        streams.append(pv_a[:, :Tm])
    if not streams: return None
    L = min(s.shape[1] for s in streams)
    streams = [s[:, :L] for s in streams]
    v = np.mean(streams, axis=0)
    v /= (v.sum(axis=0, keepdims=True) + 1e-8)
    return v.astype(np.float32)

def fuse_poe_fixed(ps: np.ndarray, pvis: np.ndarray | None, pa: np.ndarray | None, alpha_vis: float = 0.26, gamma_a: float = 0.25) -> np.ndarray:
    parts = [ps]
    if pvis is not None:
        pvis_a, ps_a = align_by_entropy_corr(pvis, ps, max_shift=15)
        parts.append(pvis_a); ps = ps_a
    if pa is not None:
        pa_a, ps_a2 = align_by_entropy_corr(pa, ps, max_shift=15)
        parts.append(pa_a); ps = ps_a2
    Tm = min(p.shape[1] for p in parts)
    ps = ps[:, :Tm]
    pvis_c = parts[1][:, :Tm] if (pvis is not None) else None
    pa_c = parts[-1][:, :Tm] if (pa is not None) else None
    w_s = max(0.0, 1.0 - (alpha_vis if pvis is not None else 0.0) - (gamma_a if pa is not None else 0.0))
    logp = w_s*np.log(np.clip(ps,1e-8,1.0))
    if pvis is not None: logp += alpha_vis*np.log(np.clip(pvis_c,1e-8,1.0))
    if pa is not None:   logp += gamma_a*np.log(np.clip(pa_c,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

# Duration priors from labels3d_v2
def segment_lengths(y):
    lens=defaultdict(list); cur=None; run=0
    for c in y:
        if c==0:
            if cur is not None: lens[cur].append(run); cur=None; run=0
            continue
        if cur is None: cur=int(c); run=1
        elif c==cur: run+=1
        else: lens[cur].append(run); cur=int(c); run=1
    if cur is not None: lens[cur].append(run)
    return lens

def compute_duration_pmf(ids, max_dur=150):
    counts = np.ones((21, max_dur+1), dtype=np.float64)  # Laplace +1, index by duration d (1..max_dur)
    for sid in ids:
        y = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
        lens = segment_lengths(y)
        for c, arr in lens.items():
            if c==0: continue
            for L in arr:
                d = int(min(L, max_dur))
                counts[c, d] += 1.0
    # class 0: keep Laplace baseline only (uniform-ish after norm)
    pmf = counts / counts.sum(axis=1, keepdims=True)
    log_pmf = -np.log(np.clip(pmf, 1e-12, 1.0))
    return log_pmf.astype(np.float32)

# Transition priors: allow only gesture<->silence and silence->silence
def compute_transition_cost(ids):
    trans = np.full((21,21), 1e-9, dtype=np.float64)  # smoothing
    for sid in ids:
        y = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
        # traverse segments
        cur=None; run=0; segs=[]
        for c in y:
            if cur is None: cur=int(c); run=1
            elif c==cur: run+=1
            else: segs.append(cur); cur=int(c); run=1
        if cur is not None: segs.append(cur)
        for i in range(1, len(segs)):
            a = int(segs[i-1]); b = int(segs[i])
            trans[a, b] += 1.0
    # enforce sparsity: gesture->gesture (i!=j) forbidden by huge cost
    allowed = np.zeros_like(trans, dtype=bool)
    allowed[0, :] = True  # silence to any
    allowed[:, 0] = True  # any to silence
    # keep silence->silence allowed; gestures->gestures disallowed
    probs = trans / trans.sum(axis=1, keepdims=True)
    cost = -np.log(np.clip(probs, 1e-12, 1.0))
    big = 1e6
    for i in range(21):
        for j in range(21):
            if not (allowed[i,j] or (i==j==0)):
                cost[i,j] = big
    return cost.astype(np.float32)

def smooth1d_keep_len(x: np.ndarray, k: int) -> np.ndarray:
    if k<=1: return x.astype(np.float32, copy=False)
    kernel = np.ones(k, dtype=np.float32)/k
    pad = k//2
    xpad = np.pad(x, (pad, pad), mode='edge')
    y = np.convolve(xpad, kernel, mode='valid')  # length T
    return y.astype(np.float32)

# Segmental Viterbi
def seg_viterbi(p: np.ndarray, log_dur: np.ndarray, trans_cost: np.ndarray, w_dur=1.0, w_tr=1.0, max_dur=150, per_class_smooth=False):
    C,T = p.shape
    # optional per-class smoothing (preserve length exactly)
    if per_class_smooth:
        ks = np.clip((0.5*np.ones(C)*7).astype(int), 3, 15)
        ps = np.empty_like(p)
        for c in range(C):
            k = int(ks[c])
            ps[c] = smooth1d_keep_len(p[c], k)
        p = ps; p /= (p.sum(axis=0, keepdims=True)+1e-8)
    # emission prefix sums
    nll = -np.log(np.clip(p, 1e-8, 1.0)).astype(np.float32)
    pref = np.cumsum(nll, axis=1, dtype=np.float64)
    def seg_cost(c, t0, t1):
        if t0<=0: return float(pref[c, t1-1])
        return float(pref[c, t1-1] - pref[c, t0-1])
    # DP arrays
    INF = 1e18
    dp = np.full((C, T+1), INF, dtype=np.float64)  # dp[c, t] best cost ending exactly at t with class c segment
    bp_prev_c = -np.ones((C, T+1), dtype=np.int16)
    bp_prev_t = -np.ones((C, T+1), dtype=np.int32)
    # base: start at t=0 coming from silence
    dp[:,0] = INF; dp[0,0] = 0.0
    # iterate end time t
    for t in range(1, T+1):
        # consider class c for the segment ending at t with duration d
        maxd = min(max_dur, t)
        for c in range(C):
            best_cost = INF; best_prev_c = -1; best_prev_t = -1
            # allowed previous class
            if c==0:
                prev_classes = list(range(C))  # any to silence
            else:
                prev_classes = [0]  # only silence to gesture
            for d in range(1, maxd+1):
                t0 = t - d
                emis = seg_cost(c, t0, t)
                durc = 0.0 if c==0 else float(log_dur[c, min(d, log_dur.shape[1]-1)])
                for pc in prev_classes:
                    prev_cost = dp[pc, t0]
                    if prev_cost >= INF: continue
                    tc = float(trans_cost[pc, c])
                    cost = prev_cost + emis + w_dur*durc + w_tr*tc
                    if cost < best_cost:
                        best_cost = cost; best_prev_c = pc; best_prev_t = t0
            dp[c, t] = best_cost; bp_prev_c[c, t] = best_prev_c; bp_prev_t[c, t] = best_prev_t
    # best ending state at T
    c_end = int(np.argmin(dp[:, T]))
    # backtrack
    segs=[]; c=c_end; t=T
    while t>0 and c>=0:
        t0 = int(bp_prev_t[c, t]); pc = int(bp_prev_c[c, t])
        if t0<0 or pc<0: break
        segs.append((c, t0, t))
        c = pc; t = t0
    segs.reverse()
    # build framewise labels
    y = np.zeros(T, dtype=np.int32)
    for c, t0, t1 in segs:
        if c==0: continue
        y[t0:t1] = int(c)
    return y

# OOF evaluation
def build_fused_probs_for_id(sid: int, alpha_vis=0.26, gamma_a=0.25, smooth_k=5):
    ps = load_skeleton_probs(sid)
    pr = load_probs_generic(sid, 'rgb')
    pdepth = load_probs_generic(sid, 'depth')
    pu = load_probs_generic(sid, 'user')
    pa = load_probs_generic(sid, 'audio')
    pvis = visual_average_aligned(ps, pr, pdepth, pu)
    pf = fuse_poe_fixed(ps, pvis, pa, alpha_vis=alpha_vis, gamma_a=gamma_a)
    pf = smooth_probs_box(pf, k=smooth_k)
    return pf

def lev_dist(a, b):
    n=len(a); m=len(b)
    if n==0: return m
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i
        for j in range(1,m+1):
            tmp=dp[j]; cost=0 if a[i-1]==b[j-1] else 1
            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
    return dp[m]

def oof_eval_hsmm(grid_wdur=(0.5,1.0), grid_wtr=(0.5,1.0), grid_maxd=(100,150), per_class_smooth_opts=(False, True)):
    print('OOF HSMM tuning...', flush=True)
    results = {}
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        log_dur = compute_duration_pmf(tr, max_dur=max(grid_maxd))
        trans_cost = compute_transition_cost(tr)
        for wd in grid_wdur:
            for wt in grid_wtr:
                for md in grid_maxd:
                    for pcs in per_class_smooth_opts:
                        key=(wd,wt,md,pcs); dists=[]
                        t0=time.time()
                        for sid in va:
                            pf = build_fused_probs_for_id(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=5)
                            y = seg_viterbi(pf, log_dur, trans_cost, w_dur=wd, w_tr=wt, max_dur=md, per_class_smooth=pcs)
                            # post
                            y = aba_collapse(y, max_len=2, ratio=1.04, p=pf)
                            seq = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
                            y_true = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
                            seq_true = [int(c) for i,c in enumerate(y_true) if c!=0 and (i==0 or y_true[i-1]!=c)]
                            dists.append(lev_dist(seq, seq_true))
                        results.setdefault(key, []).append(float(np.mean(dists)))
                        print(f"fold={fd['fold']} wd={wd} wt={wt} md={md} pcs={pcs} mean={np.mean(dists):.3f} elapsed={time.time()-t0:.1f}s", flush=True)
    summary=[]
    for k, arr in results.items():
        worst=max(arr); mean=float(np.mean(arr))
        summary.append((worst, mean, k))
    summary.sort(key=lambda x: (x[0], x[1]))
    return summary

def decode_test_hsmm(best_params, out_csv='submission_hsmm_poe.csv'):
    wd,wt,md,pcs = best_params
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    log_dur = compute_duration_pmf(all_train_ids, max_dur=md)
    trans_cost = compute_transition_cost(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        pf = build_fused_probs_for_id(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=5)
        y = seg_viterbi(pf, log_dur, trans_cost, w_dur=wd, w_tr=wt, max_dur=md, per_class_smooth=pcs)
        y = aba_collapse(y, max_len=2, ratio=1.04, p=pf)
        seq_raw = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
        seq = make_perm20(seq_raw, pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

# Run narrowed grid OOF then decode test
summary = oof_eval_hsmm(grid_wdur=(1.0,), grid_wtr=(0.5,), grid_maxd=(100,), per_class_smooth_opts=(False,))
best = summary[0]
print('HSMM Best (worst,mean,params)=', best[:2], best[2])
wd,wt,md,pcs = best[2]
out_csv = f"submission_hsmm_poe_wd{str(wd).replace('.','')}_wt{str(wt).replace('.','')}_md{md}_pcs{int(pcs)}.csv"
print('Decoding TEST with best params...', flush=True)
decode_test_hsmm((wd,wt,md,pcs), out_csv=out_csv)

OOF HSMM tuning...


KeyboardInterrupt: 

In [13]:
# Minimal duration-aware Local Search decoder (fast) with tiny OOF grid and test decode
import numpy as np, pandas as pd, json, time, os
from pathlib import Path
from collections import defaultdict

probs_cache = Path('probs_cache')
labels_dir = Path('labels3d_v2/train')

try:
    folds
except NameError:
    folds = json.load(open('folds_archive_cv.json','r'))

# Reuse utilities from previous cells
def ensure_CxT(p, C=21):
    if p is None: return None
    if p.ndim==2 and p.shape[0]==C: return p
    if p.ndim==2 and p.shape[1]==C: return p.T
    raise ValueError('Bad probs shape')

def load_skeleton_probs(seq_id: int) -> np.ndarray:
    p2 = np.load(probs_cache/f"{seq_id}_ce.npy").astype(np.float32)
    p3 = np.load(probs_cache/f"{seq_id}_ce_v3.npy").astype(np.float32)
    p2 = ensure_CxT(p2); p3 = ensure_CxT(p3)
    Tm = min(p2.shape[1], p3.shape[1])
    p2 = p2[:, :Tm]; p3 = p3[:, :Tm]
    try:
        a = A.reshape(-1,1).astype(np.float32)
    except NameError:
        a = np.full((21,1), 0.7, dtype=np.float32)
    p = a*p2 + (1.0-a)*p3
    p /= (p.sum(axis=0, keepdims=True) + 1e-8)
    return p.astype(np.float32)

def load_probs_generic(seq_id: int, suffix: str) -> np.ndarray | None:
    pth = probs_cache/f"{seq_id}_{suffix}.npy"
    if not pth.exists(): return None
    p = np.load(pth).astype(np.float32)
    return ensure_CxT(p)

def entropy(p: np.ndarray) -> np.ndarray:
    q = np.clip(p, 1e-8, 1.0)
    return -np.sum(q*np.log(q), axis=0)

def align_by_entropy_corr(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        if s.std() < 1e-8 or r.std() < 1e-8:
            corr = -1.0
        else:
            corr = float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    sh = best[1]
    if sh >= 0:
        L = min(p_src.shape[1] - sh, p_ref.shape[1])
        return p_src[:, sh:sh+L], p_ref[:, :L]
    else:
        L = min(p_src.shape[1], p_ref.shape[1] + sh)
        return p_src[:, :L], p_ref[:, -sh:-sh+L]

def smooth_probs_box(p: np.ndarray, k: int = 5) -> np.ndarray:
    if k<=1: return p
    C,T = p.shape
    pad = k//2
    x = np.pad(p, ((0,0),(pad,pad)), mode='edge')
    cs = np.cumsum(x, axis=1, dtype=np.float64)
    out = (cs[:, k:] - cs[:, :-k]) / k
    out = out.astype(np.float32)
    out /= (out.sum(axis=0, keepdims=True) + 1e-8)
    return out

def decode_minseg(p: np.ndarray, min_dur: np.ndarray) -> np.ndarray:
    y = p.argmax(axis=0).astype(np.int32)
    T = y.shape[0]; i=0
    while i < T:
        c = y[i]; j=i+1
        while j<T and y[j]==c: j+=1
        L = j-i
        if c!=0 and L < int(min_dur[c]):
            lc = y[i-1] if i>0 else None
            rc = y[j] if j<T else None
            ls = float(p[lc, i:j].mean()) if lc is not None else -1e9
            rs = float(p[rc, i:j].mean()) if rc is not None else -1e9
            if rs >= ls: y[i:j] = rc if rc is not None else 0
            else:        y[i:j] = lc if lc is not None else 0
            i = max(0, i-1); continue
        i = j
    return y

def aba_collapse(y: np.ndarray, max_len: int = 2, ratio: float = 1.04, p: np.ndarray | None = None) -> np.ndarray:
    T = len(y); i=1
    while i < T-1:
        if y[i-1]==y[i+1] and y[i]!=y[i-1]:
            L=1; j=i+1
            while j<T-1 and y[j-1]==y[j+1] and y[j]!=y[j-1]:
                L+=1; j+=1
            if L<=max_len:
                y[i:j] = y[i-1]
                i = max(1, i-1); continue
            i = j
        i+=1
    return y

def compress_to_sequence(y_frames):
    seq=[]; last=-1
    for c in y_frames:
        if c==0: continue
        if c!=last: seq.append(int(c)); last=int(c)
    return seq

def make_perm20(seq_raw, p: np.ndarray):
    seen=set(); seq=[]
    for c in seq_raw:
        if 1<=c<=20 and c not in seen:
            seen.add(c); seq.append(c)
    if len(seq)<20:
        masses = [(c, float(p[c].sum())) for c in range(1,21) if c not in seen]
        masses.sort(key=lambda x: x[1], reverse=True)
        for c,_ in masses:
            if len(seq)==20: break
            seq.append(c)
    if len(seq)>20: seq = seq[:20]
    return seq

def segment_lengths(y):
    lens=defaultdict(list); cur=None; run=0
    for c in y:
        if c==0:
            if cur is not None: lens[cur].append(run); cur=None; run=0
            continue
        if cur is None: cur=int(c); run=1
        elif c==cur: run+=1
        else: lens[cur].append(run); cur=int(c); run=1
    if cur is not None: lens[cur].append(run)
    return lens

def compute_duration_stats(ids):
    agg=defaultdict(list)
    for sid in ids:
        y = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
        for c,ls in segment_lengths(y).items():
            if c!=0: agg[c].extend(ls)
    med = np.zeros(21, dtype=np.float32); q95 = np.zeros(21, dtype=np.float32)
    for c in range(1,21):
        ls = agg.get(c, [])
        if ls:
            arr = np.array(ls, dtype=np.float32)
            med[c] = float(np.median(arr))
            q95[c] = float(np.quantile(arr, 0.95))
        else:
            med[c] = 5.0; q95[c] = 50.0
    q95 = np.clip(q95, 5.0, 150.0)
    return med, q95

# --- Temperature scaling helpers ---
def temp_scale_scalar(p: np.ndarray, T: float) -> np.ndarray:
    if p is None or T is None: return p
    T = float(T)
    logp = np.log(np.clip(p, 1e-8, 1.0)) / max(T, 1e-6)
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

def _load_temp_from_json(path: Path) -> float | None:
    if not path.exists(): return None
    try:
        obj = json.load(open(path,'r'))
        if isinstance(obj, dict):
            for k in ('T','temp','temperature'):
                if k in obj: return float(obj[k])
        if isinstance(obj, (int,float)):
            return float(obj)
    except Exception:
        try:
            txt = open(path,'r').read().strip()
            return float(txt)
        except Exception:
            return None
    return None

def get_fold_temp_map(prefix: str):
    mp = {}
    for f in (0,1,2):
        t = _load_temp_from_json(Path(f"{prefix}_fold{f}.json"))
        if t is not None: mp[f]=t
    return mp

TEMP_RGB   = get_fold_temp_map('rgb_temp')
TEMP_DEPTH = get_fold_temp_map('depth_temp')
TEMP_USER  = get_fold_temp_map('user_temp')
TEMP_AUDIO = get_fold_temp_map('audio_temp')

def get_test_temp_avg(mp: dict) -> float | None:
    if not mp: return None
    return float(np.mean(list(mp.values())))

# --- Fusion with independent alignment and per-fold temps ---
def build_fused_probs_for_id(sid: int, alpha_vis=0.26, gamma_a=0.25, smooth_k=3, fold: int | None = None, for_test: bool = False):
    ps_ref = load_skeleton_probs(sid)
    pr = load_probs_generic(sid, 'rgb')
    pdp = load_probs_generic(sid, 'depth')
    pu  = load_probs_generic(sid, 'user')
    pa  = load_probs_generic(sid, 'audio')
    # temperature-scale non-skeleton streams before alignment
    if fold is not None:
        if pr is not None and fold in TEMP_RGB:   pr = temp_scale_scalar(pr, TEMP_RGB[fold])
        if pdp is not None and fold in TEMP_DEPTH: pdp = temp_scale_scalar(pdp, TEMP_DEPTH[fold])
        if pu is not None and fold in TEMP_USER:  pu  = temp_scale_scalar(pu,  TEMP_USER[fold])
        if pa is not None and fold in TEMP_AUDIO: pa  = temp_scale_scalar(pa,  TEMP_AUDIO[fold])
    elif for_test:
        t = get_test_temp_avg(TEMP_RGB);   pr  = temp_scale_scalar(pr,  t) if pr  is not None else None
        t = get_test_temp_avg(TEMP_DEPTH); pdp = temp_scale_scalar(pdp, t) if pdp is not None else None
        t = get_test_temp_avg(TEMP_USER);  pu  = temp_scale_scalar(pu,  t) if pu  is not None else None
        t = get_test_temp_avg(TEMP_AUDIO); pa  = temp_scale_scalar(pa,  t) if pa  is not None else None
    # align each to skeleton independently
    aligned = [ps_ref]
    pr_a = pdp_a = pu_a = pa_a = None
    if pr is not None:
        pr_a, ps_a = align_by_entropy_corr(pr, ps_ref, max_shift=15)
        aligned.append(ps_a); pr_a = pr_a
    if pdp is not None:
        pdp_a, ps_a2 = align_by_entropy_corr(pdp, ps_ref, max_shift=15)
        aligned.append(ps_a2); pdp_a = pdp_a
    if pu is not None:
        pu_a, ps_a3 = align_by_entropy_corr(pu, ps_ref, max_shift=15)
        aligned.append(ps_a3); pu_a = pu_a
    if pa is not None:
        pa_a, ps_a4 = align_by_entropy_corr(pa, ps_ref, max_shift=15)
        aligned.append(ps_a4); pa_a = pa_a
    # crop all to common T
    Tm = min(x.shape[1] for x in aligned)
    ps = ps_ref[:, :Tm]
    if pr_a is not None: pr_a = pr_a[:, :Tm]
    if pdp_a is not None: pdp_a = pdp_a[:, :Tm]
    if pu_a is not None: pu_a = pu_a[:, :Tm]
    if pa_a is not None: pa_a = pa_a[:, :Tm]
    # visual average
    vis_list = [v for v in (pr_a, pdp_a, pu_a) if v is not None]
    pvis = None
    if vis_list:
        v = np.mean(vis_list, axis=0)
        v /= (v.sum(axis=0, keepdims=True)+1e-8)
        pvis = v.astype(np.float32)
    # PoE
    w_s = 1.0 - (alpha_vis if pvis is not None else 0.0) - (gamma_a if pa_a is not None else 0.0)
    logp = w_s*np.log(np.clip(ps,1e-8,1.0))
    if pvis is not None: logp += alpha_vis*np.log(np.clip(pvis,1e-8,1.0))
    if pa_a is not None: logp += gamma_a*np.log(np.clip(pa_a,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    q = smooth_probs_box(q.astype(np.float32), k=smooth_k)
    return q

def lev_dist(a, b):
    n=len(a); m=len(b)
    if n==0: return m
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i
        for j in range(1,m+1):
            tmp=dp[j]; cost=0 if a[i-1]==b[j-1] else 1
            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
    return dp[m]

# DP to optimize boundaries given a fixed order (length 20). Downsample by stride s for speed with feasibility bounds.
def optimize_boundaries(order, p, med, q95, w_dur=0.0, stride=3):
    C,T = p.shape
    if stride>1:
        T2 = T//stride
        p_ds = p[:, :T2*stride].reshape(C, T2, stride).mean(axis=2).astype(np.float32)
        scale = stride
    else:
        p_ds = p; T2=T; scale=1
    nll = -np.log(np.clip(p_ds, 1e-8, 1.0)).astype(np.float32)
    pref = np.cumsum(nll, axis=1, dtype=np.float64)
    def seg_nll(c, t0, t1):
        if t0<=0: return float(pref[c, t1-1])
        return float(pref[c, t1-1] - pref[c, t0-1])
    K = len(order)
    INF=1e18
    dp = np.full((K+1, T2+1), INF, dtype=np.float64)
    bp = -np.ones((K+1, T2+1), dtype=np.int32)
    dp[0,0]=0.0
    # per-class min/max durations (downsampled)
    dmin_ds_global = max(1, 3//scale)
    dmin_c_ds = np.zeros(21, dtype=np.int32)
    dmax_c_ds = np.zeros(21, dtype=np.int32)
    for c in range(21):
        md = max(1.0, med[c])
        dmin_c_ds[c] = max(1, int((0.4*md)//scale))
        dmax_c_ds[c] = max(dmin_c_ds[c], int(min(q95[c]//scale, 150//scale)))
    # adjust dmin to ensure feasibility sum(dmin) <= T2
    sum_min = int(sum(dmin_c_ds[c] for c in order))
    if sum_min > T2:
        factor = T2 / max(sum_min, 1)
        for c in set(order):
            d = max(1, int(np.floor(dmin_c_ds[c] * factor)))
            dmin_c_ds[c] = d
        # ensure dmax >= dmin
        for c in set(order):
            if dmax_c_ds[c] < dmin_c_ds[c]: dmax_c_ds[c] = dmin_c_ds[c]
    # DP with feasibility bounds
    for k in range(1, K+1):
        c = order[k-1]
        md = max(1.0, med[c]); log_md = np.log(md)
        t_lo = k*dmin_ds_global
        t_hi = T2 - (K-k)*dmin_ds_global
        t_lo = max(t_lo, 1); t_hi = max(t_hi, 1)
        for t in range(t_lo, t_hi+1):
            best = INF; best_t0=-1
            t0_max = t - dmin_c_ds[c]
            t0_min = max((k-1)*dmin_ds_global, t - dmax_c_ds[c], 0)
            for t0 in range(t0_min, t0_max+1):
                d = t - t0
                cost = dp[k-1, t0] + seg_nll(c, t0, t) + w_dur*abs(np.log(max(d*scale,1.0)) - log_md)
                if cost < best:
                    best = cost; best_t0 = t0
            dp[k, t] = best; bp[k, t] = best_t0
    # backtrack from best end time (allow tail silence)
    t = int(np.argmin(dp[K, :]))
    if not np.isfinite(dp[K, t]):
        raise RuntimeError('DP backtrack failed: no finite path')
    k=K; cuts=[t]
    while k>0:
        t0 = int(bp[k, t])
        if t0<0:
            raise RuntimeError('DP backtrack failed: infeasible path')
        cuts.append(t0); t=t0; k-=1
    cuts = cuts[::-1]
    # scale back to original timeline
    bnds=[0]
    for x in cuts[1:]:
        b = int(x*scale)
        bnds.append(b)
    if bnds[-1] < T: bnds[-1] = bnds[-1]  # allow tail background; do not force to T
    return bnds

def total_seq_cost(order, p, bnds, med, w_dur=0.0):
    # compute total cost with original (non-downsampled) p
    nll = -np.log(np.clip(p, 1e-8, 1.0)).astype(np.float32)
    pref = np.cumsum(nll, axis=1, dtype=np.float64)
    def seg_nll(c, t0, t1):
        if t0<=0: return float(pref[c, t1-1])
        return float(pref[c, t1-1] - pref[c, t0-1])
    cost=0.0
    for k,c in enumerate(order):
        t0=bnds[k]; t1=bnds[k+1]
        cost += seg_nll(c, t0, t1)
        if w_dur>0:
            md = max(1.0, med[c]); cost += w_dur*abs(np.log(max(t1-t0,1.0)) - np.log(md))
    return float(cost)

def initial_order_from_minseg(p, med, mult=0.7):
    min_dur = np.floor(med*mult + 0.5).astype(np.int32); min_dur[0]=0
    y = decode_minseg(p, min_dur)
    y = aba_collapse(y, max_len=2, ratio=1.04, p=p)
    seq0 = compress_to_sequence(y)
    seq = make_perm20(seq0, p)
    return seq

def _segment_emission_costs(order, p, bnds):
    # emission-only cost per segment for prioritizing moves
    nll = -np.log(np.clip(p, 1e-8, 1.0)).astype(np.float32)
    pref = np.cumsum(nll, axis=1, dtype=np.float64)
    def seg_nll(c, t0, t1):
        if t0<=0: return float(pref[c, t1-1])
        return float(pref[c, t1-1] - pref[c, t0-1])
    costs=[]
    for k,c in enumerate(order):
        t0=bnds[k]; t1=bnds[k+1]
        costs.append(seg_nll(c,t0,t1))
    return np.array(costs, dtype=np.float64)

def neighbors(order, bnds, p, cap=10):
    K=len(order); cands=[]; seen=set()
    # adjacent swaps
    for i in range(K-1):
        o=order.copy(); o[i],o[i+1]=o[i+1],o[i]
        t=tuple(o)
        if t not in seen:
            seen.add(t); cands.append(o)
        if len(cands)>=cap: return cands
    # optional (i, i+2) swaps for first few positions
    for i in range(min(3, K-2)):
        o=order.copy(); o[i],o[i+2]=o[i+2],o[i]
        t=tuple(o)
        if t not in seen:
            seen.add(t); cands.append(o)
        if len(cands)>=cap: return cands
    # reinsertion for top-3 costly segments to i±1, i±2
    seg_costs = _segment_emission_costs(order, p, bnds)
    top_idx = list(np.argsort(-seg_costs)[:3])
    for idx in top_idx:
        for delta in (-2,-1,1,2):
            j = idx + delta
            if j<0 or j>=K: continue
            o=order.copy()
            val=o.pop(idx)
            o.insert(j, val)
            t=tuple(o)
            if t not in seen:
                seen.add(t); cands.append(o)
            if len(cands)>=cap: return cands
    return cands

def local_search_decode(p, med, q95, mult=0.7, w_dur=0.0, passes=3, max_moves=12, stride=3):
    order = initial_order_from_minseg(p, med, mult=mult)
    bnds = optimize_boundaries(order, p, med, q95, w_dur=w_dur, stride=stride)
    best_cost = total_seq_cost(order, p, bnds, med, w_dur=w_dur)
    moves=0
    for _ in range(passes):
        improved=False
        cand_list = neighbors(order, bnds, p, cap=10)
        for cand in cand_list:
            b = optimize_boundaries(cand, p, med, q95, w_dur=w_dur, stride=stride)
            c = total_seq_cost(cand, p, b, med, w_dur=w_dur)
            if c + 1e-4 < best_cost:
                order=cand; bnds=b; best_cost=c; improved=True
                moves+=1
                if moves>=max_moves: break
        if not improved or moves>=max_moves: break
    # build frame labels from final boundaries
    y = np.zeros(p.shape[1], dtype=np.int32)
    for k,c in enumerate(order):
        t0=bnds[k]; t1=bnds[k+1]
        y[t0:t1]=c
    return y, order, bnds, best_cost

def oof_eval_ls(mult_list=(0.65,0.70), wdur_list=(0.1,0.2), stride=3):
    print('OOF LS tuning...', flush=True)
    results={}
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med, q95 = compute_duration_stats(tr)
        fold_scores = defaultdict(list)
        t_fold=time.time()
        n_seq=0; total_seq=len(va)*len(mult_list)*len(wdur_list)
        for sid in va:
            p = build_fused_probs_for_id(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=3, fold=int(fd['fold']))
            y_true = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
            seq_true = [int(c) for i,c in enumerate(y_true) if c!=0 and (i==0 or y_true[i-1]!=c)]
            for m in mult_list:
                for wd in wdur_list:
                    y, order, bnds, cost = local_search_decode(p, med, q95, mult=m, w_dur=wd, passes=3, max_moves=12, stride=stride)
                    seq = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
                    d = lev_dist(seq, seq_true)
                    fold_scores[(m,wd)].append(d)
                    n_seq+=1
                    if (n_seq%20)==0:
                        print(f"fold={fd['fold']} progress {n_seq}/{total_seq} elapsed={time.time()-t_fold:.1f}s", flush=True)
        for k,v in fold_scores.items():
            results.setdefault(k, []).append(float(np.mean(v)))
        print(f"fold={fd['fold']} done in {time.time()-t_fold:.1f}s", flush=True)
    summary=[]
    for k, arr in results.items():
        worst=max(arr); mean=float(np.mean(arr))
        summary.append((worst, mean, k))
    summary.sort(key=lambda x: (x[0], x[1]))
    return summary

def decode_test_ls(mult=0.70, w_dur=0.0, stride=3, out_csv='submission_ls_poe_fast.csv'):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        p = build_fused_probs_for_id(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=3, fold=None, for_test=True)
        y, order, bnds, cost = local_search_decode(p, med, q95, mult=mult, w_dur=w_dur, passes=3, max_moves=12, stride=stride)
        seq = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
        # ensure perm20
        seq = make_perm20(seq, p)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

print('OOF LS tuning...', flush=True)
summary = oof_eval_ls(mult_list=(0.65,0.70), wdur_list=(0.1,0.2), stride=3)
best = summary[0]
print('LS Best (worst,mean,params)=', best[:2], best[2])
m,wd = best[2]
out_csv = f"submission_ls_poe_fast_m{str(m).replace('.','')}_wd{str(wd).replace('.','')}.csv"
print('Decoding TEST with best params...', flush=True)
decode_test_ls(mult=m, w_dur=wd, stride=3, out_csv=out_csv)

OOF LS tuning...


OOF LS tuning...


fold=0 progress 20/392 elapsed=85.7s


fold=0 progress 40/392 elapsed=189.3s


fold=0 progress 60/392 elapsed=343.0s


fold=0 progress 80/392 elapsed=440.1s


fold=0 progress 100/392 elapsed=568.3s


fold=0 progress 120/392 elapsed=676.6s


fold=0 progress 140/392 elapsed=740.1s


fold=0 progress 160/392 elapsed=879.5s


fold=0 progress 180/392 elapsed=1120.3s


fold=0 progress 200/392 elapsed=1363.7s


fold=0 progress 220/392 elapsed=1574.1s


fold=0 progress 240/392 elapsed=1753.3s


fold=0 progress 260/392 elapsed=1878.2s


fold=0 progress 280/392 elapsed=2043.0s


fold=0 progress 300/392 elapsed=2275.2s


fold=0 progress 320/392 elapsed=2509.8s


fold=0 progress 340/392 elapsed=2673.2s


fold=0 progress 360/392 elapsed=2795.4s


fold=0 progress 380/392 elapsed=2902.4s


fold=0 done in 2946.6s


fold=1 progress 20/396 elapsed=64.6s


fold=1 progress 40/396 elapsed=128.4s


fold=1 progress 60/396 elapsed=264.4s


fold=1 progress 80/396 elapsed=339.1s


fold=1 progress 100/396 elapsed=449.6s


fold=1 progress 120/396 elapsed=604.7s


fold=1 progress 140/396 elapsed=749.0s


fold=1 progress 160/396 elapsed=889.4s


fold=1 progress 180/396 elapsed=1056.9s


fold=1 progress 200/396 elapsed=1195.3s


fold=1 progress 220/396 elapsed=1350.4s


fold=1 progress 240/396 elapsed=1520.8s


fold=1 progress 260/396 elapsed=1604.2s


fold=1 progress 280/396 elapsed=1697.5s


fold=1 progress 300/396 elapsed=1825.5s


fold=1 progress 320/396 elapsed=2006.0s


fold=1 progress 340/396 elapsed=2115.3s


fold=1 progress 360/396 elapsed=2176.0s


fold=1 progress 380/396 elapsed=2239.0s


fold=1 done in 2294.7s


fold=2 progress 20/400 elapsed=146.6s


fold=2 progress 40/400 elapsed=275.2s


fold=2 progress 60/400 elapsed=420.1s


fold=2 progress 80/400 elapsed=575.5s


fold=2 progress 100/400 elapsed=750.7s


fold=2 progress 120/400 elapsed=924.4s


fold=2 progress 140/400 elapsed=1115.1s


fold=2 progress 160/400 elapsed=1275.6s


fold=2 progress 180/400 elapsed=1440.0s


fold=2 progress 200/400 elapsed=1564.3s


fold=2 progress 220/400 elapsed=1748.2s


fold=2 progress 240/400 elapsed=1902.1s


fold=2 progress 260/400 elapsed=2073.3s


fold=2 progress 280/400 elapsed=2165.0s


fold=2 progress 300/400 elapsed=2352.0s


fold=2 progress 320/400 elapsed=2456.6s


fold=2 progress 340/400 elapsed=2567.7s


fold=2 progress 360/400 elapsed=2705.0s


fold=2 progress 380/400 elapsed=2855.2s


fold=2 progress 400/400 elapsed=3003.6s


fold=2 done in 3003.6s


LS Best (worst,mean,params)= (9.37, 7.894308390022675) (0.65, 0.1)
Decoding TEST with best params...


Decoded 20/95 elapsed=159.8s


Decoded 40/95 elapsed=353.3s


Decoded 60/95 elapsed=526.9s


KeyboardInterrupt: 

In [14]:
# Pairwise order decoder + boundary DP (fast) TEST DECODE ONLY (no OOF) to avoid long runtimes
import numpy as np, pandas as pd, json, time, os
from pathlib import Path
from collections import defaultdict

probs_cache = Path('probs_cache')
labels_dir = Path('labels3d_v2/train')

try:
    folds
except NameError:
    folds = json.load(open('folds_archive_cv.json','r'))

# Reuse from previous cells: build_fused_probs_for_id, optimize_boundaries, compute_duration_stats, lev_dist, make_perm20

def pairwise_order_from_probs(p: np.ndarray, q_power: float = 1.8) -> list:
    # Compute pairwise 'i before j' wins using weighted mass; exclude class 0
    C,T = p.shape
    classes = list(range(1,21))
    w = np.power(np.clip(p[1:21], 1e-8, 1.0), q_power).astype(np.float32)  # shape 20 x T
    # precompute cumulative sums from the end for each class: S_after[c,t] = sum_{u>t} w[c,u]
    S_after = np.cumsum(w[:, ::-1], axis=1)[:, ::-1]
    W = np.zeros((20,20), dtype=np.float64)
    for i in range(20):
        # wins of i over all j: sum_t w_i[t] * S_after[j][t]
        wi = w[i]
        for j in range(20):
            if i==j: continue
            W[i,j] = float((wi * S_after[j]).sum())
    # margin scores
    M = W - W.T  # antisymmetric margins
    score = M.sum(axis=1)  # Borda-like
    order_idx = list(np.argsort(-score))  # descending score
    order = [classes[i] for i in order_idx]
    return order

def decode_test_pairwise(q_power=1.8, w_dur=0.0, stride=3, smooth_k=5, out_csv='submission_pairwise_dp.csv', stage_submission=False):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        p = build_fused_probs_for_id(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=smooth_k, fold=None, for_test=True)
        order = pairwise_order_from_probs(p, q_power=q_power)
        bnds = optimize_boundaries(order, p, med, q95, w_dur=w_dur, stride=stride)
        y = np.zeros(p.shape[1], dtype=np.int32)
        for k,c in enumerate(order):
            t0=bnds[k]; t1=bnds[k+1]
            y[t0:t1]=c
        seq_raw = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
        seq = make_perm20(seq_raw, p)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    if stage_submission:
        sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

# Fast test-only decode with fixed params (avoid long OOF). Does NOT overwrite submission.csv.
q_fixed = 1.8
wd_fixed = 0.0
print('Decoding TEST with pairwise+DP (fixed params)...', flush=True)
out_csv = f"submission_pairwise_dp_q{str(q_fixed).replace('.','')}_wd{str(wd_fixed).replace('.','')}.csv"
decode_test_pairwise(q_power=q_fixed, w_dur=wd_fixed, stride=3, smooth_k=5, out_csv=out_csv, stage_submission=False)
print('Done. Baseline submission.csv remains unchanged.')

Decoding TEST with pairwise+DP (fixed params)...


Decoded 20/95 elapsed=1759186346.6s


Decoded 40/95 elapsed=1759186320.2s


Decoded 60/95 elapsed=1759186362.2s


Decoded 80/95 elapsed=1759186365.5s


Decoded 95/95 elapsed=1759186355.1s


Wrote submission_pairwise_dp_q18_wd00.csv rows= 95
Done. Baseline submission.csv remains unchanged.


In [15]:
# Borda rank-ensemble with per-sequence gating; TEST DECODE ONLY
import numpy as np, pandas as pd, json, time, os
from pathlib import Path
from collections import defaultdict

probs_cache = Path('probs_cache')
labels_dir = Path('labels3d_v2/train')

try:
    folds
except NameError:
    folds = json.load(open('folds_archive_cv.json','r'))

# Reuse utilities if present; define minimal fallbacks
def ensure_CxT(p, C=21):
    if p is None: return None
    if p.ndim==2 and p.shape[0]==C: return p
    if p.ndim==2 and p.shape[1]==C: return p.T
    raise ValueError('Bad probs shape')

def load_skeleton_probs(seq_id: int) -> np.ndarray:
    p2 = np.load(probs_cache/f"{seq_id}_ce.npy").astype(np.float32)
    p3 = np.load(probs_cache/f"{seq_id}_ce_v3.npy").astype(np.float32)
    p2 = ensure_CxT(p2); p3 = ensure_CxT(p3)
    Tm = min(p2.shape[1], p3.shape[1])
    p2 = p2[:, :Tm]; p3 = p3[:, :Tm]
    try:
        a = A.reshape(-1,1).astype(np.float32)
    except NameError:
        a = np.full((21,1), 0.7, dtype=np.float32)
    p = a*p2 + (1.0-a)*p3
    p /= (p.sum(axis=0, keepdims=True) + 1e-8)
    return p.astype(np.float32)

def load_probs_generic(seq_id: int, suffix: str) -> np.ndarray | None:
    pth = probs_cache/f"{seq_id}_{suffix}.npy"
    if not pth.exists(): return None
    p = np.load(pth).astype(np.float32)
    return ensure_CxT(p)

def entropy(p: np.ndarray) -> np.ndarray:
    q = np.clip(p, 1e-8, 1.0)
    return -np.sum(q*np.log(q), axis=0)

def align_with_corr(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        if s.std() < 1e-8 or r.std() < 1e-8:
            corr = -1.0
        else:
            corr = float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    corr, sh = best
    if sh >= 0:
        L = min(p_src.shape[1] - sh, p_ref.shape[1])
        return p_src[:, sh:sh+L], p_ref[:, :L], corr
    else:
        L = min(p_src.shape[1], p_ref.shape[1] + sh)
        return p_src[:, :L], p_ref[:, -sh:-sh+L], corr

def smooth_probs_box(p: np.ndarray, k: int = 5) -> np.ndarray:
    if k<=1: return p
    C,T = p.shape
    pad = k//2
    x = np.pad(p, ((0,0),(pad,pad)), mode='edge')
    cs = np.cumsum(x, axis=1, dtype=np.float64)
    out = (cs[:, k:] - cs[:, :-k]) / k
    out = out.astype(np.float32)
    out /= (out.sum(axis=0, keepdims=True) + 1e-8)
    return out

# --- Temperature scaling helpers (reuse test-avg temps) ---
def temp_scale_scalar(p: np.ndarray, T: float) -> np.ndarray:
    if p is None or T is None: return p
    T = float(T)
    logp = np.log(np.clip(p, 1e-8, 1.0)) / max(T, 1e-6)
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

def _load_temp_from_json(path: Path) -> float | None:
    if not path.exists(): return None
    try:
        obj = json.load(open(path,'r'))
        if isinstance(obj, dict):
            for k in ('T','temp','temperature'):
                if k in obj: return float(obj[k])
        if isinstance(obj, (int,float)):
            return float(obj)
    except Exception:
        try:
            txt = open(path,'r').read().strip()
            return float(txt)
        except Exception:
            return None
    return None

def get_fold_temp_map(prefix: str):
    mp = {}
    for f in (0,1,2):
        t = _load_temp_from_json(Path(f"{prefix}_fold{f}.json"))
        if t is not None: mp[f]=t
    return mp

TEMP_RGB   = get_fold_temp_map('rgb_temp')
TEMP_DEPTH = get_fold_temp_map('depth_temp')
TEMP_USER  = get_fold_temp_map('user_temp')
TEMP_AUDIO = get_fold_temp_map('audio_temp')

def get_test_temp_avg(mp: dict) -> float | None:
    if not mp: return None
    return float(np.mean(list(mp.values())))

# --- Duration stats ---
def segment_lengths(y):
    lens=defaultdict(list); cur=None; run=0
    for c in y:
        if c==0:
            if cur is not None: lens[cur].append(run); cur=None; run=0
            continue
        if cur is None: cur=int(c); run=1
        elif c==cur: run+=1
        else: lens[cur].append(run); cur=int(c); run=1
    if cur is not None: lens[cur].append(run)
    return lens

def compute_duration_stats(ids):
    agg=defaultdict(list)
    for sid in ids:
        y = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
        for c,ls in segment_lengths(y).items():
            if c!=0: agg[c].extend(ls)
    med = np.zeros(21, dtype=np.float32); q95 = np.zeros(21, dtype=np.float32)
    for c in range(1,21):
        ls = agg.get(c, [])
        if ls:
            arr = np.array(ls, dtype=np.float32)
            med[c] = float(np.median(arr))
            q95[c] = float(np.quantile(arr, 0.95))
        else:
            med[c] = 5.0; q95[c] = 50.0
    q95 = np.clip(q95, 5.0, 150.0)
    return med, q95

# --- Decoders and helpers ---
def decode_minseg(p: np.ndarray, min_dur: np.ndarray) -> np.ndarray:
    y = p.argmax(axis=0).astype(np.int32)
    T = y.shape[0]; i=0
    while i < T:
        c = y[i]; j=i+1
        while j<T and y[j]==c: j+=1
        L = j-i
        if c!=0 and L < int(min_dur[c]):
            lc = y[i-1] if i>0 else None
            rc = y[j] if j<T else None
            ls = float(p[lc, i:j].mean()) if lc is not None else -1e9
            rs = float(p[rc, i:j].mean()) if rc is not None else -1e9
            if rs >= ls: y[i:j] = rc if rc is not None else 0
            else:        y[i:j] = lc if lc is not None else 0
            i = max(0, i-1); continue
        i = j
    return y

def aba_collapse(y: np.ndarray, max_len: int = 2) -> np.ndarray:
    T = len(y); i=1
    while i < T-1:
        if y[i-1]==y[i+1] and y[i]!=y[i-1]:
            L=1; j=i+1
            while j<T-1 and y[j-1]==y[j+1] and y[j]!=y[j-1]:
                L+=1; j+=1
            if L<=max_len:
                y[i:j] = y[i-1]
                i = max(1, i-1); continue
            i = j
        i+=1
    return y

def compress_to_sequence(y_frames):
    seq=[]; last=-1
    for c in y_frames:
        if c==0: continue
        if c!=last: seq.append(int(c)); last=int(c)
    return seq

def make_perm20(seq_raw, p: np.ndarray):
    seen=set(); seq=[]
    for c in seq_raw:
        if 1<=c<=20 and c not in seen:
            seen.add(c); seq.append(c)
    if len(seq)<20:
        masses = [(c, float(p[c].sum())) for c in range(1,21) if c not in seen]
        masses.sort(key=lambda x: x[1], reverse=True)
        for c,_ in masses:
            if len(seq)==20: break
            seq.append(c)
    if len(seq)>20: seq = seq[:20]
    return seq

def optimize_boundaries(order, p, med, q95, w_dur=0.05, stride=3):
    C,T = p.shape
    if stride>1:
        T2 = T//stride
        p_ds = p[:, :T2*stride].reshape(C, T2, stride).mean(axis=2).astype(np.float32)
        scale = stride
    else:
        p_ds = p; T2=T; scale=1
    nll = -np.log(np.clip(p_ds, 1e-8, 1.0)).astype(np.float32)
    pref = np.cumsum(nll, axis=1, dtype=np.float64)
    def seg_nll(c, t0, t1):
        if t0<=0: return float(pref[c, t1-1])
        return float(pref[c, t1-1] - pref[c, t0-1])
    K = len(order)
    INF=1e18
    dp = np.full((K+1, T2+1), INF, dtype=np.float64)
    bp = -np.ones((K+1, T2+1), dtype=np.int32)
    dp[0,0]=0.0
    dmin_ds_global = max(1, 3//scale)
    dmin_c_ds = np.zeros(21, dtype=np.int32)
    dmax_c_ds = np.zeros(21, dtype=np.int32)
    for c in range(21):
        md = max(1.0, med[c])
        dmin_c_ds[c] = max(1, int((0.4*md)//scale))
        dmax_c_ds[c] = max(dmin_c_ds[c], int(min(q95[c]//scale, 150//scale)))
    sum_min = int(sum(dmin_c_ds[c] for c in order))
    if sum_min > T2:
        factor = T2 / max(sum_min, 1)
        for c in set(order):
            d = max(1, int(np.floor(dmin_c_ds[c] * factor)))
            dmin_c_ds[c] = d
        for c in set(order):
            if dmax_c_ds[c] < dmin_c_ds[c]: dmax_c_ds[c] = dmin_c_ds[c]
    for k in range(1, K+1):
        c = order[k-1]
        md = max(1.0, med[c]); log_md = np.log(md)
        t_lo = k*dmin_ds_global
        t_hi = T2 - (K-k)*dmin_ds_global
        t_lo = max(t_lo, 1); t_hi = max(t_hi, 1)
        for t in range(t_lo, t_hi+1):
            best = INF; best_t0=-1
            t0_max = t - dmin_c_ds[c]
            t0_min = max((k-1)*dmin_ds_global, t - dmax_c_ds[c], 0)
            for t0 in range(t0_min, t0_max+1):
                d = t - t0
                cost = dp[k-1, t0] + seg_nll(c, t0, t) + w_dur*abs(np.log(max(d*scale,1.0)) - log_md)
                if cost < best:
                    best = cost; best_t0 = t0
            dp[k, t] = best; bp[k, t] = best_t0
    t = int(np.argmin(dp[K, :]))
    if not np.isfinite(dp[K, t]):
        # fallback: force equal splits
        step = max(1, T//K)
        bnds=[0]
        for k in range(1,K): bnds.append(min(T, k*step))
        bnds.append(T)
        return bnds
    k=K; cuts=[t]
    while k>0:
        t0 = int(bp[k, t])
        if t0<0: break
        cuts.append(t0); t=t0; k-=1
    cuts = cuts[::-1]
    bnds=[0]
    for x in cuts[1:]:
        b = int(x*scale)
        bnds.append(b)
    if len(bnds)<K+1: bnds.append(T)
    if bnds[-1] < T: bnds[-1] = T
    return bnds

def pairwise_order_from_probs(p: np.ndarray, q_power: float = 1.8):
    C,T = p.shape
    classes = list(range(1,21))
    w = np.power(np.clip(p[1:21], 1e-8, 1.0), q_power).astype(np.float32)
    S_after = np.cumsum(w[:, ::-1], axis=1)[:, ::-1]
    W = np.zeros((20,20), dtype=np.float64)
    for i in range(20):
        wi = w[i]
        for j in range(20):
            if i==j: continue
            W[i,j] = float((wi * S_after[j]).sum())
    M = W - W.T
    score = M.sum(axis=1)
    order_idx = list(np.argsort(-score))
    order = [classes[i] for i in order_idx]
    return order, M

# --- Per-sequence gated fusion ---
def build_fused_probs_gated(sid: int, alpha_vis=0.26, gamma_a=0.25, smooth_k=5):
    ps_ref = load_skeleton_probs(sid)
    pr = load_probs_generic(sid, 'rgb')
    pdp = load_probs_generic(sid, 'depth')
    pu  = load_probs_generic(sid, 'user')
    pa  = load_probs_generic(sid, 'audio')
    # temp-scale for test using average fold temps
    t = get_test_temp_avg(TEMP_RGB);   pr  = temp_scale_scalar(pr,  t) if pr  is not None else None
    t = get_test_temp_avg(TEMP_DEPTH); pdp = temp_scale_scalar(pdp, t) if pdp is not None else None
    t = get_test_temp_avg(TEMP_USER);  pu  = temp_scale_scalar(pu,  t) if pu  is not None else None
    t = get_test_temp_avg(TEMP_AUDIO); pa  = temp_scale_scalar(pa,  t) if pa is not None else None
    # align and compute corr; gate by corr>=0.5
    aligned = [ps_ref]
    vis_list=[]
    # RGB
    if pr is not None:
        pr_a, ps_a, corr = align_with_corr(pr, ps_ref, max_shift=15)
        if corr >= 0.5: vis_list.append(pr_a); aligned.append(ps_a)
    # Depth
    if pdp is not None:
        pdp_a, ps_a2, corr = align_with_corr(pdp, ps_ref, max_shift=15)
        if corr >= 0.5: vis_list.append(pdp_a); aligned.append(ps_a2)
    # User
    if pu is not None:
        pu_a, ps_a3, corr = align_with_corr(pu, ps_ref, max_shift=15)
        if corr >= 0.5: vis_list.append(pu_a); aligned.append(ps_a3)
    # Audio (not visual but also gate with corr>=0.5 for robustness)
    pa_a = None
    if pa is not None:
        pa_a_tmp, ps_a4, corr = align_with_corr(pa, ps_ref, max_shift=15)
        if corr >= 0.5:
            pa_a = pa_a_tmp; aligned.append(ps_a4)
    # crop to common T
    Tm = min(x.shape[1] for x in aligned)
    ps = ps_ref[:, :Tm]
    if vis_list:
        vis_list = [v[:, :Tm] for v in vis_list]
        pvis = np.mean(vis_list, axis=0)
        pvis /= (pvis.sum(axis=0, keepdims=True)+1e-8)
    else:
        pvis = None
    if pa_a is not None: pa_a = pa_a[:, :Tm]
    # PoE
    w_s = 1.0 - (alpha_vis if pvis is not None else 0.0) - (gamma_a if pa_a is not None else 0.0)
    logp = w_s*np.log(np.clip(ps,1e-8,1.0))
    if pvis is not None: logp += alpha_vis*np.log(np.clip(pvis,1e-8,1.0))
    if pa_a is not None: logp += gamma_a*np.log(np.clip(pa_a,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    q = smooth_probs_box(q.astype(np.float32), k=smooth_k)
    return q

# --- Borda aggregation ---
def borda_aggregate(o1, o2, o3, p_fused, M_pairwise):
    # ranks: dict class->rank (1..20)
    ranks = []
    for o in (o1,o2,o3):
        rk={c:i+1 for i,c in enumerate(o)}
        ranks.append(rk)
    classes = list(range(1,21))
    # base Borda score
    scores = {c: sum(21 - ranks[k].get(c, 20) for k in range(3)) for c in classes}
    # tie-breakers: total mass desc, then pairwise margin desc, then o1 precedence
    mass = {c: float(p_fused[c].sum()) for c in classes}
    # pairwise margin proxy: sum over j M[c_idx-1, j-1]
    # Build mapping class->idx 0..19
    idx = {c: c-1 for c in classes}
    margin = {c: float(M_pairwise[idx[c]].sum()) for c in classes}
    # final ordering
    def key(c):
        return (-scores[c], -mass[c], -margin[c], ranks[0].get(c, 20))
    agg = sorted(classes, key=key)
    return agg

# --- Main: test decode with ensemble ---
def generate_submission_borda(out_csv='submission_ensemble_borda_gated.csv'):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        p = build_fused_probs_gated(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=5)
        # Candidate 1: MinSeg k=5, ABA max_len=2
        min_dur = np.floor(med*0.7 + 0.5).astype(np.int32); min_dur[0]=0
        y1 = decode_minseg(p, min_dur.copy())
        y1 = aba_collapse(y1, max_len=2)
        o1 = make_perm20(compress_to_sequence(y1), p)
        # Candidate 2: MinSeg k=3, ABA max_len=3
        p_s3 = smooth_probs_box(p, k=3)
        y2 = decode_minseg(p_s3, min_dur.copy())
        y2 = aba_collapse(y2, max_len=3)
        o2 = make_perm20(compress_to_sequence(y2), p)
        # Candidate 3: Pairwise q=1.8
        o3, M = pairwise_order_from_probs(p, q_power=1.8)
        # Ensure all are length 20 unique
        if len(o1)!=20: o1 = make_perm20(o1, p)
        if len(o2)!=20: o2 = make_perm20(o2, p)
        if len(o3)!=20: o3 = make_perm20(o3, p)
        # Borda
        o_final = borda_aggregate(o1, o2, o3, p, M)
        # Boundary optimization
        bnds = optimize_boundaries(o_final, p, med, q95, w_dur=0.05, stride=3)
        y = np.zeros(p.shape[1], dtype=np.int32)
        for k,c in enumerate(o_final):
            t0=bnds[k]; t1=bnds[k+1]
            y[t0:t1]=c
        seq_raw = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
        seq = make_perm20(seq_raw, p)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

print('Generating Borda-ensemble gated submission...', flush=True)
generate_submission_borda(out_csv='submission_ensemble_borda_gated.csv')
print('Done.')

# Note: This cell performs TEST decode only and overwrites submission.csv intentionally.

Generating Borda-ensemble gated submission...


Decoded 20/95 elapsed=1759186533.0s


Decoded 40/95 elapsed=1759186812.9s


Decoded 60/95 elapsed=1759186840.0s


Decoded 80/95 elapsed=1759186849.5s


Decoded 95/95 elapsed=1759186233.2s


Wrote submission_ensemble_borda_gated.csv rows= 95
submission.csv written -> submission_ensemble_borda_gated.csv
Done.


In [16]:
# Re-run Borda ensemble with audio gate=0.3 (visuals keep 0.5); stage as submission.csv
import numpy as np, pandas as pd, time
from pathlib import Path

def build_fused_probs_gated_audio03(sid: int, alpha_vis=0.26, gamma_a=0.25, smooth_k=5):
    ps_ref = load_skeleton_probs(sid)
    pr = load_probs_generic(sid, 'rgb')
    pdp = load_probs_generic(sid, 'depth')
    pu  = load_probs_generic(sid, 'user')
    pa  = load_probs_generic(sid, 'audio')
    # temp-scale for test using average fold temps
    t = get_test_temp_avg(TEMP_RGB);   pr  = temp_scale_scalar(pr,  t) if pr  is not None else None
    t = get_test_temp_avg(TEMP_DEPTH); pdp = temp_scale_scalar(pdp, t) if pdp is not None else None
    t = get_test_temp_avg(TEMP_USER);  pu  = temp_scale_scalar(pu,  t) if pu  is not None else None
    t = get_test_temp_avg(TEMP_AUDIO); pa  = temp_scale_scalar(pa,  t) if pa is not None else None
    # align and compute corr; gate: visuals >=0.5, audio >=0.3
    aligned = [ps_ref]
    vis_list=[]
    if pr is not None:
        pr_a, ps_a, corr = align_with_corr(pr, ps_ref, max_shift=15)
        if corr >= 0.5: vis_list.append(pr_a); aligned.append(ps_a)
    if pdp is not None:
        pdp_a, ps_a2, corr = align_with_corr(pdp, ps_ref, max_shift=15)
        if corr >= 0.5: vis_list.append(pdp_a); aligned.append(ps_a2)
    if pu is not None:
        pu_a, ps_a3, corr = align_with_corr(pu, ps_ref, max_shift=15)
        if corr >= 0.5: vis_list.append(pu_a); aligned.append(ps_a3)
    pa_a = None
    if pa is not None:
        pa_a_tmp, ps_a4, corr = align_with_corr(pa, ps_ref, max_shift=15)
        if corr >= 0.3:
            pa_a = pa_a_tmp; aligned.append(ps_a4)
    Tm = min(x.shape[1] for x in aligned)
    ps = ps_ref[:, :Tm]
    if vis_list:
        vis_list = [v[:, :Tm] for v in vis_list]
        pvis = np.mean(vis_list, axis=0)
        pvis /= (pvis.sum(axis=0, keepdims=True)+1e-8)
    else:
        pvis = None
    if pa_a is not None: pa_a = pa_a[:, :Tm]
    w_s = 1.0 - (alpha_vis if pvis is not None else 0.0) - (gamma_a if pa_a is not None else 0.0)
    logp = w_s*np.log(np.clip(ps,1e-8,1.0))
    if pvis is not None: logp += alpha_vis*np.log(np.clip(pvis,1e-8,1.0))
    if pa_a is not None: logp += gamma_a*np.log(np.clip(pa_a,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    q = smooth_probs_box(q.astype(np.float32), k=smooth_k)
    return q

def generate_submission_borda_audio03(out_csv='submission_ensemble_borda_gated_audio03.csv'):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        p = build_fused_probs_gated_audio03(int(sid), alpha_vis=0.26, gamma_a=0.25, smooth_k=5)
        min_dur = np.floor(med*0.7 + 0.5).astype(np.int32); min_dur[0]=0
        y1 = decode_minseg(p, min_dur.copy()); y1 = aba_collapse(y1, max_len=2)
        o1 = make_perm20(compress_to_sequence(y1), p)
        p_s3 = smooth_probs_box(p, k=3)
        y2 = decode_minseg(p_s3, min_dur.copy()); y2 = aba_collapse(y2, max_len=3)
        o2 = make_perm20(compress_to_sequence(y2), p)
        o3, M = pairwise_order_from_probs(p, q_power=1.8)
        if len(o1)!=20: o1 = make_perm20(o1, p)
        if len(o2)!=20: o2 = make_perm20(o2, p)
        if len(o3)!=20: o3 = make_perm20(o3, p)
        o_final = borda_aggregate(o1, o2, o3, p, M)
        bnds = optimize_boundaries(o_final, p, med, q95, w_dur=0.05, stride=3)
        y = np.zeros(p.shape[1], dtype=np.int32)
        for k,c in enumerate(o_final):
            t0=bnds[k]; t1=bnds[k+1]
            y[t0:t1]=c
        seq_raw = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
        seq = make_perm20(seq_raw, p)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f"Decoded {n}/95 elapsed={time.time()-t0:.1f}s", flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    sub.to_csv('submission.csv', index=False); print('submission.csv written ->', out_csv)

print('Generating Borda-ensemble (audio gate=0.3)...', flush=True)
generate_submission_borda_audio03(out_csv='submission_ensemble_borda_gated_audio03.csv')
print('Done.')

Generating Borda-ensemble (audio gate=0.3)...


Decoded 20/95 elapsed=1759186858.7s


Decoded 40/95 elapsed=1759187138.6s


Decoded 60/95 elapsed=1759187165.8s


Decoded 80/95 elapsed=1759187175.3s


Decoded 95/95 elapsed=1759186559.0s


Wrote submission_ensemble_borda_gated_audio03.csv rows= 95
submission.csv written -> submission_ensemble_borda_gated_audio03.csv
Done.


In [17]:
# OOF sanity check: Borda-gated (audio gate=0.3) vs baseline PoE+MinSeg
import numpy as np, pandas as pd, json, time
from collections import defaultdict

def lev_dist(a, b):
    n=len(a); m=len(b)
    if n==0: return m
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i
        for j in range(1,m+1):
            tmp=dp[j]; cost=0 if a[i-1]==b[j-1] else 1
            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
    return dp[m]

def compress_to_sequence(y_frames):
    seq=[]; last=-1
    for c in y_frames:
        if c==0: continue
        if c!=last: seq.append(int(c)); last=int(c)
    return seq

def oof_eval_borda_and_baseline():
    results = {'borda03': [], 'baseline_poe': []}
    fold_stats = {'borda03': [], 'baseline_poe': []}
    t_all = time.time()
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med, q95 = compute_duration_stats(tr)
        # baseline PoE uses build_fused_probs_for_id from cell 4 with smooth_k=5
        dists_borda=[]; dists_base=[]
        t0 = time.time()
        for sid in va:
            sid = int(sid)
            # skip if skeleton probs missing
            if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
                continue
            # Borda-gated audio0.3 fused probs
            p_borda = build_fused_probs_gated_audio03(sid, alpha_vis=0.26, gamma_a=0.25, smooth_k=5)
            # Candidates
            min_dur = np.floor(med*0.7 + 0.5).astype(np.int32); min_dur[0]=0
            y1 = decode_minseg(p_borda, min_dur.copy()); y1 = aba_collapse(y1, max_len=2)
            o1 = make_perm20(compress_to_sequence(y1), p_borda)
            p_s3 = smooth_probs_box(p_borda, k=3)
            y2 = decode_minseg(p_s3, min_dur.copy()); y2 = aba_collapse(y2, max_len=3)
            o2 = make_perm20(compress_to_sequence(y2), p_borda)
            o3, M = pairwise_order_from_probs(p_borda, q_power=1.8)
            if len(o1)!=20: o1 = make_perm20(o1, p_borda)
            if len(o2)!=20: o2 = make_perm20(o2, p_borda)
            if len(o3)!=20: o3 = make_perm20(o3, p_borda)
            of = borda_aggregate(o1, o2, o3, p_borda, M)
            bnds = optimize_boundaries(of, p_borda, med, q95, w_dur=0.05, stride=3)
            yf = np.zeros(p_borda.shape[1], dtype=np.int32)
            for k,c in enumerate(of):
                tA=bnds[k]; tB=bnds[k+1]
                yf[tA:tB]=c
            seq_borda = compress_to_sequence(yf)
            # Baseline PoE fused probs
            p_base = build_fused_probs_for_id(sid, alpha_vis=0.26, gamma_a=0.25, smooth_k=5, fold=int(fd['fold']))
            yb = decode_minseg(p_base, min_dur.copy()); yb = aba_collapse(yb, max_len=2)
            seq_base = compress_to_sequence(yb)
            # Ground truth
            y_true = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
            seq_true = compress_to_sequence(y_true)
            d_b = lev_dist(seq_borda, seq_true); dists_borda.append(d_b)
            d_a = lev_dist(seq_base, seq_true); dists_base.append(d_a)
        worst_b = max(dists_borda) if dists_borda else 1e9
        mean_b  = float(np.mean(dists_borda)) if dists_borda else 1e9
        worst_a = max(dists_base) if dists_base else 1e9
        mean_a  = float(np.mean(dists_base)) if dists_base else 1e9
        fold_stats['borda03'].append(mean_b); fold_stats['baseline_poe'].append(mean_a)
        print(f"fold={fd['fold']} Borda03 mean={mean_b:.3f} (norm={mean_b/20:.3f}) worst={worst_b:.3f} | Base mean={mean_a:.3f} (norm={mean_a/20:.3f}) worst={worst_a:.3f} elapsed={time.time()-t0:.1f}s", flush=True)
        results['borda03'].extend(dists_borda); results['baseline_poe'].extend(dists_base)
    # summarize overall
    worst_b = max(results['borda03']) if results['borda03'] else 1e9
    mean_b  = float(np.mean(results['borda03'])) if results['borda03'] else 1e9
    worst_a = max(results['baseline_poe']) if results['baseline_poe'] else 1e9
    mean_a  = float(np.mean(results['baseline_poe'])) if results['baseline_poe'] else 1e9
    print(f"OOF summary Borda03: worst={worst_b:.3f}, mean={mean_b:.3f}, norm_mean={mean_b/20:.5f}")
    print(f"OOF summary Baseline: worst={worst_a:.3f}, mean={mean_a:.3f}, norm_mean={mean_a/20:.5f}")
    print(f"Total elapsed={time.time()-t_all:.1f}s", flush=True)

print('Running OOF sanity check for Borda audio0.3 vs baseline PoE...', flush=True)
oof_eval_borda_and_baseline()

Running OOF sanity check for Borda audio0.3 vs baseline PoE...


fold=0 Borda03 mean=10.347 (norm=0.517) worst=15.000 | Base mean=3.857 (norm=0.193) worst=10.000 elapsed=35.3s


fold=1 Borda03 mean=8.798 (norm=0.440) worst=17.000 | Base mean=2.929 (norm=0.146) worst=13.000 elapsed=32.1s


fold=2 Borda03 mean=10.320 (norm=0.516) worst=20.000 | Base mean=4.470 (norm=0.223) worst=20.000 elapsed=33.6s


OOF summary Borda03: worst=20.000, mean=9.822, norm_mean=0.49108
OOF summary Baseline: worst=20.000, mean=3.754, norm_mean=0.18771
Total elapsed=102.3s


In [19]:
# Soft-gated PoE fusion by entropy correlation per stream; OOF sanity and TEST decode
import numpy as np, pandas as pd, json, time
from pathlib import Path

probs_cache = Path('probs_cache')

def soft_gate_from_corr(corr, w_min=0.05, w_max=0.40):
    if corr is None: return 0.0
    w = (corr + 1.0) / 4.0  # maps corr in [-1,1] -> [0,0.5]
    return float(np.clip(w, w_min, w_max))

def build_fused_probs_softgate(sid: int, smooth_k=5, fold: int | None = None, for_test: bool = False, audio_gate_thresh=0.30):
    ps_ref = load_skeleton_probs(sid)
    pr = load_probs_generic(sid, 'rgb')
    pdp = load_probs_generic(sid, 'depth')
    pu  = load_probs_generic(sid, 'user')
    pa  = load_probs_generic(sid, 'audio')
    # temp-scale using fold temps for OOF or avg test temps
    if fold is not None:
        if pr is not None and fold in TEMP_RGB:     pr  = temp_scale_scalar(pr,  TEMP_RGB[fold])
        if pdp is not None and fold in TEMP_DEPTH:  pdp = temp_scale_scalar(pdp, TEMP_DEPTH[fold])
        if pu  is not None and fold in TEMP_USER:   pu  = temp_scale_scalar(pu,  TEMP_USER[fold])
        if pa  is not None and fold in TEMP_AUDIO:  pa  = temp_scale_scalar(pa,  TEMP_AUDIO[fold])
    elif for_test:
        t = get_test_temp_avg(TEMP_RGB);   pr  = temp_scale_scalar(pr,  t) if pr  is not None else None
        t = get_test_temp_avg(TEMP_DEPTH); pdp = temp_scale_scalar(pdp, t) if pdp is not None else None
        t = get_test_temp_avg(TEMP_USER);  pu  = temp_scale_scalar(pu,  t) if pu  is not None else None
        t = get_test_temp_avg(TEMP_AUDIO); pa  = temp_scale_scalar(pa,  t) if pa is not None else None
    # align independently to skeleton and get corr
    aligned_refs = [ps_ref]
    streams = []  # list of (p_aligned, weight)
    # Visual streams (soft weights from corr, no hard dropping)
    if pr is not None:
        pr_a, ps_a, cr = align_with_corr(pr, ps_ref, max_shift=15)
        w = soft_gate_from_corr(cr, 0.05, 0.40)
        streams.append((pr_a, w)); aligned_refs.append(ps_a)
    if pdp is not None:
        pdp_a, ps_a2, cr = align_with_corr(pdp, ps_ref, max_shift=15)
        w = soft_gate_from_corr(cr, 0.05, 0.40)
        streams.append((pdp_a, w)); aligned_refs.append(ps_a2)
    if pu is not None:
        pu_a, ps_a3, cr = align_with_corr(pu, ps_ref, max_shift=15)
        w = soft_gate_from_corr(cr, 0.05, 0.40)
        streams.append((pu_a, w)); aligned_refs.append(ps_a3)
    # Audio: allow lower gate thresh as per expert (0.3); then soft map
    pa_a = None; w_audio = 0.0
    if pa is not None:
        pa_tmp, ps_a4, cr = align_with_corr(pa, ps_ref, max_shift=15)
        if cr >= audio_gate_thresh:
            pa_a = pa_tmp; aligned_refs.append(ps_a4); w_audio = soft_gate_from_corr(cr, 0.05, 0.40)
    # crop all to common length
    Tm = min(x.shape[1] for x in aligned_refs)
    ps = ps_ref[:, :Tm]
    # log-space PoE with per-stream weights; skeleton gets remainder
    total_non_skel = sum(w for _,w in streams) + (w_audio if pa_a is not None else 0.0)
    total_non_skel = float(np.clip(total_non_skel, 0.0, 0.95))
    w_skel = 1.0 - total_non_skel
    logp = w_skel * np.log(np.clip(ps, 1e-8, 1.0))
    for p_str, w in streams:
        p_str = p_str[:, :Tm]
        logp += w * np.log(np.clip(p_str, 1e-8, 1.0))
    if pa_a is not None:
        pa_c = pa_a[:, :Tm]
        logp += w_audio * np.log(np.clip(pa_c, 1e-8, 1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    q = smooth_probs_box(q.astype(np.float32), k=smooth_k)
    return q.astype(np.float32)

def oof_eval_softgate_vs_base(audio_gate_thresh=0.30):
    print('OOF soft-gate vs baseline...', flush=True)
    results_soft=[]; results_base=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med, q95 = compute_duration_stats(tr)
        min_dur = np.floor(med*0.7 + 0.5).astype(np.int32); min_dur[0]=0
        d_soft=[]; d_base=[]; t0=time.time()
        for sid in va:
            sid=int(sid)
            if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
                continue
            pf_s = build_fused_probs_softgate(sid, smooth_k=5, fold=int(fd['fold']), for_test=False, audio_gate_thresh=audio_gate_thresh)
            y_s = decode_minseg(pf_s, min_dur.copy()); y_s = aba_collapse(y_s, max_len=2)
            seq_s = compress_to_sequence(y_s)
            pf_b = build_fused_probs_for_id(sid, alpha_vis=0.26, gamma_a=0.25, smooth_k=5, fold=int(fd['fold']))
            y_b = decode_minseg(pf_b, min_dur.copy()); y_b = aba_collapse(y_b, max_len=2)
            seq_b = compress_to_sequence(y_b)
            y_true = np.load(labels_dir/f"{sid}.npy").astype(np.int32)
            seq_t = compress_to_sequence(y_true)
            # Levenshtein
            def lev(a,b):
                n=len(a); m=len(b)
                if n==0: return m
                dp=list(range(m+1))
                for i in range(1,n+1):
                    prev=dp[0]; dp[0]=i
                    for j in range(1,m+1):
                        tmp=dp[j]; cost=0 if a[i-1]==b[j-1] else 1
                        dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                return dp[m]
            d_soft.append(lev(seq_s, seq_t)); d_base.append(lev(seq_b, seq_t))
        print(f"fold={fd['fold']} soft mean={np.mean(d_soft):.3f} (norm={np.mean(d_soft)/20:.3f}) | base mean={np.mean(d_base):.3f} (norm={np.mean(d_base)/20:.3f}) elapsed={time.time()-t0:.1f}s", flush=True)
        results_soft.extend(d_soft); results_base.extend(d_base)
    print(f"Summary soft: mean={np.mean(results_soft):.3f} (norm={np.mean(results_soft)/20:.5f})")
    print(f"Summary base: mean={np.mean(results_base):.3f} (norm={np.mean(results_base)/20:.5f})")

def decode_test_softgate(audio_gate_thresh=0.30, out_csv='submission_softgate.csv'):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    min_dur = np.floor(med*0.7 + 0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        pf = build_fused_probs_softgate(int(sid), smooth_k=5, fold=None, for_test=True, audio_gate_thresh=audio_gate_thresh)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq_raw = compress_to_sequence(y)
        seq = make_perm20(seq_raw, pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    # do not auto-stage; we will choose after OOF check

print('Running OOF soft-gate vs base...', flush=True)
oof_eval_softgate_vs_base(audio_gate_thresh=0.30)
print('Optionally decode TEST with soft-gate by calling decode_test_softgate().')

Running OOF soft-gate vs base...


OOF soft-gate vs baseline...


fold=0 soft mean=12.990 (norm=0.649) | base mean=3.857 (norm=0.193) elapsed=4.0s


fold=1 soft mean=12.020 (norm=0.601) | base mean=2.929 (norm=0.146) elapsed=3.1s


fold=2 soft mean=11.420 (norm=0.571) | base mean=4.470 (norm=0.223) elapsed=2.9s


Summary soft: mean=12.138 (norm=0.60690)
Summary base: mean=3.754 (norm=0.18771)
Optionally decode TEST with soft-gate by calling decode_test_softgate().


In [20]:
# Hedge 2: Gated-PoE (audio gate=0.3) + MinSeg (no Borda, no DP); TEST DECODE ONLY; does NOT overwrite submission.csv
import numpy as np, pandas as pd, time
from pathlib import Path

def decode_test_gated_poe_minseg_audio03(out_csv='submission_gated_poe_minseg_a03.csv', alpha_vis=0.26, gamma_a=0.25, smooth_k=5, min_mult=0.70):
    # duration stats from all train ids
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    min_dur = np.floor(med*min_mult + 0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        # fused probs with visual gate=0.5, audio gate=0.3
        p = build_fused_probs_gated_audio03(int(sid), alpha_vis=alpha_vis, gamma_a=gamma_a, smooth_k=smooth_k)
        # MinSeg decode
        y = decode_minseg(p, min_dur.copy())
        y = aba_collapse(y, max_len=2)
        seq_raw = compress_to_sequence(y)
        seq = make_perm20(seq_raw, p)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'

print('Decoding TEST: Gated-PoE (audio gate=0.3) + MinSeg...', flush=True)
decode_test_gated_poe_minseg_audio03(out_csv='submission_gated_poe_minseg_a03.csv', alpha_vis=0.26, gamma_a=0.25, smooth_k=5, min_mult=0.70)
print('Done. submission.csv unchanged (baseline remains staged).')

Decoding TEST: Gated-PoE (audio gate=0.3) + MinSeg...


Decoded 20/95 elapsed=0.3s


Decoded 40/95 elapsed=0.6s


Decoded 60/95 elapsed=0.8s


Decoded 80/95 elapsed=1.1s


Decoded 95/95 elapsed=1.2s


Wrote submission_gated_poe_minseg_a03.csv rows= 95
Done. submission.csv unchanged (baseline remains staged).


In [21]:
# Stage hedge submission: pairwise+DP q=1.8 wd=0.0
import shutil, os
src = 'submission_pairwise_dp_q18_wd00.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged hedge submission: {src} -> {dst}')

Staged hedge submission: submission_pairwise_dp_q18_wd00.csv -> submission.csv


In [22]:
# Stage hedge submission: gated PoE + MinSeg (audio gate=0.3)
import shutil, os
src = 'submission_gated_poe_minseg_a03.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged hedge submission: {src} -> {dst}')

Staged hedge submission: submission_gated_poe_minseg_a03.csv -> submission.csv


In [23]:
# Optional Hedge 3: Majority-vote MinSeg over min_dur multipliers on fused PoE; TEST DECODE ONLY
import numpy as np, pandas as pd, time

def majority_vote_labels(labels_list):
    # labels_list: list of np.array int32 of equal length
    L = len(labels_list)
    T = labels_list[0].shape[0]
    out = np.zeros(T, dtype=np.int32)
    for t in range(T):
        a = labels_list[0][t]; b = labels_list[1][t]; c = labels_list[2][t]
        if a==b or a==c: out[t]=a
        elif b==c: out[t]=b
        else: out[t]=labels_list[1][t]  # tie-break to central (m=0.70)
    return out

def decode_test_minseg_majority(out_csv='submission_minseg_mv_m060_070_080.csv', alpha_vis=0.26, gamma_a=0.25, smooth_k=5):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        pf = build_fused_probs_for_id(int(sid), alpha_vis=alpha_vis, gamma_a=gamma_a, smooth_k=smooth_k, fold=None, for_test=True)
        labels=[]
        for mult in (0.60, 0.70, 0.80):
            min_dur = np.floor(med*mult + 0.5).astype(np.int32); min_dur[0]=0
            y = decode_minseg(pf, min_dur.copy())
            y = aba_collapse(y, max_len=2)
            labels.append(y)
        y_mv = majority_vote_labels(labels)
        seq_raw = [int(c) for i,c in enumerate(y_mv) if c!=0 and (i==0 or y_mv[i-1]!=c)]
        seq = make_perm20(seq_raw, pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'

print('Decoding TEST: MinSeg majority vote over m={0.60,0.70,0.80} ...', flush=True)
decode_test_minseg_majority(out_csv='submission_minseg_mv_m060_070_080.csv', alpha_vis=0.26, gamma_a=0.25, smooth_k=5)
print('Done. submission.csv unchanged (baseline remains staged unless restaged manually).')

Decoding TEST: MinSeg majority vote over m={0.60,0.70,0.80} ...


Decoded 20/95 elapsed=0.3s


Decoded 40/95 elapsed=0.7s


Decoded 60/95 elapsed=1.0s


Decoded 80/95 elapsed=1.3s


Decoded 95/95 elapsed=1.5s


Wrote submission_minseg_mv_m060_070_080.csv rows= 95
Done. submission.csv unchanged (baseline remains staged unless restaged manually).


In [25]:
# Best-of-N test-time selection over tiny PoE/decoder grid; picks highest log-likelihood; TEST DECODE ONLY
import numpy as np, pandas as pd, time

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    # y: framewise labels (int32, 0..20), p: CxT probs (same T as y)
    C,T = p.shape
    assert y.shape[0] == T, f'Length mismatch y={y.shape[0]} T={T}'
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def decode_minseg_on_probs(pf: np.ndarray, med: np.ndarray, min_mult: float):
    min_dur = np.floor(med*min_mult + 0.5).astype(np.int32); min_dur[0]=0
    y = decode_minseg(pf, min_dur.copy())
    y = aba_collapse(y, max_len=2)
    return y

def build_fused_probs_fixed_for_test(sid: int, alpha_vis=0.26, gamma_a=0.25):
    # Reuse build_fused_probs_for_id with for_test=True; keep k=3 as a light default pre-smooth
    pf = build_fused_probs_for_id(int(sid), alpha_vis=alpha_vis, gamma_a=gamma_a, smooth_k=3, fold=None, for_test=True)
    return pf

def decode_test_bestofN(out_csv='submission_bestofN_poe_minseg.csv',
                        alpha_list=(0.24,0.26,0.28), gamma_list=(0.20,0.25,0.30),
                        smooth_list=(3,5), min_mult_list=(0.60,0.70)):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        best_ll = -1e99; best_seq=None
        # Precompute base fused probs per (alpha,gamma)
        pf_cache = {}
        for av in alpha_list:
            for ga in gamma_list:
                pf_cache[(av,ga)] = build_fused_probs_fixed_for_test(sid, alpha_vis=av, gamma_a=ga)
        for (av,ga), pf_base in pf_cache.items():
            for sk in smooth_list:
                # smooth once to pf_use; keep length consistent for both decode and LL
                pf_use = smooth_probs_box(pf_base, k=sk)
                for mm in min_mult_list:
                    y = decode_minseg_on_probs(pf_use, med, min_mult=mm)
                    ll = total_emission_loglik(y, pf_use)
                    if ll > best_ll:
                        best_ll = ll
                        seq_raw = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
                        seq = make_perm20(seq_raw, pf_use)
                        best_seq = seq
        ids.append(sid); rows.append(' '.join(map(str, best_seq))); n+=1
        if (n%20)==0 or n==95: print(f"Decoded {n}/95 elapsed={time.time()-t0:.1f}s", flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'

print('Decoding TEST with Best-of-N PoE+MinSeg (tiny grid)...', flush=True)
decode_test_bestofN(out_csv='submission_bestofN_poe_minseg.csv',
                    alpha_list=(0.24,0.26,0.28), gamma_list=(0.20,0.25,0.30),
                    smooth_list=(3,5), min_mult_list=(0.60,0.70))
print('Done. submission.csv unchanged; stage explicitly before submit.')

Decoding TEST with Best-of-N PoE+MinSeg (tiny grid)...


Decoded 20/95 elapsed=2.9s


Decoded 40/95 elapsed=6.2s


Decoded 60/95 elapsed=8.9s


Decoded 80/95 elapsed=11.3s


Decoded 95/95 elapsed=13.0s


Wrote submission_bestofN_poe_minseg.csv rows= 95
Done. submission.csv unchanged; stage explicitly before submit.


In [26]:
# Stage hedge submission: Best-of-N PoE+MinSeg
import shutil, os
src = 'submission_bestofN_poe_minseg.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged hedge submission: {src} -> {dst}')

Staged hedge submission: submission_bestofN_poe_minseg.csv -> submission.csv


In [27]:
# Optional Hedge: PoE+MinSeg (m=0.65) with pairwise-margin fill for missing classes; TEST DECODE ONLY
import numpy as np, pandas as pd, time

def pairwise_margins(p: np.ndarray, q_power: float = 1.8):
    # returns antisymmetric margins M (20x20) for classes 1..20
    C,T = p.shape
    w = np.power(np.clip(p[1:21], 1e-8, 1.0), q_power).astype(np.float32)  # 20 x T
    S_after = np.cumsum(w[:, ::-1], axis=1)[:, ::-1]
    W = np.zeros((20,20), dtype=np.float64)
    for i in range(20):
        wi = w[i]
        for j in range(20):
            if i==j: continue
            W[i,j] = float((wi * S_after[j]).sum())
    M = W - W.T
    return M

def make_perm20_pairmargin(seq_raw, p: np.ndarray, q_power: float = 1.8):
    # keep first occurrences from seq_raw; fill missing by pairwise margin strength
    seen=set(); seq=[]
    for c in seq_raw:
        if 1<=c<=20 and c not in seen:
            seen.add(c); seq.append(c)
    if len(seq) >= 20:
        return seq[:20]
    # compute margins
    M = pairwise_margins(p, q_power=q_power)
    # score each missing class by row-sum (overall dominance)
    missing = [c for c in range(1,21) if c not in seen]
    scores = {c: float(M[c-1].sum()) for c in missing}
    missing_sorted = sorted(missing, key=lambda c: scores[c], reverse=True)
    for c in missing_sorted:
        if len(seq)==20: break
        seq.append(c)
    if len(seq)>20: seq = seq[:20]
    return seq

def decode_test_poe_minseg_pairfill(out_csv='submission_poe_m065_pairfill.csv', alpha_vis=0.26, gamma_a=0.25, smooth_k=5, min_mult=0.65, q_power=1.8):
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med, q95 = compute_duration_stats(all_train_ids)
    min_dur = np.floor(med*min_mult + 0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        if not (probs_cache/f"{sid}_ce.npy").exists() or not (probs_cache/f"{sid}_ce_v3.npy").exists():
            continue
        pf = build_fused_probs_for_id(int(sid), alpha_vis=alpha_vis, gamma_a=gamma_a, smooth_k=smooth_k, fold=None, for_test=True)
        y = decode_minseg(pf, min_dur.copy())
        y = aba_collapse(y, max_len=2)
        seq_raw = [int(c) for i,c in enumerate(y) if c!=0 and (i==0 or y[i-1]!=c)]
        seq = make_perm20_pairmargin(seq_raw, pf, q_power=q_power)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'

print('Decoding TEST: PoE+MinSeg (m=0.65) with pairwise-margin fill...', flush=True)
decode_test_poe_minseg_pairfill(out_csv='submission_poe_m065_pairfill.csv', alpha_vis=0.26, gamma_a=0.25, smooth_k=5, min_mult=0.65, q_power=1.8)
print('Done. submission.csv unchanged.')

Decoding TEST: PoE+MinSeg (m=0.65) with pairwise-margin fill...


Decoded 20/95 elapsed=0.3s


Decoded 40/95 elapsed=0.6s


Decoded 60/95 elapsed=0.8s


Decoded 80/95 elapsed=1.1s


Decoded 95/95 elapsed=1.3s


Wrote submission_poe_m065_pairfill.csv rows= 95
Done. submission.csv unchanged.


In [29]:
# Stage hedge submission: PoE+MinSeg m=0.65 with pairwise-margin fill
import shutil, os
src = 'submission_poe_m065_pairfill.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged hedge submission: {src} -> {dst}')

Staged hedge submission: submission_poe_m065_pairfill.csv -> submission.csv


In [31]:
# CLIP ViT-B/32 feature extraction (fps=4, max_frames=512) -> rgb_clip_embed/{train,test}/{id}.npy
import os, sys, subprocess, time, json, numpy as np, pandas as pd
from pathlib import Path

# Install deps if missing (torch cu121, open_clip_torch, decord)
def ensure_pkg():
    try:
        import torch, open_clip, decord  # noqa
        import torchvision  # noqa
        return
    except Exception as e:
        print('Installing deps...', e, flush=True)
    cmds = [
        [sys.executable, '-m', 'pip', 'install', '--index-url', 'https://download.pytorch.org/whl/cu121', '--extra-index-url', 'https://pypi.org/simple', 'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1'],
        [sys.executable, '-m', 'pip', 'install', 'open_clip_torch', 'decord']
    ]
    for cmd in cmds:
        print('>', ' '.join(cmd), flush=True)
        subprocess.run(cmd, check=True)
    print('Deps installed.', flush=True)

ensure_pkg()

import torch, torchvision.transforms as T
import open_clip
from decord import VideoReader, cpu
from PIL import Image

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('CUDA available:', torch.cuda.is_available(), flush=True)
if device=='cuda':
    print('GPU:', torch.cuda.get_device_name(0), flush=True)

CACHE_DIR = Path('rgb_clip_embed'); (CACHE_DIR/'train').mkdir(parents=True, exist_ok=True); (CACHE_DIR/'test').mkdir(parents=True, exist_ok=True)
VID_DIR_T = Path('rgb_videos/train'); VID_DIR_E = Path('rgb_videos/test')

# Model + transforms
model, _, _ = open_clip.create_model_and_transforms('ViT-B-32-quickgelu', pretrained='laion400m_e32', device=device)
model.eval()
mean = (0.48145466, 0.4578275, 0.40821073); std = (0.26862954, 0.26130258, 0.27577711)
tx = T.Compose([T.Resize(224, interpolation=T.InterpolationMode.BICUBIC), T.CenterCrop(224), T.ToTensor(), T.Normalize(mean, std)])

def sample_idx(nf, fps_native, fps_target=4.0, max_frames=512):
    if not fps_native or fps_native<=0: n = min(max_frames, nf)
    else:
        dur = nf/float(fps_native); n = min(max_frames, int(round(dur*fps_target)))
    n = max(1, min(n, nf))
    return np.linspace(0, nf-1, n).round().astype(int)

@torch.no_grad()
def encode_video(path: Path, max_frames=512, bs=128):
    vr = VideoReader(str(path), ctx=cpu(0))
    nf = len(vr)
    try: fps_native = float(vr.get_avg_fps())
    except Exception: fps_native = None
    idx = sample_idx(nf, fps_native, 4.0, max_frames)
    embs=[]
    for i in range(0, len(idx), bs):
        frames = vr.get_batch(idx[i:i+bs]).asnumpy()  # (B,H,W,C) uint8
        imgs = [tx(Image.fromarray(fr)) for fr in frames]
        x = torch.stack(imgs,0).to(device, non_blocking=True)
        with torch.autocast(device_type='cuda', dtype=torch.float16) if device=='cuda' else torch.no_grad():
            f = model.encode_image(x)  # (B,512)
        f = torch.nn.functional.normalize(f.float(), dim=1).cpu().numpy()
        embs.append(f)
    E = np.concatenate(embs,0).astype(np.float16)  # (T,512)
    return E

def id_to_video(dirpath: Path, sid: int):
    # Expect exact match first
    cands = list(dirpath.glob(f'{sid}.mp4'))
    if cands: return cands[0]
    # fallback: any file containing id
    cands = list(dirpath.glob(f'*{sid}*.mp4'))
    return cands[0] if cands else None

def extract_split(split='train', max_frames=512):
    csv_path = 'training.csv' if split=='train' else 'test.csv'
    ids = pd.read_csv(csv_path)['Id'].astype(int).tolist()
    out_dir = CACHE_DIR/split
    vid_dir = VID_DIR_T if split=='train' else VID_DIR_E
    done=0; t0=time.time()
    for k,sid in enumerate(ids, 1):
        out = out_dir/f"{sid}.npy"
        if out.exists():
            continue
        vp = id_to_video(vid_dir, sid)
        if vp is None:
            # silently skip missing videos (e.g., some test ids)
            continue
        try:
            E = encode_video(vp, max_frames=max_frames, bs=128)
            np.save(out, E)
        except Exception as e:
            print('FAIL', split, sid, e, flush=True)
        done+=1
        if (done%20)==0:
            print(f"{split}: saved {done} in {time.time()-t0:.1f}s (last id={sid})", flush=True)
    print(split, 'finished; new saved =', done, 'elapsed=', round(time.time()-t0,1), 's', flush=True)

print('Starting CLIP extraction (train then test)...', flush=True)
extract_split('train', max_frames=512)
extract_split('test',  max_frames=512)
print('CLIP extraction done.', flush=True)

CUDA available: True


GPU: NVIDIA A10-24Q


Starting CLIP extraction (train then test)...


train: saved 20 in 26.7s (last id=21)


train: saved 40 in 55.8s (last id=41)


train: saved 60 in 91.0s (last id=61)


train: saved 80 in 124.1s (last id=81)


train: saved 100 in 148.2s (last id=102)


train: saved 120 in 172.1s (last id=122)


train: saved 140 in 196.4s (last id=142)


train: saved 160 in 219.2s (last id=162)


train: saved 180 in 242.3s (last id=182)


train: saved 200 in 266.3s (last id=202)


train: saved 220 in 290.1s (last id=222)


train: saved 240 in 317.0s (last id=242)


train: saved 260 in 341.4s (last id=262)


train: saved 280 in 365.4s (last id=282)


train finished; new saved = 297 elapsed= 384.7 s


test: saved 20 in 23.8s (last id=319)


test: saved 40 in 50.1s (last id=340)


test: saved 60 in 74.3s (last id=362)


test: saved 80 in 99.3s (last id=383)


test finished; new saved = 92 elapsed= 113.7 s


CLIP extraction done.


In [35]:
# CLIP head training (3 folds, linear 512->21), cache OOF/test probs, per-fold temp scaling
import json, numpy as np, pandas as pd, torch, torch.nn as nn, torch.nn.functional as F, time
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

device = 'cuda' if torch.cuda.is_available() else 'cpu'
EMB_TR = Path('rgb_clip_embed/train'); EMB_TE = Path('rgb_clip_embed/test'); PROBS = Path('probs_cache'); PROBS.mkdir(exist_ok=True)
LABELS = Path('labels3d_v2/train')

def load_clip_embed(sid: int, split: str):
    p = (EMB_TR if split=='train' else EMB_TE)/f'{sid}.npy'
    if not p.exists(): return None
    return np.load(p)

def resample_labels(y, T):
    if len(y)==T: return y
    idx = np.linspace(0, len(y)-1, T).round().astype(int)
    return y[idx]

class SeqDataset(Dataset):
    def __init__(self, ids):
        self.items=[]
        for sid in ids:
            E = load_clip_embed(int(sid), 'train')
            if E is None: continue
            y = np.load(LABELS/f"{int(sid)}.npy").astype(np.int64)
            y = resample_labels(y, E.shape[0])
            self.items.append((int(sid), E.astype(np.float32), y))
    def __len__(self): return len(self.items)
    def __getitem__(self, i):
        sid,E,y = self.items[i]
        return sid, torch.from_numpy(E), torch.from_numpy(y)

class LinearHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(512, 21)
    def forward(self, x):
        return self.fc(x)

@torch.no_grad()
def forward_logits_all(head, E_np: np.ndarray, bs: int = 4096):
    xb = torch.from_numpy(E_np.astype(np.float32)).to(device)
    outs=[]
    for i in range(0, xb.size(0), bs):
        outs.append(head(xb[i:i+bs]).float().cpu())
    lg = torch.cat(outs,0).numpy()  # (T,21)
    return lg

def fit_temperature(head, val_items):
    # Collect logits and labels
    Xs=[]; Ys=[]
    with torch.no_grad():
        for sid,E,y in val_items:
            xb = torch.from_numpy(E.astype(np.float32)).to(device)
            outs=[]
            for i in range(0, xb.size(0), 4096):
                outs.append(head(xb[i:i+4096]).float())
            lg = torch.cat(outs,0)  # (T,21)
            Xs.append(lg.cpu()); Ys.append(torch.from_numpy(y))
    X = torch.cat(Xs,0).to(device); Y = torch.cat(Ys,0).to(device)
    Tsc = torch.tensor(1.5, device=device, requires_grad=True)
    opt = torch.optim.LBFGS([Tsc], lr=0.01, max_iter=50)
    def closure():
        opt.zero_grad()
        loss = F.cross_entropy(X / Tsc, Y, reduction='mean')
        loss.backward()
        return loss
    opt.step(closure)
    return float(Tsc.detach().cpu().item())

def train_clip_head_and_cache(folds_path='folds_archive_cv.json', epochs=3, bs_frames=2048, lr=2e-3, wd=0.05):
    folds = json.load(open(folds_path,'r'))
    test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
    for fd in folds:
        fidx = int(fd['fold'])
        tr_ids = list(map(int, fd['train_ids'])); va_ids = list(map(int, fd['val_ids']))
        ds_tr = SeqDataset(tr_ids); ds_va = SeqDataset(va_ids)
        # Flatten frames for training
        Xtr = np.concatenate([E for _,E,_ in ds_tr.items], axis=0)
        Ytr = np.concatenate([y for *_,y in ds_tr.items], axis=0)
        # Build loaders as chunks to avoid extra copies
        n = Xtr.shape[0]
        chunks = [(Xtr[i:i+bs_frames], Ytr[i:i+bs_frames]) for i in range(0, n, bs_frames)]
        head = LinearHead().to(device)
        opt = torch.optim.AdamW(head.parameters(), lr=lr, weight_decay=wd)
        scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))
        t0=time.time()
        head.train()
        for ep in range(epochs):
            loss_sum=0.0; nb=0
            for xb_np,yb_np in chunks:
                xb = torch.from_numpy(xb_np).to(device)
                yb = torch.from_numpy(yb_np).to(device)
                opt.zero_grad(set_to_none=True)
                with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                    lg = head(xb)
                    loss = F.cross_entropy(lg, yb, label_smoothing=0.05)
                scaler.scale(loss).backward()
                scaler.step(opt); scaler.update()
                loss_sum += float(loss.detach().cpu().item()); nb+=1
            print(f"fold={fidx} ep={ep+1}/{epochs} loss={loss_sum/max(nb,1):.4f} elapsed={time.time()-t0:.1f}s", flush=True)
        # Cache OOF probs for validation ids
        head.eval()
        with torch.no_grad():
            for sid,E,y in ds_va.items:
                lg = forward_logits_all(head, E, bs=4096)  # (T,21)
                p = torch.softmax(torch.from_numpy(lg), dim=1).numpy().astype(np.float32).T  # CxT
                np.save(PROBS/f"{sid}_clip.npy", p)
        # Temp scaling on validation
        Tval = fit_temperature(head, ds_va.items)
        json.dump({'T': Tval}, open(f'clip_temp_fold{fidx}.json','w'))
        # Test per-fold probs (temp-scaled)
        with torch.no_grad():
            for sid in test_ids:
                E = load_clip_embed(int(sid), 'test')
                if E is None: continue
                lg = forward_logits_all(head, E, bs=4096)  # (T,21)
                p = torch.softmax(torch.from_numpy(lg)/Tval, dim=1).numpy().astype(np.float32).T  # CxT
                np.save(PROBS/f"{sid}_clip_f{fidx}.npy", p)
    print('CLIP head training + caching complete.', flush=True)

print('Ready: run train_clip_head_and_cache() after embeddings finish.')

# Fusion + decode with CLIP stream into PoE; small OOF grid and test decode
def _load_temp_num(path: str):
    p = Path(path)
    if not p.exists(): return None
    try:
        return float(json.load(open(p,'r')).get('T', 1.0))
    except Exception:
        try: return float(open(p).read().strip())
        except Exception: return None

def clip_test_temp_mean():
    ts = [_load_temp_num(f'clip_temp_fold{f}.json') for f in (0,1,2)]
    ts = [t for t in ts if t is not None]
    return float(np.mean(ts)) if ts else 1.0

def load_clip_probs_train(sid:int):
    pth = PROBS/f"{sid}_clip.npy"
    if not pth.exists(): return None
    arr = np.load(pth).astype(np.float32)
    return arr

def load_clip_probs_test_avg(sid:int):
    arr=[]
    for f in (0,1,2):
        pth = PROBS/f"{sid}_clip_f{f}.npy"
        if pth.exists(): arr.append(np.load(pth).astype(np.float32))
    if not arr: return None
    L = min(a.shape[1] for a in arr)
    q = np.mean([a[:, :L] for a in arr], axis=0)
    q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q

# Reuse ensure_CxT, load_skeleton_probs, load_probs_generic, align_by_entropy_corr, smooth_probs_box, decode_minseg, aba_collapse, compress_to_sequence, make_perm20, compute_min_dur_from_ids if present
def fuse_poe_with_clip(ps, p_clip, p_audio, alpha_clip, gamma_audio):
    parts=[ps]
    if p_clip is not None:
        pc, ps = align_by_entropy_corr(p_clip, ps, max_shift=15); parts.append(pc)
    if p_audio is not None:
        pa, ps = align_by_entropy_corr(p_audio, ps, max_shift=15); parts.append(pa)
    Tm = min(p.shape[1] for p in parts)
    ps = ps[:, :Tm]
    pc = parts[1][:, :Tm] if p_clip is not None else None
    pa = parts[-1][:, :Tm] if p_audio is not None else None
    w_s = 1.0 - (alpha_clip if pc is not None else 0.0) - (gamma_audio if pa is not None else 0.0)
    logp = w_s*np.log(np.clip(ps,1e-8,1.0))
    if pc is not None: logp += alpha_clip*np.log(np.clip(pc,1e-8,1.0))
    if pa is not None: logp += gamma_audio*np.log(np.clip(pa,1e-8,1.0))
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

# --- temperature scaling helpers for parity ---
def temp_scale_scalar(p_arr: np.ndarray | None, T: float | None):
    if p_arr is None or T is None: return p_arr
    logp = np.log(np.clip(p_arr, 1e-8, 1.0)) / max(float(T), 1e-6)
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

# --- length-preserving fusion (no crop-to-shortest): align by shift only, add overlap contributions ---
def _find_best_shift(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    # Use entropy correlation to pick shift; return (sh, L) where sh can be negative
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        if s.std() < 1e-8 or r.std() < 1e-8:
            corr = -1.0
        else:
            corr = float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    return best[1]

def fuse_poe_with_clip_keep_len(ps: np.ndarray, p_clip: np.ndarray | None, p_audio: np.ndarray | None, alpha_clip: float, gamma_audio: float):
    C, T = ps.shape
    logp = np.log(np.clip(ps, 1e-8, 1.0))
    w_s = np.ones(T, dtype=np.float32)
    # CLIP contribution
    if p_clip is not None:
        sh = _find_best_shift(p_clip, ps, max_shift=15)
        if sh >= 0:
            L = min(p_clip.shape[1] - sh, T); ref_start = 0; src_start = sh
        else:
            L = min(p_clip.shape[1], T + sh); ref_start = -sh; src_start = 0
        if L > 0:
            w_s[ref_start:ref_start+L] -= alpha_clip
            logp[:, ref_start:ref_start+L] += alpha_clip * np.log(np.clip(p_clip[:, src_start:src_start+L], 1e-8, 1.0))
    # Audio contribution
    if p_audio is not None:
        sh = _find_best_shift(p_audio, ps, max_shift=15)
        if sh >= 0:
            L = min(p_audio.shape[1] - sh, T); ref_start = 0; src_start = sh
        else:
            L = min(p_audio.shape[1], T + sh); ref_start = -sh; src_start = 0
        if L > 0:
            w_s[ref_start:ref_start+L] -= gamma_audio
            logp[:, ref_start:ref_start+L] += gamma_audio * np.log(np.clip(p_audio[:, src_start:src_start+L], 1e-8, 1.0))
    w_s = np.clip(w_s, 0.0, 1.0)[None, :]
    logp = w_s * logp
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

def oof_grid_clip(alpha_list=(0.30,0.35,0.40,0.45,0.50), gamma_list=(0.15,0.20,0.25,0.30), smooth_k=5, min_mult=0.7, keep_len=True):
    folds = json.load(open('folds_archive_cv.json','r'))
    results=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clip_temp_fold{fidx}.json')
        Taud  = _load_temp_num(f'audio_temp_fold{fidx}.json')
        for a in alpha_list:
            for g in gamma_list:
                d=[]
                for sid in va:
                    sid=int(sid)
                    ps = load_skeleton_probs(sid)
                    pc = load_clip_probs_train(sid)
                    pa = load_probs_generic(sid, 'audio')
                    # temperature parity for OOF
                    if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
                    if pa is not None and Taud  is not None: pa = temp_scale_scalar(pa,  Taud)
                    if keep_len:
                        pf = fuse_poe_with_clip_keep_len(ps, pc, pa, a, g)
                    else:
                        pf = fuse_poe_with_clip(ps, pc, pa, a, g)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    seq = compress_to_sequence(y); true = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
                    n=len(seq); m=len(true);
                    if n==0: d.append(m); continue
                    dp=list(range(m+1))
                    for i in range(1,n+1):
                        prev=dp[0]; dp[0]=i
                        for j in range(1,m+1):
                            tmp=dp[j]; cost=0 if seq[i-1]==true[j-1] else 1
                            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                    d.append(dp[m])
                results.append((max(d), float(np.mean(d)), a, g))
    results.sort(key=lambda x: (x[0], x[1]))
    return results[0]

def decode_test_clip(alpha_clip=0.26, gamma_audio=0.25, smooth_k=5, min_mult=0.7, out_csv='submission_clip_poe.csv', keep_len=True):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    # average audio temperature over folds for test-time parity
    Ta_list = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_list if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if len(Ta_vals)>0 else None
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # per-fold temp already applied
        pa = load_probs_generic(sid, 'audio')
        if pa is not None and Ta_mean is not None:
            pa = temp_scale_scalar(pa, Ta_mean)
        if keep_len:
            pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip, gamma_audio)
        else:
            pf = fuse_poe_with_clip(ps, pc, pa, alpha_clip, gamma_audio)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95', flush=True)
    pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id').to_csv(out_csv, index=False)
    print('Wrote', out_csv, flush=True)

print('CLIP head cell ready. After embeddings finish:',
      '\n - train_clip_head_and_cache()  # ~60-75m',
      '\n - best = oof_grid_clip(); print(best)',
      '\n - decode_test_clip(alpha_clip=best[2], gamma_audio=best[3])', flush=True)

Ready: run train_clip_head_and_cache() after embeddings finish.
CLIP head cell ready. After embeddings finish: 
 - train_clip_head_and_cache()  # ~60-75m 
 - best = oof_grid_clip(); print(best) 
 - decode_test_clip(alpha_clip=best[2], gamma_audio=best[3])


In [33]:
# Run CLIP head training -> OOF grid -> test decode -> stage submission
import time, shutil, os
t0=time.time()
print('Starting CLIP head training + caching...', flush=True)
train_clip_head_and_cache(epochs=3, bs_frames=2048, lr=2e-3, wd=0.05)
print(f'CLIP head done in {time.time()-t0:.1f}s', flush=True)
print('Running small OOF grid for CLIP fusion...', flush=True)
best = oof_grid_clip(alpha_list=(0.18,0.22,0.24,0.26,0.28), gamma_list=(0.20,0.25,0.30), smooth_k=5, min_mult=0.7)
print('Best (worst, mean, alpha_clip, gamma_audio)=', best, flush=True)
_,_,alpha_clip,gamma_audio = best
print('Decoding TEST with best weights...', flush=True)
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv='submission_clip_poe.csv')
src = 'submission_clip_poe.csv'; dst = 'submission.csv'
if os.path.exists(src):
    shutil.copyfile(src, dst)
    print(f'Staged submission: {src} -> {dst}', flush=True)
else:
    print('ERROR: submission_clip_poe.csv not found', flush=True)

Starting CLIP head training + caching...


fold=0 ep=1/3 loss=3.0246 elapsed=0.1s


fold=0 ep=2/3 loss=2.9646 elapsed=0.1s


fold=0 ep=3/3 loss=2.9289 elapsed=0.2s


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


fold=1 ep=1/3 loss=3.0209 elapsed=0.0s


fold=1 ep=2/3 loss=2.9359 elapsed=0.1s


fold=1 ep=3/3 loss=2.8867 elapsed=0.1s


fold=2 ep=1/3 loss=2.9405 elapsed=0.0s


fold=2 ep=2/3 loss=2.7434 elapsed=0.1s


fold=2 ep=3/3 loss=2.6489 elapsed=0.1s


CLIP head training + caching complete.


CLIP head done in 1.5s


Running small OOF grid for CLIP fusion...


Best (worst, mean, alpha_clip, gamma_audio)= (19, 16.785714285714285, 0.28, 0.2)


Decoding TEST with best weights...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe.csv


Staged submission: submission_clip_poe.csv -> submission.csv


In [34]:
# Re-run OOF grid with calibration parity + expanded ranges, then decode test and stage
import time, shutil, os
print('Running parity-fixed OOF grid for CLIP fusion...', flush=True)
t0=time.time()
best = oof_grid_clip(alpha_list=(0.30,0.35,0.40,0.45,0.50), gamma_list=(0.15,0.20,0.25,0.30), smooth_k=5, min_mult=0.7)
print('Best (worst, mean, alpha_clip, gamma_audio)=', best, 'elapsed=', round(time.time()-t0,1),'s', flush=True)
_,_,alpha_clip,gamma_audio = best
print(f'Decoding TEST with alpha_clip={alpha_clip}, gamma_audio={gamma_audio} ...', flush=True)
out_csv = 'submission_clip_poe_fixed.csv'
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv=out_csv)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: submission_clip_poe_fixed.csv not found', flush=True)

Running parity-fixed OOF grid for CLIP fusion...


Best (worst, mean, alpha_clip, gamma_audio)= (19, 16.79591836734694, 0.3, 0.15) elapsed= 38.7 s


Decoding TEST with alpha_clip=0.3, gamma_audio=0.15 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe_fixed.csv


Staged submission: submission_clip_poe_fixed.csv -> submission.csv


In [36]:
# Decode TEST with conservative CLIP fusion (keep_len) and stage
import shutil, os, time
alpha_clip = 0.40
gamma_audio = 0.25
out_csv = 'submission_clip_poe_noloss.csv'
print(f'Decoding TEST with keep_len: alpha_clip={alpha_clip}, gamma_audio={gamma_audio}', flush=True)
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv=out_csv, keep_len=True)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: expected output file missing', flush=True)

Decoding TEST with keep_len: alpha_clip=0.4, gamma_audio=0.25


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe_noloss.csv


Staged submission: submission_clip_poe_noloss.csv -> submission.csv


In [37]:
# OOF sanity: CLIP keep_len PoE vs baseline PoE (MobileNet RGB) with temp parity; quick grid over alphas
import numpy as np, pandas as pd, json, time
from collections import defaultdict

def lev_dist(a, b):
    n=len(a); m=len(b)
    if n==0: return m
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i
        for j in range(1,m+1):
            tmp=dp[j]; cost=0 if a[i-1]==b[j-1] else 1
            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
    return dp[m]

def oof_eval_clip_keep_len_vs_base(alpha_list=(0.30,0.35,0.40,0.45), gamma_list=(0.20,0.25), smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    results = {}  # (a,g)-> list of fold means
    base_stats = []
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clip_temp_fold{fidx}.json')
        Taud  = _load_temp_num(f'audio_temp_fold{fidx}.json')
        # baseline PoE mean on this fold for reference
        d_base=[]
        for sid in va:
            sid=int(sid)
            pf_base = build_fused_probs_for_id(sid, alpha_vis=0.26, gamma_a=0.25, smooth_k=5, fold=fidx)  # MobileNet RGB + audio
            yb = decode_minseg(pf_base, min_dur.copy()); yb = aba_collapse(yb, max_len=2)
            seq_b = compress_to_sequence(yb)
            y_true = np.load(LABELS/f"{sid}.npy").astype(np.int32)
            seq_t = compress_to_sequence(y_true)
            d_base.append(lev_dist(seq_b, seq_t))
        base_stats.append(float(np.mean(d_base)))
        # grid for CLIP keep_len
        for a in alpha_list:
            for g in gamma_list:
                d=[]
                for sid in va:
                    sid=int(sid)
                    ps = load_skeleton_probs(sid)
                    pc = load_clip_probs_train(sid)
                    pa = load_probs_generic(sid, 'audio')
                    if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
                    if pa is not None and Taud  is not None: pa = temp_scale_scalar(pa,  Taud)
                    pf = fuse_poe_with_clip_keep_len(ps, pc, pa, a, g)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    seq = compress_to_sequence(y)
                    y_true = np.load(LABELS/f"{sid}.npy").astype(np.int32)
                    seq_t = compress_to_sequence(y_true)
                    d.append(lev_dist(seq, seq_t))
                results.setdefault((a,g), []).append(float(np.mean(d)))
        print(f"fold={fidx} base_mean={np.mean(d_base):.3f} (norm={np.mean(d_base)/20:.3f})", flush=True)
    # summarize
    summary=[]
    for (a,g), arr in results.items():
        worst=max(arr); mean=float(np.mean(arr))
        summary.append((worst, mean, a, g))
    summary.sort(key=lambda x: (x[0], x[1]))
    print('Baseline OOF means per fold:', [f"{x:.3f}" for x in base_stats], 'overall mean=', f"{np.mean(base_stats):.3f}")
    print('Top CLIP keep_len configs (worst,mean,a,g):')
    for row in summary[:5]:
        print(row)
    return summary[0]

print('Running OOF sanity: CLIP keep_len vs baseline...', flush=True)
best_clip = oof_eval_clip_keep_len_vs_base(alpha_list=(0.30,0.35,0.40,0.45), gamma_list=(0.20,0.25), smooth_k=5, min_mult=0.7)
print('Best CLIP keep_len config:', best_clip, flush=True)

Running OOF sanity: CLIP keep_len vs baseline...


fold=0 base_mean=3.857 (norm=0.193)


fold=1 base_mean=2.929 (norm=0.146)


fold=2 base_mean=4.470 (norm=0.223)


Baseline OOF means per fold: ['3.857', '2.929', '4.470'] overall mean= 3.752
Top CLIP keep_len configs (worst,mean,a,g):
(4.45, 3.7286435786435788, 0.3, 0.2)
(4.45, 3.7286435786435788, 0.3, 0.25)
(4.46, 3.7285411942554796, 0.4, 0.2)
(4.46, 3.7285411942554796, 0.4, 0.25)
(4.46, 3.7319769119769126, 0.35, 0.2)
Best CLIP keep_len config: (4.45, 3.7286435786435788, 0.3, 0.2)


In [38]:
# Decode TEST with OOF-best keep_len CLIP fusion (alpha=0.30, gamma=0.20) and stage
import shutil, os, time
alpha_clip = 0.30
gamma_audio = 0.20
out_csv = 'submission_clip_poe_keep_best.csv'
print(f'Decoding TEST keep_len with alpha_clip={alpha_clip}, gamma_audio={gamma_audio}', flush=True)
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv=out_csv, keep_len=True)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: expected output file missing', flush=True)

Decoding TEST keep_len with alpha_clip=0.3, gamma_audio=0.2


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe_keep_best.csv


Staged submission: submission_clip_poe_keep_best.csv -> submission.csv


In [39]:
# CLIP + MobileNet RGB + Audio PoE (keep_len), temp parity; small OOF grid -> test decode
import numpy as np, pandas as pd, json, time, os
from pathlib import Path

def _mean_fold_temp(prefix: str):
    vals=[]
    for f in (0,1,2):
        t = _load_temp_num(f'{prefix}_fold{f}.json')
        if t is not None: vals.append(float(t))
    return float(np.mean(vals)) if vals else None

def fuse_poe_keep_len_three(ps: np.ndarray, p_clip: np.ndarray | None, p_rgb: np.ndarray | None, p_audio: np.ndarray | None, a_clip: float, a_rgb: float, g_aud: float):
    C,T = ps.shape
    logp = np.log(np.clip(ps, 1e-8, 1.0))
    w_s = np.ones(T, dtype=np.float32)
    def add_stream(pv, alpha):
        nonlocal logp, w_s
        if pv is None or alpha<=0: return
        sh = _find_best_shift(pv, ps, max_shift=15)
        if sh >= 0:
            L = min(pv.shape[1] - sh, T); ref_start = 0; src_start = sh
        else:
            L = min(pv.shape[1], T + sh); ref_start = -sh; src_start = 0
        if L > 0:
            w_s[ref_start:ref_start+L] -= alpha
            logp[:, ref_start:ref_start+L] += alpha * np.log(np.clip(pv[:, src_start:src_start+L], 1e-8, 1.0))
    add_stream(p_clip, a_clip)
    add_stream(p_rgb,  a_rgb)
    add_stream(p_audio, g_aud)
    w_s = np.clip(w_s, 0.0, 1.0)[None, :]
    logp = w_s * logp
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

def oof_grid_clip_rgb(alpha_clip_list=(0.20,0.25,0.30), alpha_rgb_list=(0.10,0.15,0.20,0.25), gamma_list=(0.20,0.25), smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    results=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clip_temp_fold{fidx}.json')
        Trgb  = _load_temp_num(f'rgb_temp_fold{fidx}.json')
        Taud  = _load_temp_num(f'audio_temp_fold{fidx}.json')
        for ac in alpha_clip_list:
            for ar in alpha_rgb_list:
                for g in gamma_list:
                    d=[]
                    for sid in va:
                        sid=int(sid)
                        ps = load_skeleton_probs(sid)
                        pc = load_clip_probs_train(sid)
                        pr = load_probs_generic(sid, 'rgb')
                        pa = load_probs_generic(sid, 'audio')
                        if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
                        if pr is not None and Trgb  is not None: pr = temp_scale_scalar(pr,  Trgb)
                        if pa is not None and Taud  is not None: pa = temp_scale_scalar(pa,  Taud)
                        pf = fuse_poe_keep_len_three(ps, pc, pr, pa, ac, ar, g)
                        pf = smooth_probs_box(pf, k=smooth_k)
                        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                        seq = compress_to_sequence(y); true = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
                        n=len(seq); m=len(true);
                        if n==0: d.append(m); continue
                        dp=list(range(m+1))
                        for i in range(1,n+1):
                            prev=dp[0]; dp[0]=i
                            for j in range(1,m+1):
                                tmp=dp[j]; cost=0 if seq[i-1]==true[j-1] else 1
                                dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                        d.append(dp[m])
                    results.append((max(d), float(np.mean(d)), ac, ar, g))
    results.sort(key=lambda x: (x[0], x[1]))
    return results[0]

def decode_test_clip_rgb(a_clip=0.30, a_rgb=0.15, g_aud=0.20, smooth_k=5, min_mult=0.7, out_csv='submission_clip_rgb_keep.csv'):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    # temps: CLIP test already temped per-fold and averaged; RGB/audio need mean temps
    Trgb_mean = _mean_fold_temp('rgb_temp')
    Ta_mean   = _mean_fold_temp('audio_temp')
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)
        pr = load_probs_generic(sid, 'rgb')
        pa = load_probs_generic(sid, 'audio')
        if pr is not None and Trgb_mean is not None: pr = temp_scale_scalar(pr, Trgb_mean)
        if pa is not None and Ta_mean   is not None: pa = temp_scale_scalar(pa,  Ta_mean)
        pf = fuse_poe_keep_len_three(ps, pc, pr, pa, a_clip, a_rgb, g_aud)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95', flush=True)
    pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id').to_csv(out_csv, index=False)
    print('Wrote', out_csv, flush=True)

print('Running OOF grid for CLIP+RGB+Audio (keep_len)...', flush=True)
t0=time.time()
best = oof_grid_clip_rgb(alpha_clip_list=(0.20,0.25,0.30), alpha_rgb_list=(0.10,0.15,0.20,0.25), gamma_list=(0.20,0.25), smooth_k=5, min_mult=0.7)
print('Best (worst,mean,a_clip,a_rgb,g_aud)=', best, 'elapsed=', round(time.time()-t0,1),'s', flush=True)
_,_,ac,ar,ga = best
print(f'Decoding TEST with a_clip={ac}, a_rgb={ar}, g_aud={ga} ...', flush=True)
out_csv = 'submission_clip_rgb_keep.csv'
decode_test_clip_rgb(a_clip=ac, a_rgb=ar, g_aud=ga, smooth_k=5, min_mult=0.7, out_csv=out_csv)
if os.path.exists(out_csv):
    import shutil
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

Running OOF grid for CLIP+RGB+Audio (keep_len)...


Best (worst,mean,a_clip,a_rgb,g_aud)= (10, 3.86734693877551, 0.3, 0.1, 0.2) elapsed= 100.8 s


Decoding TEST with a_clip=0.3, a_rgb=0.1, g_aud=0.2 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_rgb_keep.csv


Staged submission: submission_clip_rgb_keep.csv -> submission.csv


In [58]:
# Stage OOF-best CLIP+Audio keep_len submission
import shutil, os
src = 'submission_clip_poe_keep_best.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged best CLIP+Audio keep_len submission: {src} -> {dst}')

Staged best CLIP+Audio keep_len submission: submission_clip_poe_keep_best.csv -> submission.csv


In [41]:
# Retrain CLIP head with more epochs, then OOF grid (parity, keep_len) and decode test
import time, shutil, os
print('Retraining CLIP head (epochs=12)...', flush=True)
t0=time.time()
train_clip_head_and_cache(epochs=12, bs_frames=2048, lr=2e-3, wd=0.05)
print(f'CLIP head retrain done in {time.time()-t0:.1f}s', flush=True)
print('OOF grid (parity, keep_len) after retrain...', flush=True)
best = oof_grid_clip(alpha_list=(0.30,0.35,0.40,0.45,0.50), gamma_list=(0.15,0.20,0.25,0.30), smooth_k=5, min_mult=0.7, keep_len=True)
print('Best (worst, mean, alpha_clip, gamma_audio)=', best, flush=True)
_,_,alpha_clip,gamma_audio = best
print(f'Decoding TEST keep_len with alpha_clip={alpha_clip}, gamma_audio={gamma_audio} ...', flush=True)
out_csv = 'submission_clip_poe_retrained.csv'
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv=out_csv, keep_len=True)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: expected output file missing', flush=True)

Retraining CLIP head (epochs=12)...


fold=0 ep=1/12 loss=3.0333 elapsed=0.0s


fold=0 ep=2/12 loss=2.9702 elapsed=0.1s


fold=0 ep=3/12 loss=2.9323 elapsed=0.1s


fold=0 ep=4/12 loss=2.9037 elapsed=0.1s


fold=0 ep=5/12 loss=2.8803 elapsed=0.2s


fold=0 ep=6/12 loss=2.8598 elapsed=0.2s


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


fold=0 ep=7/12 loss=2.8410 elapsed=0.2s


fold=0 ep=8/12 loss=2.8236 elapsed=0.2s


fold=0 ep=9/12 loss=2.8072 elapsed=0.3s


fold=0 ep=10/12 loss=2.7916 elapsed=0.3s


fold=0 ep=11/12 loss=2.7770 elapsed=0.3s


fold=0 ep=12/12 loss=2.7631 elapsed=0.4s


fold=1 ep=1/12 loss=3.0161 elapsed=0.0s


fold=1 ep=2/12 loss=2.9321 elapsed=0.1s


fold=1 ep=3/12 loss=2.8837 elapsed=0.1s


fold=1 ep=4/12 loss=2.8508 elapsed=0.1s


fold=1 ep=5/12 loss=2.8257 elapsed=0.2s


fold=1 ep=6/12 loss=2.8045 elapsed=0.2s


fold=1 ep=7/12 loss=2.7853 elapsed=0.2s


fold=1 ep=8/12 loss=2.7675 elapsed=0.3s


fold=1 ep=9/12 loss=2.7509 elapsed=0.3s


fold=1 ep=10/12 loss=2.7352 elapsed=0.3s


fold=1 ep=11/12 loss=2.7203 elapsed=0.4s


fold=1 ep=12/12 loss=2.7062 elapsed=0.4s


fold=2 ep=1/12 loss=2.9398 elapsed=0.0s


fold=2 ep=2/12 loss=2.7423 elapsed=0.1s


fold=2 ep=3/12 loss=2.6477 elapsed=0.1s


fold=2 ep=4/12 loss=2.6143 elapsed=0.1s


fold=2 ep=5/12 loss=2.5953 elapsed=0.2s


fold=2 ep=6/12 loss=2.5774 elapsed=0.2s


fold=2 ep=7/12 loss=2.5600 elapsed=0.2s


fold=2 ep=8/12 loss=2.5433 elapsed=0.3s


fold=2 ep=9/12 loss=2.5272 elapsed=0.3s


fold=2 ep=10/12 loss=2.5118 elapsed=0.3s


fold=2 ep=11/12 loss=2.4969 elapsed=0.4s


fold=2 ep=12/12 loss=2.4826 elapsed=0.4s


CLIP head training + caching complete.


CLIP head retrain done in 2.3s


OOF grid (parity, keep_len) after retrain...


Best (worst, mean, alpha_clip, gamma_audio)= (10, 3.836734693877551, 0.35, 0.15)


Decoding TEST keep_len with alpha_clip=0.35, gamma_audio=0.15 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe_retrained.csv


Staged submission: submission_clip_poe_retrained.csv -> submission.csv


In [42]:
# Re-extract TEST CLIP with longer cap (max_frames=1024), retrain head, re-decode keep_len with OOF-best weights
import time, os, shutil
print('Re-extracting TEST CLIP embeddings with max_frames=1024...', flush=True)
t0=time.time()
extract_split('test', max_frames=1024)
print(f'Test CLIP re-extract done in {time.time()-t0:.1f}s', flush=True)
print('Retraining CLIP head (epochs=6) and caching per-fold TEST probs...', flush=True)
t1=time.time()
train_clip_head_and_cache(epochs=6, bs_frames=2048, lr=2e-3, wd=0.05)
print(f'Head retrain+cache done in {time.time()-t1:.1f}s', flush=True)
alpha_clip = 0.30; gamma_audio = 0.20
out_csv = 'submission_clip_poe_keep_1024.csv'
print(f'Decoding TEST keep_len with alpha_clip={alpha_clip}, gamma_audio={gamma_audio} (using 1024-frame CLIP)...', flush=True)
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv=out_csv, keep_len=True)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: expected output file missing', flush=True)

Re-extracting TEST CLIP embeddings with max_frames=1024...


test finished; new saved = 0 elapsed= 0.0 s


Test CLIP re-extract done in 0.0s


Retraining CLIP head (epochs=6) and caching per-fold TEST probs...


fold=0 ep=1/6 loss=3.0273 elapsed=0.0s


fold=0 ep=2/6 loss=2.9659 elapsed=0.1s


fold=0 ep=3/6 loss=2.9292 elapsed=0.1s


fold=0 ep=4/6 loss=2.9013 elapsed=0.1s


fold=0 ep=5/6 loss=2.8783 elapsed=0.2s


fold=0 ep=6/6 loss=2.8580 elapsed=0.2s


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


fold=1 ep=1/6 loss=3.0192 elapsed=0.0s


fold=1 ep=2/6 loss=2.9350 elapsed=0.1s


fold=1 ep=3/6 loss=2.8864 elapsed=0.1s


fold=1 ep=4/6 loss=2.8532 elapsed=0.1s


fold=1 ep=5/6 loss=2.8279 elapsed=0.2s


fold=1 ep=6/6 loss=2.8064 elapsed=0.2s


fold=2 ep=1/6 loss=2.9378 elapsed=0.0s


fold=2 ep=2/6 loss=2.7417 elapsed=0.1s


fold=2 ep=3/6 loss=2.6483 elapsed=0.1s


fold=2 ep=4/6 loss=2.6152 elapsed=0.1s


fold=2 ep=5/6 loss=2.5960 elapsed=0.2s


fold=2 ep=6/6 loss=2.5780 elapsed=0.2s


CLIP head training + caching complete.


Head retrain+cache done in 1.7s


Decoding TEST keep_len with alpha_clip=0.3, gamma_audio=0.2 (using 1024-frame CLIP)...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe_keep_1024.csv


Staged submission: submission_clip_poe_keep_1024.csv -> submission.csv


In [43]:
# Refined OOF grid (keep_len, temp parity) around best; decode test and stage
import time, os, shutil
def oof_grid_clip_refined(alpha_list=(0.28,0.30,0.32,0.34), gamma_list=(0.18,0.20,0.22,0.24), smooth_k=5, min_mult=0.7, keep_len=True):
    folds = json.load(open('folds_archive_cv.json','r'))
    results=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clip_temp_fold{fidx}.json')
        Taud  = _load_temp_num(f'audio_temp_fold{fidx}.json')
        for a in alpha_list:
            for g in gamma_list:
                d=[]
                for sid in va:
                    sid=int(sid)
                    ps = load_skeleton_probs(sid)
                    pc = load_clip_probs_train(sid)
                    pa = load_probs_generic(sid, 'audio')
                    if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
                    if pa is not None and Taud  is not None: pa = temp_scale_scalar(pa,  Taud)
                    if keep_len:
                        pf = fuse_poe_with_clip_keep_len(ps, pc, pa, a, g)
                    else:
                        pf = fuse_poe_with_clip(ps, pc, pa, a, g)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    seq = compress_to_sequence(y); true = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
                    n=len(seq); m=len(true);
                    if n==0: d.append(m); continue
                    dp=list(range(m+1))
                    for i in range(1,n+1):
                        prev=dp[0]; dp[0]=i
                        for j in range(1,m+1):
                            tmp=dp[j]; cost=0 if seq[i-1]==true[j-1] else 1
                            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                    d.append(dp[m])
                results.append((max(d), float(np.mean(d)), a, g))
    results.sort(key=lambda x: (x[0], x[1]))
    return results[0]

print('Refined OOF grid (keep_len, parity)...', flush=True)
t0=time.time()
best = oof_grid_clip_refined(alpha_list=(0.28,0.30,0.32,0.34), gamma_list=(0.18,0.20,0.22,0.24), smooth_k=5, min_mult=0.7, keep_len=True)
print('Best (worst, mean, alpha_clip, gamma_audio)=', best, 'elapsed=', round(time.time()-t0,1),'s', flush=True)
_,_,alpha_clip,gamma_audio = best
print(f'Decoding TEST keep_len with alpha_clip={alpha_clip}, gamma_audio={gamma_audio} ...', flush=True)
out_csv = 'submission_clip_poe_keep_refined.csv'
decode_test_clip(alpha_clip=alpha_clip, gamma_audio=gamma_audio, smooth_k=5, min_mult=0.7, out_csv=out_csv, keep_len=True)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: expected output file missing', flush=True)

Refined OOF grid (keep_len, parity)...


Best (worst, mean, alpha_clip, gamma_audio)= (10, 3.836734693877551, 0.3, 0.18) elapsed= 54.2 s


Decoding TEST keep_len with alpha_clip=0.3, gamma_audio=0.18 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_poe_keep_refined.csv


Staged submission: submission_clip_poe_keep_refined.csv -> submission.csv


In [51]:
# Test-time Best-of-N for CLIP+Audio keep_len PoE using emission log-likelihood; stages submission
import numpy as np, pandas as pd, time, os, shutil, json

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    C,T = p.shape
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def build_pf_clip_audio_keep_len(sid: int, a_clip: float, g_aud: float, smooth_k=5, min_mult=0.7):
    # durations from all train for decoding outside
    return None  # not used; we directly build pf in the loop

def decode_test_clip_bestofN_keep_len(out_csv='submission_clip_poe_keep_bestofN.csv',
                                      alpha_list=(0.28,0.30,0.32,0.34), gamma_list=(0.18,0.20,0.22,0.24),
                                      smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    # average audio temperature over folds
    Ta_list = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_list if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if len(Ta_vals)>0 else None
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # per-fold temp already applied
        pa = load_probs_generic(sid, 'audio')
        if pa is not None and Ta_mean is not None:
            pa = temp_scale_scalar(pa, Ta_mean)
        best_ll = -1e99; best_seq=None
        for ac in alpha_list:
            for ga in gamma_list:
                pf = fuse_poe_with_clip_keep_len(ps, pc, pa, ac, ga)
                pf = smooth_probs_box(pf, k=smooth_k)
                y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                ll = total_emission_loglik(y, pf)
                if ll > best_ll:
                    best_ll = ll
                    seq = make_perm20(compress_to_sequence(y), pf)
                    best_seq = seq
        ids.append(sid); rows.append(' '.join(map(str, best_seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST: Best-of-N over (alpha_clip, gamma_audio) for CLIP+Audio keep_len...', flush=True)
decode_test_clip_bestofN_keep_len(out_csv='submission_clip_poe_keep_bestofN.csv',
                                  alpha_list=(0.28,0.30,0.32,0.34), gamma_list=(0.18,0.20,0.22,0.24),
                                  smooth_k=5, min_mult=0.7)

Decoding TEST: Best-of-N over (alpha_clip, gamma_audio) for CLIP+Audio keep_len...


Decoded 20/95 elapsed=1.5s


Decoded 40/95 elapsed=3.2s


Decoded 60/95 elapsed=4.7s


Decoded 80/95 elapsed=5.9s


Decoded 95/95 elapsed=6.7s


Wrote submission_clip_poe_keep_bestofN.csv rows= 95
Staged submission: submission_clip_poe_keep_bestofN.csv -> submission.csv


In [45]:
# CLIP features for Depth and User: extract -> train heads -> OOF grid (alpha_clip_total,gamma_audio) -> test decode keep_len
import os, time, json, numpy as np, pandas as pd
from pathlib import Path

# Reuse model/tx/encode_video from Cell 18 (already executed).
assert 'model' in globals(), 'CLIP model not initialized; run Cell 18 first.'
assert 'tx' in globals(), 'CLIP transforms not initialized; run Cell 18 first.'

# Dirs
VID_DIR_DEPTH_T = Path('rgb_videos_depth/train'); VID_DIR_DEPTH_E = Path('rgb_videos_depth/test')
VID_DIR_USER_T  = Path('rgb_videos_user/train');  VID_DIR_USER_E  = Path('rgb_videos_user/test')
EMB_DEPTH = Path('rgb_clip_embed_depth'); (EMB_DEPTH/'train').mkdir(parents=True, exist_ok=True); (EMB_DEPTH/'test').mkdir(parents=True, exist_ok=True)
EMB_USER  = Path('rgb_clip_embed_user');  (EMB_USER/'train').mkdir(parents=True, exist_ok=True);  (EMB_USER/'test').mkdir(parents=True, exist_ok=True)
PROBS = Path('probs_cache'); PROBS.mkdir(exist_ok=True)

def id_to_video_generic(dirpath: Path, sid: int):
    cands = list(dirpath.glob(f'{sid}.mp4'))
    if cands: return cands[0]
    cands = list(dirpath.glob(f'*{sid}*.mp4'))
    return cands[0] if cands else None

def extract_split_generic(split='train', max_frames=512, which='depth'):
    out_dir = (EMB_DEPTH if which=='depth' else EMB_USER)/split
    vid_dir = (VID_DIR_DEPTH_T if split=='train' else VID_DIR_DEPTH_E) if which=='depth' else (VID_DIR_USER_T if split=='train' else VID_DIR_USER_E)
    csv_path = 'training.csv' if split=='train' else 'test.csv'
    ids = pd.read_csv(csv_path)['Id'].astype(int).tolist()
    done=0; t0=time.time()
    for k,sid in enumerate(ids, 1):
        out = out_dir/f"{sid}.npy"
        if out.exists():
            continue
        vp = id_to_video_generic(vid_dir, sid)
        if vp is None:
            continue
        try:
            E = encode_video(vp, max_frames=max_frames, bs=128)
            np.save(out, E)
        except Exception as e:
            print('FAIL', which, split, sid, e, flush=True)
        done+=1
        if (done%20)==0:
            print(f"{which}/{split}: saved {done} in {time.time()-t0:.1f}s (last id={sid})", flush=True)
    print(which, split, 'finished; new saved =', done, 'elapsed=', round(time.time()-t0,1), 's', flush=True)

# Train linear head for a given embed root; cache OOF and per-fold TEST (temp-scaled) probs
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset
LABELS = Path('labels3d_v2/train')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class SeqDatasetEmbed(Dataset):
    def __init__(self, ids, emb_root: Path):
        self.items=[]; self.emb_root=emb_root
        for sid in ids:
            p = (emb_root/'train'/f"{int(sid)}.npy")
            if not p.exists(): continue
            E = np.load(p)
            y = np.load(LABELS/f"{int(sid)}.npy").astype(np.int64)
            if len(y)!=E.shape[0]:
                idx = np.linspace(0, len(y)-1, E.shape[0]).round().astype(int)
                y = y[idx]
            self.items.append((int(sid), E.astype(np.float32), y))
    def __len__(self): return len(self.items)
    def __getitem__(self, i):
        sid,E,y = self.items[i]
        return sid, torch.from_numpy(E), torch.from_numpy(y)

class LinearHead(nn.Module):
    def __init__(self): super().__init__(); self.fc = nn.Linear(512, 21)
    def forward(self, x): return self.fc(x)

@torch.no_grad()
def forward_logits_all(head, E_np: np.ndarray, bs: int = 4096):
    xb = torch.from_numpy(E_np.astype(np.float32)).to(device)
    outs=[]
    for i in range(0, xb.size(0), bs):
        outs.append(head(xb[i:i+bs]).float().cpu())
    return torch.cat(outs,0).numpy().astype(np.float32)

def fit_temperature_on_val(head, val_items):
    Xs=[]; Ys=[]
    with torch.no_grad():
        for sid,E,y in val_items:
            xb = torch.from_numpy(E.astype(np.float32)).to(device)
            outs=[]
            for i in range(0, xb.size(0), 4096):
                outs.append(head(xb[i:i+4096]).float())
            lg = torch.cat(outs,0); Xs.append(lg.cpu()); Ys.append(torch.from_numpy(y))
    X = torch.cat(Xs,0).to(device); Y = torch.cat(Ys,0).to(device)
    Tsc = torch.tensor(1.5, device=device, requires_grad=True)
    opt = torch.optim.LBFGS([Tsc], lr=0.01, max_iter=50)
    def closure():
        opt.zero_grad(); loss = F.cross_entropy(X / Tsc, Y, reduction='mean'); loss.backward(); return loss
    opt.step(closure)
    return float(Tsc.detach().cpu().item())

def train_clip_head_and_cache_for_embed(emb_root: Path, suffix: str, temp_prefix: str, epochs=3, bs_frames=2048, lr=2e-3, wd=0.05):
    folds = json.load(open('folds_archive_cv.json','r'))
    test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
    for fd in folds:
        fidx = int(fd['fold'])
        tr_ids = list(map(int, fd['train_ids'])); va_ids = list(map(int, fd['val_ids']))
        ds_tr = SeqDatasetEmbed(tr_ids, emb_root); ds_va = SeqDatasetEmbed(va_ids, emb_root)
        if len(ds_tr)==0 or len(ds_va)==0: continue
        Xtr = np.concatenate([E for _,E,_ in ds_tr.items], axis=0)
        Ytr = np.concatenate([y for *_,y in ds_tr.items], axis=0)
        chunks = [(Xtr[i:i+bs_frames], Ytr[i:i+bs_frames]) for i in range(0, Xtr.shape[0], bs_frames)]
        head = LinearHead().to(device)
        opt = torch.optim.AdamW(head.parameters(), lr=lr, weight_decay=wd)
        scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))
        t0=time.time(); head.train()
        for ep in range(epochs):
            loss_sum=0.0; nb=0
            for xb_np,yb_np in chunks:
                xb = torch.from_numpy(xb_np).to(device); yb = torch.from_numpy(yb_np).to(device)
                opt.zero_grad(set_to_none=True)
                with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                    lg = head(xb); loss = F.cross_entropy(lg, yb, label_smoothing=0.05)
                scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
                loss_sum += float(loss.detach().cpu().item()); nb+=1
            print(f"{suffix} fold={fidx} ep={ep+1}/{epochs} loss={loss_sum/max(nb,1):.4f} elapsed={time.time()-t0:.1f}s", flush=True)
        head.eval()
        with torch.no_grad():
            for sid,E,y in ds_va.items:
                lg = forward_logits_all(head, E, bs=4096)  # (T,21)
                p = torch.softmax(torch.from_numpy(lg), dim=1).numpy().astype(np.float32).T  # CxT
                np.save(PROBS/f"{sid}_{suffix}.npy", p)
        Tval = fit_temperature_on_val(head, ds_va.items)
        json.dump({'T': Tval}, open(f'{temp_prefix}_temp_fold{fidx}.json','w'))
        with torch.no_grad():
            for sid in test_ids:
                pth = emb_root/'test'/f"{sid}.npy"
                if not pth.exists(): continue
                E = np.load(pth)
                lg = forward_logits_all(head, E, bs=4096)
                p = torch.softmax(torch.from_numpy(lg)/Tval, dim=1).numpy().astype(np.float32).T
                np.save(PROBS/f"{sid}_{suffix}_f{fidx}.npy", p)
    print(f'{suffix} head training + caching complete.', flush=True)

# Loaders for new streams
def _load_temp_num(path: str):
    p = Path(path);
    if not p.exists(): return None
    try: return float(json.load(open(p,'r')).get('T', 1.0))
    except Exception:
        try: return float(open(p).read().strip())
        except Exception: return None

def load_clipd_probs_train(sid:int):
    pth = PROBS/f"{sid}_clipd.npy"
    return np.load(pth).astype(np.float32) if pth.exists() else None
def load_clipu_probs_train(sid:int):
    pth = PROBS/f"{sid}_clipu.npy"
    return np.load(pth).astype(np.float32) if pth.exists() else None
def load_clipd_probs_test_avg(sid:int):
    arr=[]
    for f in (0,1,2):
        pth = PROBS/f"{sid}_clipd_f{f}.npy"
        if pth.exists(): arr.append(np.load(pth).astype(np.float32))
    if not arr: return None
    L = min(a.shape[1] for a in arr)
    q = np.mean([a[:, :L] for a in arr], axis=0); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)
def load_clipu_probs_test_avg(sid:int):
    arr=[]
    for f in (0,1,2):
        pth = PROBS/f"{sid}_clipu_f{f}.npy"
        if pth.exists(): arr.append(np.load(pth).astype(np.float32))
    if not arr: return None
    L = min(a.shape[1] for a in arr)
    q = np.mean([a[:, :L] for a in arr], axis=0); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

# Visual averaging on skeleton timeline (keep_len) for multiple CLIP streams
def _find_best_shift(p_src: np.ndarray, p_ref: np.ndarray, max_shift: int = 15):
    hs = entropy(p_src); hr = entropy(p_ref)
    best = (-1e9, 0)
    for sh in range(-max_shift, max_shift+1):
        if sh >= 0:
            L = min(hs.shape[0] - sh, hr.shape[0])
            if L < 16: continue
            s = hs[sh:sh+L]; r = hr[:L]
        else:
            L = min(hs.shape[0], hr.shape[0] + sh)
            if L < 16: continue
            s = hs[:L]; r = hr[-sh:-sh+L]
        corr = -1.0 if (s.std()<1e-8 or r.std()<1e-8) else float(np.corrcoef(s, r)[0,1])
        if corr > best[0]: best = (corr, sh)
    return best[1]

def visual_avg_keep_len(ps: np.ndarray, streams: list[np.ndarray | None]) -> np.ndarray | None:
    C,T = ps.shape
    acc = np.zeros((C,T), dtype=np.float32); cnt = np.zeros(T, dtype=np.int32)
    any_ok = False
    for pv in streams:
        if pv is None or pv.ndim!=2: continue
        sh = _find_best_shift(pv, ps, max_shift=15)
        if sh >= 0:
            L = min(pv.shape[1]-sh, T); ref_start=0; src_start=sh
        else:
            L = min(pv.shape[1], T+sh); ref_start=-sh; src_start=0
        if L >= 16:
            acc[:, ref_start:ref_start+L] += pv[:, src_start:src_start+L].astype(np.float32)
            cnt[ref_start:ref_start+L] += 1
            any_ok = True
    if not any_ok: return None
    mask = cnt>0
    v = np.zeros((C,T), dtype=np.float32)
    v[:, mask] = acc[:, mask] / np.maximum(cnt[mask][None,:], 1)
    # normalize columns where mask True
    colsum = v.sum(axis=0, keepdims=True); colsum[:, ~mask] = 1.0
    v = v / np.clip(colsum, 1e-8, 1e8)
    return v.astype(np.float32)

# Keep-len fusion with single visual averaged stream + audio
def fuse_keep_len_skel_vis_audio(ps: np.ndarray, pvis: np.ndarray | None, pa: np.ndarray | None, alpha_vis_total: float, gamma_audio: float) -> np.ndarray:
    C,T = ps.shape
    logp = np.log(np.clip(ps, 1e-8, 1.0)).astype(np.float32)
    w_s = np.ones(T, dtype=np.float32)
    if pvis is not None:
        mask_v = (pvis.sum(axis=0) > 0)
        logp[:, mask_v] += alpha_vis_total * np.log(np.clip(pvis[:, mask_v], 1e-8, 1.0))
        w_s[mask_v] -= alpha_vis_total
    if pa is not None:
        sh = _find_best_shift(pa, ps, max_shift=15)
        if sh >= 0:
            L = min(pa.shape[1]-sh, T); ref_start=0; src_start=sh
        else:
            L = min(pa.shape[1], T+sh); ref_start=-sh; src_start=0
        if L >= 16:
            logp[:, ref_start:ref_start+L] += gamma_audio * np.log(np.clip(pa[:, src_start:src_start+L], 1e-8, 1.0))
            w_s[ref_start:ref_start+L] -= gamma_audio
    w_s = np.clip(w_s, 0.0, 1.0)[None, :]
    logp = w_s * logp
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

# OOF grid over alpha_clip_total and gamma_audio with temp parity per stream; select by worst then mean
def oof_grid_threeclip(alpha_list=(0.30,0.35,0.40,0.45,0.50), gamma_list=(0.15,0.20,0.25,0.30), smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    results=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tr = _load_temp_num(f'clip_temp_fold{fidx}.json')
        Td = _load_temp_num(f'clipd_temp_fold{fidx}.json')
        Tu = _load_temp_num(f'clipu_temp_fold{fidx}.json')
        Ta = _load_temp_num(f'audio_temp_fold{fidx}.json')
        for a in alpha_list:
            for g in gamma_list:
                if a + g > 0.60: continue
                d=[]
                for sid in va:
                    sid=int(sid)
                    ps = load_skeleton_probs(sid)
                    pr = load_clip_probs_train(sid)
                    pdp = load_clipd_probs_train(sid)
                    pu  = load_clipu_probs_train(sid)
                    pa  = load_probs_generic(sid, 'audio')
                    if pr is not None and Tr is not None: pr  = temp_scale_scalar(pr,  Tr)
                    if pdp is not None and Td is not None: pdp = temp_scale_scalar(pdp, Td)
                    if pu  is not None and Tu is not None: pu  = temp_scale_scalar(pu,  Tu)
                    if pa  is not None and Ta is not None:  pa  = temp_scale_scalar(pa,  Ta)
                    pvis = visual_avg_keep_len(ps, [pr, pdp, pu])
                    pf = fuse_keep_len_skel_vis_audio(ps, pvis, pa, a, g)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    seq = compress_to_sequence(y); true = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
                    n=len(seq); m=len(true);
                    if n==0: d.append(m); continue
                    dp=list(range(m+1))
                    for i in range(1,n+1):
                        prev=dp[0]; dp[0]=i
                        for j in range(1,m+1):
                            tmp=dp[j]; cost=0 if seq[i-1]==true[j-1] else 1
                            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                    d.append(dp[m])
                if d: results.append((max(d), float(np.mean(d)), a, g))
    results.sort(key=lambda x: (x[0], x[1]))
    return results[0] if results else None

def decode_test_threeclip(alpha_clip_total=0.40, gamma_audio=0.20, smooth_k=5, min_mult=0.7, out_csv='submission_clip3_poe_keep.csv'):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    # audio mean temp
    Ta_list = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_list if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if len(Ta_vals)>0 else None
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pr = load_clip_probs_test_avg(sid)
        pdp = load_clipd_probs_test_avg(sid)
        pu  = load_clipu_probs_test_avg(sid)
        pa  = load_probs_generic(sid, 'audio')
        if pa is not None and Ta_mean is not None: pa = temp_scale_scalar(pa, Ta_mean)
        pvis = visual_avg_keep_len(ps, [pr, pdp, pu])
        pf = fuse_keep_len_skel_vis_audio(ps, pvis, pa, alpha_clip_total, gamma_audio)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95', flush=True)
    pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id').to_csv(out_csv, index=False)
    print('Wrote', out_csv, flush=True)

print('Extracting CLIP embeddings for Depth and User (train then test)...', flush=True)
extract_split_generic('train', max_frames=512, which='depth')
extract_split_generic('test',  max_frames=512, which='depth')
extract_split_generic('train', max_frames=512, which='user')
extract_split_generic('test',  max_frames=512, which='user')
print('Training linear heads for Depth(User) CLIP...', flush=True)
train_clip_head_and_cache_for_embed(EMB_DEPTH, suffix='clipd', temp_prefix='clipd', epochs=3, bs_frames=2048, lr=2e-3, wd=0.05)
train_clip_head_and_cache_for_embed(EMB_USER,  suffix='clipu', temp_prefix='clipu', epochs=3, bs_frames=2048, lr=2e-3, wd=0.05)
print('OOF grid for three-CLIP fusion (keep_len, temp parity)...', flush=True)
best = oof_grid_threeclip(alpha_list=(0.30,0.35,0.40,0.45,0.50), gamma_list=(0.15,0.20,0.25,0.30), smooth_k=5, min_mult=0.7)
print('Best (worst, mean, alpha_clip_total, gamma_audio)=', best, flush=True)
if best is not None:
    _,_,a_best,g_best = best
    print(f'Decoding TEST with alpha_clip_total={a_best}, gamma_audio={g_best} ...', flush=True)
    decode_test_threeclip(alpha_clip_total=a_best, gamma_audio=g_best, smooth_k=5, min_mult=0.7, out_csv='submission_clip3_poe_keep.csv')
    import shutil, os
    if os.path.exists('submission_clip3_poe_keep.csv'):
        shutil.copyfile('submission_clip3_poe_keep.csv', 'submission.csv')
        print('Staged submission: submission_clip3_poe_keep.csv -> submission.csv', flush=True)
else:
    print('No best found; skipping test decode.', flush=True)

Extracting CLIP embeddings for Depth and User (train then test)...


depth/train: saved 20 in 25.3s (last id=21)


depth/train: saved 40 in 52.0s (last id=41)


depth/train: saved 60 in 85.4s (last id=61)


depth/train: saved 80 in 116.2s (last id=81)


depth/train: saved 100 in 139.5s (last id=102)


depth/train: saved 120 in 162.6s (last id=122)


depth/train: saved 140 in 186.4s (last id=142)


depth/train: saved 160 in 208.9s (last id=162)


depth/train: saved 180 in 231.6s (last id=182)


depth/train: saved 200 in 254.5s (last id=202)


depth/train: saved 220 in 277.2s (last id=222)


depth/train: saved 240 in 302.0s (last id=242)


depth/train: saved 260 in 325.6s (last id=262)


depth/train: saved 280 in 349.5s (last id=282)


depth train finished; new saved = 297 elapsed= 368.9 s


depth/test: saved 20 in 23.5s (last id=319)


depth/test: saved 40 in 49.5s (last id=340)


depth/test: saved 60 in 73.7s (last id=362)


depth/test: saved 80 in 98.4s (last id=383)


depth test finished; new saved = 92 elapsed= 112.6 s


user/train: saved 20 in 23.2s (last id=21)


user/train: saved 40 in 48.1s (last id=41)


user/train: saved 60 in 78.4s (last id=61)


user/train: saved 80 in 106.7s (last id=81)


user/train: saved 100 in 129.9s (last id=102)


user/train: saved 120 in 153.5s (last id=122)


user/train: saved 140 in 177.1s (last id=142)


user/train: saved 160 in 199.4s (last id=162)


user/train: saved 180 in 222.0s (last id=182)


user/train: saved 200 in 244.9s (last id=202)


user/train: saved 220 in 268.0s (last id=222)


user/train: saved 240 in 291.8s (last id=242)


user/train: saved 260 in 315.2s (last id=262)


user/train: saved 280 in 338.6s (last id=282)


user train finished; new saved = 297 elapsed= 357.5 s


user/test: saved 20 in 23.2s (last id=319)


user/test: saved 40 in 48.9s (last id=340)


user/test: saved 60 in 72.3s (last id=362)


user/test: saved 80 in 97.1s (last id=383)


user test finished; new saved = 92 elapsed= 111.5 s


Training linear heads for Depth(User) CLIP...


clipd fold=0 ep=1/3 loss=3.0267 elapsed=0.0s


clipd fold=0 ep=2/3 loss=2.9690 elapsed=0.1s


clipd fold=0 ep=3/3 loss=2.9393 elapsed=0.1s


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


clipd fold=1 ep=1/3 loss=3.0168 elapsed=0.0s


clipd fold=1 ep=2/3 loss=2.9361 elapsed=0.1s


clipd fold=1 ep=3/3 loss=2.8959 elapsed=0.1s


clipd fold=2 ep=1/3 loss=2.9160 elapsed=0.0s


clipd fold=2 ep=2/3 loss=2.7133 elapsed=0.1s


clipd fold=2 ep=3/3 loss=2.6375 elapsed=0.1s


clipd head training + caching complete.


clipu fold=0 ep=1/3 loss=3.0197 elapsed=0.0s


clipu fold=0 ep=2/3 loss=2.9571 elapsed=0.1s


clipu fold=0 ep=3/3 loss=2.9267 elapsed=0.1s


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


clipu fold=1 ep=1/3 loss=3.0157 elapsed=0.0s


clipu fold=1 ep=2/3 loss=2.9297 elapsed=0.1s


clipu fold=1 ep=3/3 loss=2.8900 elapsed=0.1s


clipu fold=2 ep=1/3 loss=2.9129 elapsed=0.0s


clipu fold=2 ep=2/3 loss=2.6939 elapsed=0.1s


clipu fold=2 ep=3/3 loss=2.6211 elapsed=0.1s


clipu head training + caching complete.


OOF grid for three-CLIP fusion (keep_len, temp parity)...


Best (worst, mean, alpha_clip_total, gamma_audio)= (10, 3.836734693877551, 0.35, 0.15)


Decoding TEST with alpha_clip_total=0.35, gamma_audio=0.15 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip3_poe_keep.csv


Staged submission: submission_clip3_poe_keep.csv -> submission.csv


In [46]:
# Test-time Best-of-N over alpha_clip_total, gamma_audio, and visual weight splits (RGB/Depth/User) with keep_len PoE
import numpy as np, pandas as pd, time, os, shutil, json

def weighted_visual_avg_keep_len(ps: np.ndarray, pr: np.ndarray | None, pdp: np.ndarray | None, pu: np.ndarray | None, w_rgb: float, w_dep: float, w_usr: float):
    C,T = ps.shape
    acc = np.zeros((C,T), dtype=np.float32); wsum = np.zeros(T, dtype=np.float32)
    def add(pv, w):
        nonlocal acc, wsum
        if pv is None or w<=0: return
        sh = _find_best_shift(pv, ps, max_shift=15)
        if sh >= 0:
            L = min(pv.shape[1]-sh, T); ref_start=0; src_start=sh
        else:
            L = min(pv.shape[1], T+sh); ref_start=-sh; src_start=0
        if L >= 16:
            acc[:, ref_start:ref_start+L] += w * pv[:, src_start:src_start+L].astype(np.float32)
            wsum[ref_start:ref_start+L] += w
    add(pr,  w_rgb)
    add(pdp, w_dep)
    add(pu,  w_usr)
    mask = wsum > 0
    if not mask.any():
        return None
    v = np.zeros((C,T), dtype=np.float32)
    v[:, mask] = acc[:, mask] / np.maximum(wsum[mask][None, :], 1e-8)
    colsum = v.sum(axis=0, keepdims=True); colsum[:, ~mask] = 1.0
    v = v / np.clip(colsum, 1e-8, 1e8)
    return v.astype(np.float32)

def fuse_keep_len_skel_vis_audio(ps: np.ndarray, pvis: np.ndarray | None, pa: np.ndarray | None, alpha_vis_total: float, gamma_audio: float) -> np.ndarray:
    C,T = ps.shape
    logp = np.log(np.clip(ps, 1e-8, 1.0)).astype(np.float32)
    w_s = np.ones(T, dtype=np.float32)
    if pvis is not None:
        mask_v = (pvis.sum(axis=0) > 0)
        logp[:, mask_v] += alpha_vis_total * np.log(np.clip(pvis[:, mask_v], 1e-8, 1.0))
        w_s[mask_v] -= alpha_vis_total
    if pa is not None:
        sh = _find_best_shift(pa, ps, max_shift=15)
        if sh >= 0:
            L = min(pa.shape[1]-sh, T); ref_start=0; src_start=sh
        else:
            L = min(pa.shape[1], T+sh); ref_start=-sh; src_start=0
        if L >= 16:
            logp[:, ref_start:ref_start+L] += gamma_audio * np.log(np.clip(pa[:, src_start:src_start+L], 1e-8, 1.0))
            w_s[ref_start:ref_start+L] -= gamma_audio
    w_s = np.clip(w_s, 0.0, 1.0)[None, :]
    logp = w_s * logp
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
    return q.astype(np.float32)

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    C,T = p.shape
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def decode_test_threeclip_bestofN(out_csv='submission_clip3_bestofN.csv',
                                  alpha_tot_list=(0.35,0.40,0.45), gamma_list=(0.20,0.25),
                                  splits=((0.60,0.25,0.15),(0.50,0.30,0.20),(0.70,0.20,0.10), None),
                                  smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    # audio mean temp for test
    Ta_list = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_list if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if len(Ta_vals)>0 else None
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pr = load_clip_probs_test_avg(sid)            # RGB CLIP (per-fold temp applied)
        pdp = load_clipd_probs_test_avg(sid)          # Depth CLIP
        pu  = load_clipu_probs_test_avg(sid)          # User CLIP
        pa  = load_probs_generic(sid, 'audio')
        if pa is not None and Ta_mean is not None:
            pa = temp_scale_scalar(pa, Ta_mean)
        best_ll=-1e99; best_seq=None
        for a_tot in alpha_tot_list:
            for ga in gamma_list:
                if a_tot + ga > 0.60: continue
                for sp in splits:
                    if sp is None:
                        # equal average using existing helper
                        pvis = visual_avg_keep_len(ps, [pr, pdp, pu])
                    else:
                        wr,wd,wu = sp
                        # scale weights to sum to 1 over available streams
                        avail = [(pr,wr),(pdp,wd),(pu,wu)]
                        s = sum(w for pv,w in avail if pv is not None)
                        if s<=0: pvis=None
                        else:
                            wr2 = (wr/s) if pr  is not None else 0.0
                            wd2 = (wd/s) if pdp is not None else 0.0
                            wu2 = (wu/s) if pu  is not None else 0.0
                            pvis = weighted_visual_avg_keep_len(ps, pr, pdp, pu, wr2, wd2, wu2)
                    pf = fuse_keep_len_skel_vis_audio(ps, pvis, pa, a_tot, ga)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    ll = total_emission_loglik(y, pf)
                    if ll > best_ll:
                        best_ll = ll
                        seq = make_perm20(compress_to_sequence(y), pf)
                        best_seq = seq
        ids.append(sid); rows.append(' '.join(map(str, best_seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST Best-of-N over (alpha_clip_total, gamma_audio, visual splits)...', flush=True)
decode_test_threeclip_bestofN(out_csv='submission_clip3_bestofN.csv',
                               alpha_tot_list=(0.35,0.40,0.45), gamma_list=(0.20,0.25),
                               splits=((0.60,0.25,0.15),(0.50,0.30,0.20),(0.70,0.20,0.10), None),
                               smooth_k=5, min_mult=0.7)

Decoding TEST Best-of-N over (alpha_clip_total, gamma_audio, visual splits)...


Decoded 20/95 elapsed=1.7s


Decoded 40/95 elapsed=3.5s


Decoded 60/95 elapsed=5.1s


Decoded 80/95 elapsed=6.6s


Decoded 95/95 elapsed=7.6s


Wrote submission_clip3_bestofN.csv rows= 95
Staged submission: submission_clip3_bestofN.csv -> submission.csv


In [59]:
# PANNs (CNN14) audio embeddings -> linear head -> OOF gamma grid -> test decode (keep_len PoE) and stage submission
import sys, subprocess, os, time, json, numpy as np, pandas as pd, ssl, urllib.request
from pathlib import Path

# 0) Ensure PANNs labels CSV and checkpoint exist without using wget (apt is unavailable).
PANN_DIR = '/app/panns_data'
CKPT_PATH = f'{PANN_DIR}/Cnn14_mAP=0.431.pth'
LABELS_CSV_URL = 'http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv'
CKPT_URLS = [
    'https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth?download=1',
    'https://zenodo.org/record/3987831/files/Cnn14_mAP=0.431.pth?download=1'
]

def _urlretrieve(url, dst):
    try:
        ctx = ssl.create_default_context()
        with urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}), context=ctx) as r, open(dst, 'wb') as f:
            f.write(r.read())
        return True
    except Exception as e:
        print('Download failed for', url, e, flush=True)
        return False

def ensure_panns_assets():
    os.makedirs(PANN_DIR, exist_ok=True)
    labels_csv = os.path.join(PANN_DIR, 'class_labels_indices.csv')
    if not os.path.exists(labels_csv):
        print('Downloading PANNs labels CSV...', flush=True)
        ok = _urlretrieve(LABELS_CSV_URL, labels_csv)
        if ok: print('Labels CSV ready at', labels_csv, flush=True)
    if not os.path.exists(CKPT_PATH):
        print('Downloading PANNs checkpoint to', CKPT_PATH, flush=True)
        for url in CKPT_URLS:
            if _urlretrieve(url, CKPT_PATH):
                print('Checkpoint downloaded from', url, flush=True)
                break
        if not os.path.exists(CKPT_PATH):
            raise RuntimeError('Failed to download PANNs checkpoint; cannot proceed.')

ensure_panns_assets()

# 1) Ensure deps
def ensure_audio_pkgs():
    try:
        import panns_inference, librosa, torchlibrosa  # noqa
        return
    except Exception as e:
        print('Installing audio deps...', e, flush=True)
    cmds = [
        [sys.executable, '-m', 'pip', 'install', '-q', 'panns-inference', 'torchlibrosa', 'librosa==0.10.1']
    ]
    for cmd in cmds:
        subprocess.run(cmd, check=True)
    print('Audio deps installed.', flush=True)

ensure_audio_pkgs()
import librosa
from panns_inference import AudioTagging
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset

probs_cache = Path('probs_cache'); probs_cache.mkdir(exist_ok=True)
AUDIO_WAV_TR = Path('audio_wav/train'); AUDIO_WAV_TE = Path('audio_wav/test')
LABELS = Path('labels3d_v2/train')

# 1b) Create a single global AudioTagging instance (reuse across all files) to avoid repeated heavy init
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Creating global PANNs AudioTagging on', DEVICE, 'checkpoint:', CKPT_PATH, flush=True)
AT_GLOBAL = AudioTagging(checkpoint_path=CKPT_PATH, device=DEVICE)

# 2) Extract PANNs features on-demand and cache to audio_panns/{split}/{id}.npy
FEAT_DIR = Path('audio_panns'); (FEAT_DIR/'train').mkdir(parents=True, exist_ok=True); (FEAT_DIR/'test').mkdir(parents=True, exist_ok=True)

def _to_527(F: np.ndarray) -> np.ndarray:
    # ensure feature dim = 527 (AudioSet classes) by slicing/padding if needed
    if F.ndim!=2: F = np.atleast_2d(F.astype(np.float32))
    T,D = F.shape
    if D == 527: return F.astype(np.float32)
    if D > 527: return F[:, :527].astype(np.float32)
    # pad
    out = np.zeros((T, 527), dtype=np.float32); out[:, :D] = F.astype(np.float32); return out

def _parse_at_output(out):
    # panns_inference may return dict or tuple
    try:
        if isinstance(out, dict):
            fw = out.get('framewise_output', None)
            cw = out.get('clipwise_output', None)
            return fw, cw
        # tuple: try (clipwise, embedding, framewise) variants
        if isinstance(out, tuple):
            fw = None; cw = None
            for elem in out:
                arr = np.asarray(elem)
                if arr.ndim>=1:
                    if arr.shape[-1] == 527 and cw is None:
                        cw = arr
                    elif arr.shape[-1] != 527 and fw is None and arr.ndim==2:
                        # some builds put framewise as time x 527; safeguard
                        if arr.shape[-1] == 527: fw = arr
            return fw, cw
    except Exception:
        return None, None
    return None, None

@torch.no_grad()
def extract_panns_for_file(wav_path: Path, sr=32000, pool2=True) -> np.ndarray:
    try:
        y, sr_ = librosa.load(str(wav_path), sr=sr, mono=True)
        if y.size == 0:
            return np.zeros((1, 527), dtype=np.float32)
        feats = []
        seg_len = sr * 30
        y = y.astype(np.float32)
        for s in range(0, len(y), seg_len):
            seg = y[s:s+seg_len]
            if seg.size == 0: continue
            seg_b = seg[None, :]  # (1, samples)
            out = AT_GLOBAL.inference(seg_b)
            fw, cw = _parse_at_output(out)
            if fw is not None:
                fw_np = np.asarray(fw, dtype=np.float32)
                # If fw has batch, squeeze
                if fw_np.ndim == 3: fw_np = fw_np.reshape(-1, fw_np.shape[-1])
                # pool to ~5 Hz
                if pool2 and fw_np.ndim==2 and fw_np.shape[0] > 1:
                    Tm = (fw_np.shape[0]//2)*2
                    if Tm >= 2:
                        fw_np = fw_np[:Tm].reshape(Tm//2, 2, fw_np.shape[1]).mean(axis=1).astype(np.float32)
                feats.append(_to_527(fw_np))
            elif cw is not None:
                cw_np = np.asarray(cw, dtype=np.float32)
                if cw_np.ndim == 2: cw_np = cw_np[0]
                feats.append(_to_527(cw_np[None, :]))
        if not feats:
            return np.zeros((1, 527), dtype=np.float32)
        F = np.concatenate(feats, axis=0).astype(np.float32)
        return _to_527(F)
    except Exception as e:
        print('PANNs FAIL on', wav_path, e, flush=True)
        return np.zeros((1, 527), dtype=np.float32)

def extract_split_panns(split='train'):
    print(f'Skipping bulk PANNs extract for {split}; using on-the-fly extraction.', flush=True)

print('Preparing PANNs (CNN14) assets and skipping bulk extraction...', flush=True)
extract_split_panns('train')
extract_split_panns('test')

# 3) Train linear head (D->21) per fold on framewise PANNs features; cache OOF and per-fold TEST probs; save temps
class PannsDataset(Dataset):
    def __init__(self, ids):
        self.items=[]
        for sid in ids:
            sid=int(sid)
            p = FEAT_DIR/'train'/f'{sid}.npy'
            if not p.exists():
                wp = AUDIO_WAV_TR/f'{sid}.wav'
                if wp.exists():
                    X = extract_panns_for_file(wp)
                    np.save(p, X.astype(np.float32))
            if not p.exists():
                continue
            X = np.load(p).astype(np.float32)
            X = _to_527(X)  # coerce feature dim to 527
            y = np.load(LABELS/f"{sid}.npy").astype(np.int64)
            if X.shape[0] != len(y):
                idx = np.linspace(0, len(y)-1, X.shape[0]).round().astype(int)
                y = y[idx]
            self.items.append((sid, X, y))
    def __len__(self): return len(self.items)
    def __getitem__(self, i):
        sid,X,y = self.items[i]
        return sid, torch.from_numpy(X), torch.from_numpy(y)

class LinearHead(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.fc = nn.Linear(in_dim, 21)
    def forward(self, x):
        return self.fc(x)

@torch.no_grad()
def forward_logits_all(head, X_np: np.ndarray, bs: int = 8192):
    xb = torch.from_numpy(X_np.astype(np.float32)).to(DEVICE)
    outs=[]
    for i in range(0, xb.size(0), bs):
        outs.append(head(xb[i:i+bs]).float().cpu())
    return torch.cat(outs,0).numpy().astype(np.float32)

def fit_temperature_on_val(head, val_items):
    Xs=[]; Ys=[]
    with torch.no_grad():
        for sid,X,y in val_items:
            xb = torch.from_numpy(X.astype(np.float32)).to(DEVICE)
            outs=[]
            for i in range(0, xb.size(0), 8192):
                outs.append(head(xb[i:i+8192]).float())
            lg = torch.cat(outs,0); Xs.append(lg.cpu()); Ys.append(torch.from_numpy(y))
    X = torch.cat(Xs,0).to(DEVICE); Y = torch.cat(Ys,0).to(DEVICE)
    Tsc = torch.tensor(1.5, device=DEVICE, requires_grad=True)
    opt = torch.optim.LBFGS([Tsc], lr=0.01, max_iter=50)
    def closure():
        opt.zero_grad(); loss = F.cross_entropy(X / Tsc, Y, reduction='mean'); loss.backward(); return loss
    opt.step(closure)
    return float(Tsc.detach().cpu().item())

def _uniform_cap_frames(X: np.ndarray, Y: np.ndarray, cap: int = 100_000):
    n = X.shape[0]
    if n <= cap: return X, Y
    idx = np.linspace(0, n-1, cap).round().astype(int)
    return X[idx], Y[idx]

def train_panns_head_and_cache(folds_path='folds_archive_cv.json', epochs=1, lr=2e-3, wd=0.05):
    folds = json.load(open(folds_path,'r'))
    test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
    for fd in folds:
        fidx = int(fd['fold'])
        tr_ids = list(map(int, fd['train_ids'])); va_ids = list(map(int, fd['val_ids']))
        ds_tr = PannsDataset(tr_ids); ds_va = PannsDataset(va_ids)
        if len(ds_tr)==0 or len(ds_va)==0:
            print('Fold', fidx, 'no data; skipping.')
            continue
        Xtr = np.concatenate([X for _,X,_ in ds_tr.items], axis=0)
        Ytr = np.concatenate([y for *_,y in ds_tr.items], axis=0)
        Xtr, Ytr = _uniform_cap_frames(Xtr, Ytr, cap=100_000)
        in_dim = Xtr.shape[1]
        head = LinearHead(in_dim).to(DEVICE)
        opt = torch.optim.AdamW(head.parameters(), lr=lr, weight_decay=wd)
        scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))
        t0=time.time(); head.train()
        for ep in range(epochs):
            loss_sum=0.0; nb=0
            for i in range(0, Xtr.shape[0], 8192):
                xb = torch.from_numpy(Xtr[i:i+8192]).to(DEVICE)
                yb = torch.from_numpy(Ytr[i:i+8192]).to(DEVICE)
                opt.zero_grad(set_to_none=True)
                with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(DEVICE=='cuda')):
                    lg = head(xb); loss = F.cross_entropy(lg, yb, label_smoothing=0.05)
                scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
                loss_sum += float(loss.detach().cpu().item()); nb+=1
            print(f'panns fold={fidx} ep={ep+1}/{epochs} loss={loss_sum/max(nb,1):.4f} elapsed={time.time()-t0:.1f}s', flush=True)
        head.eval()
        with torch.no_grad():
            for sid,X,y in ds_va.items:
                lg = forward_logits_all(head, X, bs=8192)  # (T,21)
                p = torch.softmax(torch.from_numpy(lg), dim=1).numpy().astype(np.float32).T  # CxT
                np.save(probs_cache/f"{sid}_audio_panns.npy", p)
        Tval = fit_temperature_on_val(head, ds_va.items)
        json.dump({'T': Tval}, open(f'audio_panns_temp_fold{fidx}.json','w'))
        with torch.no_grad():
            for sid in test_ids:
                sid=int(sid)
                pth = FEAT_DIR/'test'/f"{sid}.npy"
                if not pth.exists():
                    wp = AUDIO_WAV_TE/f'{sid}.wav'
                    if wp.exists():
                        X = extract_panns_for_file(wp)
                        np.save(pth, X.astype(np.float32))
                if not pth.exists():
                    continue
                X = _to_527(np.load(pth).astype(np.float32))
                lg = forward_logits_all(head, X, bs=8192)
                p = torch.softmax(torch.from_numpy(lg)/Tval, dim=1).numpy().astype(np.float32).T
                np.save(probs_cache/f"{sid}_audio_panns_f{fidx}.npy", p)
    print('PANNs head training + caching complete.', flush=True)

print('Training PANNs linear head (fast: epochs=1)...', flush=True)
train_panns_head_and_cache(epochs=1, lr=2e-3, wd=0.05)

# 4) Fuse with skeleton + CLIP(RGB) using keep_len PoE; OOF gamma grid (alpha fixed 0.30) -> pick by worst then mean
def _load_temp_num(path: str):
    p = Path(path);
    if not p.exists(): return None
    try: return float(json.load(open(p,'r')).get('T', 1.0))
    except Exception:
        try: return float(open(p).read().strip())
        except Exception: return None

def load_audio_panns_train(sid:int):
    pth = probs_cache/f"{sid}_audio_panns.npy"
    return np.load(pth).astype(np.float32) if pth.exists() else None

def load_audio_panns_test_avg(sid:int):
    arr=[]
    for f in (0,1,2):
        pth = probs_cache/f"{sid}_audio_panns_f{f}.npy"
        if pth.exists(): arr.append(np.load(pth).astype(np.float32))
    if not arr: return None
    L = min(a.shape[1] for a in arr)
    q = np.mean([a[:, :L] for a in arr], axis=0); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

def oof_grid_gamma_panns(alpha_clip_fixed=0.30, gamma_list=(0.18,0.20,0.22,0.25), smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    results=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clip_temp_fold{fidx}.json')
        for g in gamma_list:
            d=[]
            for sid in va:
                sid=int(sid)
                ps = load_skeleton_probs(sid)
                pc = load_clip_probs_train(sid)
                pa = load_audio_panns_train(sid)
                if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
                if pa is None:
                    pa = load_probs_generic(sid, 'audio')
                pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip_fixed, g)
                pf = smooth_probs_box(pf, k=smooth_k)
                y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                seq = compress_to_sequence(y); true = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
                n=len(seq); m=len(true)
                if n==0: d.append(m); continue
                dp=list(range(m+1))
                for i in range(1,n+1):
                    prev=dp[0]; dp[0]=i
                    for j in range(1,m+1):
                        tmp=dp[j]; cost=0 if seq[i-1]==true[j-1] else 1
                        dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                d.append(dp[m])
            results.append((max(d), float(np.mean(d)), g, fidx))
    agg={}
    for worst, mean, g, fidx in results:
        agg.setdefault(g, []).append(mean)
    summary=[]
    for g, arr in agg.items():
        summary.append((max(arr), float(np.mean(arr)), g))
    summary.sort(key=lambda x: (x[0], x[1]))
    return summary[0]

print('OOF grid for gamma (PANNs audio) with alpha_clip=0.30 ...', flush=True)
best_g = oof_grid_gamma_panns(alpha_clip_fixed=0.30, gamma_list=(0.18,0.20,0.22,0.25), smooth_k=5, min_mult=0.7)
print('Best gamma (worst, mean, gamma)=', best_g, flush=True)
gamma_best = best_g[2]

# 5) Decode TEST with alpha_clip=0.30, gamma_panns=best; MinSeg with min_mult=0.70; stage
def decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.20, smooth_k=5, min_mult=0.7, out_csv='submission_clip_panns_keep.csv'):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # already temp-averaged
        pa = load_audio_panns_test_avg(sid)
        if pa is None:
            pa = load_probs_generic(sid, 'audio')
        pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip, gamma_audio)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95', flush=True)
    pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id').to_csv(out_csv, index=False)
    print('Wrote', out_csv, flush=True)
    import shutil
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST with PANNs audio fusion...', flush=True)
decode_test_with_panns(alpha_clip=0.30, gamma_audio=gamma_best, smooth_k=5, min_mult=0.7, out_csv='submission_clip_panns_keep.csv')

Creating global PANNs AudioTagging on cuda checkpoint: /app/panns_data/Cnn14_mAP=0.431.pth


Checkpoint path: /app/panns_data/Cnn14_mAP=0.431.pth


GPU number: 1
Preparing PANNs (CNN14) assets and skipping bulk extraction...


Skipping bulk PANNs extract for train; using on-the-fly extraction.


Skipping bulk PANNs extract for test; using on-the-fly extraction.


Training PANNs linear head (fast: epochs=1)...


panns fold=0 ep=1/1 loss=3.0315 elapsed=0.0s


  checkpoint = torch.load(checkpoint_path, map_location=self.device)
  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=='cuda'))


PANNs FAIL on audio_wav/test/318.wav Given input size: (512x1x8). Calculated output size: (512x0x4). Output size is too small


PANNs FAIL on audio_wav/test/340.wav Given input size: (1024x1x4). Calculated output size: (1024x0x2). Output size is too small


PANNs FAIL on audio_wav/test/343.wav Given input size: (512x1x8). Calculated output size: (512x0x4). Output size is too small


PANNs FAIL on audio_wav/test/369.wav Given input size: (1024x1x4). Calculated output size: (1024x0x2). Output size is too small


PANNs FAIL on audio_wav/test/377.wav Given input size: (1024x1x4). Calculated output size: (1024x0x2). Output size is too small


panns fold=1 ep=1/1 loss=3.0660 elapsed=0.0s


panns fold=2 ep=1/1 loss=3.0516 elapsed=0.0s


PANNs head training + caching complete.


OOF grid for gamma (PANNs audio) with alpha_clip=0.30 ...


Best gamma (worst, mean, gamma)= (4.45, 3.71847385418814, 0.18)


Decoding TEST with PANNs audio fusion...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep.csv


Staged submission: submission_clip_panns_keep.csv -> submission.csv


In [53]:
# OpenCLIP ViT-L/14 RGB features (fps=4, max_frames=512) -> linear head -> keep_len PoE with audio -> OOF tune -> TEST decode + stage
import os, sys, subprocess, time, json, numpy as np, pandas as pd
from pathlib import Path

# Ensure deps (torch cu121 stack, open_clip_torch, decord already installed earlier in Cell 18)
import torch, torchvision.transforms as T
import open_clip
from decord import VideoReader, cpu
from PIL import Image

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('CUDA available:', torch.cuda.is_available(), 'GPU:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU', flush=True)

# Dirs for ViT-L/14 embeddings
EMB_L = Path('rgb_clipL_embed'); (EMB_L/'train').mkdir(parents=True, exist_ok=True); (EMB_L/'test').mkdir(parents=True, exist_ok=True)
VID_DIR_T = Path('rgb_videos/train'); VID_DIR_E = Path('rgb_videos/test')
PROBS = Path('probs_cache'); PROBS.mkdir(exist_ok=True)
LABELS = Path('labels3d_v2/train')

# Reuse skeleton utilities from earlier cells
try:
    folds
except NameError:
    folds = json.load(open('folds_archive_cv.json','r'))

# Load OpenCLIP ViT-L/14
modelL, _, txL = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion2b_s32b_b82k', device=device)
modelL.eval()
mean = (0.48145466, 0.4578275, 0.40821073); std = (0.26862954, 0.26130258, 0.27577711)
tx_img = T.Compose([T.Resize(224, interpolation=T.InterpolationMode.BICUBIC), T.CenterCrop(224), T.ToTensor(), T.Normalize(mean, std)])

def sample_idx(nf, fps_native, fps_target=4.0, max_frames=512):
    if not fps_native or fps_native<=0: n = min(max_frames, nf)
    else:
        dur = nf/float(fps_native); n = min(max_frames, int(round(dur*fps_target)))
    n = max(1, min(n, nf))
    return np.linspace(0, nf-1, n).round().astype(int)

@torch.no_grad()
def encode_video_L(path: Path, max_frames=512, bs=64):
    vr = VideoReader(str(path), ctx=cpu(0))
    nf = len(vr)
    try: fps_native = float(vr.get_avg_fps())
    except Exception: fps_native = None
    idx = sample_idx(nf, fps_native, 4.0, max_frames)
    embs=[]
    for i in range(0, len(idx), bs):
        frames = vr.get_batch(idx[i:i+bs]).asnumpy()
        imgs = [tx_img(Image.fromarray(fr)) for fr in frames]
        x = torch.stack(imgs,0).to(device, non_blocking=True)
        with torch.autocast(device_type='cuda', dtype=torch.float16) if device=='cuda' else torch.no_grad():
            f = modelL.encode_image(x)
        f = torch.nn.functional.normalize(f.float(), dim=1).cpu().numpy()
        embs.append(f)
    E = np.concatenate(embs,0).astype(np.float16)
    return E

def id_to_video(dirpath: Path, sid: int):
    cands = list(dirpath.glob(f'{sid}.mp4'))
    if cands: return cands[0]
    cands = list(dirpath.glob(f'*{sid}*.mp4'))
    return cands[0] if cands else None

def extract_split_L(split='train', max_frames=512):
    csv_path = 'training.csv' if split=='train' else 'test.csv'
    ids = pd.read_csv(csv_path)['Id'].astype(int).tolist()
    out_dir = EMB_L/split
    vid_dir = VID_DIR_T if split=='train' else VID_DIR_E
    done=0; t0=time.time()
    for k,sid in enumerate(ids, 1):
        out = out_dir/f"{sid}.npy"
        if out.exists():
            continue
        vp = id_to_video(vid_dir, sid)
        if vp is None:
            continue
        try:
            E = encode_video_L(vp, max_frames=max_frames, bs=64)
            np.save(out, E)
            done+=1
            if (done%20)==0: print(f"{split}: saved {done} in {time.time()-t0:.1f}s (last id={sid})", flush=True)
        except Exception as e:
            print('FAIL ViT-L', split, sid, e, flush=True)
    print(split, 'ViT-L finished; new saved =', done, 'elapsed=', round(time.time()-t0,1), 's', flush=True)

print('Extracting ViT-L/14 embeddings (train then test)...', flush=True)
extract_split_L('train', max_frames=512)
extract_split_L('test',  max_frames=512)

# Train linear head (D->21) on ViT-L embeddings; cache OOF and per-fold TEST with temp scaling
import torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset

def load_clipL_embed(sid: int, split: str):
    p = (EMB_L/'train'/f'{sid}.npy') if split=='train' else (EMB_L/'test'/f'{sid}.npy')
    if not p.exists(): return None
    return np.load(p)

def resample_labels(y, T):
    if len(y)==T: return y
    idx = np.linspace(0, len(y)-1, T).round().astype(int)
    return y[idx]

class SeqDatasetL(Dataset):
    def __init__(self, ids):
        self.items=[]
        for sid in ids:
            E = load_clipL_embed(int(sid), 'train')
            if E is None: continue
            y = np.load(LABELS/f"{int(sid)}.npy").astype(np.int64)
            y = resample_labels(y, E.shape[0])
            self.items.append((int(sid), E.astype(np.float32), y))
    def __len__(self): return len(self.items)
    def __getitem__(self, i):
        sid,E,y = self.items[i]
        return sid, torch.from_numpy(E), torch.from_numpy(y)

class LinearHead(nn.Module):
    def __init__(self, in_dim): super().__init__(); self.fc = nn.Linear(in_dim, 21)
    def forward(self, x): return self.fc(x)

@torch.no_grad()
def forward_logits_all(head, E_np: np.ndarray, bs: int = 2048):
    xb = torch.from_numpy(E_np.astype(np.float32)).to(device)
    outs=[]
    for i in range(0, xb.size(0), bs):
        outs.append(head(xb[i:i+bs]).float().cpu())
    return torch.cat(outs,0).numpy().astype(np.float32)

def fit_temperature(head, val_items):
    Xs=[]; Ys=[]
    with torch.no_grad():
        for sid,E,y in val_items:
            xb = torch.from_numpy(E.astype(np.float32)).to(device)
            outs=[]
            for i in range(0, xb.size(0), 2048):
                outs.append(head(xb[i:i+2048]).float())
            lg = torch.cat(outs,0); Xs.append(lg.cpu()); Ys.append(torch.from_numpy(y))
    X = torch.cat(Xs,0).to(device); Y = torch.cat(Ys,0).to(device)
    Tsc = torch.tensor(1.5, device=device, requires_grad=True)
    opt = torch.optim.LBFGS([Tsc], lr=0.01, max_iter=50)
    def closure():
        opt.zero_grad(); loss = F.cross_entropy(X / Tsc, Y, reduction='mean'); loss.backward(); return loss
    opt.step(closure)
    return float(Tsc.detach().cpu().item())

def train_clipL_head_and_cache(folds_path='folds_archive_cv.json', epochs=3, bs_frames=2048, lr=2e-3, wd=0.05):
    folds = json.load(open(folds_path,'r'))
    test_ids = pd.read_csv('test.csv')['Id'].astype(int).tolist()
    for fd in folds:
        fidx = int(fd['fold'])
        tr_ids = list(map(int, fd['train_ids'])); va_ids = list(map(int, fd['val_ids']))
        ds_tr = SeqDatasetL(tr_ids); ds_va = SeqDatasetL(va_ids)
        if len(ds_tr)==0 or len(ds_va)==0: continue
        Xtr = np.concatenate([E for _,E,_ in ds_tr.items], axis=0)
        Ytr = np.concatenate([y for *_,y in ds_tr.items], axis=0)
        chunks = [(Xtr[i:i+bs_frames], Ytr[i:i+bs_frames]) for i in range(0, Xtr.shape[0], bs_frames)]
        in_dim = Xtr.shape[1]
        head = LinearHead(in_dim).to(device)
        opt = torch.optim.AdamW(head.parameters(), lr=lr, weight_decay=wd)
        scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))
        t0=time.time(); head.train()
        for ep in range(epochs):
            loss_sum=0.0; nb=0
            for xb_np,yb_np in chunks:
                xb = torch.from_numpy(xb_np).to(device); yb = torch.from_numpy(yb_np).to(device)
                opt.zero_grad(set_to_none=True)
                with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=(device=='cuda')):
                    lg = head(xb); loss = F.cross_entropy(lg, yb, label_smoothing=0.05)
                scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
                loss_sum += float(loss.detach().cpu().item()); nb+=1
            print(f"ViT-L fold={fidx} ep={ep+1}/{epochs} loss={loss_sum/max(nb,1):.4f} elapsed={time.time()-t0:.1f}s", flush=True)
        head.eval()
        # Cache OOF probs for validation ids
        with torch.no_grad():
            for sid,E,y in ds_va.items:
                lg = forward_logits_all(head, E, bs=2048)  # (T,21)
                p = torch.softmax(torch.from_numpy(lg), dim=1).numpy().astype(np.float32).T  # CxT
                np.save(PROBS/f"{sid}_clipL.npy", p)
        # Temp scaling on validation
        Tval = fit_temperature(head, ds_va.items)
        json.dump({'T': Tval}, open(f'clipL_temp_fold{fidx}.json','w'))
        # Test per-fold probs (temp-scaled)
        with torch.no_grad():
            for sid in test_ids:
                E = load_clipL_embed(int(sid), 'test')
                if E is None: continue
                lg = forward_logits_all(head, E, bs=2048)  # (T,21)
                p = torch.softmax(torch.from_numpy(lg)/Tval, dim=1).numpy().astype(np.float32).T  # CxT
                np.save(PROBS/f"{sid}_clipL_f{fidx}.npy", p)
    print('ViT-L head training + caching complete.', flush=True)

print('Training ViT-L head (3 folds)...', flush=True)
train_clipL_head_and_cache(epochs=3, bs_frames=2048, lr=2e-3, wd=0.05)

# Loaders for ViT-L probs
def _load_temp_num(path: str):
    p = Path(path);
    if not p.exists(): return None
    try: return float(json.load(open(p,'r')).get('T', 1.0))
    except Exception:
        try: return float(open(p).read().strip())
        except Exception: return None

def load_clipL_probs_train(sid:int):
    pth = PROBS/f"{sid}_clipL.npy"
    return np.load(pth).astype(np.float32) if pth.exists() else None

def load_clipL_probs_test_avg(sid:int):
    arr=[]
    for f in (0,1,2):
        pth = PROBS/f"{sid}_clipL_f{f}.npy"
        if pth.exists(): arr.append(np.load(pth).astype(np.float32))
    if not arr: return None
    L = min(a.shape[1] for a in arr)
    q = np.mean([a[:, :L] for a in arr], axis=0); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

# Reuse fusion utilities: load_skeleton_probs, load_probs_generic, fuse_poe_with_clip_keep_len, smooth_probs_box, decode_minseg, aba_collapse, compress_to_sequence, make_perm20, compute_min_dur_from_ids

def oof_grid_clipL(alpha_list=(0.30,0.35,0.40,0.45), gamma_list=(0.15,0.20,0.25), smooth_k=5, min_mult=0.7):
    folds = json.load(open('folds_archive_cv.json','r'))
    results=[]
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clipL_temp_fold{fidx}.json')
        Taud  = _load_temp_num(f'audio_temp_fold{fidx}.json')
        for a in alpha_list:
            for g in gamma_list:
                d=[]
                for sid in va:
                    sid=int(sid)
                    ps = load_skeleton_probs(sid)
                    pc = load_clipL_probs_train(sid)
                    pa = load_probs_generic(sid, 'audio')
                    if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
                    if pa is not None and Taud  is not None: pa = temp_scale_scalar(pa,  Taud)
                    pf = fuse_poe_with_clip_keep_len(ps, pc, pa, a, g)
                    pf = smooth_probs_box(pf, k=smooth_k)
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    seq = compress_to_sequence(y); true = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
                    # Levenshtein
                    n=len(seq); m=len(true)
                    if n==0: d.append(m); continue
                    dp=list(range(m+1))
                    for i in range(1,n+1):
                        prev=dp[0]; dp[0]=i
                        for j in range(1,m+1):
                            tmp=dp[j]; cost=0 if seq[i-1]==true[j-1] else 1
                            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
                    d.append(dp[m])
                results.append((max(d), float(np.mean(d)), a, g))
    results.sort(key=lambda x: (x[0], x[1]))
    return results[0]

def decode_test_clipL(alpha_clip=0.35, gamma_audio=0.20, smooth_k=5, min_mult=0.7, out_csv='submission_clipL_poe_keep.csv'):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    Ta_list = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_list if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if len(Ta_vals)>0 else None
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clipL_probs_test_avg(sid)
        pa = load_probs_generic(sid, 'audio')
        if pa is not None and Ta_mean is not None: pa = temp_scale_scalar(pa, Ta_mean)
        pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip, gamma_audio)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95', flush=True)
    pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id').to_csv(out_csv, index=False)
    print('Wrote', out_csv, flush=True)
    import shutil
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('OOF grid for ViT-L keep_len fusion...', flush=True)
best = oof_grid_clipL(alpha_list=(0.30,0.35,0.40,0.45), gamma_list=(0.15,0.20,0.25), smooth_k=5, min_mult=0.7)
print('Best (worst, mean, alpha_clip, gamma_audio)=', best, flush=True)
_,_,alphaL,gammaA = best
print(f'Decoding TEST keep_len with ViT-L: alpha={alphaL}, gamma={gammaA} ...', flush=True)
decode_test_clipL(alpha_clip=alphaL, gamma_audio=gammaA, smooth_k=5, min_mult=0.7, out_csv='submission_clipL_poe_keep.csv')

CUDA available: True GPU: NVIDIA A10-24Q


Extracting ViT-L/14 embeddings (train then test)...


train ViT-L finished; new saved = 0 elapsed= 0.0 s


test ViT-L finished; new saved = 0 elapsed= 0.0 s


Training ViT-L head (3 folds)...


ViT-L fold=0 ep=1/3 loss=3.0236 elapsed=0.0s


ViT-L fold=0 ep=2/3 loss=2.9589 elapsed=0.1s


ViT-L fold=0 ep=3/3 loss=2.9224 elapsed=0.1s


  scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))


ViT-L fold=1 ep=1/3 loss=3.0183 elapsed=0.0s


ViT-L fold=1 ep=2/3 loss=2.9263 elapsed=0.1s


ViT-L fold=1 ep=3/3 loss=2.8792 elapsed=0.1s


ViT-L fold=2 ep=1/3 loss=2.9162 elapsed=0.0s


ViT-L fold=2 ep=2/3 loss=2.7021 elapsed=0.1s


ViT-L fold=2 ep=3/3 loss=2.6253 elapsed=0.1s


ViT-L head training + caching complete.


OOF grid for ViT-L keep_len fusion...


Best (worst, mean, alpha_clip, gamma_audio)= (10, 3.836734693877551, 0.4, 0.15)


Decoding TEST keep_len with ViT-L: alpha=0.4, gamma=0.15 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clipL_poe_keep.csv


Staged submission: submission_clipL_poe_keep.csv -> submission.csv


In [89]:
# Best-of-N (keep_len PoE) over alpha_clip, gamma_audio, and min_mult in {0.65,0.70} for CLIP(RGB)+Audio; stages submission
import numpy as np, pandas as pd, time, os, shutil, json

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    C,T = p.shape
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def decode_test_clip_bestofN_keep_len_minmult(out_csv='submission_clip_poe_keep_bestofN_minmult.csv',
                                             alpha_list=(0.28,0.30,0.32,0.34), gamma_list=(0.18,0.20,0.22,0.24),
                                             min_mult_list=(0.65,0.70), smooth_k=5):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    # average audio temperature over folds
    def _load_temp_num(path: str):
        p = Path(path);
        if not p.exists(): return None
        try: return float(json.load(open(p,'r')).get('T', 1.0))
        except Exception:
            try: return float(open(p).read().strip())
            except Exception: return None
    Ta_list = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_list if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if len(Ta_vals)>0 else None
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # per-fold temp already applied
        pa = load_probs_generic(sid, 'audio')
        if pa is not None and Ta_mean is not None:
            pa = temp_scale_scalar(pa, Ta_mean)
        best_ll = -1e99; best_seq=None
        for ac in alpha_list:
            for ga in gamma_list:
                pf = fuse_poe_with_clip_keep_len(ps, pc, pa, ac, ga)
                pf = smooth_probs_box(pf, k=smooth_k)
                for mm in min_mult_list:
                    min_dur = np.floor(med*mm + 0.5).astype(np.int32); min_dur[0]=0
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    ll = total_emission_loglik(y, pf)
                    if ll > best_ll:
                        best_ll = ll
                        seq = make_perm20(compress_to_sequence(y), pf)
                        best_seq = seq
        ids.append(sid); rows.append(' '.join(map(str, best_seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST: Best-of-N over (alpha_clip, gamma_audio, min_mult) for CLIP+Audio keep_len...', flush=True)
decode_test_clip_bestofN_keep_len_minmult(out_csv='submission_clip_poe_keep_bestofN_minmult.csv',
                                          alpha_list=(0.28,0.30,0.32,0.34), gamma_list=(0.18,0.20,0.22,0.24),
                                          min_mult_list=(0.65,0.70), smooth_k=5)

Decoding TEST: Best-of-N over (alpha_clip, gamma_audio, min_mult) for CLIP+Audio keep_len...


Decoded 20/95 elapsed=2.1s


Decoded 40/95 elapsed=4.5s


Decoded 60/95 elapsed=6.3s


Decoded 80/95 elapsed=8.0s


Decoded 95/95 elapsed=9.2s


Wrote submission_clip_poe_keep_bestofN_minmult.csv rows= 95
Staged submission: submission_clip_poe_keep_bestofN_minmult.csv -> submission.csv


In [67]:
# Hedge decodes for PANNs gamma +/- 0.02; restage OOF-best (g=0.18) at the end
import shutil, os
print('Decoding PANNs hedges gamma=0.16 and gamma=0.20 ...', flush=True)
try:
    decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.16, smooth_k=5, min_mult=0.7, out_csv='submission_clip_panns_keep_g016.csv')
except Exception as e:
    print('Gamma 0.16 decode failed:', e, flush=True)
try:
    decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.20, smooth_k=5, min_mult=0.7, out_csv='submission_clip_panns_keep_g020.csv')
except Exception as e:
    print('Gamma 0.20 decode failed:', e, flush=True)
# Restore OOF-best staged (gamma=0.18) to submission.csv
if os.path.exists('submission_clip_panns_keep.csv'):
    shutil.copyfile('submission_clip_panns_keep.csv', 'submission.csv')
    print('Restaged OOF-best: submission_clip_panns_keep.csv -> submission.csv', flush=True)
else:
    print('Warning: submission_clip_panns_keep.csv missing; cannot restage.', flush=True)

Decoding PANNs hedges gamma=0.16 and gamma=0.20 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_g016.csv


Staged submission: submission_clip_panns_keep_g016.csv -> submission.csv


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_g020.csv


Staged submission: submission_clip_panns_keep_g020.csv -> submission.csv


Restaged OOF-best: submission_clip_panns_keep.csv -> submission.csv


In [61]:
# Stage PANNs hedge gamma=0.20
import shutil, os
src = 'submission_clip_panns_keep_g020.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged PANNs hedge submission: {src} -> {dst}')

Staged PANNs hedge submission: submission_clip_panns_keep_g020.csv -> submission.csv


In [99]:
# Best-of-N over (alpha_clip, gamma_panns, min_mult) for PANNs fusion; TEST DECODE; stage result
import numpy as np, pandas as pd, time, os, shutil, json

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    C,T = p.shape
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def decode_test_panns_bestofN_keep_len(out_csv='submission_clip_panns_keep_bestofN_minmult.csv',
                                       alpha_list=(0.28,0.30,0.32), gamma_list=(0.16,0.18,0.20,0.22),
                                       min_mult_list=(0.65,0.70), smooth_k=5):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # CLIP per-fold temp already applied
        pa = load_audio_panns_test_avg(sid)
        if pa is None:
            pa = load_probs_generic(sid, 'audio')
        best_ll = -1e99; best_seq=None
        for ac in alpha_list:
            for ga in gamma_list:
                pf = fuse_poe_with_clip_keep_len(ps, pc, pa, ac, ga)
                pf = smooth_probs_box(pf, k=smooth_k)
                for mm in min_mult_list:
                    min_dur = np.floor(med*mm + 0.5).astype(np.int32); min_dur[0]=0
                    y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                    ll = total_emission_loglik(y, pf)
                    if ll > best_ll:
                        best_ll = ll
                        seq = make_perm20(compress_to_sequence(y), pf)
                        best_seq = seq
        ids.append(sid); rows.append(' '.join(map(str, best_seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST Best-of-N for PANNs fusion...', flush=True)
decode_test_panns_bestofN_keep_len(out_csv='submission_clip_panns_keep_bestofN_minmult.csv',
                                   alpha_list=(0.28,0.30,0.32), gamma_list=(0.16,0.18,0.20,0.22),
                                   min_mult_list=(0.65,0.70), smooth_k=5)

Decoding TEST Best-of-N for PANNs fusion...


Decoded 20/95 elapsed=1.6s


Decoded 40/95 elapsed=3.5s


Decoded 60/95 elapsed=4.9s


Decoded 80/95 elapsed=6.2s


Decoded 95/95 elapsed=7.2s


Wrote submission_clip_panns_keep_bestofN_minmult.csv rows= 95
Staged submission: submission_clip_panns_keep_bestofN_minmult.csv -> submission.csv


In [63]:
# Best-of-N for PANNs fusion with smooth_k in {3,5} and pairwise-margin fill; TEST DECODE; stage
import numpy as np, pandas as pd, time, os, shutil, json

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    C,T = p.shape
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def pairwise_margins(p: np.ndarray, q_power: float = 1.8):
    C,T = p.shape
    w = np.power(np.clip(p[1:21], 1e-8, 1.0), q_power).astype(np.float32)  # 20 x T
    S_after = np.cumsum(w[:, ::-1], axis=1)[:, ::-1]
    W = np.zeros((20,20), dtype=np.float64)
    for i in range(20):
        wi = w[i]
        for j in range(20):
            if i==j: continue
            W[i,j] = float((wi * S_after[j]).sum())
    M = W - W.T
    return M

def make_perm20_pairmargin(seq_raw, p: np.ndarray, q_power: float = 1.8):
    seen=set(); seq=[]
    for c in seq_raw:
        if 1<=c<=20 and c not in seen:
            seen.add(c); seq.append(c)
    if len(seq) >= 20:
        return seq[:20]
    M = pairwise_margins(p, q_power=q_power)
    missing = [c for c in range(1,21) if c not in seen]
    scores = {c: float(M[c-1].sum()) for c in missing}
    missing_sorted = sorted(missing, key=lambda c: scores[c], reverse=True)
    for c in missing_sorted:
        if len(seq)==20: break
        seq.append(c)
    if len(seq)>20: seq = seq[:20]
    return seq

def decode_test_panns_bestofN_keep_len_smooth_pair(out_csv='submission_clip_panns_keep_bestofN_smooth_pair.csv',
                                                   alpha_list=(0.28,0.30,0.32), gamma_list=(0.16,0.18,0.20,0.22),
                                                   min_mult_list=(0.65,0.70), smooth_list=(3,5)):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids)
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids=[]; rows=[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # CLIP per-fold temp already applied
        pa = load_audio_panns_test_avg(sid)
        if pa is None:
            pa = load_probs_generic(sid, 'audio')
        best_ll = -1e99; best_seq=None
        for ac in alpha_list:
            for ga in gamma_list:
                pf_base = fuse_poe_with_clip_keep_len(ps, pc, pa, ac, ga)
                for sk in smooth_list:
                    pf = smooth_probs_box(pf_base, k=sk)
                    for mm in min_mult_list:
                        min_dur = np.floor(med*mm + 0.5).astype(np.int32); min_dur[0]=0
                        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                        ll = total_emission_loglik(y, pf)
                        if ll > best_ll:
                            best_ll = ll
                            seq = make_perm20_pairmargin(compress_to_sequence(y), pf, q_power=1.8)
                            best_seq = seq
        ids.append(sid); rows.append(' '.join(map(str, best_seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST Best-of-N (smooth {3,5} + pairwise fill) for PANNs fusion...', flush=True)
decode_test_panns_bestofN_keep_len_smooth_pair(out_csv='submission_clip_panns_keep_bestofN_smooth_pair.csv',
                                               alpha_list=(0.28,0.30,0.32), gamma_list=(0.16,0.18,0.20,0.22),
                                               min_mult_list=(0.65,0.70), smooth_list=(3,5))

Decoding TEST Best-of-N (smooth {3,5} + pairwise fill) for PANNs fusion...


Decoded 20/95 elapsed=2.5s


Decoded 40/95 elapsed=5.5s


Decoded 60/95 elapsed=7.8s


Decoded 80/95 elapsed=9.5s


Decoded 95/95 elapsed=10.9s


Wrote submission_clip_panns_keep_bestofN_smooth_pair.csv rows= 95
Staged submission: submission_clip_panns_keep_bestofN_smooth_pair.csv -> submission.csv


In [65]:
# OOF sanity: compare min_mult=0.70 vs 0.68 for PANNs fusion (alpha_clip=0.30, gamma=0.18), keep_len PoE
import numpy as np, pandas as pd, json, time

def lev_dist(a, b):
    n=len(a); m=len(b)
    if n==0: return m
    dp=list(range(m+1))
    for i in range(1,n+1):
        prev=dp[0]; dp[0]=i
        for j in range(1,m+1):
            tmp=dp[j]; cost=0 if a[i-1]==b[j-1] else 1
            dp[j]=min(dp[j]+1, dp[j-1]+1, prev+cost); prev=tmp
    return dp[m]

def oof_compare_minmult(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult_list=(0.70, 0.68)):
    folds = json.load(open('folds_archive_cv.json','r'))
    results = {mm: [] for mm in min_mult_list}
    for fd in folds:
        tr = list(map(int, fd['train_ids'])); va = list(map(int, fd['val_ids']))
        med = compute_min_dur_from_ids(tr)
        fidx = int(fd['fold'])
        Tclip = _load_temp_num(f'clip_temp_fold{fidx}.json')
        for sid in va:
            sid=int(sid)
            ps = load_skeleton_probs(sid)
            pc = load_clip_probs_train(sid)
            pa = load_audio_panns_train(sid)
            if pc is not None and Tclip is not None: pc = temp_scale_scalar(pc, Tclip)
            if pa is None:
                pa = load_probs_generic(sid, 'audio')
            pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip, gamma_audio)
            pf = smooth_probs_box(pf, k=smooth_k)
            true_seq = compress_to_sequence(np.load(LABELS/f"{sid}.npy"))
            for mm in min_mult_list:
                min_dur = np.floor(med*mm + 0.5).astype(np.int32); min_dur[0]=0
                y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
                seq = compress_to_sequence(y)
                results[mm].append(lev_dist(seq, true_seq))
    for mm in min_mult_list:
        arr = results[mm]
        print(f"min_mult={mm:.2f}: worst={max(arr):.3f}, mean={np.mean(arr):.3f}, norm_mean={np.mean(arr)/20:.5f}")
    # recommend best by worst then mean
    summary = sorted([(max(v), float(np.mean(v)), mm) for mm,v in results.items()], key=lambda x: (x[0], x[1]))
    print('Best setting (by OOF):', summary[0])
    return summary[0]

print('OOF compare min_mult 0.70 vs 0.68 (PANNs keep_len PoE)...', flush=True)
best_mm = oof_compare_minmult(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult_list=(0.70, 0.68))

OOF compare min_mult 0.70 vs 0.68 (PANNs keep_len PoE)...


min_mult=0.70: worst=20.000, mean=3.721, norm_mean=0.18603
min_mult=0.68: worst=20.000, mean=3.680, norm_mean=0.18401
Best setting (by OOF): (20, 3.68013468013468, 0.68)


In [80]:
# Decode TEST with PANNs fusion using min_mult=0.68 (OOF-better) and stage
import shutil, os, time
print('Decoding TEST with PANNs: alpha_clip=0.30, gamma=0.18, min_mult=0.68 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068.csv'
decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.68, out_csv=out_csv)
if os.path.exists(out_csv):
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)
else:
    print('ERROR: expected output file missing', flush=True)

Decoding TEST with PANNs: alpha_clip=0.30, gamma=0.18, min_mult=0.68 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068.csv


Staged submission: submission_clip_panns_keep_m068.csv -> submission.csv


Staged submission: submission_clip_panns_keep_m068.csv -> submission.csv


In [68]:
# Stage hedge: PANNs Best-of-N with smooth {3,5} + pairwise fill
import shutil, os
src = 'submission_clip_panns_keep_bestofN_smooth_pair.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
shutil.copyfile(src, dst)
print(f'Staged hedge submission: {src} -> {dst}')

Staged hedge submission: submission_clip_panns_keep_bestofN_smooth_pair.csv -> submission.csv


In [70]:
# Final hedge S1: PANNs fusion keep_len PoE with smooth_k=3, alpha_clip=0.30, gamma=0.18, min_mult=0.68; stage submission
import shutil, os, time
print('Decoding TEST S1: alpha_clip=0.30, gamma=0.18, min_mult=0.68, smooth_k=3 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068_smooth3.csv'
decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.18, smooth_k=3, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S1: {out_csv} -> submission.csv', flush=True)

Decoding TEST S1: alpha_clip=0.30, gamma=0.18, min_mult=0.68, smooth_k=3 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_smooth3.csv


Staged submission: submission_clip_panns_keep_m068_smooth3.csv -> submission.csv


Staged submission S1: submission_clip_panns_keep_m068_smooth3.csv -> submission.csv


In [71]:
# Final hedge S2: PANNs fusion keep_len PoE with smooth_k=5, alpha_clip=0.30, gamma=0.20, min_mult=0.68; stage submission
import shutil, os, time
print('Decoding TEST S2: alpha_clip=0.30, gamma=0.20, min_mult=0.68, smooth_k=5 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068_g020.csv'
decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.20, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S2: {out_csv} -> submission.csv', flush=True)

Decoding TEST S2: alpha_clip=0.30, gamma=0.20, min_mult=0.68, smooth_k=5 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_g020.csv


Staged submission: submission_clip_panns_keep_m068_g020.csv -> submission.csv


Staged submission S2: submission_clip_panns_keep_m068_g020.csv -> submission.csv


In [72]:
# Final hedge S3: PANNs fusion keep_len PoE with smooth_k=5, alpha_clip=0.30, gamma=0.18, min_mult=0.70; stage submission
import shutil, os, time
print('Decoding TEST S3: alpha_clip=0.30, gamma=0.18, min_mult=0.70, smooth_k=5 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m070.csv'
decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.70, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S3: {out_csv} -> submission.csv', flush=True)

Decoding TEST S3: alpha_clip=0.30, gamma=0.18, min_mult=0.70, smooth_k=5 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m070.csv


Staged submission: submission_clip_panns_keep_m070.csv -> submission.csv


Staged submission S3: submission_clip_panns_keep_m070.csv -> submission.csv


In [73]:
# Final hedge S4: keep_len PoE with alpha_clip=0.32, gamma=0.18, min_mult=0.68, smooth_k=5; stage submission
import shutil, os, time
print('Decoding TEST S4: alpha_clip=0.32, gamma=0.18, min_mult=0.68, smooth_k=5 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068_a032_g018.csv'
decode_test_with_panns(alpha_clip=0.32, gamma_audio=0.18, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S4: {out_csv} -> submission.csv', flush=True)

Decoding TEST S4: alpha_clip=0.32, gamma=0.18, min_mult=0.68, smooth_k=5 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_a032_g018.csv


Staged submission: submission_clip_panns_keep_m068_a032_g018.csv -> submission.csv


Staged submission S4: submission_clip_panns_keep_m068_a032_g018.csv -> submission.csv


In [74]:
# Final hedge S5: keep_len PoE with alpha_clip=0.30, gamma=0.22, min_mult=0.68, smooth_k=5; stage submission
import shutil, os, time
print('Decoding TEST S5: alpha_clip=0.30, gamma=0.22, min_mult=0.68, smooth_k=5 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068_g022.csv'
decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.22, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S5: {out_csv} -> submission.csv', flush=True)

Decoding TEST S5: alpha_clip=0.30, gamma=0.22, min_mult=0.68, smooth_k=5 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_g022.csv


Staged submission: submission_clip_panns_keep_m068_g022.csv -> submission.csv


Staged submission S5: submission_clip_panns_keep_m068_g022.csv -> submission.csv


In [75]:
# Final hedge S6: keep_len PoE with alpha_clip=0.28, gamma=0.18, min_mult=0.68, smooth_k=5; stage submission
import shutil, os, time
print('Decoding TEST S6: alpha_clip=0.28, gamma=0.18, min_mult=0.68, smooth_k=5 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068_a028_g018.csv'
decode_test_with_panns(alpha_clip=0.28, gamma_audio=0.18, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S6: {out_csv} -> submission.csv', flush=True)

Decoding TEST S6: alpha_clip=0.28, gamma=0.18, min_mult=0.68, smooth_k=5 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_a028_g018.csv


Staged submission: submission_clip_panns_keep_m068_a028_g018.csv -> submission.csv


Staged submission S6: submission_clip_panns_keep_m068_a028_g018.csv -> submission.csv


In [78]:
# Final hedge S7: keep_len PoE with alpha_clip=0.30, gamma=0.18, min_mult=0.68, smooth_k=5, ABA max_len=3; stage submission
import numpy as np, pandas as pd, json, time, shutil, os

def decode_test_with_panns_aba(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.68, aba_len=3, out_csv='submission_clip_panns_keep_m068_aba3.csv'):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids); min_dur = np.floor(med*min_mult+0.5).astype(np.int32); min_dur[0]=0
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # already temp-averaged
        pa = load_audio_panns_test_avg(sid)
        if pa is None:
            pa = load_probs_generic(sid, 'audio')
        pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip, gamma_audio)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=aba_len)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, flush=True)
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST S7: alpha_clip=0.30, gamma=0.18, min_mult=0.68, smooth_k=5, ABA=3 ...', flush=True)
decode_test_with_panns_aba(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.68, aba_len=3, out_csv='submission_clip_panns_keep_m068_aba3.csv')

Decoding TEST S7: alpha_clip=0.30, gamma=0.18, min_mult=0.68, smooth_k=5, ABA=3 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_aba3.csv


Staged submission: submission_clip_panns_keep_m068_aba3.csv -> submission.csv


In [79]:
# Final hedge S8: keep_len PoE with alpha_clip=0.30, gamma=0.19, min_mult=0.68, smooth_k=5 (ABA=2 default); stage submission
import shutil, os, time
print('Decoding TEST S8: alpha_clip=0.30, gamma=0.19, min_mult=0.68, smooth_k=5 ...', flush=True)
out_csv = 'submission_clip_panns_keep_m068_g019.csv'
decode_test_with_panns(alpha_clip=0.30, gamma_audio=0.19, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(out_csv), f'Expected {out_csv}'
shutil.copyfile(out_csv, 'submission.csv')
print(f'Staged submission S8: {out_csv} -> submission.csv', flush=True)

Decoding TEST S8: alpha_clip=0.30, gamma=0.19, min_mult=0.68, smooth_k=5 ...


Decoded 20/95


Decoded 40/95


Decoded 60/95


Decoded 80/95


Decoded 95/95


Wrote submission_clip_panns_keep_m068_g019.csv


Staged submission: submission_clip_panns_keep_m068_g019.csv -> submission.csv


Staged submission S8: submission_clip_panns_keep_m068_g019.csv -> submission.csv


In [81]:
# Quick integrity check: submission.csv has 95 rows and each Sequence is a permutation of 1..20
import pandas as pd, numpy as np, sys
sub = pd.read_csv('submission.csv')
print('Rows:', len(sub))
assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
bad=[]
for i,(sid,seq_str) in enumerate(zip(sub['Id'], sub['Sequence'])):
    seq = list(map(int, str(seq_str).split()))
    if len(seq)!=20 or set(seq)!=set(range(1,21)) or len(set(seq))!=20:
        bad.append((sid, seq[:5], len(seq), len(set(seq))))
print('Invalid rows:', len(bad))
if bad:
    print('Examples (Id, first5, len, uniq):', bad[:5])
else:
    print('All sequences valid permutations of 1..20.')
# Additional sanity: no NaNs/inf in file
assert not sub.isna().any().any(), 'NaNs found in submission.csv'
print('Integrity checks passed.')

Rows: 95
Invalid rows: 0
All sequences valid permutations of 1..20.
Integrity checks passed.


In [82]:
# Patch: canonical skeleton loader with temps + classic-audio fallback temp; final decodes m068_g018 and m068_g019, stage g018
import numpy as np, pandas as pd, json, time, os, shutil
from pathlib import Path

print('Applying PATCH: canonical skeleton temps + classic-audio fallback temp', flush=True)
probs_cache = Path('probs_cache')
calib = json.load(open('calib_all_v2v3_meta.json','r'))
T2 = np.array(calib['T2'], dtype=np.float32)
T3 = np.array(calib['T3'], dtype=np.float32)
A  = np.array(calib.get('A', [0.7]*len(T2)), dtype=np.float32)

def temp_scale(p, T):
    T = np.asarray(T, dtype=np.float32).reshape(-1)
    p = np.clip(p, 1e-8, 1.0)
    logp = np.log(p)
    if p.shape[0] == T.shape[0]:
        logp = logp / np.maximum(T[:, None], 1e-6)
        q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True) + 1e-8)
        return q.astype(np.float32)
    elif p.shape[-1] == T.shape[0]:
        logp = logp / np.maximum(T[None, :], 1e-6)
        q = np.exp(logp); q /= (q.sum(axis=1, keepdims=True) + 1e-8)
        return q.T.astype(np.float32)
    else:
        raise ValueError('T length mismatch')

def ensure_CxT(p, C=21):
    if p is None: return None
    if p.ndim==2 and p.shape[0]==C: return p
    if p.ndim==2 and p.shape[1]==C: return p.T
    raise ValueError('Bad probs shape')

def load_skeleton_probs(seq_id: int) -> np.ndarray:
    p2 = np.load(probs_cache/f"{seq_id}_ce.npy").astype(np.float32)
    p3 = np.load(probs_cache/f"{seq_id}_ce_v3.npy").astype(np.float32)
    p2 = ensure_CxT(temp_scale(p2, T2))
    p3 = ensure_CxT(temp_scale(p3, T3))
    Tm = min(p2.shape[1], p3.shape[1])
    p2 = p2[:, :Tm]; p3 = p3[:, :Tm]
    a = A.reshape(-1,1).astype(np.float32)
    p = a*p2 + (1.0-a)*p3
    p /= (p.sum(axis=0, keepdims=True) + 1e-8)
    return p.astype(np.float32)

def _load_temp_num(path):
    try:
        with open(path,'r') as f:
            obj = json.load(f)
        if isinstance(obj, dict) and 'T' in obj: return float(obj['T'])
        return float(obj)
    except Exception:
        try:
            return float(open(path).read().strip())
        except Exception:
            return None

def temp_scale_scalar(p_arr: np.ndarray | None, Tnum: float | None):
    if p_arr is None or Tnum is None: return p_arr
    p = np.clip(p_arr, 1e-8, 1.0).astype(np.float32)
    logp = np.log(p) / max(float(Tnum), 1e-6)
    q = np.exp(logp); q /= (q.sum(axis=0, keepdims=True)+1e-8)
    return q.astype(np.float32)

def decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.68, out_csv='submission_final.csv'):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids)
    min_dur = np.floor(med*min_mult + 0.5).astype(np.int32); min_dur[0]=0
    # mean classic-audio temp (fallback only, PANNs already temped per-fold and averaged)
    Ta_vals = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_vals if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if Ta_vals else None
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids, rows = [], []
    t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # CLIP per-fold temps already applied upstream
        pa = load_audio_panns_test_avg(sid) # PANNs per-fold temps already applied upstream
        if pa is None:
            pa = load_probs_generic(sid, 'audio')
            if pa is not None and Ta_mean is not None:
                pa = temp_scale_scalar(ensure_CxT(pa), Ta_mean)
        pf = fuse_poe_with_clip_keep_len(ps, pc, pa, alpha_clip, gamma_audio)
        pf = smooth_probs_box(pf, k=smooth_k)
        y = decode_minseg(pf, min_dur.copy()); y = aba_collapse(y, max_len=2)
        seq = make_perm20(compress_to_sequence(y), pf)
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    return out_csv

# Run two finals and stage g018 per expert advice
print('Decoding FINAL m068_g018 ...', flush=True)
csv018 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.68, out_csv='submission_final_m068_g018.csv')
print('Decoding FINAL m068_g019 ...', flush=True)
csv019 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.19, smooth_k=5, min_mult=0.68, out_csv='submission_final_m068_g019.csv')
# Stage OOF-best first (g018) for submission.csv
shutil.copyfile(csv018, 'submission.csv')
print(f'Staged submission: {csv018} -> submission.csv', flush=True)

Applying PATCH: canonical skeleton temps + classic-audio fallback temp


Decoding FINAL m068_g018 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m068_g018.csv rows= 95
Decoding FINAL m068_g019 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m068_g019.csv rows= 95
Staged submission: submission_final_m068_g018.csv -> submission.csv


In [83]:
# Stage parity-fixed hedge: gamma=0.19
import shutil, os, pandas as pd
src = 'submission_final_m068_g019.csv'
dst = 'submission.csv'
assert os.path.exists(src), f'Missing {src}'
sub = pd.read_csv(src)
assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(src, dst)
print(f'Staged: {src} -> {dst}')

Staged: submission_final_m068_g019.csv -> submission.csv


In [84]:
# Final hedge (parity-fixed): gamma=0.22 with m=0.68, smooth_k=5; stage submission
import shutil, os, pandas as pd
print('Decoding FINAL m068_g022 ...', flush=True)
out_csv = 'submission_final_m068_g022.csv'
csv022 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.22, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(csv022), f'Missing {csv022}'
sub = pd.read_csv(csv022); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv022, 'submission.csv')
print(f'Staged submission: {csv022} -> submission.csv', flush=True)

Decoding FINAL m068_g022 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m068_g022.csv rows= 95
Staged submission: submission_final_m068_g022.csv -> submission.csv


In [87]:
# Modality isolation hedge: Skeleton + PANNs only (alpha_clip=0.0), gamma=0.22; stage submission
import shutil, os, pandas as pd
print('Decoding FINAL skeleton+PANNs only m068_g022 ...', flush=True)
out_csv = 'submission_final_sp_only_m068_g022.csv'
csv_sp = decode_test_with_panns_final(alpha_clip=0.0, gamma_audio=0.22, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(csv_sp), f'Missing {csv_sp}'
sub = pd.read_csv(csv_sp); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv_sp, 'submission.csv')
print(f'Staged submission: {csv_sp} -> submission.csv', flush=True)

Decoding FINAL skeleton+PANNs only m068_g022 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_sp_only_m068_g022.csv rows= 95
Staged submission: submission_final_sp_only_m068_g022.csv -> submission.csv


In [92]:
# Modality isolation hedge: Skeleton + CLIP only (gamma_audio=0.0), alpha_clip=0.30; stage submission
import shutil, os, pandas as pd
print('Decoding FINAL skeleton+CLIP only m068_a030 ...', flush=True)
out_csv = 'submission_final_sc_only_m068_a030.csv'
csv_sc = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.0, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(csv_sc), f'Missing {csv_sc}'
sub = pd.read_csv(csv_sc); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv_sc, 'submission.csv')
print(f'Staged submission: {csv_sc} -> submission.csv', flush=True)

Decoding FINAL skeleton+CLIP only m068_a030 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_sc_only_m068_a030.csv rows= 95
Staged submission: submission_final_sc_only_m068_a030.csv -> submission.csv


In [91]:
# Parity-fixed hedge: min_mult=0.70 with gamma=0.18; stage submission
import shutil, os, pandas as pd
print('Decoding FINAL m070_g018 ...', flush=True)
out_csv = 'submission_final_m070_g018.csv'
csv070 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.18, smooth_k=5, min_mult=0.70, out_csv=out_csv)
assert os.path.exists(csv070), f'Missing {csv070}'
sub = pd.read_csv(csv070); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv070, 'submission.csv')
print(f'Staged submission: {csv070} -> submission.csv', flush=True)

Decoding FINAL m070_g018 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m070_g018.csv rows= 95
Staged submission: submission_final_m070_g018.csv -> submission.csv


In [90]:
# Quick final decode per expert: gamma=0.25 at min_mult=0.68 (keep_len PoE), smooth_k=5; stage submission
import os, shutil, pandas as pd
print('Decoding FINAL m068_g025 ...', flush=True)
out_csv = 'submission_final_m068_g025.csv'
csv025 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.25, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(csv025), f'Missing {csv025}'
sub = pd.read_csv(csv025); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv025, 'submission.csv')
print(f'Staged submission: {csv025} -> submission.csv', flush=True)

Decoding FINAL m068_g025 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m068_g025.csv rows= 95
Staged submission: submission_final_m068_g025.csv -> submission.csv


In [93]:
# Best-of-2 per-sequence selection: Full (S+CLIP+PANNs) vs Audio-only (S+PANNs), keep_len PoE, parity-fixed
import numpy as np, pandas as pd, json, time, os, shutil
from pathlib import Path

def _load_temp_num(path: str):
    p = Path(path)
    if not p.exists(): return None
    try:
        return float(json.load(open(path,'r')).get('T', 1.0))
    except Exception:
        try: return float(open(path).read().strip())
        except Exception: return None

def total_emission_loglik(y: np.ndarray, p: np.ndarray) -> float:
    C,T = p.shape
    idx = np.clip(y, 0, C-1).astype(np.int32)
    cols = np.arange(T, dtype=np.int32)
    probs = np.clip(p[idx, cols], 1e-12, 1.0)
    return float(np.log(probs).sum())

def decode_test_bestof2_keep_len(out_csv='submission_bestof2_full_vs_audio.csv',
                                 a_full=0.30, g_full=0.22,
                                 a_audio=0.0, g_audio=0.22,
                                 min_mult=0.68, smooth_k=5):
    folds = json.load(open('folds_archive_cv.json','r'))
    all_train_ids = sorted({int(x) for fd in folds for x in fd['train_ids']})
    med = compute_min_dur_from_ids(all_train_ids)
    min_dur = np.floor(med*min_mult + 0.5).astype(np.int32); min_dur[0]=0
    # classic-audio temp mean as fallback if PANNs missing
    Ta_vals = [_load_temp_num(f'audio_temp_fold{f}.json') for f in (0,1,2)]
    Ta_vals = [t for t in Ta_vals if t is not None]
    Ta_mean = float(np.mean(Ta_vals)) if Ta_vals else None
    test_ids = sorted(pd.read_csv('test.csv')['Id'].astype(int).tolist())
    ids,rows=[],[]; t0=time.time(); n=0
    for sid in test_ids:
        sid=int(sid)
        ps = load_skeleton_probs(sid)
        pc = load_clip_probs_test_avg(sid)  # CLIP per-fold temps already applied upstream
        pa = load_audio_panns_test_avg(sid) # PANNs per-fold temps already applied upstream
        if pa is None:
            pa = load_probs_generic(sid, 'audio')
            if pa is not None and Ta_mean is not None:
                pa = temp_scale_scalar(ensure_CxT(pa), Ta_mean)
        # Candidate A: Full (S+CLIP+Audio)
        pf_full = fuse_poe_with_clip_keep_len(ps, pc, pa, a_full, g_full)
        pf_full = smooth_probs_box(pf_full, k=smooth_k)
        y_full = decode_minseg(pf_full, min_dur.copy()); y_full = aba_collapse(y_full, max_len=2)
        ll_full = total_emission_loglik(y_full, pf_full)
        seq_full = make_perm20(compress_to_sequence(y_full), pf_full)
        # Candidate B: Audio-only (S+Audio)
        pf_aud = fuse_poe_with_clip_keep_len(ps, None, pa, a_audio, g_audio)
        pf_aud = smooth_probs_box(pf_aud, k=smooth_k)
        y_aud = decode_minseg(pf_aud, min_dur.copy()); y_aud = aba_collapse(y_aud, max_len=2)
        ll_aud = total_emission_loglik(y_aud, pf_aud)
        seq_aud = make_perm20(compress_to_sequence(y_aud), pf_aud)
        # Pick higher LL
        if ll_full >= ll_aud:
            seq = seq_full
        else:
            seq = seq_aud
        ids.append(sid); rows.append(' '.join(map(str, seq))); n+=1
        if (n%20)==0 or n==95: print(f'Decoded {n}/95 elapsed={time.time()-t0:.1f}s', flush=True)
    sub = pd.DataFrame({'Id': ids, 'Sequence': rows}).sort_values('Id')
    sub.to_csv(out_csv, index=False); print('Wrote', out_csv, 'rows=', len(sub))
    assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
    shutil.copyfile(out_csv, 'submission.csv')
    print(f'Staged submission: {out_csv} -> submission.csv', flush=True)

print('Decoding TEST: Best-of-2 selection (Full vs Audio-only)...', flush=True)
decode_test_bestof2_keep_len(out_csv='submission_bestof2_full_vs_audio.csv',
                             a_full=0.30, g_full=0.22, a_audio=0.0, g_audio=0.22,
                             min_mult=0.68, smooth_k=5)

Decoding TEST: Best-of-2 selection (Full vs Audio-only)...


Decoded 20/95 elapsed=0.3s


Decoded 40/95 elapsed=0.5s


Decoded 60/95 elapsed=0.8s


Decoded 80/95 elapsed=1.0s


Decoded 95/95 elapsed=1.1s


Wrote submission_bestof2_full_vs_audio.csv rows= 95
Staged submission: submission_bestof2_full_vs_audio.csv -> submission.csv


In [94]:
# Quick hedge: decode FINAL with gamma=0.26 at min_mult=0.68 (keep_len PoE), smooth_k=5; stage submission
import os, shutil, pandas as pd
print('Decoding FINAL m068_g026 ...', flush=True)
out_csv = 'submission_final_m068_g026.csv'
csv026 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.26, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(csv026), f'Missing {csv026}'
sub = pd.read_csv(csv026); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv026, 'submission.csv')
print(f'Staged submission: {csv026} -> submission.csv', flush=True)

Decoding FINAL m068_g026 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m068_g026.csv rows= 95
Staged submission: submission_final_m068_g026.csv -> submission.csv


In [95]:
# Quick hedge per expert Option A: gamma=0.26, min_mult=0.70 (keep_len PoE), smooth_k=5; stage submission
import os, shutil, pandas as pd
print('Decoding FINAL m070_g026 ...', flush=True)
out_csv = 'submission_final_m070_g026.csv'
csv = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.26, smooth_k=5, min_mult=0.70, out_csv=out_csv)
assert os.path.exists(csv), f'Missing {csv}'
sub = pd.read_csv(csv); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv, 'submission.csv')
print(f'Staged submission: {csv} -> submission.csv', flush=True)

Decoding FINAL m070_g026 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m070_g026.csv rows= 95
Staged submission: submission_final_m070_g026.csv -> submission.csv


In [97]:
# Stage precomputed hedge: MinSeg majority vote over m={0.60,0.70,0.80}
import shutil, os
src = 'submission_minseg_mv_m060_070_080.csv'
dst = 'submission.csv'
if os.path.exists(src):
    shutil.copyfile(src, dst)
    print(f'Staged: {src} -> {dst}')
else:
    print(f'Missing {src}; no staging performed.')

Staged: submission_minseg_mv_m060_070_080.csv -> submission.csv


In [98]:
# Quick hedge: decode FINAL with gamma=0.24 at min_mult=0.68 (keep_len PoE), smooth_k=5; stage submission
import os, shutil, pandas as pd
print('Decoding FINAL m068_g024 ...', flush=True)
out_csv = 'submission_final_m068_g024.csv'
csv024 = decode_test_with_panns_final(alpha_clip=0.30, gamma_audio=0.24, smooth_k=5, min_mult=0.68, out_csv=out_csv)
assert os.path.exists(csv024), f'Missing {csv024}'
sub = pd.read_csv(csv024); assert len(sub)==95, f'Expected 95 rows, got {len(sub)}'
shutil.copyfile(csv024, 'submission.csv')
print(f'Staged submission: {csv024} -> submission.csv', flush=True)

Decoding FINAL m068_g024 ...


Decoded 20/95 elapsed=0.2s


Decoded 40/95 elapsed=0.3s


Decoded 60/95 elapsed=0.5s


Decoded 80/95 elapsed=0.6s


Decoded 95/95 elapsed=0.7s


Wrote submission_final_m068_g024.csv rows= 95
Staged submission: submission_final_m068_g024.csv -> submission.csv
