
# C3A: Multi-Crop Embedding TTA (K=5) — Curated Only Stage

This staged notebook implements K=5 deterministic multi-crop TTA at the embedding level using PANNs CNN14 for curated data only, computes OOF LWLRAP, and caches artifacts.
- Crops per clip (target T=10s @ 32kHz): begin, center, end, 25% offset, 75% offset.
- Aggregation: mean across the 5 embeddings (no pre-normalization).
- CV: fixed 5-fold MLSKF from train_curated_folds.csv (seed=42).
- Outputs: embeddings_curated_mc5.npy, metadata_c3a.json; OOF (oof_tta.npy), per-fold LWLRAP.
- Test MC5 extraction and submission are deferred to a separate follow-up stage after OOF verification.


In [1]:

import sys, subprocess, os, json, time, warnings
from pathlib import Path
warnings.filterwarnings('ignore')
def pip_install(pkg):
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])
        print(f'Installed: {pkg}')
    except Exception as e:
        print(f'Failed to install {pkg}: {e}')

for pkg in ['soundfile', 'librosa', 'iterative-stratification', 'joblib', 'scikit-learn', 'panns-inference']:
    try:
        __import__(pkg.split('==')[0].replace('-', '_'))
        print(f"{pkg.split('==')[0]} already available")
    except Exception:
        pip_install(pkg)

import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import joblib
BASE = Path('.')
np.random.seed(42)
print('Versions:', {k:__import__(k).__version__ if hasattr(__import__(k), '__version__') else 'n/a' for k in ['librosa','torch','sklearn']})


soundfile already available
librosa already available


Installed: iterative-stratification


joblib already available


Installed: scikit-learn


panns-inference already available


Versions: {'librosa': '0.11.0', 'torch': '2.4.1+cu121', 'sklearn': '1.5.2'}


In [2]:

from pathlib import Path
import urllib.request
assets_dir = Path('/app/panns_data')
assets_dir.mkdir(parents=True, exist_ok=True)
labels_csv = assets_dir / 'class_labels_indices.csv'
if not labels_csv.exists():
    import csv
    with open(labels_csv, 'w', newline='') as f:
        w = csv.writer(f)
        w.writerow(['index','mid','display_name'])
        for i in range(527):
            w.writerow([i, f'/m/{i}', f'class_{i:03d}'])
    print('Created labels stub at', labels_csv)
else:
    print('Labels CSV exists at', labels_csv)

ckpt_path = assets_dir / 'Cnn14_mAP=0.431.pth'
url = 'https://zenodo.org/record/3987831/files/Cnn14_mAP=0.431.pth?download=1'
if not ckpt_path.exists() or ckpt_path.stat().st_size == 0:
    print('Downloading CNN14 weights ...')
    with urllib.request.urlopen(url) as resp, open(ckpt_path, 'wb') as out:
        while True:
            chunk = resp.read(1<<20)
            if not chunk:
                break
            out.write(chunk)
    print('Saved CNN14 to', ckpt_path, 'size:', ckpt_path.stat().st_size)
else:
    print('CNN14 checkpoint present:', ckpt_path, 'size:', ckpt_path.stat().st_size)


Labels CSV exists at /app/panns_data/class_labels_indices.csv
CNN14 checkpoint present: /app/panns_data/Cnn14_mAP=0.431.pth size: 327428481


In [3]:

from pathlib import Path
import numpy as np, pandas as pd
import json
BASE = Path('.')
df_cur = pd.read_csv(BASE / 'train_curated_folds.csv')
df_ss  = pd.read_csv(BASE / 'sample_submission.csv')
class_names = [c for c in df_ss.columns if c != 'fname']
label_to_idx = {c:i for i,c in enumerate(class_names)}
n_classes = len(class_names)

def parse_labels_str(s):
    if not isinstance(s, str):
        return []
    toks = [t.strip() for t in s.replace(';', ',').split(',') if t.strip()]
    unknown = [t for t in toks if t not in label_to_idx]
    if unknown:
        raise ValueError(f'Unknown labels: {unknown[:5]} (total {len(unknown)})')
    return toks

def encode_tokens(toks):
    y = np.zeros(n_classes, dtype=np.float32)
    for t in toks:
        y[label_to_idx[t]] = 1.0
    return y

def encode_labels(s):
    return encode_tokens(parse_labels_str(s))

def lwlrap_np(truth, scores):
    assert truth.shape == scores.shape
    n_samples, n_labels = truth.shape
    precisions = np.zeros(n_labels)
    labels_per_class = np.maximum(truth.sum(axis=0), 1)
    for i in range(n_samples):
        pos = np.where(truth[i] > 0)[0]
        if pos.size == 0:
            continue
        ranking = np.argsort(-scores[i])
        ranked_truth = truth[i][ranking]
        cumsum = np.cumsum(ranked_truth)
        pos_rank = np.where(ranked_truth > 0)[0]
        prec = cumsum[pos_rank] / (pos_rank + 1)
        ranked_labels = ranking[pos_rank]
        for lbl, p in zip(ranked_labels, prec):
            precisions[lbl] += p
    per_class = precisions / labels_per_class
    weights = truth.sum(axis=0) / max(truth.sum(), 1)
    return float((per_class * weights).sum()), per_class

print('Loaded folds and helpers. Fold counts:', df_cur['fold'].value_counts().sort_index().to_dict())


Loaded folds and helpers. Fold counts: {0: 999, 1: 994, 2: 992, 3: 997, 4: 988}


In [4]:

from pathlib import Path
import numpy as np, pandas as pd, time, json, hashlib
import librosa, torch
from panns_inference import AudioTagging

BASE = Path('.')
SR = 32000
T_SEC = 10.0
T = int(SR * T_SEC)
K = 5
EMB_DIM = 2048
CKPT_PATH = Path('/app/panns_data/Cnn14_mAP=0.431.pth')
assert CKPT_PATH.exists(), 'Checkpoint missing; run Cell 3.'

def load_audio(path, sr=SR):
    y, s = librosa.load(path, sr=sr, mono=True)
    return y.astype(np.float32)

def crop_starts(L, T):
    if L <= T:
        return [0, 0, 0, 0, 0]
    starts = [0, (L - T)//2, L - T, int(0.25*(L - T)), int(0.75*(L - T))]
    starts = [max(0, min(s, L - T)) for s in starts]
    return starts

def crops_for_wave(y, T):
    L = len(y)
    starts = crop_starts(L, T)
    crops = []
    for s in starts:
        if L >= T:
            crops.append(y[s:s+T])
        else:
            pad = np.pad(y, (0, T - L))
            crops.append(pad)
    return np.stack(crops, 0)  # (K, T)

def sha1_of_file(path: Path, block_size=1<<20):
    h = hashlib.sha1()
    with open(path, 'rb') as f:
        while True:
            b = f.read(block_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def extract_mc5_embeddings_curated(file_list, root_dir, log_every=100):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    at = AudioTagging(checkpoint_path=str(CKPT_PATH), device=device)
    X = np.zeros((len(file_list), EMB_DIM), dtype=np.float32)
    t0 = time.time()
    for i, fname in enumerate(file_list):
        y = load_audio(str(Path(root_dir) / fname), sr=SR)
        crops = crops_for_wave(y, T)  # (5, T)
        with torch.no_grad():
            out = at.inference(crops)  # batched inference on (5, T)
        if isinstance(out, tuple) and len(out)==2:
            embs = np.asarray(out[1], dtype=np.float32)  # (5, 2048)
        elif isinstance(out, dict) and 'embedding' in out:
            embs = np.asarray(out['embedding'], dtype=np.float32)
        else:
            raise RuntimeError('Unexpected AudioTagging output type')
        assert embs.ndim == 2 and embs.shape[1] == EMB_DIM, f'Bad embedding shape: {embs.shape}'
        X[i] = embs.mean(axis=0)
        if (i+1) % log_every == 0:
            dt = time.time() - t0
            print(f'  Curated MC5: {i+1}/{len(file_list)} in {dt/60:.1f} min')
    assert X.shape == (len(file_list), EMB_DIM), f'Output shape mismatch: {X.shape}'
    return X

df_cur = pd.read_csv(BASE / 'train_curated_folds.csv')
train_files = df_cur['fname'].values

emb_cur_mc5_path = BASE / 'embeddings_curated_mc5.npy'
if emb_cur_mc5_path.exists():
    X_cur = np.load(emb_cur_mc5_path)
    print('Loaded cached curated MC5 embeddings.')
else:
    print('Extracting curated MC5 embeddings ...')
    X_cur = extract_mc5_embeddings_curated(train_files, root_dir=BASE/'train_curated', log_every=100)
    np.save(emb_cur_mc5_path, X_cur)
    print('Saved curated MC5 embeddings.')

import torch as _torch, librosa as _librosa
ckpt_size = CKPT_PATH.stat().st_size if CKPT_PATH.exists() else None
ckpt_sha1 = sha1_of_file(CKPT_PATH) if CKPT_PATH.exists() else None
meta = {
    'tta': 'mc5',
    'sr': SR, 'T_sec': T_SEC, 'T': T,
    'crops': 'begin,center,end,25%,75%',
    'aggregation': 'mean',
    'stage': 'curated_only',
    'versions': {'torch': getattr(_torch, '__version__', 'n/a'), 'librosa': getattr(_librosa, '__version__', 'n/a')},
    'checkpoint': {'path': str(CKPT_PATH), 'size_bytes': ckpt_size, 'sha1': ckpt_sha1}
}
with open('metadata_c3a.json', 'w') as f:
    json.dump(meta, f)
print('metadata_c3a.json written with provenance.')


Loaded cached curated MC5 embeddings.


metadata_c3a.json written with provenance.


In [5]:

from pathlib import Path
import numpy as np, pandas as pd, json
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

BASE = Path('.')
df_cur = pd.read_csv(BASE / 'train_curated_folds.csv')
df_ss  = pd.read_csv(BASE / 'sample_submission.csv')
class_names = [c for c in df_ss.columns if c != 'fname']
label_to_idx = {c:i for i,c in enumerate(class_names)}
n_classes = len(class_names)

def parse_labels_str(s):
    if not isinstance(s, str):
        return []
    toks = [t.strip() for t in s.replace(';', ',').split(',') if t.strip()]
    return toks

def encode_labels(s):
    y = np.zeros(n_classes, dtype=np.float32)
    for t in parse_labels_str(s):
        if t in label_to_idx:
            y[label_to_idx[t]] = 1.0
    return y

def lwlrap_np(truth, scores):
    assert truth.shape == scores.shape
    n_samples, n_labels = truth.shape
    precisions = np.zeros(n_labels)
    labels_per_class = np.maximum(truth.sum(axis=0), 1)
    for i in range(n_samples):
        pos = np.where(truth[i] > 0)[0]
        if pos.size == 0:
            continue
        ranking = np.argsort(-scores[i])
        ranked_truth = truth[i][ranking]
        cumsum = np.cumsum(ranked_truth)
        pos_rank = np.where(ranked_truth > 0)[0]
        prec = cumsum[pos_rank] / (pos_rank + 1)
        ranked_labels = ranking[pos_rank]
        for lbl, p in zip(ranked_labels, prec):
            precisions[lbl] += p
    per_class = precisions / labels_per_class
    weights = truth.sum(axis=0) / max(truth.sum(), 1)
    return float((per_class * weights).sum()), per_class

X_cur = np.load('embeddings_curated_mc5.npy')
assert X_cur.ndim == 2 and X_cur.shape[1] == 2048, f'Embeddings shape invalid: {X_cur.shape}'
Y_cur = np.stack(df_cur['labels'].apply(encode_labels).values).astype(np.float32)

oof = np.zeros((len(df_cur), n_classes), dtype=np.float32)
fold_scores = []
for k in range(5):
    trn_idx = np.where(df_cur['fold'].values != k)[0]
    val_idx = np.where(df_cur['fold'].values == k)[0]
    X_tr, X_va = X_cur[trn_idx], X_cur[val_idx]
    y_tr, y_va = Y_cur[trn_idx], Y_cur[val_idx]
    base_lr = LogisticRegression(solver='lbfgs', max_iter=1000, C=2.0, n_jobs=16, verbose=0)
    clf = OneVsRestClassifier(make_pipeline(StandardScaler(with_mean=True, with_std=True), base_lr), n_jobs=-1)
    clf.fit(X_tr, y_tr)
    proba = clf.predict_proba(X_va)
    oof[val_idx] = proba.astype(np.float32)
    lw, _ = lwlrap_np(y_va, proba)
    fold_scores.append(lw)
    print(f'Fold {k} LWLRAP (MC5 curated)={lw:.4f}')
oof_lw, _ = lwlrap_np(Y_cur, oof)
print(f'OOF LWLRAP (MC5 curated)={oof_lw:.4f}; per-fold={fold_scores}')
np.save('oof_tta.npy', oof)

# Mandatory metrics persistence
BASELINE = 0.8001
delta = float(oof_lw - BASELINE)
ck_sha1 = None
try:
    with open('metadata_c3a.json', 'r') as f:
        meta = json.load(f)
    ck_sha1 = meta.get('checkpoint', {}).get('sha1')
except Exception:
    pass
metrics = {
    'stage': 'C3A_curated_mc5',
    'oof_lwlrap': float(oof_lw),
    'per_fold_lwlrap': [float(x) for x in fold_scores],
    'delta_vs_c2_baseline': delta,
    'tta_scheme': 'mc5_mean',
    'checkpoint_sha1': ck_sha1
}
with open('metrics_c3a_curated.json', 'w') as f:
    json.dump(metrics, f)
print('Persisted metrics to metrics_c3a_curated.json:', metrics)


Fold 0 LWLRAP (MC5 curated)=0.8125


Fold 1 LWLRAP (MC5 curated)=0.8058


Fold 2 LWLRAP (MC5 curated)=0.8127


Fold 3 LWLRAP (MC5 curated)=0.8160


Fold 4 LWLRAP (MC5 curated)=0.7810
OOF LWLRAP (MC5 curated)=0.8049; per-fold=[0.8125134119899252, 0.8058232379476366, 0.8127316131414057, 0.8160070710672407, 0.7809753987693859]
Persisted metrics to metrics_c3a_curated.json: {'stage': 'C3A_curated_mc5', 'oof_lwlrap': 0.8049036654978639, 'per_fold_lwlrap': [0.8125134119899252, 0.8058232379476366, 0.8127316131414057, 0.8160070710672407, 0.7809753987693859], 'delta_vs_c2_baseline': 0.004803665497863818, 'tta_scheme': 'mc5_mean', 'checkpoint_sha1': '5f73e32676afd7a763ddec6693d975be16859f90'}
