# C3A: Multi-Crop Embedding TTA (K=5)

This notebook implements K=5 deterministic multi-crop TTA at the embedding level using PANNs CNN14.
- Crops per clip (target T=10s @ 32kHz): begin, center, end, 25% offset, 75% offset.
- For clips shorter than T: zero-pad to T and use same crops (effectively identical).
- Aggregation: mean across the 5 embeddings (no pre-normalization).
- CV: fixed 5-fold MLSKF from train_curated_folds.csv (seed=42).
- Outputs: embeddings_curated_mc5.npy, embeddings_test_mc5.npy, metadata_c3a.json; OOF (oof_tta.npy), per-fold LWLRAP, submission.csv.
- Environment: offline PANNs assets; versions logged for reproducibility.


In [None]:
import sys, subprocess, os, json, time, warnings

from pathlib import Path

warnings.filterwarnings('ignore')



def pip_install(pkg):

    try:

        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])

        print(f'Installed: {pkg}')

    except Exception as e:

        print(f'Failed to install {pkg}: {e}')



for pkg in ['soundfile', 'librosa', 'iterative-stratification', 'joblib', 'scikit-learn']:

    try:

        __import__(pkg.split('==')[0].replace('-', '_'))

        print(f'{pkg.split("==")[0]} already available')

    except Exception:

        pip_install(pkg)



import numpy as np

import pandas as pd

import soundfile as sf

import librosa

import torch

from sklearn.linear_model import LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import joblib

BASE = Path('.')

np.random.seed(42)

print('Versions:', {k:__import__(k).__version__ if hasattr(__import__(k), '__version__') else 'n/a' for k in ['librosa','torch','sklearn']})



In [None]:
from pathlib import Path

import urllib.request

assets_dir = Path('/app/panns_data')

assets_dir.mkdir(parents=True, exist_ok=True)

labels_csv = assets_dir / 'class_labels_indices.csv'

if not labels_csv.exists():

    import csv

    with open(labels_csv, 'w', newline='') as f:

        w = csv.writer(f)

        w.writerow(['index','mid','display_name'])

        for i in range(527):

            w.writerow([i, f'/m/{i}', f'class_{i:03d}'])

    print('Created labels stub at', labels_csv)

else:

    print('Labels CSV exists at', labels_csv)



ckpt_path = assets_dir / 'Cnn14_mAP=0.431.pth'

url = 'https://zenodo.org/record/3987831/files/Cnn14_mAP=0.431.pth?download=1'

if not ckpt_path.exists() or ckpt_path.stat().st_size == 0:

    print('Downloading CNN14 weights ...')

    with urllib.request.urlopen(url) as resp, open(ckpt_path, 'wb') as out:

        while True:

            chunk = resp.read(1<<20)

            if not chunk:

                break

            out.write(chunk)

    print('Saved CNN14 to', ckpt_path, 'size:', ckpt_path.stat().st_size)

else:

    print('CNN14 checkpoint present:', ckpt_path, 'size:', ckpt_path.stat().st_size)



In [None]:
from pathlib import Path

import numpy as np, pandas as pd

import json

BASE = Path('.')

df_cur = pd.read_csv(BASE / 'train_curated_folds.csv')

df_ss  = pd.read_csv(BASE / 'sample_submission.csv')

class_names = [c for c in df_ss.columns if c != 'fname']

label_to_idx = {c:i for i,c in enumerate(class_names)}

n_classes = len(class_names)



def parse_labels_str(s):

    if not isinstance(s, str):

        return []

    toks = [t.strip() for t in s.replace(';', ',').split(',') if t.strip()]

    unknown = [t for t in toks if t not in label_to_idx]

    if unknown:

        raise ValueError(f'Unknown labels: {unknown[:5]} (total {len(unknown)})')

    return toks



def encode_tokens(toks):

    y = np.zeros(n_classes, dtype=np.float32)

    for t in toks:

        y[label_to_idx[t]] = 1.0

    return y



def encode_labels(s):

    return encode_tokens(parse_labels_str(s))



def lwlrap_np(truth, scores):

    assert truth.shape == scores.shape

    n_samples, n_labels = truth.shape

    precisions = np.zeros(n_labels)

    labels_per_class = np.maximum(truth.sum(axis=0), 1)

    for i in range(n_samples):

        pos = np.where(truth[i] > 0)[0]

        if pos.size == 0:

            continue

        ranking = np.argsort(-scores[i])

        ranked_truth = truth[i][ranking]

        cumsum = np.cumsum(ranked_truth)

        pos_rank = np.where(ranked_truth > 0)[0]

        prec = cumsum[pos_rank] / (pos_rank + 1)

        ranked_labels = ranking[pos_rank]

        for lbl, p in zip(ranked_labels, prec):

            precisions[lbl] += p

    per_class = precisions / labels_per_class

    weights = truth.sum(axis=0) / max(truth.sum(), 1)

    return float((per_class * weights).sum()), per_class



print('Loaded folds and helpers. Fold counts:', df_cur['fold'].value_counts().sort_index().to_dict())



In [None]:
from pathlib import Path

import numpy as np, pandas as pd, time, json

import librosa, torch

from panns_inference import AudioTagging



BASE = Path('.')

SR = 32000

T_SEC = 10.0

T = int(SR * T_SEC)

K = 5

EMB_DIM = 2048

CKPT_PATH = Path('/app/panns_data/Cnn14_mAP=0.431.pth')

assert CKPT_PATH.exists(), 'Checkpoint missing; run Cell 3.'



def load_audio(path, sr=SR):

    y, s = librosa.load(path, sr=sr, mono=True)

    return y.astype(np.float32)



def crop_starts(L, T):

    if L <= T:

        return [0, 0, 0, 0, 0]

    # begin, center, end, 25%, 75% (clamped)

    starts = [0, (L - T)//2, L - T, int(0.25*(L - T)), int(0.75*(L - T))]

    starts = [max(0, min(s, L - T)) for s in starts]

    return starts



def crops_for_wave(y, T):

    L = len(y)

    starts = crop_starts(L, T)

    crops = []

    for s in starts:

        if L >= T:

            crops.append(y[s:s+T])

        else:

            pad = np.pad(y, (0, T - L))

            crops.append(pad)

    return np.stack(crops, 0)  # (K, T)



def extract_mc5_embeddings(file_list, root_dir):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    at = AudioTagging(checkpoint_path=str(CKPT_PATH), device=device)

    X = np.zeros((len(file_list), EMB_DIM), dtype=np.float32)

    t0 = time.time()

    for i, fname in enumerate(file_list):

        y = load_audio(str(Path(root_dir) / fname), sr=SR)

        crops = crops_for_wave(y, T)  # (5, T)

        embs = []

        with torch.no_grad():

            for k in range(crops.shape[0]):

                out = at.inference(np.expand_dims(crops[k], 0))

                if isinstance(out, tuple) and len(out)==2:

                    emb = out[1][0]

                elif isinstance(out, dict) and 'embedding' in out:

                    emb = np.asarray(out['embedding'], dtype=np.float32)[0]

                else:

                    raise RuntimeError('Unexpected AudioTagging output type')

                embs.append(emb.astype(np.float32))

        X[i] = np.mean(np.stack(embs, 0), axis=0)

        if (i+1) % 200 == 0:

            dt = time.time() - t0

            print(f'  {i+1}/{len(file_list)} files processed in {dt/60:.1f} min')

    return X



df_cur = pd.read_csv(BASE / 'train_curated_folds.csv')

df_ss  = pd.read_csv(BASE / 'sample_submission.csv')

train_files = df_cur['fname'].values

test_files  = df_ss['fname'].values



emb_cur_mc5_path = BASE / 'embeddings_curated_mc5.npy'

emb_test_mc5_path = BASE / 'embeddings_test_mc5.npy'



if emb_cur_mc5_path.exists() and emb_test_mc5_path.exists():

    X_cur = np.load(emb_cur_mc5_path)

    X_test = np.load(emb_test_mc5_path)

    print('Loaded cached MC5 embeddings.')

else:

    print('Extracting curated MC5 embeddings ...')

    X_cur = extract_mc5_embeddings(train_files, root_dir=BASE/'train_curated')

    np.save(emb_cur_mc5_path, X_cur)

    print('Extracting test MC5 embeddings ...')

    X_test = extract_mc5_embeddings(test_files, root_dir=BASE/'test')

    np.save(emb_test_mc5_path, X_test)

    print('Saved MC5 embeddings.')



meta = {

    'tta': 'mc5',

    'sr': SR, 'T_sec': T_SEC, 'T': T,

    'crops': 'begin,center,end,25%,75%',

    'aggregation': 'mean',

}

with open('metadata_c3a.json', 'w') as f:

    json.dump(meta, f)



In [None]:
from pathlib import Path

import numpy as np, pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline



BASE = Path('.')

df_cur = pd.read_csv(BASE / 'train_curated_folds.csv')

df_ss  = pd.read_csv(BASE / 'sample_submission.csv')

class_names = [c for c in df_ss.columns if c != 'fname']

label_to_idx = {c:i for i,c in enumerate(class_names)}

n_classes = len(class_names)



def parse_labels_str(s):

    if not isinstance(s, str):

        return []

    toks = [t.strip() for t in s.replace(';', ',').split(',') if t.strip()]

    return toks



def encode_labels(s):

    y = np.zeros(n_classes, dtype=np.float32)

    for t in parse_labels_str(s):

        if t in label_to_idx:

            y[label_to_idx[t]] = 1.0

    return y



def lwlrap_np(truth, scores):

    assert truth.shape == scores.shape

    n_samples, n_labels = truth.shape

    precisions = np.zeros(n_labels)

    labels_per_class = np.maximum(truth.sum(axis=0), 1)

    for i in range(n_samples):

        pos = np.where(truth[i] > 0)[0]

        if pos.size == 0:

            continue

        ranking = np.argsort(-scores[i])

        ranked_truth = truth[i][ranking]

        cumsum = np.cumsum(ranked_truth)

        pos_rank = np.where(ranked_truth > 0)[0]

        prec = cumsum[pos_rank] / (pos_rank + 1)

        ranked_labels = ranking[pos_rank]

        for lbl, p in zip(ranked_labels, prec):

            precisions[lbl] += p

    per_class = precisions / labels_per_class

    weights = truth.sum(axis=0) / max(truth.sum(), 1)

    return float((per_class * weights).sum()), per_class



X_cur = np.load('embeddings_curated_mc5.npy')

X_test = np.load('embeddings_test_mc5.npy')

Y_cur = np.stack(df_cur['labels'].apply(encode_labels).values).astype(np.float32)



oof = np.zeros((len(df_cur), n_classes), dtype=np.float32)

fold_scores = []

for k in range(5):

    trn_idx = np.where(df_cur['fold'].values != k)[0]

    val_idx = np.where(df_cur['fold'].values == k)[0]

    X_tr, X_va = X_cur[trn_idx], X_cur[val_idx]

    y_tr, y_va = Y_cur[trn_idx], Y_cur[val_idx]

    base_lr = LogisticRegression(solver='lbfgs', max_iter=1000, C=2.0, n_jobs=16, verbose=0)

    clf = OneVsRestClassifier(make_pipeline(StandardScaler(with_mean=True, with_std=True), base_lr), n_jobs=-1)

    clf.fit(X_tr, y_tr)

    proba = clf.predict_proba(X_va)

    oof[val_idx] = proba.astype(np.float32)

    lw, _ = lwlrap_np(y_va, proba)

    fold_scores.append(lw)

    print(f'Fold {k} LWLRAP={lw:.4f}')

oof_lw, _ = lwlrap_np(Y_cur, oof)

print(f'OOF LWLRAP (MC5)={oof_lw:.4f}; per-fold={fold_scores}')

np.save('oof_tta.npy', oof)



base_lr_full = LogisticRegression(solver='lbfgs', max_iter=1000, C=2.0, n_jobs=16, verbose=0)

clf_full = OneVsRestClassifier(make_pipeline(StandardScaler(with_mean=True, with_std=True), base_lr_full), n_jobs=-1)

clf_full.fit(X_cur, Y_cur)

test_proba = clf_full.predict_proba(X_test).astype(np.float32)

sub = pd.DataFrame(test_proba, columns=class_names)

sub.insert(0, 'fname', df_ss['fname'].values)

sub.to_csv('submission.csv', index=False)

print('Saved submission.csv (MC5). Shape:', sub.shape)

