In [3]:
# Tiny PANNs kNN sweep with LOSO and station-aware prior fusion
import os, sys, gc, time, re, json, math, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

warnings.filterwarnings('ignore')
DATA_DIR = Path('essential_data')

def load_species_list(path: Path):
    df = pd.read_csv(path)
    if {'class_id','code'}.issubset(df.columns):
        return df.sort_values('class_id')['code'].tolist()
    with open(path, 'r') as f:
        f.readline()
        out = []
        for line in f:
            s = line.strip()
            if not s: continue
            parts = s.split(',')
            out.append(parts[1] if len(parts)>1 else s)
    return out

def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]

def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=int)
            if not is_test and tokens:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols

def macro_auc_np(P, Y):
    C = Y.shape[1]
    aucs = []
    for c in range(C):
        yt = Y[:,c]; yp = P[:,c]
        if yt.sum()==0 or yt.sum()==len(yt): continue
        try: aucs.append(roc_auc_score(yt, yp))
        except: pass
    return float(np.mean(aucs)) if aucs else np.nan

def l2_normalize_rows(A):
    A = A.astype(np.float32, copy=False)
    n = np.linalg.norm(A, axis=1, keepdims=True) + 1e-12
    return A / n

def load_panns_emb(path: Path, mask_train=None, mask_test=None):
    arr = np.load(path, allow_pickle=True)
    # Case 1: npz-like
    if hasattr(arr, 'files'):
        Xtr = arr['X_train'] if 'X_train' in arr.files else arr['train']
        Xte = arr['X_test'] if 'X_test' in arr.files else arr['test']
        id_tr = arr['train_ids'] if 'train_ids' in arr.files else None
        id_te = arr['test_ids'] if 'test_ids' in arr.files else None
        return Xtr, Xte, id_tr, id_te
    # Case 2: object ndarray storing dict
    if isinstance(arr, np.ndarray) and arr.dtype == object:
        try:
            obj = arr.item()
        except Exception:
            obj = arr.tolist() if hasattr(arr, 'tolist') else None
        if isinstance(obj, dict):
            Xtr = obj.get('X_train') or obj.get('train') or obj.get('train_X')
            Xte = obj.get('X_test') or obj.get('test') or obj.get('test_X')
            id_tr = obj.get('train_ids') or obj.get('ids_train') or obj.get('rec_ids_train')
            id_te = obj.get('test_ids') or obj.get('ids_test') or obj.get('rec_ids_test')
            return Xtr, Xte, id_tr, id_te
    # Case 3: single 2D array of shape (N_total, D) aligned to labels order
    if isinstance(arr, np.ndarray) and arr.ndim == 2 and mask_train is not None and mask_test is not None:
        assert arr.shape[0] == int(mask_train.sum() + mask_test.sum()), 'Embedding rows != N_total'
        return arr[mask_train], arr[mask_test], None, None
    raise RuntimeError('Unsupported panns_cnn14_emb.npy format')

def align_by_rec_ids(Xtr, Xte, id_tr, id_te, meta_train, meta_test):
    # Align embeddings rows to meta_train/meta_test rec_id order
    tr_order = meta_train['rec_id'].values.tolist()
    te_order = meta_test['rec_id'].values.tolist()
    def reindex(X, ids, order):
        if ids is None:
            # assume already aligned
            return X
        mp = {int(r): i for i, r in enumerate(ids)}
        idx = [mp[r] for r in order]
        return X[idx]
    return reindex(Xtr, id_tr, tr_order), reindex(Xte, id_te, te_order)

def knn_fold_predict(X_tr_emb, y_tr, X_va_emb, k=11, metric='cosine', weights='distance'):
    nn = NearestNeighbors(n_neighbors=min(k, len(X_tr_emb)), metric=metric, algorithm='auto')
    nn.fit(X_tr_emb)
    dists, idxs = nn.kneighbors(X_va_emb, return_distance=True)
    if weights == 'distance':
        w = 1.0 / (dists + 1e-6)
    else:
        w = np.ones_like(dists)
    Ytr = y_tr.astype(np.float32)
    C = Ytr.shape[1]
    P = np.zeros((len(X_va_emb), C), dtype=np.float32)
    for i in range(len(X_va_emb)):
        nbr_idx = idxs[i]
        wi = w[i][:, None]
        votes = (Ytr[nbr_idx] * wi).sum(axis=0)
        denom = wi.sum() + 1e-8
        P[i] = votes / denom
    return np.clip(P, 0.0, 1.0)

def knn_loso_oof_and_test(X_emb_tr, X_emb_te, y_train_df, groups, k=11, metric='cosine', weights='distance'):
    logo = LeaveOneGroupOut()
    idx = np.arange(len(groups))
    Y = y_train_df.values.astype(np.uint8)
    C = Y.shape[1]
    P_oof = np.zeros((len(Y), C), dtype=np.float32)
    fold = 0
    t0 = time.time()
    for tr, va in logo.split(idx, groups=groups):
        fold += 1
        print(f'[LOSO] fold {fold:02d} | tr={len(tr)} va={len(va)} | k={k} metric={metric} weights={weights} | elapsed={time.time()-t0:.1f}s')
        sys.stdout.flush()
        P_oof[va] = knn_fold_predict(X_emb_tr[tr], Y[tr], X_emb_tr[va], k=k, metric=metric, weights=weights)
    print('[Full-train NN] fitting for test...')
    P_test = knn_fold_predict(X_emb_tr, Y, X_emb_te, k=k, metric=metric, weights=weights)
    auc = macro_auc_np(P_oof, Y)
    return P_oof, P_test, auc

def compute_fulltrain_station_priors(meta_train, y_train_df, alpha=30.0):
    Y = y_train_df.values.astype(float)
    C = Y.shape[1]
    p_global = Y.mean(axis=0)
    df = pd.DataFrame(Y, columns=[f'c{i}' for i in range(C)])
    df['station'] = meta_train['station'].values
    grp = df.groupby('station')
    n = grp.size()
    pos = grp[[f'c{i}' for i in range(C)]].sum()
    eb = {}
    for st, cnt in n.items():
        pos_st = pos.loc[st].values
        eb[st] = (pos_st + alpha * p_global) / (cnt + alpha)
    st_arr = meta_train['station'].values
    prior_train = np.vstack([eb.get(s, p_global) for s in st_arr])
    return eb, p_global, prior_train

def logit_zscore_full(prior_train):
    P = np.clip(prior_train, 1e-6, 1-1e-6)
    L = np.log(P/(1-P))
    L = np.clip(L, -6, 6)
    mu = L.mean(axis=0)
    sd = L.std(axis=0) + 1e-6
    Z = (L - mu) / sd
    return Z, mu, sd

def build_test_Z_station(meta_test, eb_map, p_global, mu, sd):
    T = len(meta_test)
    C = len(mu)
    P = np.tile(p_global, (T, 1))
    st_vals = meta_test['station'].values
    for i, s in enumerate(st_vals):
        if s in eb_map:
            P[i] = eb_map[s]
    L = np.log(np.clip(P,1e-6,1-1e-6)/np.clip(1-P,1e-6,1))
    L = np.clip(L, -6, 6)
    return (L - mu)/sd

def logit(p):
    p = np.clip(p, 1e-6, 1-1e-6)
    return np.log(p/(1-p))
def sigmoid(x):
    return 1.0/(1.0+np.exp(-x))

# 1) Load core metadata and labels
species = load_species_list(DATA_DIR/'species_list.txt')
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, lab_cols_tmp = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df[~df['is_test']].copy()
test_df = df[df['is_test']].copy()
y_train = train_df[[c for c in train_df.columns if c.startswith('label_')]].copy()
y_train.columns = [f'label_{s}' for s in species]
groups = train_df['station'].values
meta_train = train_df[['rec_id','filename','station']].copy()
meta_test = test_df[['rec_id','filename','station']].copy()
print(f'Train N={len(train_df)} Test N={len(test_df)} Classes={y_train.shape[1]} Stations={len(pd.unique(groups))}')
sys.stdout.flush()

# 2) Load embeddings and align (supports single 2D array of all rows)
mask_train = (~df['is_test']).values
mask_test = (df['is_test']).values
Xtr, Xte, id_tr, id_te = load_panns_emb(Path('panns_cnn14_emb.npy'), mask_train=mask_train, mask_test=mask_test)
Xtr, Xte = align_by_rec_ids(Xtr, Xte, id_tr, id_te, meta_train, meta_test)
Xtr = l2_normalize_rows(Xtr); Xte = l2_normalize_rows(Xte)
print('Embeddings:', Xtr.shape, Xte.shape)
sys.stdout.flush()

# 3) Sweep k and metrics
sweep = [
    {'k':7, 'metric':'cosine', 'weights':'distance'},
    {'k':9, 'metric':'cosine', 'weights':'distance'},
    {'k':13, 'metric':'cosine', 'weights':'distance'},
    {'k':11, 'metric':'euclidean', 'weights':'distance'},
]
best = {'auc': -1.0, 'cfg': None, 'P_oof': None, 'P_test': None}
for i, cfg in enumerate(sweep):
    t0 = time.time()
    P_oof, P_test, auc = knn_loso_oof_and_test(Xtr, Xte, y_train, groups, **cfg)
    print(f"[Sweep {i+1}/{len(sweep)}] k={cfg['k']} metric={cfg['metric']} -> pooled macro AUC={auc:.4f} | dt={time.time()-t0:.1f}s")
    sys.stdout.flush()
    if auc > best['auc']:
        best.update({'auc': auc, 'cfg': cfg.copy(), 'P_oof': P_oof, 'P_test': P_test})

print('Best sweep result:', best['cfg'], 'AUC=', f"{best['auc']:.4f}")
sys.stdout.flush()

# 4) Save plain kNN submission for best config
P_test_best = best['P_test']
rows = []
for i, rec_id in enumerate(meta_test['rec_id'].values.tolist()):
    for c in range(y_train.shape[1]):
        rows.append((rec_id*100 + c, float(P_test_best[i, c])))
sub_knn = pd.DataFrame(rows, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
sub_knn.to_csv('submission_knn_sweep.csv', index=False)
print('Saved submission_knn_sweep.csv | rows=', len(sub_knn))

# 5) Station-aware prior fusion with lambda=0.25 (from weighted blend sweep)
eb_map, p_global, prior_train = compute_fulltrain_station_priors(meta_train, y_train, alpha=30.0)
prior_train_z, mu, sd = logit_zscore_full(prior_train)
Z_test = build_test_Z_station(meta_test, eb_map, p_global, mu, sd)
lam = 0.25
P_test_fused = sigmoid(np.clip(logit(P_test_best) + lam*Z_test, -12, 12))
rows2 = []
for i, rec_id in enumerate(meta_test['rec_id'].values.tolist()):
    for c in range(y_train.shape[1]):
        rows2.append((rec_id*100 + c, float(P_test_fused[i, c])))
sub_knn_pf = pd.DataFrame(rows2, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
sub_knn_pf.to_csv('submission_knn_sweep_priorfusion_station.csv', index=False)
print('Saved submission_knn_sweep_priorfusion_station.csv | rows=', len(sub_knn_pf))

gc.collect();

Train N=258 Test N=64 Classes=19 Stations=13


Embeddings: (258, 2048) (64, 2048)


[LOSO] fold 01 | tr=231 va=27 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 02 | tr=234 va=24 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 03 | tr=232 va=26 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 04 | tr=244 va=14 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 05 | tr=233 va=25 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 06 | tr=233 va=25 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 07 | tr=236 va=22 | k=7 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 08 | tr=247 va=11 | k=7 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 09 | tr=243 va=15 | k=7 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 10 | tr=243 va=15 | k=7 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 11 | tr=238 va=20 | k=7 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 12 | tr=234 va=24 | k=7 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 13 | tr=248 va=10 | k=7 metric=cosine weights=distance | elapsed=0.1s


[Full-train NN] fitting for test...
[Sweep 1/4] k=7 metric=cosine -> pooled macro AUC=0.6317 | dt=0.2s


[LOSO] fold 01 | tr=231 va=27 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 02 | tr=234 va=24 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 03 | tr=232 va=26 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 04 | tr=244 va=14 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 05 | tr=233 va=25 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 06 | tr=233 va=25 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 07 | tr=236 va=22 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 08 | tr=247 va=11 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 09 | tr=243 va=15 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 10 | tr=243 va=15 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 11 | tr=238 va=20 | k=9 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 12 | tr=234 va=24 | k=9 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 13 | tr=248 va=10 | k=9 metric=cosine weights=distance | elapsed=0.1s


[Full-train NN] fitting for test...


[Sweep 2/4] k=9 metric=cosine -> pooled macro AUC=0.6496 | dt=0.1s


[LOSO] fold 01 | tr=231 va=27 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 02 | tr=234 va=24 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 03 | tr=232 va=26 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 04 | tr=244 va=14 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 05 | tr=233 va=25 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 06 | tr=233 va=25 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 07 | tr=236 va=22 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 08 | tr=247 va=11 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 09 | tr=243 va=15 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 10 | tr=243 va=15 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 11 | tr=238 va=20 | k=13 metric=cosine weights=distance | elapsed=0.0s


[LOSO] fold 12 | tr=234 va=24 | k=13 metric=cosine weights=distance | elapsed=0.1s


[LOSO] fold 13 | tr=248 va=10 | k=13 metric=cosine weights=distance | elapsed=0.1s


[Full-train NN] fitting for test...
[Sweep 3/4] k=13 metric=cosine -> pooled macro AUC=0.6615 | dt=0.1s


[LOSO] fold 01 | tr=231 va=27 | k=11 metric=euclidean weights=distance | elapsed=0.0s


[LOSO] fold 02 | tr=234 va=24 | k=11 metric=euclidean weights=distance | elapsed=0.2s


[LOSO] fold 03 | tr=232 va=26 | k=11 metric=euclidean weights=distance | elapsed=0.3s


[LOSO] fold 04 | tr=244 va=14 | k=11 metric=euclidean weights=distance | elapsed=0.4s


[LOSO] fold 05 | tr=233 va=25 | k=11 metric=euclidean weights=distance | elapsed=0.6s


[LOSO] fold 06 | tr=233 va=25 | k=11 metric=euclidean weights=distance | elapsed=0.7s


[LOSO] fold 07 | tr=236 va=22 | k=11 metric=euclidean weights=distance | elapsed=0.8s


[LOSO] fold 08 | tr=247 va=11 | k=11 metric=euclidean weights=distance | elapsed=0.9s


[LOSO] fold 09 | tr=243 va=15 | k=11 metric=euclidean weights=distance | elapsed=1.0s


[LOSO] fold 10 | tr=243 va=15 | k=11 metric=euclidean weights=distance | elapsed=1.2s


[LOSO] fold 11 | tr=238 va=20 | k=11 metric=euclidean weights=distance | elapsed=1.3s


[LOSO] fold 12 | tr=234 va=24 | k=11 metric=euclidean weights=distance | elapsed=1.4s


[LOSO] fold 13 | tr=248 va=10 | k=11 metric=euclidean weights=distance | elapsed=1.5s


[Full-train NN] fitting for test...


[Sweep 4/4] k=11 metric=euclidean -> pooled macro AUC=0.6549 | dt=1.8s


Best sweep result: {'k': 13, 'metric': 'cosine', 'weights': 'distance'} AUC= 0.6615


Saved submission_knn_sweep.csv | rows= 1216
Saved submission_knn_sweep_priorfusion_station.csv | rows= 1216


In [2]:
# Inspect panns_cnn14_emb.npy format to adapt loader
import numpy as np
from pathlib import Path
p = Path('panns_cnn14_emb.npy')
arr = np.load(p, allow_pickle=True)
print('Loaded type:', type(arr))
if hasattr(arr, 'files'):
    print('npz-like files:', arr.files)
else:
    print('No .files attribute; dtype:', getattr(arr, 'dtype', None), 'shape:', getattr(arr, 'shape', None))
    if isinstance(arr, np.ndarray) and arr.dtype == object:
        try:
            obj = arr.item()
            print('Top-level keys:', list(obj.keys()))
            for k, v in obj.items():
                if hasattr(v, 'shape'):
                    print(' key', k, '-> shape', v.shape, 'dtype', getattr(v, 'dtype', None))
                else:
                    print(' key', k, '-> type', type(v))
        except Exception as e:
            print('arr.item() failed:', e)

Loaded type: <class 'numpy.ndarray'>
No .files attribute; dtype: float32 shape: (322, 2048)


In [4]:
# Regenerate kNN station-aware prior fusion with safer lambda=0.20 and set as submission.csv
import numpy as np, pandas as pd
from pathlib import Path

# Rebuild minimal metadata and priors
DATA_DIR = Path('essential_data')
def load_species_list(path: Path):
    df = pd.read_csv(path)
    if {'class_id','code'}.issubset(df.columns):
        return df.sort_values('class_id')['code'].tolist()
    with open(path, 'r') as f:
        f.readline()
        out = []
        for line in f:
            s = line.strip()
            if not s: continue
            parts = s.split(',')
            out.append(parts[1] if len(parts)>1 else s)
    return out
def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]
def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=int)
            if not is_test and tokens:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols
def compute_fulltrain_station_priors(meta_train, y_train_df, alpha=30.0):
    Y = y_train_df.values.astype(float)
    C = Y.shape[1]
    p_global = Y.mean(axis=0)
    df = pd.DataFrame(Y, columns=[f'c{i}' for i in range(C)])
    df['station'] = meta_train['station'].values
    grp = df.groupby('station')
    n = grp.size()
    pos = grp[[f'c{i}' for i in range(C)]].sum()
    eb = {}
    for st, cnt in n.items():
        pos_st = pos.loc[st].values
        eb[st] = (pos_st + alpha * p_global) / (cnt + alpha)
    st_arr = meta_train['station'].values
    prior_train = np.vstack([eb.get(s, p_global) for s in st_arr])
    return eb, p_global, prior_train
def logit_zscore_full(prior_train):
    P = np.clip(prior_train, 1e-6, 1-1e-6)
    L = np.log(P/(1-P))
    L = np.clip(L, -6, 6)
    mu = L.mean(axis=0)
    sd = L.std(axis=0) + 1e-6
    Z = (L - mu) / sd
    return Z, mu, sd
def build_test_Z_station(meta_test, eb_map, p_global, mu, sd):
    T = len(meta_test)
    P = np.tile(p_global, (T, 1))
    st_vals = meta_test['station'].values
    for i, s in enumerate(st_vals):
        if s in eb_map:
            P[i] = eb_map[s]
    L = np.log(np.clip(P,1e-6,1-1e-6)/np.clip(1-P,1e-6,1))
    L = np.clip(L, -6, 6)
    return (L - mu)/sd
def logit(p):
    p = np.clip(p, 1e-6, 1-1e-6)
    return np.log(p/(1-p))
def sigmoid(x):
    return 1.0/(1.0+np.exp(-x))

species = load_species_list(DATA_DIR/'species_list.txt')
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, _ = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df_all = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df_all[~df_all['is_test']].copy()
test_df = df_all[df_all['is_test']].copy()
y_train = train_df[[c for c in train_df.columns if c.startswith('label_')]].copy()
y_train.columns = [f'label_{s}' for s in species]
meta_train = train_df[['rec_id','filename','station']].copy()
meta_test = test_df[['rec_id','filename','station']].copy()

# Build station-aware Z_test
eb_map, p_global, prior_train = compute_fulltrain_station_priors(meta_train, y_train, alpha=30.0)
prior_train_z, mu, sd = logit_zscore_full(prior_train)
Z_test = build_test_Z_station(meta_test, eb_map, p_global, mu, sd)

# Load existing kNN submission and apply lambda=0.20 fusion
sub_knn = pd.read_csv('submission_knn.csv').sort_values('Id').reset_index(drop=True)
ids = sub_knn['Id'].values; probs = sub_knn['Probability'].values
rec_ids_order = meta_test['rec_id'].values.tolist()
T = len(rec_ids_order); C = y_train.shape[1]
P_knn = np.zeros((T, C), dtype=float)
id_to_prob = dict(zip(ids, probs))
for i, rid in enumerate(rec_ids_order):
    base = rid*100
    for cls in range(C):
        P_knn[i, cls] = float(id_to_prob.get(base+cls, 0.5))
lam = 0.20
P_knn_f = sigmoid(np.clip(logit(P_knn) + lam*Z_test, -12, 12))
rows = []
for i, rid in enumerate(rec_ids_order):
    for cls in range(C):
        rows.append((rid*100 + cls, float(P_knn_f[i, cls])))
out = pd.DataFrame(rows, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
out.to_csv('submission_knn_priorfusion_station_lam020.csv', index=False)
out.to_csv('submission.csv', index=False)
print('Saved submission_knn_priorfusion_station_lam020.csv and overwrote submission.csv | rows=', len(out))

Saved submission_knn_priorfusion_station_lam020.csv and overwrote submission.csv | rows= 1216


In [11]:
# Set submission.csv to robust hedge: submission_sktrees.csv
import pandas as pd, os, numpy as np
path = 'submission_sktrees.csv'
assert os.path.exists(path), 'Missing submission_sktrees.csv'
df = pd.read_csv(path).sort_values('Id').reset_index(drop=True)
assert len(df)==1216 and {'Id','Probability'}.issubset(df.columns), 'Bad format/row count'
assert np.isfinite(df['Probability']).all() and df['Probability'].between(0,1).all(), 'Prob out of [0,1] or non-finite'
df.to_csv('submission.csv', index=False)
print('Overwrote submission.csv with submission_sktrees.csv | rows=', len(df))

Overwrote submission.csv with submission_sktrees.csv | rows= 1216


In [6]:
# Set submission.csv to plain weighted blend hedge
import pandas as pd, os, numpy as np
path = 'submission_weighted.csv'
assert os.path.exists(path), 'Missing submission_weighted.csv'
df = pd.read_csv(path).sort_values('Id').reset_index(drop=True)
assert len(df)==1216 and {'Id','Probability'}.issubset(df.columns), 'Bad format/row count'
assert np.isfinite(df['Probability']).all() and df['Probability'].between(0,1).all(), 'Probs out of [0,1] or non-finite'
df.to_csv('submission.csv', index=False)
print('Overwrote submission.csv with submission_weighted.csv | rows=', len(df))

Overwrote submission.csv with submission_weighted.csv | rows= 1216


In [10]:
# Set submission.csv to plain kNN (no prior fusion) per expert fallback
import pandas as pd, os, numpy as np
path = 'submission_knn.csv'
assert os.path.exists(path), 'Missing submission_knn.csv'
df = pd.read_csv(path).sort_values('Id').reset_index(drop=True)
assert len(df)==1216 and {'Id','Probability'}.issubset(df.columns), 'Bad format/row count'
assert np.isfinite(df['Probability']).all() and df['Probability'].between(0,1).all(), 'Prob out of [0,1] or non-finite'
df.to_csv('submission.csv', index=False)
print('Overwrote submission.csv with submission_knn.csv | rows=', len(df))

Overwrote submission.csv with submission_knn.csv | rows= 1216


In [8]:
# Set submission.csv to FS/PANNs blend per expert order (second submission)
import pandas as pd, os, numpy as np
path = 'submission_fs.csv'
assert os.path.exists(path), 'Missing submission_fs.csv'
df = pd.read_csv(path).sort_values('Id').reset_index(drop=True)
assert len(df)==1216 and {'Id','Probability'}.issubset(df.columns), 'Bad format/row count'
assert np.isfinite(df['Probability']).all() and df['Probability'].between(0,1).all(), 'Prob out of [0,1] or non-finite'
df.to_csv('submission.csv', index=False)
print('Overwrote submission.csv with submission_fs.csv | rows=', len(df))

Overwrote submission.csv with submission_fs.csv | rows= 1216


In [12]:
# Set submission.csv to 50/50 average hedge (submission_avg_skt_fs.csv)
import pandas as pd, os, numpy as np
path = 'submission_avg_skt_fs.csv'
assert os.path.exists(path), 'Missing submission_avg_skt_fs.csv'
df = pd.read_csv(path).sort_values('Id').reset_index(drop=True)
assert len(df)==1216 and {'Id','Probability'}.issubset(df.columns), 'Bad format/row count'
assert np.isfinite(df['Probability']).all() and df['Probability'].between(0,1).all(), 'Prob out of [0,1] or non-finite'
df.to_csv('submission.csv', index=False)
print('Overwrote submission.csv with submission_avg_skt_fs.csv | rows=', len(df))

Overwrote submission.csv with submission_avg_skt_fs.csv | rows= 1216


In [14]:
import pandas as pd, hashlib, os
target = 'submission_rankmean.csv'
assert os.path.exists(target), f'Missing {target}'
sub = pd.read_csv(target)
print('Loaded', target, 'shape=', sub.shape)
sample = pd.read_csv('sample_submission.csv')
print('Sample shape=', sample.shape, 'cols=', list(sample.columns))
assert list(sub.columns) == list(sample.columns), f'Columns do not match sample_submission: {list(sub.columns)} vs {list(sample.columns)}'
assert len(sub) == len(sample), f'Row count mismatch: {len(sub)} vs {len(sample)}'
sub = sub.sort_values('Id').reset_index(drop=True)
sample_sorted = sample.sort_values('Id').reset_index(drop=True)
assert (sub['Id'] == sample_sorted['Id']).all(), 'Id values do not match sample_submission after sort'
sub.to_csv('submission.csv', index=False)
h = hashlib.sha256(open('submission.csv','rb').read()).hexdigest()
print('Wrote submission.csv, sha256=', h)
print(sub.head(3))

Loaded submission_rankmean.csv shape= (1216, 2)
Sample shape= (1216, 2) cols= ['Id', 'Probability']
Wrote submission.csv, sha256= 99d8c93f92dfaffb0f851347f47b0c264540b1d00801b0b003c2667e2988febb
    Id  Probability
0  100     0.212500
1  101     0.346875
2  102     0.265625


In [15]:
# Set submission.csv to rebuild equal-weight blend with validation
import pandas as pd, os, hashlib, numpy as np
path = 'submission_rebuild_blend.csv'
assert os.path.exists(path), f'Missing {path}'
sub = pd.read_csv(path).sort_values('Id').reset_index(drop=True)
sample = pd.read_csv('sample_submission.csv').sort_values('Id').reset_index(drop=True)
assert list(sub.columns) == list(sample.columns), f'Bad columns: {list(sub.columns)}'
assert len(sub) == len(sample) == 1216, f'Row count mismatch: {len(sub)} vs {len(sample)}'
assert (sub['Id'] == sample['Id']).all(), 'Id alignment mismatch vs sample'
assert np.isfinite(sub['Probability']).all() and sub['Probability'].between(0,1).all(), 'Probabilities invalid'
sub.to_csv('submission.csv', index=False)
print('Overwrote submission.csv with', path, '| rows=', len(sub))
print('sha256=', hashlib.sha256(open('submission.csv','rb').read()).hexdigest())
print(sub.head(3))

Overwrote submission.csv with submission_rebuild_blend.csv | rows= 1216
sha256= dc08e95be74bc84c31703a168bdd58b9b2cb244bce16c2e76bd876ef419c9b77
    Id  Probability
0  100     0.000641
1  101     0.003674
2  102     0.004824


In [16]:
# Alignment rescue for PANNs kNN: try multiple plausible row orders and pick best by OOF
import numpy as np, pandas as pd, time, sys, gc
from pathlib import Path
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score

def macro_auc_np(P, Y):
    C = Y.shape[1]
    aucs = []
    for c in range(C):
        yt = Y[:,c]; yp = P[:,c]
        if yt.sum()==0 or yt.sum()==len(yt):
            continue
        try: aucs.append(roc_auc_score(yt, yp))
        except: pass
    return float(np.mean(aucs)) if aucs else np.nan

def l2_normalize_rows(A):
    A = A.astype(np.float32, copy=False)
    n = np.linalg.norm(A, axis=1, keepdims=True) + 1e-12
    return A / n

def knn_predict_block(X_tr_emb, Y_tr, X_va_emb, k=11, metric='cosine'):
    nn = NearestNeighbors(n_neighbors=min(k, len(X_tr_emb)), metric=metric)
    nn.fit(X_tr_emb)
    dists, idxs = nn.kneighbors(X_va_emb, return_distance=True)
    w = 1.0 / (dists + 1e-6)
    C = Y_tr.shape[1]
    P = np.zeros((len(X_va_emb), C), dtype=np.float32)
    for i in range(len(X_va_emb)):
        nbr = idxs[i]; wi = w[i][:, None]
        votes = (Y_tr[nbr] * wi).sum(axis=0)
        P[i] = votes / (wi.sum() + 1e-8)
    return np.clip(P, 0, 1)

def knn_loso_oof_and_test(X_tr, X_te, y_train_df, groups, k=11, metric='cosine'):
    logo = LeaveOneGroupOut()
    idx = np.arange(len(groups))
    Y = y_train_df.values.astype(np.uint8)
    C = Y.shape[1]
    P_oof = np.zeros((len(Y), C), dtype=np.float32)
    fold = 0; t0 = time.time()
    for tr, va in logo.split(idx, groups=groups):
        fold += 1
        print(f'[ALIGN] fold {fold:02d} tr={len(tr)} va={len(va)} | elapsed={time.time()-t0:.1f}s'); sys.stdout.flush()
        P_oof[va] = knn_predict_block(X_tr[tr], Y[tr], X_tr[va], k=k, metric=metric)
    P_test = knn_predict_block(X_tr, Y, X_te, k=k, metric=metric)
    auc = macro_auc_np(P_oof, Y)
    return P_oof, P_test, auc

# 1) Load core meta/labels
DATA_DIR = Path('essential_data')
def load_species_list(path: Path):
    df = pd.read_csv(path)
    if {'class_id','code'}.issubset(df.columns):
        return df.sort_values('class_id')['code'].tolist()
    with open(path, 'r') as f:
        f.readline()
        out = []
        for line in f:
            s = line.strip()
            if not s: continue
            parts = s.split(',')
            out.append(parts[1] if len(parts)>1 else s)
    return out
def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]
def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=int)
            if not is_test and tokens:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols

species = load_species_list(DATA_DIR/'species_list.txt')
assert len(species)==19, f'species count {len(species)} != 19'
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, _ = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df_all = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df_all[~df_all['is_test']].copy()
test_df = df_all[df_all['is_test']].copy()
y_train = train_df[[c for c in train_df.columns if c.startswith('label_')]].copy()
y_train.columns = [f'label_{s}' for s in species]
groups = train_df['station'].values
meta_train = train_df[['rec_id','filename','station']].copy()
meta_test = test_df[['rec_id','filename','station']].copy()
print(f'Meta: Ntr={len(train_df)} Nte={len(test_df)} C={y_train.shape[1]} stations={len(pd.unique(groups))}')
sys.stdout.flush()

# 2) Load raw embeddings array
E_all = np.load('panns_cnn14_emb.npy', allow_pickle=True)
assert isinstance(E_all, np.ndarray) and E_all.ndim==2 and E_all.shape[1]==2048, f'Bad emb shape {getattr(E_all,"shape",None)}'
assert E_all.shape[0] == len(df_all), f'Embedding rows {E_all.shape[0]} != N_all {len(df_all)}'

# Build three candidate alignments:
# A) Current df_all row order
order_A = df_all.index.values
E_A = E_all.copy()  # as-is
# B) Sort by rec_id ascending
df_sorted = df_all.sort_values('rec_id').reset_index(drop=True)
mpB = {rid:i for i, rid in enumerate(df_sorted['rec_id'].values.tolist())}
idxB = [mpB[rid] for rid in df_all['rec_id'].values.tolist()]
E_B = E_all[idxB]
# C) Use rec_map order (as read from file) to assign, then reindex to df_all
mpC_assign = {rid:i for i, rid in enumerate(rec_map['rec_id'].values.tolist())}
idxC_assign = [mpC_assign[rid] for rid in df_all['rec_id'].values.tolist()]
E_C = E_all[idxC_assign]

cands = {'A_df_order': E_A, 'B_sort_recid': E_B, 'C_recmap_order': E_C}
results = {}
best_key, best_auc = None, -1.0
for key, E in cands.items():
    # Split to train/test via df_all masks
    mask_tr = (~df_all['is_test']).values
    mask_te = (df_all['is_test']).values
    Xtr = l2_normalize_rows(E[mask_tr])
    Xte = l2_normalize_rows(E[mask_te])
    print(f'Trying alignment {key}: Xtr {Xtr.shape} Xte {Xte.shape}'); sys.stdout.flush()
    P_oof, P_test, auc = knn_loso_oof_and_test(Xtr, Xte, y_train, groups, k=11, metric='cosine')
    print(f'  -> AUC {auc:.4f}'); sys.stdout.flush()
    results[key] = (auc, P_oof, P_test)
    if auc > best_auc:
        best_auc, best_key = auc, key

print('Best alignment:', best_key, 'AUC=', f'{best_auc:.4f}')
P_test_best = results[best_key][2]

# 3) Build submission from best
rows = []
for i, rec_id in enumerate(meta_test['rec_id'].values.tolist()):
    for c in range(y_train.shape[1]):
        rows.append((rec_id*100 + c, float(P_test_best[i, c])))
sub = pd.DataFrame(rows, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
sub.to_csv('submission_knn_alignfix.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Saved submission_knn_alignfix.csv and overwrote submission.csv | rows=', len(sub))
print(sub.head(3))
gc.collect();

Meta: Ntr=258 Nte=64 C=19 stations=13


Trying alignment A_df_order: Xtr (258, 2048) Xte (64, 2048)


[ALIGN] fold 01 tr=231 va=27 | elapsed=0.0s


[ALIGN] fold 02 tr=234 va=24 | elapsed=0.0s


[ALIGN] fold 03 tr=232 va=26 | elapsed=0.0s


[ALIGN] fold 04 tr=244 va=14 | elapsed=0.0s


[ALIGN] fold 05 tr=233 va=25 | elapsed=0.0s


[ALIGN] fold 06 tr=233 va=25 | elapsed=0.0s


[ALIGN] fold 07 tr=236 va=22 | elapsed=0.0s


[ALIGN] fold 08 tr=247 va=11 | elapsed=0.1s


[ALIGN] fold 09 tr=243 va=15 | elapsed=0.1s


[ALIGN] fold 10 tr=243 va=15 | elapsed=0.1s


[ALIGN] fold 11 tr=238 va=20 | elapsed=0.1s


[ALIGN] fold 12 tr=234 va=24 | elapsed=0.1s


[ALIGN] fold 13 tr=248 va=10 | elapsed=0.1s


  -> AUC 0.6547


Trying alignment B_sort_recid: Xtr (258, 2048) Xte (64, 2048)


[ALIGN] fold 01 tr=231 va=27 | elapsed=0.0s


[ALIGN] fold 02 tr=234 va=24 | elapsed=0.0s


[ALIGN] fold 03 tr=232 va=26 | elapsed=0.0s


[ALIGN] fold 04 tr=244 va=14 | elapsed=0.0s


[ALIGN] fold 05 tr=233 va=25 | elapsed=0.0s


[ALIGN] fold 06 tr=233 va=25 | elapsed=0.0s


[ALIGN] fold 07 tr=236 va=22 | elapsed=0.0s


[ALIGN] fold 08 tr=247 va=11 | elapsed=0.1s


[ALIGN] fold 09 tr=243 va=15 | elapsed=0.1s


[ALIGN] fold 10 tr=243 va=15 | elapsed=0.1s


[ALIGN] fold 11 tr=238 va=20 | elapsed=0.1s


[ALIGN] fold 12 tr=234 va=24 | elapsed=0.1s


[ALIGN] fold 13 tr=248 va=10 | elapsed=0.1s


  -> AUC 0.6547


Trying alignment C_recmap_order: Xtr (258, 2048) Xte (64, 2048)


[ALIGN] fold 01 tr=231 va=27 | elapsed=0.0s


[ALIGN] fold 02 tr=234 va=24 | elapsed=0.0s


[ALIGN] fold 03 tr=232 va=26 | elapsed=0.0s


[ALIGN] fold 04 tr=244 va=14 | elapsed=0.0s


[ALIGN] fold 05 tr=233 va=25 | elapsed=0.0s


[ALIGN] fold 06 tr=233 va=25 | elapsed=0.0s


[ALIGN] fold 07 tr=236 va=22 | elapsed=0.0s


[ALIGN] fold 08 tr=247 va=11 | elapsed=0.0s


[ALIGN] fold 09 tr=243 va=15 | elapsed=0.0s


[ALIGN] fold 10 tr=243 va=15 | elapsed=0.0s


[ALIGN] fold 11 tr=238 va=20 | elapsed=0.0s


[ALIGN] fold 12 tr=234 va=24 | elapsed=0.0s


[ALIGN] fold 13 tr=248 va=10 | elapsed=0.1s


  -> AUC 0.6547


Best alignment: A_df_order AUC= 0.6547


Saved submission_knn_alignfix.csv and overwrote submission.csv | rows= 1216
    Id  Probability
0  100          0.0
1  101          0.0
2  102          0.0


In [17]:
# Re-run kNN with strict settings from production: k=11, cosine, uniform weights
import numpy as np, pandas as pd, sys, time, gc
from pathlib import Path
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score

def macro_auc_np(P, Y):
    C = Y.shape[1]; aucs = []
    for c in range(C):
        yt = Y[:,c]; yp = P[:,c]
        if yt.sum()==0 or yt.sum()==len(yt):
            continue
        try: aucs.append(roc_auc_score(yt, yp))
        except: pass
    return float(np.mean(aucs)) if aucs else np.nan

def l2_normalize_rows(A):
    A = A.astype(np.float32, copy=False)
    n = np.linalg.norm(A, axis=1, keepdims=True) + 1e-12
    return A / n

def load_species_list(path: Path):
    df = pd.read_csv(path)
    if {'class_id','code'}.issubset(df.columns):
        return df.sort_values('class_id')['code'].tolist()
    with open(path, 'r') as f:
        f.readline()
        out = []
        for line in f:
            s = line.strip()
            if not s: continue
            parts = s.split(',')
            out.append(parts[1] if len(parts)>1 else s)
    return out

def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]

def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=int)
            if not is_test and tokens:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols

DATA_DIR = Path('essential_data')
species = load_species_list(DATA_DIR/'species_list.txt')
assert len(species)==19, 'species != 19'
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, _ = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df_all = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df_all[~df_all['is_test']].copy()
test_df = df_all[df_all['is_test']].copy()
y_train = train_df[[c for c in train_df.columns if c.startswith('label_')]].copy()
y_train.columns = [f'label_{s}' for s in species]
groups = train_df['station'].values
meta_test = test_df[['rec_id','filename','station']].copy()
print(f'Ntr={len(train_df)} Nte={len(test_df)} C={y_train.shape[1]} stations={len(pd.unique(groups))}')
sys.stdout.flush()

# Use as-is embedding row order split by is_test mask (previous best among tried alignments)
E_all = np.load('panns_cnn14_emb.npy', allow_pickle=True)
assert isinstance(E_all, np.ndarray) and E_all.ndim==2 and E_all.shape[1]==2048, 'Bad emb array'
mask_tr = (~df_all['is_test']).values
mask_te = (df_all['is_test']).values
Xtr = l2_normalize_rows(E_all[mask_tr])
Xte = l2_normalize_rows(E_all[mask_te])
assert np.allclose(np.linalg.norm(Xtr[:5], axis=1), 1.0, atol=1e-5)
print('Emb shapes:', Xtr.shape, Xte.shape)

def knn_predict_uniform(X_tr, Y_tr, X_va, k=11, metric='cosine'):
    nn = NearestNeighbors(n_neighbors=min(k, len(X_tr)), metric=metric)
    nn.fit(X_tr)
    _, idxs = nn.kneighbors(X_va, return_distance=True)
    C = Y_tr.shape[1]
    P = np.zeros((len(X_va), C), dtype=np.float32)
    for i in range(len(X_va)):
        nbr = idxs[i]
        P[i] = Y_tr[nbr].mean(axis=0)
    return np.clip(P, 0, 1)

logo = LeaveOneGroupOut()
idx = np.arange(len(groups))
Y = y_train.values.astype(np.uint8)
P_oof = np.zeros_like(Y, dtype=np.float32)
t0 = time.time()
for f, (tr, va) in enumerate(logo.split(idx, groups=groups), 1):
    print(f'[UNIF] fold {f:02d} tr={len(tr)} va={len(va)} | elapsed={time.time()-t0:.1f}s'); sys.stdout.flush()
    P_oof[va] = knn_predict_uniform(Xtr[tr], Y[tr], Xtr[va], k=11, metric='cosine')
auc = macro_auc_np(P_oof, Y)
print('Pooled macro AUC (k=11, cosine, uniform):', f'{auc:.4f}')

# Full-train for test
P_test = knn_predict_uniform(Xtr, Y, Xte, k=11, metric='cosine')
rows = []
rec_ids = meta_test['rec_id'].values.tolist()
C = Y.shape[1]
for i, rid in enumerate(rec_ids):
    for c in range(C):
        rows.append((rid*100 + c, float(P_test[i, c])))
sub = pd.DataFrame(rows, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
sub.to_csv('submission_knn_uniform.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Saved submission_knn_uniform.csv and overwrote submission.csv | rows=', len(sub))
print(sub.head(3))
gc.collect();

Ntr=258 Nte=64 C=19 stations=13


Emb shapes: (258, 2048) (64, 2048)
[UNIF] fold 01 tr=231 va=27 | elapsed=0.0s


[UNIF] fold 02 tr=234 va=24 | elapsed=0.0s


[UNIF] fold 03 tr=232 va=26 | elapsed=0.0s


[UNIF] fold 04 tr=244 va=14 | elapsed=0.0s


[UNIF] fold 05 tr=233 va=25 | elapsed=0.0s


[UNIF] fold 06 tr=233 va=25 | elapsed=0.0s


[UNIF] fold 07 tr=236 va=22 | elapsed=0.0s


[UNIF] fold 08 tr=247 va=11 | elapsed=0.0s


[UNIF] fold 09 tr=243 va=15 | elapsed=0.0s


[UNIF] fold 10 tr=243 va=15 | elapsed=0.0s


[UNIF] fold 11 tr=238 va=20 | elapsed=0.0s


[UNIF] fold 12 tr=234 va=24 | elapsed=0.1s


[UNIF] fold 13 tr=248 va=10 | elapsed=0.1s


Pooled macro AUC (k=11, cosine, uniform): 0.6523
Saved submission_knn_uniform.csv and overwrote submission.csv | rows= 1216
    Id  Probability
0  100          0.0
1  101          0.0
2  102          0.0


In [20]:
# Regenerate PANNs CNN14 embeddings with explicit rec_id mapping and save as NPZ
import os, sys, time, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')
DATA_DIR = Path('essential_data')
WAV_DIR = DATA_DIR/'src_wavs'

def log(msg):
    print(time.strftime('%H:%M:%S'), msg, flush=True)

# Robust loaders
def load_species_list(path: Path):
    try:
        df = pd.read_csv(path)
        if {'class_id','code'}.issubset(df.columns):
            return df.sort_values('class_id')['code'].tolist()
    except Exception:
        pass
    lines = []
    with open(path, 'r') as f:
        for ln in f:
            s = ln.strip()
            if not s: continue
            if s.lower().startswith('species') or ',' in s or '\t' in s:
                continue
            lines.append(s)
    return lines

def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]

def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        _ = f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=np.uint8)
            if not is_test:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols

# Build meta
species = load_species_list(DATA_DIR/'species_list.txt')
assert len(species)==19, f'species {len(species)} != 19'
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, _ = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df_all = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df_all[~df_all['is_test']].copy()
test_df = df_all[df_all['is_test']].copy()
log(f'Meta ready: N_all={len(df_all)} Ntr={len(train_df)} Nte={len(test_df)} stations={df_all.station.nunique()}')

# Ensure deps
def ensure(pkgs):
    import importlib
    for name, pipname in pkgs:
        try:
            importlib.import_module(name)
        except Exception:
            log(f'Installing {pipname} ...')
            import subprocess
            subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', pipname], check=True)
ensure([('librosa','librosa==0.10.1'), ('soundfile','soundfile==0.12.1')])
try:
    import torch
except Exception:
    # Fallback to CPU torch if missing
    ensure([('torch','torch==2.2.2')])
    import torch

import librosa, soundfile as sf
sys.path.insert(0, str(Path('panns_repo')/'pytorch'))
from models import Cnn14

# Load PANNs CNN14 model and weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
log(f'Using device: {device}')
# IMPORTANT: Use window_size=512 and hop_size=160 to match Cnn14_32k.pth (257 FFT bins)
model = Cnn14(sample_rate=32000, window_size=512, hop_size=160, mel_bins=64, fmin=50, fmax=14000, classes_num=527)
ckpt = torch.load('Cnn14_32k.pth', map_location='cpu')
state = ckpt.get('model', ckpt)
missing, unexpected = model.load_state_dict(state, strict=False)
log(f'Loaded Cnn14 weights | missing={len(missing)} unexpected={len(unexpected)}')
model.to(device)
model.eval()

# Helper to load and prepare audio
SR = 32000
FIX_LEN = SR * 10  # 10 seconds
def load_audio(filepath: Path):
    y, sr = librosa.load(filepath, sr=SR, mono=True)
    if len(y) < FIX_LEN:
        y = np.pad(y, (0, FIX_LEN - len(y)))
    elif len(y) > FIX_LEN:
        y = y[:FIX_LEN]
    return y.astype(np.float32)

# Compute embeddings in df_all row order
N = len(df_all)
emb = np.zeros((N, 2048), dtype=np.float32)
t0 = time.time()
bs = 16
buf = []
buf_idx = []
def flush_batch():
    if not buf: return
    x = np.stack(buf, axis=0)
    with torch.no_grad():
        xt = torch.from_numpy(x).to(device)
        out = model(xt)  # expect dict with 'embedding'
        if isinstance(out, dict) and 'embedding' in out:
            z = out['embedding']
        elif hasattr(out, 'embedding'):
            z = out.embedding
        else:
            # Some versions return tuple (clipwise_output, embedding)
            try:
                z = out[1]
            except Exception:
                raise RuntimeError('Unexpected Cnn14 output structure')
        z = z.detach().cpu().numpy().astype(np.float32)
    for j, idx in enumerate(buf_idx):
        emb[idx] = z[j]
    buf.clear(); buf_idx.clear()

for i, row in enumerate(df_all.itertuples(index=False)):
    if (i % 25)==0 and i>0:
        log(f'Processed {i}/{N} | dt={time.time()-t0:.1f}s')
    fname = getattr(row, 'filename')
    fname = str(fname)
    if not fname.lower().endswith('.wav'):
        fname = fname + '.wav'
    wav_path = WAV_DIR / fname
    if not wav_path.exists():
        raise FileNotFoundError(f'Missing audio file: {wav_path}')
    y = load_audio(wav_path)
    buf.append(y)
    buf_idx.append(i)
    if len(buf) >= bs:
        flush_batch()
flush_batch()
log(f'Embeddings computed: shape={emb.shape} | total dt={time.time()-t0:.1f}s')

# Split to train/test aligned to meta order
mask_tr = (~df_all['is_test']).values
mask_te = (df_all['is_test']).values
X_train_emb = emb[mask_tr]
X_test_emb = emb[mask_te]
ids_train = train_df['rec_id'].values.astype(np.int64)
ids_test = test_df['rec_id'].values.astype(np.int64)
assert X_train_emb.shape[0]==len(ids_train) and X_test_emb.shape[0]==len(ids_test)

# Save NPZ with explicit ids
out_path = 'panns_cnn14_emb_v2.npz'
np.savez_compressed(out_path, X_train=X_train_emb, X_test=X_test_emb, train_ids=ids_train, test_ids=ids_test)
log(f'Saved {out_path} with shapes tr={X_train_emb.shape} te={X_test_emb.shape}')
gc.collect();

23:17:24 Meta ready: N_all=322 Ntr=258 Nte=64 stations=13


23:17:24 Using device: cuda


23:17:26 Loaded Cnn14 weights | missing=0 unexpected=0


23:17:26 Processed 25/322 | dt=0.6s


23:17:27 Processed 50/322 | dt=1.1s


23:17:27 Processed 75/322 | dt=1.4s


23:17:28 Processed 100/322 | dt=1.8s


23:17:28 Processed 125/322 | dt=2.1s


23:17:28 Processed 150/322 | dt=2.6s


23:17:29 Processed 175/322 | dt=2.9s


23:17:29 Processed 200/322 | dt=3.3s


23:17:30 Processed 225/322 | dt=3.8s


23:17:30 Processed 250/322 | dt=4.1s


23:17:30 Processed 275/322 | dt=4.6s


23:17:31 Processed 300/322 | dt=4.8s


23:17:31 Embeddings computed: shape=(322, 2048) | total dt=5.4s


23:17:31 Saved panns_cnn14_emb_v2.npz with shapes tr=(258, 2048) te=(64, 2048)


In [21]:
# Use regenerated embeddings (panns_cnn14_emb_v2.npz) with explicit ids for LOSO kNN and submission
import numpy as np, pandas as pd, time, sys, gc
from pathlib import Path
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score

def macro_auc_np(P, Y):
    C = Y.shape[1]; aucs = []
    for c in range(C):
        yt = Y[:,c]; yp = P[:,c]
        if yt.sum()==0 or yt.sum()==len(yt):
            continue
        try: aucs.append(roc_auc_score(yt, yp))
        except: pass
    return float(np.mean(aucs)) if aucs else np.nan

def l2_normalize_rows(A):
    A = A.astype(np.float32, copy=False)
    n = np.linalg.norm(A, axis=1, keepdims=True) + 1e-12
    return A / n

DATA_DIR = Path('essential_data')
def load_species_list(path: Path):
    df = pd.read_csv(path)
    if {'class_id','code'}.issubset(df.columns):
        return df.sort_values('class_id')['code'].tolist()
    with open(path, 'r') as f:
        f.readline()
        out = []
        for line in f:
            s = line.strip()
            if not s: continue
            parts = s.split(',')
            out.append(parts[1] if len(parts)>1 else s)
    return out
def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]
def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=int)
            if not is_test and tokens:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols

species = load_species_list(DATA_DIR/'species_list.txt')
assert len(species)==19, 'species != 19'
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, _ = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df_all = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df_all[~df_all['is_test']].copy()
test_df = df_all[df_all['is_test']].copy()
y_train = train_df[[c for c in train_df.columns if c.startswith('label_')]].copy()
y_train.columns = [f'label_{s}' for s in species]
groups = train_df['station'].values
meta_train = train_df[['rec_id','filename','station']].copy()
meta_test = test_df[['rec_id','filename','station']].copy()
print(f'Ntr={len(train_df)} Nte={len(test_df)} C={y_train.shape[1]} stations={len(pd.unique(groups))}')
sys.stdout.flush()

# Load regenerated embeddings with explicit ids
npz = np.load('panns_cnn14_emb_v2.npz')
Xtr_raw = npz['X_train']; Xte_raw = npz['X_test']
ids_tr = npz['train_ids'].astype(int); ids_te = npz['test_ids'].astype(int)

# Align to meta order
mp_tr = {int(r): i for i, r in enumerate(ids_tr.tolist())}
mp_te = {int(r): i for i, r in enumerate(ids_te.tolist())}
idx_tr = [mp_tr[int(r)] for r in meta_train['rec_id'].values.tolist()]
idx_te = [mp_te[int(r)] for r in meta_test['rec_id'].values.tolist()]
Xtr = l2_normalize_rows(Xtr_raw[idx_tr]); Xte = l2_normalize_rows(Xte_raw[idx_te])
assert Xtr.shape==(258,2048) and Xte.shape==(64,2048), f'Shapes mismatch: {Xtr.shape}, {Xte.shape}'
assert np.allclose(np.linalg.norm(Xtr[:5], axis=1), 1.0, atol=1e-5)
print('Embeddings aligned:', Xtr.shape, Xte.shape); sys.stdout.flush()

def knn_predict_uniform(X_tr, Y_tr, X_va, k=11, metric='cosine'):
    nn = NearestNeighbors(n_neighbors=min(k, len(X_tr)), metric=metric)
    nn.fit(X_tr)
    _, idxs = nn.kneighbors(X_va, return_distance=True)
    C = Y_tr.shape[1]
    P = np.zeros((len(X_va), C), dtype=np.float32)
    for i in range(len(X_va)):
        nbr = idxs[i]
        P[i] = Y_tr[nbr].mean(axis=0)
    return np.clip(P, 0, 1)

logo = LeaveOneGroupOut()
idx = np.arange(len(groups))
Y = y_train.values.astype(np.uint8)
P_oof = np.zeros_like(Y, dtype=np.float32)
t0 = time.time()
for f, (tr, va) in enumerate(logo.split(idx, groups=groups), 1):
    print(f'[LOSO v2] fold {f:02d} tr={len(tr)} va={len(va)} | elapsed={time.time()-t0:.1f}s'); sys.stdout.flush()
    P_oof[va] = knn_predict_uniform(Xtr[tr], Y[tr], Xtr[va], k=11, metric='cosine')
auc = macro_auc_np(P_oof, Y)
print('Pooled macro AUC (v2, k=11, cosine, uniform):', f'{auc:.4f}')

# Full-train for test
P_test = knn_predict_uniform(Xtr, Y, Xte, k=11, metric='cosine')
rows = []
rec_ids = meta_test['rec_id'].values.tolist()
C = Y.shape[1]
for i, rid in enumerate(rec_ids):
    for c in range(C):
        rows.append((rid*100 + c, float(P_test[i, c])))
sub = pd.DataFrame(rows, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
sub.to_csv('submission_knn_v2.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Saved submission_knn_v2.csv and overwrote submission.csv | rows=', len(sub))
print(sub.head(3))
gc.collect();

Ntr=258 Nte=64 C=19 stations=13


Embeddings aligned: (258, 2048) (64, 2048)


[LOSO v2] fold 01 tr=231 va=27 | elapsed=0.0s


[LOSO v2] fold 02 tr=234 va=24 | elapsed=0.0s


[LOSO v2] fold 03 tr=232 va=26 | elapsed=0.0s


[LOSO v2] fold 04 tr=244 va=14 | elapsed=0.0s


[LOSO v2] fold 05 tr=233 va=25 | elapsed=0.0s


[LOSO v2] fold 06 tr=233 va=25 | elapsed=0.0s


[LOSO v2] fold 07 tr=236 va=22 | elapsed=0.0s


[LOSO v2] fold 08 tr=247 va=11 | elapsed=0.0s


[LOSO v2] fold 09 tr=243 va=15 | elapsed=0.0s


[LOSO v2] fold 10 tr=243 va=15 | elapsed=0.0s


[LOSO v2] fold 11 tr=238 va=20 | elapsed=0.0s


[LOSO v2] fold 12 tr=234 va=24 | elapsed=0.1s


[LOSO v2] fold 13 tr=248 va=10 | elapsed=0.1s


Pooled macro AUC (v2, k=11, cosine, uniform): 0.6266
Saved submission_knn_v2.csv and overwrote submission.csv | rows= 1216
    Id  Probability
0  100     0.000000
1  101     0.181818
2  102     0.000000


In [22]:
# LOSO Logistic Regression on regenerated PANNs embeddings (Standardized per fold)
import numpy as np, pandas as pd, time, sys, gc
from pathlib import Path
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def macro_auc_np(P, Y):
    C = Y.shape[1]; aucs = []
    for c in range(C):
        yt = Y[:,c]; yp = P[:,c]
        if yt.sum()==0 or yt.sum()==len(yt):
            continue
        try: aucs.append(roc_auc_score(yt, yp))
        except: pass
    return float(np.mean(aucs)) if aucs else np.nan

DATA_DIR = Path('essential_data')
def load_species_list(path: Path):
    df = pd.read_csv(path)
    if {'class_id','code'}.issubset(df.columns):
        return df.sort_values('class_id')['code'].tolist()
    with open(path, 'r') as f:
        f.readline()
        out = []
        for line in f:
            s = line.strip()
            if not s: continue
            parts = s.split(',')
            out.append(parts[1] if len(parts)>1 else s)
    return out
def parse_rec_id2filename(path: Path):
    df = pd.read_csv(path)
    df = df.rename(columns={df.columns[0]:'rec_id', df.columns[1]:'filename'})
    df['rec_id'] = df['rec_id'].astype(int)
    df['station'] = df['filename'].str.extract(r'^(PC\d+)')
    return df[['rec_id','filename','station']]
def parse_labels(path: Path, C: int):
    rec_ids, flags, Y = [], [], []
    with open(path, 'r') as f:
        f.readline()
        for line in f:
            line = line.strip()
            if not line: continue
            parts = [tok.strip() for tok in line.split(',')]
            try: rid = int(parts[0])
            except: continue
            tokens = parts[1:] if len(parts)>1 else []
            is_test = any(tok=='?' for tok in tokens)
            y = np.zeros(C, dtype=int)
            if not is_test and tokens:
                for tok in tokens:
                    if tok in ('','?'): continue
                    try: idx = int(tok)
                    except: continue
                    if 0 <= idx < C: y[idx] = 1
            rec_ids.append(rid); flags.append(is_test); Y.append(y)
    lab_cols = [f'label_{i}' for i in range(C)]
    ydf = pd.DataFrame(np.vstack(Y), columns=lab_cols)
    df = pd.DataFrame({'rec_id': rec_ids, 'is_test': flags})
    return df.join(ydf), lab_cols

species = load_species_list(DATA_DIR/'species_list.txt')
assert len(species)==19, 'species != 19'
rec_map = parse_rec_id2filename(DATA_DIR/'rec_id2filename.txt')
labels_df, _ = parse_labels(DATA_DIR/'rec_labels_test_hidden.txt', len(species))
df_all = rec_map.merge(labels_df, on='rec_id', how='right')
train_df = df_all[~df_all['is_test']].copy()
test_df = df_all[df_all['is_test']].copy()
y_train = train_df[[c for c in train_df.columns if c.startswith('label_')]].copy()
y_train.columns = [f'label_{s}' for s in species]
groups = train_df['station'].values
meta_train = train_df[['rec_id','filename','station']].copy()
meta_test = test_df[['rec_id','filename','station']].copy()
print(f'Ntr={len(train_df)} Nte={len(test_df)} C={y_train.shape[1]} stations={len(pd.unique(groups))}')
sys.stdout.flush()

# Load regenerated embeddings with explicit ids and align to meta order
npz = np.load('panns_cnn14_emb_v2.npz')
Xtr_raw = npz['X_train']; Xte_raw = npz['X_test']
ids_tr = npz['train_ids'].astype(int); ids_te = npz['test_ids'].astype(int)
mp_tr = {int(r): i for i, r in enumerate(ids_tr.tolist())}
mp_te = {int(r): i for i, r in enumerate(ids_te.tolist())}
idx_tr = [mp_tr[int(r)] for r in meta_train['rec_id'].values.tolist()]
idx_te = [mp_te[int(r)] for r in meta_test['rec_id'].values.tolist()]
Xtr = Xtr_raw[idx_tr].astype(np.float32); Xte = Xte_raw[idx_te].astype(np.float32)
assert Xtr.shape==(258,2048) and Xte.shape==(64,2048)

# LOSO with per-fold StandardScaler and OvR LogisticRegression
logo = LeaveOneGroupOut()
idx = np.arange(len(groups))
Y = y_train.values.astype(np.uint8)
C = Y.shape[1]
P_oof = np.zeros((len(Y), C), dtype=np.float32)
t0 = time.time()
for f, (tr, va) in enumerate(logo.split(idx, groups=groups), 1):
    print(f'[LR] fold {f:02d} tr={len(tr)} va={len(va)} | elapsed={time.time()-t0:.1f}s'); sys.stdout.flush()
    ss = StandardScaler(with_mean=True, with_std=True)
    Xtr_s = ss.fit_transform(Xtr[tr])
    Xva_s = ss.transform(Xtr[va])
    for c in range(C):
        ytr_c = Y[tr, c]
        if ytr_c.min() == ytr_c.max():
            P_oof[va, c] = Y[:, c].mean()
            continue
        lr = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=2000, n_jobs=-1, random_state=42)
        lr.fit(Xtr_s, ytr_c)
        P_oof[va, c] = lr.predict_proba(Xva_s)[:, 1]
auc = macro_auc_np(P_oof, Y)
print('Pooled macro AUC (LR on PANNs):', f'{auc:.4f}')

# Full train for test predictions
ss_full = StandardScaler(with_mean=True, with_std=True)
Xtr_full_s = ss_full.fit_transform(Xtr)
Xte_full_s = ss_full.transform(Xte)
P_test = np.zeros((len(Xte_full_s), C), dtype=np.float32)
for c in range(C):
    y_c = Y[:, c]
    if y_c.min() == y_c.max():
        P_test[:, c] = Y[:, c].mean()
        continue
    lr = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=2000, n_jobs=-1, random_state=42)
    lr.fit(Xtr_full_s, y_c)
    P_test[:, c] = lr.predict_proba(Xte_full_s)[:, 1]

# Build submission
rows = []
rec_ids = meta_test['rec_id'].values.tolist()
for i, rid in enumerate(rec_ids):
    for c in range(C):
        rows.append((rid*100 + c, float(P_test[i, c])))
sub = pd.DataFrame(rows, columns=['Id','Probability']).sort_values('Id').reset_index(drop=True)
sub.to_csv('submission_lr_panns.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Saved submission_lr_panns.csv and overwrote submission.csv | rows=', len(sub))
print(sub.head(3))
gc.collect();

Ntr=258 Nte=64 C=19 stations=13


[LR] fold 01 tr=231 va=27 | elapsed=0.0s


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


[LR] fold 02 tr=234 va=24 | elapsed=54.4s


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


  return _ForkingPickler.loads(res)


[LR] fold 03 tr=232 va=26 | elapsed=104.9s


[LR] fold 04 tr=244 va=14 | elapsed=122.9s


[LR] fold 05 tr=233 va=25 | elapsed=143.1s


[LR] fold 06 tr=233 va=25 | elapsed=163.3s


[LR] fold 07 tr=236 va=22 | elapsed=183.4s


[LR] fold 08 tr=247 va=11 | elapsed=203.5s


[LR] fold 09 tr=243 va=15 | elapsed=223.6s


[LR] fold 10 tr=243 va=15 | elapsed=243.8s


[LR] fold 11 tr=238 va=20 | elapsed=263.9s


[LR] fold 12 tr=234 va=24 | elapsed=284.0s


[LR] fold 13 tr=248 va=10 | elapsed=304.1s


Pooled macro AUC (LR on PANNs): 0.6780


Saved submission_lr_panns.csv and overwrote submission.csv | rows= 1216
    Id  Probability
0  100     0.017779
1  101     0.004862
2  102     0.001935
