# Plan to Medal: ICML 2013 Whale Challenge (Redux)

Goal: Produce a high AUC-ROC submission quickly, iterate to medal.

Phases:

1) Setup & Data Discovery
- Unzip train2.zip/test2.zip
- Inspect contents and determine training/label files and test format
- Identify metric (AUC) and submission format (sampleSubmission.csv)

2) Baseline Modeling
- If tabular: LightGBM/XGBoost baseline with KFold CV
- If audio/images: check if features are provided; otherwise use available features (MFCCs/specs) if included; if raw media only, create a simple feature-extractor baseline
- Log CV AUC; generate test predictions

3) Iteration
- Feature engineering (scaling, class weights, aggregates)
- Model tuning (learning rate, depths, regularization)
- Ensembling if multiple feature views exist

4) Submission
- Save submission.csv matching sample format
- Iterate until CV strong; submit when confident

Checkpoints for Expert Review:
- After this plan
- After data loading/discovery
- After first baseline CV
- If training is slow or CV poor

In [1]:
import os, zipfile, time, glob, hashlib
import pandas as pd

start = time.time()
print("[INFO] CWD:", os.getcwd())
print("[INFO] Listing initial files:", os.listdir())

def unzip_if_needed(zip_path, extract_dir):
    if not os.path.exists(zip_path):
        print(f"[WARN] Zip not found: {zip_path}")
        return None
    os.makedirs(extract_dir, exist_ok=True)
    # Heuristic: if directory empty or no marker file, unzip
    need_unzip = True
    if os.listdir(extract_dir):
        need_unzip = False
    if need_unzip:
        print(f"[INFO] Unzipping {zip_path} -> {extract_dir} ...")
        with zipfile.ZipFile(zip_path, 'r') as zf:
            zf.extractall(extract_dir)
        print(f"[INFO] Unzip done. {len(os.listdir(extract_dir))} top-level items.")
    else:
        print(f"[INFO] Already unzipped: {extract_dir} contains {len(os.listdir(extract_dir))} items.")
    return extract_dir

train_dir = unzip_if_needed('train2.zip', 'train2')
test_dir = unzip_if_needed('test2.zip', 'test2')

def list_dir(path, max_items=20):
    if not path or not os.path.exists(path):
        print(f"[WARN] Path not found: {path}")
        return
    items = os.listdir(path)
    print(f"[INFO] Contents of {path} (showing up to {max_items}/{len(items)}):")
    for it in items[:max_items]:
        p = os.path.join(path, it)
        print("  -", it, "[DIR]" if os.path.isdir(p) else "[FILE]")

list_dir(train_dir if train_dir else '')
list_dir(test_dir if test_dir else '')

# Find CSVs
csv_paths = []
for base in ['', train_dir or '', test_dir or '']:
    if base and os.path.exists(base):
        csv_paths.extend(glob.glob(os.path.join(base, '**', '*.csv'), recursive=True))
    else:
        csv_paths.extend(glob.glob('*.csv'))
csv_paths = sorted(list(set(csv_paths)))
print(f"[INFO] Found {len(csv_paths)} CSV files:")
for p in csv_paths[:20]:
    print("  ", p)

# Peek sampleSubmission
if os.path.exists('sampleSubmission.csv'):
    ss = pd.read_csv('sampleSubmission.csv')
    print("[INFO] sampleSubmission.csv shape:", ss.shape)
    print("[INFO] sampleSubmission columns:", ss.columns.tolist())
    print(ss.head(3))
else:
    print("[WARN] sampleSubmission.csv not found")

# Try to detect likely train/test CSVs
train_candidates = [p for p in csv_paths if 'train' in os.path.basename(p).lower()]
test_candidates = [p for p in csv_paths if 'test' in os.path.basename(p).lower()]
label_candidates = [p for p in csv_paths if any(k in os.path.basename(p).lower() for k in ['label', 'y', 'target'])]
print("[INFO] Train CSV candidates:", train_candidates)
print("[INFO] Test CSV candidates:", test_candidates)
print("[INFO] Label CSV candidates:", label_candidates)

def safe_read_head(path, n=5):
    try:
        df = pd.read_csv(path, nrows=n)
        print(f"[INFO] Head of {path}:")
        print(df.head(n))
    except Exception as e:
        print(f"[ERROR] Failed reading {path}: {e}")

for p in train_candidates[:3] + test_candidates[:3] + label_candidates[:3]:
    safe_read_head(p, 5)

print(f"[INFO] Elapsed: {time.time()-start:.2f}s")

[INFO] CWD: /app/agent_run_states/the-icml-2013-whale-challenge-right-whale-redux-spray-20250909-040125
[INFO] Listing initial files: ['test2.zip', 'train2.zip', 'main.ipynb', 'agent_metadata', 'requirements.txt', 'task.txt', 'sampleSubmission.csv', 'docker_run.log', 'description.md']
[INFO] Unzipping train2.zip -> train2 ...


[INFO] Unzip done. 1 top-level items.
[INFO] Unzipping test2.zip -> test2 ...


[INFO] Unzip done. 1 top-level items.
[INFO] Contents of train2 (showing up to 20/1):
  - train2 [DIR]
[INFO] Contents of test2 (showing up to 20/1):
  - test2 [DIR]
[INFO] Found 1 CSV files:
   sampleSubmission.csv
[INFO] sampleSubmission.csv shape: (25149, 2)
[INFO] sampleSubmission columns: ['clip', 'probability']
                                clip  probability
0  20090330_000000_015s2ms_Test0.aif            0
1  20090330_000000_020s0ms_Test1.aif            0
2  20090330_000000_025s8ms_Test2.aif            0
[INFO] Train CSV candidates: []
[INFO] Test CSV candidates: []
[INFO] Label CSV candidates: []
[INFO] Elapsed: 7.46s


In [2]:
import os, glob, re, pandas as pd

train_glob = sorted(glob.glob(os.path.join('train2','train2','*.aif')))
test_glob = sorted(glob.glob(os.path.join('test2','test2','*.aif')))
print(f"[INFO] Train files: {len(train_glob)} | Test files: {len(test_glob)}")
print("[INFO] Example train files:")
for p in train_glob[:5]:
    print("  ", os.path.basename(p))
print("[INFO] Example test files:")
for p in test_glob[:5]:
    print("  ", os.path.basename(p))

# Parse labels and groups from filenames
def parse_label(fname):
    # Expect pattern ..._TRAINX_LABEL.aif, where LABEL in {0,1}
    base = os.path.splitext(os.path.basename(fname))[0]
    parts = base.split('_')
    try:
        lbl = int(parts[-1])
    except:
        lbl = None
    return lbl

def parse_group(fname):
    base = os.path.splitext(os.path.basename(fname))[0]
    parts = base.split('_')
    # Group by date_time (first two parts), safer against leakage
    if len(parts) >= 2:
        return '_'.join(parts[:2])
    return base

train_df = pd.DataFrame({
    'path': train_glob,
    'clip': [os.path.basename(p) for p in train_glob],
    'label': [parse_label(p) for p in train_glob],
    'group': [parse_group(p) for p in train_glob],
})
print(train_df.head())
label_counts = train_df['label'].value_counts(dropna=False)
print("[INFO] Label distribution:\n", label_counts)
print("[INFO] Num groups:", train_df['group'].nunique())

# Ensure sampleSubmission alignment for test ids
ss = pd.read_csv('sampleSubmission.csv')
print('[INFO] sampleSubmission first rows:')
print(ss.head())
test_ids_from_ss = ss['clip'].tolist()
missing = [t for t in test_ids_from_ss if t not in set(os.path.basename(p) for p in test_glob)]
print(f"[INFO] Missing test files referenced by sampleSubmission: {len(missing)} (should be 0)")

In [None]:
import os, time, math, json, numpy as np, pandas as pd
from multiprocessing import cpu_count

# Install deps if missing
def ensure_pkg(pkg):
    try:
        __import__(pkg)
        return True
    except Exception:
        import sys, subprocess
        print(f"[INFO] Installing {pkg} ...", flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])
        return True

ensure_pkg('librosa')
ensure_pkg('soundfile')

import librosa, soundfile as sf

# Rebuild train_df/test_glob if not present (for idempotency)
import glob
if 'train_df' not in globals():
    train_glob = sorted(glob.glob(os.path.join('train2','train2','*.aif')))
    def parse_label(fname):
        base = os.path.splitext(os.path.basename(fname))[0]
        parts = base.split('_')
        try: return int(parts[-1])
        except: return None
    def parse_group(fname):
        base = os.path.splitext(os.path.basename(fname))[0]
        parts = base.split('_')
        return '_'.join(parts[:2]) if len(parts) >= 2 else base
    train_df = pd.DataFrame({
        'path': train_glob,
        'clip': [os.path.basename(p) for p in train_glob],
        'label': [parse_label(p) for p in train_glob],
        'group': [parse_group(p) for p in train_glob],
    })
test_glob = sorted(glob.glob(os.path.join('test2','test2','*.aif')))

SR = 4000
N_MELS = 64
N_FFT = 1024
HOP = 256
FMIN = 20.0
FMAX = 800.0

def extract_features_one(path):
    try:
        y, sr = librosa.load(path, sr=SR, mono=True, res_type='kaiser_fast')
        if len(y) == 0:
            return None
        # Mel-spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP, fmin=FMIN, fmax=FMAX, power=2.0)
        S_db = librosa.power_to_db(S + 1e-12)
        # Per-mel stats
        mel_mean = S_db.mean(axis=1)  # (N_MELS,)
        mel_std = S_db.std(axis=1)
        # Additional spectral features over time
        sc = librosa.feature.spectral_centroid(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP)  # (1,T)
        sbw = librosa.feature.spectral_bandwidth(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP)
        srf = librosa.feature.spectral_rolloff(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP)
        flat = librosa.feature.spectral_flatness(y=y, n_fft=N_FFT, hop_length=HOP)
        rms = librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP)
        zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=N_FFT, hop_length=HOP)
        def tstats(a):
            a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
            return [a.mean(), a.std(), np.percentile(a, 25), np.percentile(a, 50), np.percentile(a, 75)]
        feats = {
            **{f'mean_mel_{i}': mel_mean[i] for i in range(N_MELS)},
            **{f'std_mel_{i}': mel_std[i] for i in range(N_MELS)},
            'sc_mean': tstats(sc)[0], 'sc_std': tstats(sc)[1], 'sc_p25': tstats(sc)[2], 'sc_p50': tstats(sc)[3], 'sc_p75': tstats(sc)[4],
            'sbw_mean': tstats(sbw)[0], 'sbw_std': tstats(sbw)[1], 'sbw_p25': tstats(sbw)[2], 'sbw_p50': tstats(sbw)[3], 'sbw_p75': tstats(sbw)[4],
            'srf_mean': tstats(srf)[0], 'srf_std': tstats(srf)[1], 'srf_p25': tstats(srf)[2], 'srf_p50': tstats(srf)[3], 'srf_p75': tstats(srf)[4],
            'flat_mean': tstats(flat)[0], 'flat_std': tstats(flat)[1], 'flat_p25': tstats(flat)[2], 'flat_p50': tstats(flat)[3], 'flat_p75': tstats(flat)[4],
            'rms_mean': tstats(rms)[0], 'rms_std': tstats(rms)[1], 'rms_p25': tstats(rms)[2], 'rms_p50': tstats(rms)[3], 'rms_p75': tstats(rms)[4],
            'zcr_mean': tstats(zcr)[0], 'zcr_std': tstats(zcr)[1], 'zcr_p25': tstats(zcr)[2], 'zcr_p50': tstats(zcr)[3], 'zcr_p75': tstats(zcr)[4],
            'duration_sec': len(y) / SR,
        }
        return feats
    except Exception as e:
        return None

def build_feature_df(paths, ids):
    from joblib import Parallel, delayed
    n_jobs = max(1, min(cpu_count() - 2, 16))
    print(f"[INFO] Extracting features for {len(paths)} files with n_jobs={n_jobs} ...", flush=True)
    t0 = time.time()
    feats_list = Parallel(n_jobs=n_jobs, backend='loky', verbose=5)(delayed(extract_features_one)(p) for p in paths)
    elapsed = time.time() - t0
    print(f"[INFO] Feature extraction done in {elapsed/60:.2f} min", flush=True)
    rows = []
    for clip, fdict in zip(ids, feats_list):
        if fdict is None:
            fdict = {}
        fdict['clip'] = clip
        rows.append(fdict)
    df = pd.DataFrame(rows)
    return df

feat_train_path = 'features_train.parquet'
feat_test_path = 'features_test.parquet'

if os.path.exists(feat_train_path) and os.path.exists(feat_test_path):
    print('[INFO] Using cached feature files')
    feat_train = pd.read_parquet(feat_train_path)
    feat_test = pd.read_parquet(feat_test_path)
else:
    feat_train = build_feature_df(train_df['path'].tolist(), train_df['clip'].tolist())
    feat_test = build_feature_df(test_glob, [os.path.basename(p) for p in test_glob])
    # Align columns
    common_cols = sorted(set(feat_train.columns).intersection(set(feat_test.columns)))
    feat_train = feat_train[common_cols]
    feat_test = feat_test[common_cols]
    feat_train.to_parquet(feat_train_path, index=False)
    feat_test.to_parquet(feat_test_path, index=False)
    print('[INFO] Saved features to parquet')

print('[INFO] Feature shapes:', feat_train.shape, feat_test.shape)
print('[INFO] Preview train features:')
print(feat_train.head(3))
print('[INFO] Missing values per column (top 10):')
na_counts = feat_train.isna().sum().sort_values(ascending=False)
print(na_counts.head(10))

In [3]:
import os, hashlib, time, glob, pandas as pd
from joblib import Parallel, delayed
from multiprocessing import cpu_count

# Build exact-duplicate clusters via MD5 of raw file bytes (fast, no decoding).
def file_md5(path, chunk_size=1024*1024):
    h = hashlib.md5()
    with open(path, 'rb') as f:
        while True:
            b = f.read(chunk_size)
            if not b: break
            h.update(b)
    return h.hexdigest()

# Ensure train_df and test_glob exist
if 'train_df' not in globals():
    train_glob = sorted(glob.glob(os.path.join('train2','train2','*.aif')))
    def parse_label(fname):
        base = os.path.splitext(os.path.basename(fname))[0]
        parts = base.split('_')
        try: return int(parts[-1])
        except: return None
    def parse_group(fname):
        base = os.path.splitext(os.path.basename(fname))[0]
        parts = base.split('_')
        return '_'.join(parts[:2]) if len(parts) >= 2 else base
    train_df = pd.DataFrame({'path': train_glob, 'clip': [os.path.basename(p) for p in train_glob], 'label': [parse_label(p) for p in train_glob], 'group': [parse_group(p) for p in train_glob]})
test_glob = sorted(glob.glob(os.path.join('test2','test2','*.aif')))

n_jobs = max(1, min(cpu_count()-2, 16))
t0 = time.time()
print(f"[INFO] Computing MD5 hashes for {len(train_df)} train files with n_jobs={n_jobs} ...", flush=True)
train_hashes = Parallel(n_jobs=n_jobs, backend='loky', verbose=5)(delayed(file_md5)(p) for p in train_df['path'].tolist())
train_df['dup_cluster'] = train_hashes
dup_counts = train_df.groupby('dup_cluster').size().sort_values(ascending=False)
num_clusters = dup_counts.shape[0]
multi_clusters = (dup_counts > 1).sum()
print(f"[INFO] Train duplicate clusters: total={num_clusters}, multi-file clusters={multi_clusters}")
print("[INFO] Top 5 duplicate cluster sizes:\n", dup_counts.head(5))
print(f"[INFO] Hashing time: {time.time()-t0:.1f}s")

# Build a small balanced subset for pipeline sanity check
pos_df = train_df[train_df['label'] == 1]
neg_df = train_df[train_df['label'] == 0]
n_pos = min(150, len(pos_df))
n_neg = min(350, len(neg_df))
subset_df = pd.concat([pos_df.sample(n=n_pos, random_state=42), neg_df.sample(n=n_neg, random_state=42)], axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)
print(f"[INFO] Subset built: {subset_df.shape[0]} clips ({n_pos} pos, {n_neg} neg), dup clusters={subset_df['dup_cluster'].nunique()}")
subset_df.head(3)

[INFO] Computing MD5 hashes for 22692 train files with n_jobs=16 ...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 384 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 6128 tasks      | elapsed:    0.5s


[INFO] Train duplicate clusters: total=22692, multi-file clusters=0
[INFO] Top 5 duplicate cluster sizes:
 dup_cluster
00023fd1300698de031d04a424e6ca4f    1
ab9b6ceadcb0684ebaea7e3c2ed86631    1
ab6805c2ca11dc7a97631a3ddec12053    1
ab64a1d92fd04cbb255aff93d8a67bd3    1
ab61e2273bf0afd7be6d6300e4b44d85    1
dtype: int64
[INFO] Hashing time: 0.6s
[INFO] Subset built: 500 clips (150 pos, 350 neg), dup clusters=500


[Parallel(n_jobs=16)]: Done 20814 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 22692 out of 22692 | elapsed:    0.6s finished


Unnamed: 0,path,clip,label,group,dup_cluster
0,train2/train2/20090328_221500_80427s3ms_TRAIN7...,20090328_221500_80427s3ms_TRAIN7279_0.aif,0,20090328_221500,d883140ae552e275e7dac61a02b5a573
1,train2/train2/20090328_234500_86170s9ms_TRAIN8...,20090328_234500_86170s9ms_TRAIN8366_1.aif,1,20090328_234500,e334c2c1064c31eb01e4a49e2ad2548c
2,train2/train2/20090329_120000_43303s6ms_TRAIN1...,20090329_120000_43303s6ms_TRAIN14589_0.aif,0,20090329_120000,898d96e693645a6f2497d0a74a841709


In [18]:
# Perceptual hash grouping to prevent leakage (content-based groups on subset_df)
import os, time, numpy as np, pandas as pd

def ensure_pkg(pkg):
    try:
        __import__(pkg)
        return True
    except Exception:
        import sys, subprocess
        print(f"[INFO] Installing {pkg} ...", flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])
        return True

ensure_pkg('audioread'); ensure_pkg('librosa'); ensure_pkg('Pillow'); ensure_pkg('imagehash')
import audioread, librosa
from PIL import Image
import imagehash
from joblib import Parallel, delayed
from multiprocessing import cpu_count

if 'subset_df' not in globals():
    raise RuntimeError('subset_df not found; build it before running perceptual hashing')

PHASH_MELS = 32
PHASH_FMAX = 800.0
PHASH_NFFT = 512
PHASH_HOP = 256
HAMMING_THRESH = 6  # cluster if distance <= 6

def load_audio_bytes_audioread(path):
    with audioread.audio_open(path) as f:
        sr = int(f.samplerate)
        ch = int(f.channels)
        buf = bytearray()
        for block in f:
            buf.extend(block)
    pcm = np.frombuffer(buf, dtype=np.int16)
    if pcm.size == 0:
        return np.zeros(1, dtype=np.float32), sr
    if ch > 1:
        try:
            pcm = pcm.reshape((-1, ch)).mean(axis=1)
        except Exception:
            pcm = pcm.astype(np.float32)
    y = (pcm.astype(np.float32) / 32768.0)
    return y, sr

def phash_one(path, clip):
    try:
        y, sr = load_audio_bytes_audioread(path)
        if y is None or y.size == 0:
            return {'clip': clip, 'phash': '0'*16}
        ny = max(100.0, 0.5 * sr - 1.0)
        fmax_eff = float(min(PHASH_FMAX, ny))
        fmin_eff = 20.0
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=PHASH_MELS, n_fft=PHASH_NFFT, hop_length=PHASH_HOP, fmin=fmin_eff, fmax=fmax_eff, power=2.0)
        S_db = librosa.power_to_db(S + 1e-12)
        m, M = np.nanmin(S_db), np.nanmax(S_db)
        if not np.isfinite(m) or not np.isfinite(M) or M - m < 1e-6:
            arr = np.zeros_like(S_db, dtype=np.uint8)
        else:
            arr = ((S_db - m) / (M - m) * 255.0).clip(0, 255).astype(np.uint8)
        img = Image.fromarray(arr)
        img = img.resize((64, 32))  # (time, mel)
        h = imagehash.phash(img, hash_size=8)  # 64-bit
        return {'clip': clip, 'phash': str(h)}
    except Exception:
        return {'clip': clip, 'phash': '0'*16}

paths = subset_df['path'].tolist()
clips = subset_df['clip'].tolist()
n_jobs = max(1, min(cpu_count()-2, 16))
print(f"[INFO] Computing perceptual hashes for subset ({len(paths)} files) with n_jobs={n_jobs} ...", flush=True)
t0 = time.time()
rows = Parallel(n_jobs=n_jobs, backend='threading', verbose=5)(delayed(phash_one)(p, c) for p, c in zip(paths, clips))
print(f"[INFO] phash done in {(time.time()-t0)/60:.2f} min", flush=True)
phash_df = pd.DataFrame(rows)

# Ensure clean merge (avoid suffixing if phash already present from previous runs)
for col in ['phash', 'phash_x', 'phash_y', 'phash_new']:
    if col in subset_df.columns:
        subset_df = subset_df.drop(columns=[col])

subset_df = subset_df.merge(phash_df, on='clip', how='left')
if 'phash' not in subset_df.columns:
    # Handle unexpected suffixing just in case
    if 'phash_new' in subset_df.columns:
        subset_df['phash'] = subset_df['phash_new']
        subset_df = subset_df.drop(columns=['phash_new'])
    elif 'phash_x' in subset_df.columns or 'phash_y' in subset_df.columns:
        subset_df['phash'] = subset_df.get('phash_x', subset_df.get('phash_y'))
        subset_df = subset_df.drop(columns=[c for c in ['phash_x','phash_y'] if c in subset_df.columns])

# Build clusters by Hamming distance <= threshold using union-find
phashes = subset_df['phash'].astype(str).tolist()
n = len(phashes)
hvals = [int(h, 16) if all(c in '0123456789abcdef' for c in h.lower()) else 0 for h in phashes]

parent = list(range(n))
rank = [0]*n
def find(x):
    while parent[x] != x:
        parent[x] = parent[parent[x]]
        x = parent[x]
    return x
def union(x, y):
    rx, ry = find(x), find(y)
    if rx == ry: return
    if rank[rx] < rank[ry]:
        parent[rx] = ry
    elif rank[rx] > rank[ry]:
        parent[ry] = rx
    else:
        parent[ry] = rx
        rank[rx] += 1

# Bucket by short prefix to reduce pair checks
buckets = {}
for i, h in enumerate(phashes):
    key = h[:4]
    buckets.setdefault(key, []).append(i)

checks = 0
for key, idxs in buckets.items():
    m = len(idxs)
    for a in range(m):
        ia = idxs[a]
        va = hvals[ia]
        for b in range(a+1, m):
            ib = idxs[b]
            vb = hvals[ib]
            d = (va ^ vb).bit_count()  # Hamming distance
            checks += 1
            if d <= HAMMING_THRESH:
                union(ia, ib)
print(f"[INFO] Hamming checks within buckets: {checks}")

roots = [find(i) for i in range(n)]
root_map = {}
gid = 0
group_ids = []
for r in roots:
    if r not in root_map:
        root_map[r] = gid
        gid += 1
    group_ids.append(root_map[r])

# Overwrite dup_cluster with content-based cluster id
subset_df['dup_cluster'] = [f'phc_{g}' for g in group_ids]
print('[INFO] Subset content-based unique groups:', subset_df['dup_cluster'].nunique(), 'out of', n)
subset_df.head(3)

In [19]:
# Fallback grouping: use filename date_time prefix groups to avoid leakage while phash clustering is slow
if 'subset_df' not in globals():
    raise RuntimeError('subset_df not available')
subset_df['dup_cluster'] = subset_df['group'].astype(str)
print('[INFO] Fallback grouping applied. Unique groups:', subset_df['dup_cluster'].nunique())
subset_df[['clip','dup_cluster']].head(3)

[INFO] Fallback grouping applied. Unique groups: 170


Unnamed: 0,clip,dup_cluster
0,20090328_221500_80427s3ms_TRAIN7279_0.aif,20090328_221500
1,20090328_234500_86170s9ms_TRAIN8366_1.aif,20090328_234500
2,20090329_120000_43303s6ms_TRAIN14589_0.aif,20090329_120000


In [20]:
import os, time, numpy as np, pandas as pd
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Ensure deps
def ensure_pkg(pkg):
    try:
        __import__(pkg)
        return True
    except Exception:
        import sys, subprocess
        print(f"[INFO] Installing {pkg} ...", flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])
        return True

ensure_pkg('librosa'); ensure_pkg('soundfile'); ensure_pkg('lightgbm'); ensure_pkg('audioread'); ensure_pkg('resampy')
import librosa, soundfile as sf, lightgbm as lgb, audioread, resampy

# Use subset_df built earlier (500 clips). If missing, fallback to small slice of train_df.
if 'subset_df' not in globals():
    print('[WARN] subset_df not found; creating a small fallback subset of 500 clips')
    pos_df = train_df[train_df['label'] == 1].sample(n=min(150, (train_df['label']==1).sum()), random_state=42)
    neg_df = train_df[train_df['label'] == 0].sample(n=min(350, (train_df['label']==0).sum()), random_state=42)
    subset_df = pd.concat([pos_df, neg_df], axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

# Default params (sr-specific values will adapt to actual sr returned by loader)
SR = 4000
N_MELS = 128
N_FFT = 1024
HOP = 256
FMIN = 20.0
FMAX = 1000.0

def tstats(a):
    a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
    return [a.mean(), a.std(), np.median(a), a.min(), a.max()]

def load_audio_robust(path):
    # Decode using audioread and DO NOT resample (avoid resampy dependency); return native sr
    with audioread.audio_open(path) as f:
        sr = int(f.samplerate)
        ch = int(f.channels)
        buf = bytearray()
        for block in f:
            buf.extend(block)
    if len(buf) == 0:
        raise RuntimeError('Empty decode buffer')
    pcm = np.frombuffer(buf, dtype=np.int16)
    if pcm.size == 0:
        raise RuntimeError('Decoded PCM empty')
    if ch > 1:
        try:
            pcm = pcm.reshape((-1, ch)).mean(axis=1)
        except Exception:
            pcm = pcm.astype(np.float32)
    y = (pcm.astype(np.float32) / 32768.0)
    return y, sr

_ERR_COUNT = 0

def extract_features_one_enh(path, clip_id):
    global _ERR_COUNT
    try:
        y, sr = load_audio_robust(path)
        if y is None or y.size == 0:
            raise RuntimeError('Empty audio after decoding')
        # Ensure frequency bounds within Nyquist
        ny = max(100.0, 0.5 * sr - 1.0)
        fmax_eff = float(min(FMAX, ny))
        fmin_eff = float(min(FMIN, max(10.0, fmax_eff * 0.25)))
        # Mel-spectrogram and log power
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP, fmin=fmin_eff, fmax=fmax_eff, power=2.0)
        S_db = librosa.power_to_db(S + 1e-12)
        # deltas
        d1 = librosa.feature.delta(S_db, order=1)
        d2 = librosa.feature.delta(S_db, order=2)
        # mfccs
        mfcc = librosa.feature.mfcc(S=S, sr=sr, n_mfcc=20)
        # spectral features
        sc = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP)
        sbw = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP)
        srf = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP, roll_percent=0.85)
        flat = librosa.feature.spectral_flatness(y=y, n_fft=N_FFT, hop_length=HOP)
        rms = librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP)
        zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=N_FFT, hop_length=HOP)
        # band energy ratios: low vs mid
        mels_low = S_db[:int(N_MELS*0.25), :]
        mels_mid = S_db[int(N_MELS*0.25):int(N_MELS*0.5), :]
        low_energy = np.mean(mels_low)
        mid_energy = np.mean(mels_mid)
        ratio_low_mid = low_energy / (np.abs(mid_energy) + 1e-6)
        feats = {'clip': clip_id, 'duration_sec': y.size / float(sr), 'low_mid_ratio': ratio_low_mid}
        # mel stats
        mel_mean = S_db.mean(axis=1); mel_std = S_db.std(axis=1); mel_med = np.median(S_db, axis=1)
        for i in range(N_MELS):
            feats[f'mel_mean_{i}'] = mel_mean[i]
            feats[f'mel_std_{i}'] = mel_std[i]
            feats[f'mel_med_{i}'] = mel_med[i]
        # deltas aggregated
        d1m = d1.mean(axis=1); d1s = d1.std(axis=1)
        d2m = d2.mean(axis=1); d2s = d2.std(axis=1)
        for i in range(N_MELS):
            feats[f'd1_mean_{i}'] = d1m[i]
            feats[f'd1_std_{i}'] = d1s[i]
            feats[f'd2_mean_{i}'] = d2m[i]
            feats[f'd2_std_{i}'] = d2s[i]
        # mfcc stats
        for i in range(mfcc.shape[0]):
            mstats = tstats(mfcc[i])
            feats[f'mfcc{i}_mean'] = mstats[0]
            feats[f'mfcc{i}_std'] = mstats[1]
            feats[f'mfcc{i}_med'] = mstats[2]
            feats[f'mfcc{i}_min'] = mstats[3]
            feats[f'mfcc{i}_max'] = mstats[4]
        # spectral stats
        for name, arr in [('sc', sc), ('sbw', sbw), ('srf', srf), ('flat', flat), ('rms', rms), ('zcr', zcr)]:
            st = tstats(arr)
            feats[f'{name}_mean'] = st[0]; feats[f'{name}_std'] = st[1]
            feats[f'{name}_med'] = st[2]; feats[f'{name}_min'] = st[3]; feats[f'{name}_max'] = st[4]
        return feats
    except Exception as e:
        if _ERR_COUNT < 5:
            print(f"[WARN] Feature extraction failed for {clip_id}: {repr(e)}", flush=True)
            _ERR_COUNT += 1
        return {'clip': clip_id}

def build_feature_df(paths, ids, n_jobs=None):
    # Use sequential extraction to ensure decoding works reliably
    print(f"[INFO] Extracting enhanced features for {len(paths)} files sequentially...", flush=True)
    t0 = time.time()
    feats_list = [extract_features_one_enh(p, i) for p, i in zip(paths, ids)]
    print(f"[INFO] Feature extraction done in {(time.time()-t0)/60:.2f} min", flush=True)
    return pd.DataFrame(feats_list)

# Build features for subset (force rebuild to fix earlier empty features)
sub_paths = subset_df['path'].tolist()
sub_ids = subset_df['clip'].tolist()
feat_subset_path = 'features_subset.parquet'
FORCE_REBUILD = True
if os.path.exists(feat_subset_path) and FORCE_REBUILD:
    try:
        os.remove(feat_subset_path)
        print('[INFO] Removed cached subset features to rebuild')
    except Exception:
        pass
if os.path.exists(feat_subset_path) and not FORCE_REBUILD:
    feat_sub = pd.read_parquet(feat_subset_path)
    print('[INFO] Loaded cached subset features:', feat_sub.shape)
else:
    # Increase subset size for more reliable CV
    k = min(500, len(sub_paths))
    feat_sub = build_feature_df(sub_paths[:k], sub_ids[:k])
    # Merge with full subset_df to align; missing rows kept for now
    feat_sub.to_parquet(feat_subset_path, index=False)
    print('[INFO] Saved subset features:', feat_sub.shape)

# Merge labels and groups (only for extracted subset rows)
work_df = subset_df[['clip','label','dup_cluster']].merge(feat_sub, on='clip', how='inner')
y = work_df['label'].astype(int).values
groups = work_df['dup_cluster'].values
feature_cols = [c for c in work_df.columns if c not in ['clip','label','dup_cluster']]
X = work_df[feature_cols].fillna(0.0).astype(np.float32).values
print('[INFO] X shape:', X.shape, 'Num features:', len(feature_cols))
print('[INFO] Unique groups in work_df:', len(pd.unique(groups)))

# Only proceed with CV if we have features
if X.shape[1] > 0 and len(np.unique(y)) > 1:
    try:
        from sklearn.model_selection import StratifiedGroupKFold
        sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
        splits = sgkf.split(X, y, groups)
        print('[INFO] Using StratifiedGroupKFold(5)')
    except Exception:
        from sklearn.model_selection import GroupKFold
        gkf = GroupKFold(n_splits=5)
        splits = gkf.split(X, y, groups)
        print('[WARN] StratifiedGroupKFold not available; using GroupKFold(5)')

    pos = (y==1).sum(); neg = (y==0).sum()
    scale_pos_weight = float(neg) / max(1.0, float(pos))
    print(f"[INFO] Class balance: pos={pos}, neg={neg}, scale_pos_weight={scale_pos_weight:.2f}")

    oof = np.zeros(len(y), dtype=np.float32)
    fold = 0
    t_start = time.time()
    for tr_idx, va_idx in splits:
        fold += 1
        print(f"[INFO] Fold {fold} | train={len(tr_idx)} valid={len(va_idx)}", flush=True)
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        model = lgb.LGBMClassifier(
            objective='binary',
            learning_rate=0.03,
            n_estimators=10000,
            num_leaves=63,
            subsample=0.8,
            colsample_bytree=0.7,
            reg_lambda=1.5,
            random_state=42,
            n_jobs=min(8, cpu_count()-2),
            scale_pos_weight=scale_pos_weight
        )
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(200, verbose=True), lgb.log_evaluation(200)]
        )
        p = model.predict_proba(X_va)[:,1]
        oof[va_idx] = p
        auc = roc_auc_score(y_va, p)
        print(f"[INFO] Fold {fold} AUC: {auc:.5f} | elapsed {(time.time()-t_start):.1f}s", flush=True)

    oof_auc = roc_auc_score(y, oof)
    print(f"[RESULT] Subset OOF AUC: {oof_auc:.5f}")
else:
    print('[WARN] No usable features extracted yet; inspect extraction pipeline.')

[INFO] Removed cached subset features to rebuild
[INFO] Extracting enhanced features for 500 files sequentially...


[WARN] Feature extraction failed for 20090329_080000_29089s8ms_TRAIN12960_0.aif: ParameterError("when mode='interp', width=9 cannot exceed data.shape[axis]=8")


[WARN] Feature extraction failed for 20090328_113000_41875s5ms_TRAIN3840_0.aif: ParameterError("when mode='interp', width=9 cannot exceed data.shape[axis]=8")


[WARN] Feature extraction failed for 20090328_221500_80192s3ms_TRAIN7239_0.aif: ParameterError("when mode='interp', width=9 cannot exceed data.shape[axis]=7")


[INFO] Feature extraction done in 0.11 min


[INFO] Saved subset features: (500, 1029)
[INFO] X shape: (500, 1028) Num features: 1028
[INFO] Unique groups in work_df: 170
[INFO] Using StratifiedGroupKFold(5)
[INFO] Class balance: pos=150, neg=350, scale_pos_weight=2.33
[INFO] Fold 1 | train=388 valid=112


[LightGBM] [Info] Number of positive: 114, number of negative: 274
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 132269
[LightGBM] [Info] Number of data points in the train set: 388, number of used features: 1028
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.293814 -> initscore=-0.876930
[LightGBM] [Info] Start training from score -0.876930
Training until validation scores don't improve for 200 rounds








[200]	valid_0's auc: 0.943348	valid_0's binary_logloss: 0.288395






[400]	valid_0's auc: 0.94883	valid_0's binary_logloss: 0.272943






[600]	valid_0's auc: 0.955775	valid_0's binary_logloss: 0.268681




[800]	valid_0's auc: 0.958333	valid_0's binary_logloss: 0.267875




Early stopping, best iteration is:
[721]	valid_0's auc: 0.957602	valid_0's binary_logloss: 0.266924
[INFO] Fold 1 AUC: 0.95760 | elapsed 1.6s


[INFO] Fold 2 | train=400 valid=100


[LightGBM] [Info] Number of positive: 112, number of negative: 288


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136234
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 1028
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.280000 -> initscore=-0.944462
[LightGBM] [Info] Start training from score -0.944462
Training until validation scores don't improve for 200 rounds












[200]	valid_0's auc: 0.844228	valid_0's binary_logloss: 0.518612






Early stopping, best iteration is:
[109]	valid_0's auc: 0.840832	valid_0's binary_logloss: 0.484952
[INFO] Fold 2 AUC: 0.84083 | elapsed 2.1s


[INFO] Fold 3 | train=393 valid=107


[LightGBM] [Info] Number of positive: 114, number of negative: 279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014829 seconds.
You can set `force_col_wise=true` to remove the overhead.


[LightGBM] [Info] Total Bins 134288
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 1028
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.290076 -> initscore=-0.895013
[LightGBM] [Info] Start training from score -0.895013
Training until validation scores don't improve for 200 rounds
















[200]	valid_0's auc: 0.966745	valid_0's binary_logloss: 0.246849
















[400]	valid_0's auc: 0.962833	valid_0's binary_logloss: 0.241362
Early stopping, best iteration is:
[237]	valid_0's auc: 0.968701	valid_0's binary_logloss: 0.240434


[INFO] Fold 3 AUC: 0.96870 | elapsed 2.9s


[INFO] Fold 4 | train=403 valid=97


[LightGBM] [Info] Number of positive: 127, number of negative: 276


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137276
[LightGBM] [Info] Number of data points in the train set: 403, number of used features: 1028


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.315136 -> initscore=-0.776214
[LightGBM] [Info] Start training from score -0.776214
Training until validation scores don't improve for 200 rounds




















[200]	valid_0's auc: 0.891304	valid_0's binary_logloss: 0.334936






















[400]	valid_0's auc: 0.907756	valid_0's binary_logloss: 0.339189


Early stopping, best iteration is:
[243]	valid_0's auc: 0.898942	valid_0's binary_logloss: 0.330158
[INFO] Fold 4 AUC: 0.89894 | elapsed 3.8s


[INFO] Fold 5 | train=416 valid=84


[LightGBM] [Info] Number of positive: 133, number of negative: 283


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141416


[LightGBM] [Info] Number of data points in the train set: 416, number of used features: 1028
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319712 -> initscore=-0.755098
[LightGBM] [Info] Start training from score -0.755098
Training until validation scores don't improve for 200 rounds


















[200]	valid_0's auc: 0.854258	valid_0's binary_logloss: 0.418095
















Early stopping, best iteration is:
[147]	valid_0's auc: 0.856014	valid_0's binary_logloss: 0.405433


[INFO] Fold 5 AUC: 0.85601 | elapsed 4.4s


[RESULT] Subset OOF AUC: 0.91131


In [12]:
# Debug single-file feature extraction to find why features are empty
import numpy as np, time
dbg_path = subset_df['path'].iloc[0]
dbg_clip = subset_df['clip'].iloc[0]
print('[DEBUG] File:', dbg_clip, 'Path:', dbg_path)
t0 = time.time()
try:
    y, sr = load_audio_robust(dbg_path)
    print('[DEBUG] Loaded audio sr:', sr, 'len:', None if y is None else y.size, 'min/max:', None if y is None else (float(np.nanmin(y)), float(np.nanmax(y))))
    assert y is not None and y.size > 0, 'Empty audio'
    S = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP, fmin=FMIN, fmax=FMAX, power=2.0)
    print('[DEBUG] Mel shape:', S.shape, 'min/max:', float(np.min(S)), float(np.max(S)))
    S_db = librosa.power_to_db(S + 1e-12)
    print('[DEBUG] Mel dB shape:', S_db.shape, 'min/max:', float(np.min(S_db)), float(np.max(S_db)))
    d1 = librosa.feature.delta(S_db, order=1); d2 = librosa.feature.delta(S_db, order=2)
    print('[DEBUG] Delta shapes:', d1.shape, d2.shape)
    mfcc = librosa.feature.mfcc(S=S, sr=SR, n_mfcc=20)
    print('[DEBUG] MFCC shape:', mfcc.shape, 'min/max:', float(np.min(mfcc)), float(np.max(mfcc)))
    sc = librosa.feature.spectral_centroid(y=y, sr=SR, n_fft=N_FFT, hop_length=HOP)
    print('[DEBUG] spectral_centroid shape:', sc.shape)
    print('[DEBUG] Single file extraction OK in %.2fs' % (time.time()-t0))
except Exception as e:
    import traceback
    print('[ERROR] Debug extraction failed:', repr(e))
    traceback.print_exc()

[DEBUG] File: 20090328_221500_80427s3ms_TRAIN7279_0.aif Path: train2/train2/20090328_221500_80427s3ms_TRAIN7279_0.aif
[DEBUG] Loaded audio sr: None len: None min/max: None
[ERROR] Debug extraction failed: AssertionError('Empty audio')


Traceback (most recent call last):
  File "/tmp/ipykernel_61/1080803614.py", line 10, in <module>
    assert y is not None and y.size > 0, 'Empty audio'
AssertionError: Empty audio


In [8]:
import subprocess, sys, os, numpy as np
print('[INFO] Installing system ffmpeg (for AIFF decoding via audioread/ffmpeg)...', flush=True)
subprocess.check_call(['bash', '-lc', 'apt-get update -y && apt-get install -y ffmpeg >/dev/null 2>&1 || true'])
print('[INFO] ffmpeg install attempted. Verifying...', flush=True)
subprocess.run(['bash', '-lc', 'ffmpeg -version | head -n 1'], check=False)

print('[INFO] Testing audioread decoding on one file...', flush=True)
import audioread
test_path = subset_df['path'].iloc[0]
try:
    with audioread.audio_open(test_path) as f:
        sr = f.samplerate
        ch = f.channels
        buf = bytearray()
        for block in f:
            buf.extend(block)
        pcm = np.frombuffer(buf, dtype=np.int16)
        if ch > 1:
            pcm = pcm.reshape((-1, ch)).mean(axis=1)
        y = (pcm.astype(np.float32) / 32768.0)
        print('[INFO] audioread loaded:', test_path, 'sr=', sr, 'len=', y.size, 'min/max=', float(y.min()) if y.size else None, float(y.max()) if y.size else None)
except Exception as e:
    import traceback
    print('[ERROR] audioread failed:', repr(e))
    traceback.print_exc()

[INFO] Installing system ffmpeg (for AIFF decoding via audioread/ffmpeg)...


Reading package lists...
[INFO] ffmpeg install attempted. Verifying...


[INFO] Testing audioread decoding on one file...


[INFO] audioread loaded: train2/train2/20090328_221500_80427s3ms_TRAIN7279_0.aif sr= 2000 len= 4000 min/max= -0.0546875 0.02978515625


E: List directory /var/lib/apt/lists/partial is missing. - Acquire (30: Read-only file system)
bash: line 1: ffmpeg: command not found


In [9]:
# Sanity-check extraction function and small sequential build
dbg_path = subset_df['path'].iloc[0]
dbg_clip = subset_df['clip'].iloc[0]
print('[CHECK] Running extract_features_one_enh on single file...')
fe = extract_features_one_enh(dbg_path, dbg_clip)
print('[CHECK] keys count:', len(fe), 'keys sample:', list(fe.keys())[:10])

print('[CHECK] Building features for 10 files sequentially...')
mini_paths = subset_df['path'].iloc[:10].tolist()
mini_ids = subset_df['clip'].iloc[:10].tolist()
mini_feats = []
for p,i in zip(mini_paths, mini_ids):
    mini_feats.append(extract_features_one_enh(p, i))
mini_df = pd.DataFrame(mini_feats)
print('[CHECK] mini_df shape:', mini_df.shape)
print(mini_df.head(2).T.head(20))

[CHECK] Running extract_features_one_enh on single file...
[CHECK] keys count: 1 keys sample: ['clip']
[CHECK] Building features for 10 files sequentially...
[CHECK] mini_df shape: (10, 1)
                                              0  \
clip  20090328_221500_80427s3ms_TRAIN7279_0.aif   

                                              1  
clip  20090328_234500_86170s9ms_TRAIN8366_1.aif  


In [None]:
# Build content-based groups (perceptual hash clusters) for FULL train_df
import os, time, numpy as np, pandas as pd

def ensure_pkg(pkg):
    try:
        __import__(pkg)
        return True
    except Exception:
        import sys, subprocess
        print(f"[INFO] Installing {pkg} ...", flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])
        return True

ensure_pkg('audioread'); ensure_pkg('librosa'); ensure_pkg('Pillow'); ensure_pkg('imagehash')
import audioread, librosa
from PIL import Image
import imagehash
from joblib import Parallel, delayed
from multiprocessing import cpu_count

assert 'train_df' in globals(), 'train_df must be built before running full phash grouping'

PHASH_MELS = 32
PHASH_FMAX = 800.0
PHASH_NFFT = 512
PHASH_HOP = 256
HAMMING_THRESH = 6  # cluster if distance <= 6

def load_audio_bytes_audioread(path):
    with audioread.audio_open(path) as f:
        sr = int(f.samplerate)
        ch = int(f.channels)
        buf = bytearray()
        for block in f:
            buf.extend(block)
    pcm = np.frombuffer(buf, dtype=np.int16)
    if pcm.size == 0:
        return np.zeros(1, dtype=np.float32), sr
    if ch > 1:
        try:
            pcm = pcm.reshape((-1, ch)).mean(axis=1)
        except Exception:
            pcm = pcm.astype(np.float32)
    y = (pcm.astype(np.float32) / 32768.0)
    return y, sr

def phash_one(path, clip):
    try:
        y, sr = load_audio_bytes_audioread(path)
        if y is None or y.size == 0:
            return {'clip': clip, 'phash': '0'*16}
        ny = max(100.0, 0.5 * sr - 1.0)
        fmax_eff = float(min(PHASH_FMAX, ny))
        fmin_eff = 20.0
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=PHASH_MELS, n_fft=PHASH_NFFT, hop_length=PHASH_HOP, fmin=fmin_eff, fmax=fmax_eff, power=2.0)
        S_db = librosa.power_to_db(S + 1e-12)
        m, M = np.nanmin(S_db), np.nanmax(S_db)
        if not np.isfinite(m) or not np.isfinite(M) or M - m < 1e-6:
            arr = np.zeros_like(S_db, dtype=np.uint8)
        else:
            arr = ((S_db - m) / (M - m) * 255.0).clip(0, 255).astype(np.uint8)
        img = Image.fromarray(arr)
        img = img.resize((64, 32))
        h = imagehash.phash(img, hash_size=8)
        return {'clip': clip, 'phash': str(h)}
    except Exception:
        return {'clip': clip, 'phash': '0'*16}

# Skip if cached mapping exists
cache_csv = 'train_phash_groups.csv'
if os.path.exists(cache_csv):
    print('[INFO] Loading cached phash groups from', cache_csv)
    phg = pd.read_csv(cache_csv)
    train_df = train_df.merge(phg, on='clip', how='left')
    train_df['dup_cluster'] = train_df['dup_cluster_phc'].fillna(train_df.get('dup_cluster', pd.Series(index=train_df.index))).astype(str)
    print('[INFO] Loaded groups. Unique groups:', train_df['dup_cluster'].nunique())
else:
    paths = train_df['path'].tolist()
    clips = train_df['clip'].tolist()
    n_jobs = max(1, min(cpu_count()-2, 16))
    print(f"[INFO] Computing phash for FULL train ({len(paths)} files) with n_jobs={n_jobs} ...", flush=True)
    t0 = time.time()
    rows = Parallel(n_jobs=n_jobs, backend='threading', verbose=10)(delayed(phash_one)(p, c) for p, c in zip(paths, clips))
    print(f"[INFO] phash done in {(time.time()-t0)/60:.2f} min", flush=True)
    phash_df = pd.DataFrame(rows)
    df = train_df[['clip']].merge(phash_df, on='clip', how='left')
    phashes = df['phash'].astype(str).tolist()
    n = len(phashes)
    hvals = [int(h, 16) if all(c in '0123456789abcdef' for c in h.lower()) else 0 for h in phashes]
    parent = list(range(n)); rank = [0]*n
    def find(x):
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return x
    def union(x, y):
        rx, ry = find(x), find(y)
        if rx == ry: return
        if rank[rx] < rank[ry]: parent[rx] = ry
        elif rank[rx] > rank[ry]: parent[ry] = rx
        else: parent[ry] = rx; rank[rx] += 1
    # Bucket by 4-hex prefix
    buckets = {}
    for i, h in enumerate(phashes):
        key = h[:4]
        buckets.setdefault(key, []).append(i)
    checks = 0
    for key, idxs in buckets.items():
        m = len(idxs)
        for a in range(m):
            ia = idxs[a]; va = hvals[ia]
            for b in range(a+1, m):
                ib = idxs[b]; vb = hvals[ib]
                d = (va ^ vb).bit_count()
                checks += 1
                if d <= HAMMING_THRESH:
                    union(ia, ib)
    print(f"[INFO] Hamming checks within buckets: {checks}")
    roots = [find(i) for i in range(n)]
    root_map = {}; gid = 0; group_ids = []
    for r in roots:
        if r not in root_map: root_map[r] = gid; gid += 1
        group_ids.append(root_map[r])
    train_df['dup_cluster_phc'] = [f'phc_{g}' for g in group_ids]
    train_df['dup_cluster'] = train_df['dup_cluster_phc']
    print('[INFO] Content-based unique groups:', train_df['dup_cluster'].nunique(), 'out of', len(train_df))
    train_df[['clip','dup_cluster_phc']].to_csv(cache_csv, index=False)
    print('[INFO] Saved phash groups to', cache_csv)

print('[INFO] Grouping ready. Example:')
print(train_df[['clip','dup_cluster']].head())

In [None]:
# Lean full-pipeline: extract features (audioread), grouped CV, train on full, predict test, write submission.csv
import os, time, numpy as np, pandas as pd
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score

def ensure_pkg(pkg):
    try: __import__(pkg); return True
    except Exception:
        import sys, subprocess
        print(f"[INFO] Installing {pkg} ...", flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])
        return True

ensure_pkg('librosa'); ensure_pkg('audioread'); ensure_pkg('lightgbm')
import librosa, audioread, lightgbm as lgb

assert 'train_df' in globals(), 'train_df must exist (from Cell 2)'
import glob
test_glob = sorted(glob.glob(os.path.join('test2','test2','*.aif')))
test_df = pd.DataFrame({'path': test_glob, 'clip': [os.path.basename(p) for p in test_glob]})

# Use safer fallback grouping by filename date_time prefix
if 'group' not in train_df.columns:
    def parse_group(fname):
        base = os.path.splitext(os.path.basename(fname))[0]
        parts = base.split('_')
        return '_'.join(parts[:2]) if len(parts)>=2 else base
    train_df['group'] = train_df['path'].apply(parse_group)

# Feature params (lean)
N_MELS = 64
N_FFT = 512
HOP = 256
FMIN = 20.0
FMAX = 800.0

def load_audio_robust(path):
    with audioread.audio_open(path) as f:
        sr = int(f.samplerate); ch = int(f.channels); buf = bytearray()
        for block in f: buf.extend(block)
    pcm = np.frombuffer(buf, dtype=np.int16)
    if pcm.size == 0: return np.zeros(1, dtype=np.float32), sr
    if ch > 1:
        try: pcm = pcm.reshape((-1, ch)).mean(axis=1)
        except Exception: pcm = pcm.astype(np.float32)
    y = (pcm.astype(np.float32) / 32768.0)
    return y, sr

def tstats(a):
    a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
    return [a.mean(), a.std(), np.min(a), np.max(a)]

def extract_lean(path, clip):
    try:
        y, sr = load_audio_robust(path)
        if y is None or y.size == 0:
            return {'clip': clip}
        ny = max(100.0, 0.5 * sr - 1.0)
        fmax_eff = float(min(FMAX, ny))
        fmin_eff = float(min(FMIN, max(10.0, fmax_eff * 0.2)))
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP, fmin=fmin_eff, fmax=fmax_eff, power=2.0)
        S_db = librosa.power_to_db(S + 1e-12)
        mel_mean = S_db.mean(axis=1).astype(np.float32)
        mel_std = S_db.std(axis=1).astype(np.float32)
        sc = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP)
        sbw = librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP)
        srf = librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP, roll_percent=0.85)
        flat = librosa.feature.spectral_flatness(y=y, n_fft=N_FFT, hop_length=HOP)
        rms = librosa.feature.rms(y=y, frame_length=N_FFT, hop_length=HOP)
        zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=N_FFT, hop_length=HOP)
        feats = {'clip': clip, 'duration_sec': float(y.size)/float(sr)}
        for i in range(N_MELS):
            feats[f'mel_mean_{i}'] = float(mel_mean[i])
            feats[f'mel_std_{i}'] = float(mel_std[i])
        for name, arr in [('sc', sc), ('sbw', sbw), ('srf', srf), ('flat', flat), ('rms', rms), ('zcr', zcr)]:
            s = tstats(arr)
            feats[f'{name}_mean'] = float(s[0]); feats[f'{name}_std'] = float(s[1])
            feats[f'{name}_min'] = float(s[2]); feats[f'{name}_max'] = float(s[3])
        return feats
    except Exception:
        return {'clip': clip}

def build_feats(paths, clips, out_path):
    n_jobs = max(1, min(cpu_count()-2, 12))
    print(f"[INFO] Extracting lean features for {len(paths)} files with n_jobs={n_jobs} ...", flush=True)
    t0 = time.time()
    rows = Parallel(n_jobs=n_jobs, backend='threading', verbose=10)(delayed(extract_lean)(p, c) for p, c in zip(paths, clips))
    print(f"[INFO] Extraction elapsed: {(time.time()-t0)/60:.2f} min", flush=True)
    df = pd.DataFrame(rows)
    df.to_parquet(out_path, index=False)
    print('[INFO] Saved', out_path, df.shape)
    return df

feat_train_path = 'features_train_lean.parquet'
feat_test_path = 'features_test_lean.parquet'

if os.path.exists(feat_train_path):
    feat_train = pd.read_parquet(feat_train_path)
    print('[INFO] Loaded cached train features', feat_train.shape)
else:
    feat_train = build_feats(train_df['path'].tolist(), train_df['clip'].tolist(), feat_train_path)

if os.path.exists(feat_test_path):
    feat_test = pd.read_parquet(feat_test_path)
    print('[INFO] Loaded cached test features', feat_test.shape)
else:
    feat_test = build_feats(test_df['path'].tolist(), test_df['clip'].tolist(), feat_test_path)

# Align columns between train and test
common_cols = sorted(set(feat_train.columns).intersection(set(feat_test.columns)))
common_cols = [c for c in common_cols if c != 'clip']
feat_train = feat_train[['clip'] + common_cols].copy()
feat_test = feat_test[['clip'] + common_cols].copy()
print('[INFO] Aligned feature shapes:', feat_train.shape, feat_test.shape)

# Merge labels and groups
trainX = train_df[['clip','label','group']].merge(feat_train, on='clip', how='inner')
X = trainX[common_cols].fillna(0.0).astype(np.float32).values
y = trainX['label'].astype(int).values
groups = trainX['group'].values
print('[INFO] X:', X.shape, 'y positives:', int((y==1).sum()), 'groups:', len(np.unique(groups)))

# Grouped CV with LightGBM (robust params)
from sklearn.model_selection import StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
pos = (y==1).sum(); neg = (y==0).sum()
scale_pos_weight = float(neg)/max(1.0, float(pos))
oof = np.zeros(len(y), dtype=np.float32)
fold = 0
t0 = time.time()
for tr_idx, va_idx in sgkf.split(X, y, groups):
    fold += 1
    print(f"[INFO] Fold {fold} | train={len(tr_idx)} valid={len(va_idx)}", flush=True)
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    model = lgb.LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        learning_rate=0.03,
        n_estimators=4000,
        num_leaves=31,
        min_child_samples=20,
        feature_fraction=0.6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        reg_alpha=0.1,
        n_jobs=min(12, cpu_count()-2),
        random_state=42,
        scale_pos_weight=scale_pos_weight
    )
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(200, verbose=True), lgb.log_evaluation(200)]
    )
    p = model.predict_proba(X_va)[:,1]
    oof[va_idx] = p
    auc = roc_auc_score(y_va, p)
    print(f"[INFO] Fold {fold} AUC: {auc:.5f} | elapsed {(time.time()-t0):.1f}s", flush=True)

oof_auc = roc_auc_score(y, oof)
print(f"[RESULT] Full-train grouped OOF AUC (lean feats): {oof_auc:.5f}")

# Train final model on full data
final_model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    learning_rate=0.03,
    n_estimators= int(1.2*4000),
    num_leaves=31,
    min_child_samples=20,
    feature_fraction=0.6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    reg_alpha=0.1,
    n_jobs=min(12, cpu_count()-2),
    random_state=42,
    scale_pos_weight=scale_pos_weight
)
final_model.fit(X, y)

# Predict test and save submission
Xt = feat_test[common_cols].fillna(0.0).astype(np.float32).values
ptest = final_model.predict_proba(Xt)[:,1]
sub = pd.DataFrame({'clip': feat_test['clip'], 'probability': ptest})
sub = sub.sort_values('clip')
sub.to_csv('submission.csv', index=False)
print('[INFO] Wrote submission.csv', sub.shape)

[INFO] Extracting lean features for 22692 files with n_jobs=12 ...


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    0.2s


[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.4s


[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:    0.6s
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:    0.7s


[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:    1.0s


[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:    1.2s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:    1.4s


[Parallel(n_jobs=12)]: Done 121 tasks      | elapsed:    1.7s


[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:    1.9s


[Parallel(n_jobs=12)]: Done 157 tasks      | elapsed:    2.1s


[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    2.4s


[Parallel(n_jobs=12)]: Done 197 tasks      | elapsed:    2.6s


[Parallel(n_jobs=12)]: Done 218 tasks      | elapsed:    2.9s


[Parallel(n_jobs=12)]: Done 241 tasks      | elapsed:    3.2s


[Parallel(n_jobs=12)]: Done 264 tasks      | elapsed:    3.5s


[Parallel(n_jobs=12)]: Done 289 tasks      | elapsed:    3.8s


[Parallel(n_jobs=12)]: Done 314 tasks      | elapsed:    4.1s


[Parallel(n_jobs=12)]: Done 341 tasks      | elapsed:    4.4s


[Parallel(n_jobs=12)]: Done 368 tasks      | elapsed:    4.8s
