# Plan
- Goal: Win a medal (AUC-ROC).
- Steps:
  1) Inspect provided artifacts: sampleSubmission.csv, train2.zip, test2.zip.
  2) Determine data format (tabular, images, audio, features).
  3) Parse/prepare training/test data.
  4) Baseline model with fast CV (StratifiedKFold, strong classifiers e.g., LightGBM/XGBoost/CatBoost).
  5) Iterate with feature engineering and model ensembling.
  6) Generate submission.csv and evaluate with CV.

## Immediate next actions
- Preview zips (list contents), sampleSubmission columns/shape, and infer target/ids.
- If zips contain precomputed features, load them; else extract and build minimal features quickly.

## Logging
- Print progress and timings in loops.

## Checkpoints
- After data inspection: request expert review for data strategy.
- After baseline CV: request expert review for improvements.

## Assumptions
- Prepared artifacts likely include ready-to-train tabular features or spectrogram features.
- Keep runtime efficient; avoid heavy DL unless necessary.

----

In [1]:
import os, zipfile, pandas as pd, time
from pathlib import Path

print('CWD:', os.getcwd())
print('Files in CWD:')
for p in Path('.').iterdir():
    print('-', p.name, p.stat().st_size//1024, 'KB')

print('\nReading sampleSubmission.csv...')
sub = pd.read_csv('sampleSubmission.csv')
print(sub.shape)
print(sub.head())
print(sub.columns.tolist())

def list_zip(zpath, max_items=20):
    print(f'\nListing {zpath} ...')
    with zipfile.ZipFile(zpath) as z:
        infos = z.infolist()
        print('n_files:', len(infos))
        for i, info in enumerate(infos[:max_items]):
            print(f'{i:3d}:', info.filename, info.file_size//1024, 'KB')
        if len(infos) > max_items:
            print('...')

list_zip('train2.zip')
list_zip('test2.zip')

CWD: /app/agent_run_states/the-icml-2013-whale-challenge-right-whale-redux-spray-20250911-051236
Files in CWD:
- test2.zip 104741 KB
- train2.zip 93044 KB
- main.ipynb 3 KB
- agent_metadata 4 KB
- requirements.txt 1 KB
- task.txt 2 KB
- sampleSubmission.csv 1017 KB
- docker_run.log 13 KB
- description.md 3 KB

Reading sampleSubmission.csv...
(25149, 2)
                                clip  probability
0  20090330_000000_015s2ms_Test0.aif            0
1  20090330_000000_020s0ms_Test1.aif            0
2  20090330_000000_025s8ms_Test2.aif            0
3  20090330_000000_028s0ms_Test3.aif            0
4  20090330_000000_035s4ms_Test4.aif            0
['clip', 'probability']

Listing train2.zip ...
n_files: 22693
  0: train2/ 0 KB
  1: train2/20090328_121500_44292s1ms_TRAIN3930_0.aif 7 KB
  2: train2/20090328_040000_14754s3ms_TRAIN1536_0.aif 7 KB
  3: train2/20090329_190000_68809s7ms_TRAIN20079_0.aif 7 KB
  4: train2/20090328_170000_61317s6ms_TRAIN5011_0.aif 7 KB
  5: train2/20090329_050000

n_files: 25150
  0: test2/ 0 KB
  1: test2/20090330_211500_76911s3ms_Test9103.aif 7 KB
  2: test2/20090331_093000_34786s9ms_Test16890.aif 7 KB
  3: test2/20090331_164500_60302s0ms_Test20173.aif 7 KB
  4: test2/20090331_061500_23065s5ms_Test15089.aif 7 KB
  5: test2/20090330_103000_38310s6ms_Test4068.aif 7 KB
  6: test2/20090331_173000_63548s4ms_Test20737.aif 7 KB
  7: test2/20090331_204500_74792s1ms_Test22617.aif 7 KB
  8: test2/20090330_184500_67999s0ms_Test7115.aif 7 KB
  9: test2/20090330_203000_74285s8ms_Test8581.aif 7 KB
 10: test2/20090331_054500_21339s8ms_Test14792.aif 7 KB
 11: test2/20090330_211500_76623s1ms_Test9047.aif 7 KB
 12: test2/20090331_181500_66381s2ms_Test21309.aif 7 KB
 13: test2/20090330_183000_66957s4ms_Test7017.aif 7 KB
 14: test2/20090330_144500_53767s4ms_Test5595.aif 7 KB
 15: test2/20090331_201500_73585s2ms_Test22419.aif 7 KB
 16: test2/20090330_191500_70183s0ms_Test7551.aif 7 KB
 17: test2/20090331_020000_7418s6ms_Test12846.aif 7 KB
 18: test2/20090330_19150

In [3]:
import os, re, zipfile, shutil, time
from pathlib import Path
import pandas as pd

# Ensure train2/ and test2/ exist by extracting only if missing
train_dir = Path('train2')
test_dir = Path('test2')
if not train_dir.exists():
    t0 = time.time()
    with zipfile.ZipFile('train2.zip') as z:
        print('Extracting train2.zip ...')
        z.extractall('.')
    print(f'Done train2 in {time.time()-t0:.1f}s')
else:
    print('train2/ exists; skipping extraction')
if not test_dir.exists():
    t0 = time.time()
    with zipfile.ZipFile('test2.zip') as z:
        print('Extracting test2.zip ...')
        z.extractall('.')
    print(f'Done test2 in {time.time()-t0:.1f}s')
else:
    print('test2/ exists; skipping extraction')

assert train_dir.exists() and test_dir.exists()

# Collect file lists
train_files = sorted([p for p in train_dir.iterdir() if p.suffix.lower()=='.aif'])
test_files = sorted([p for p in test_dir.iterdir() if p.suffix.lower()=='.aif'])
print('n_train_files:', len(train_files), 'n_test_files:', len(test_files))

# Parse labels and groups
label_re = re.compile(r'_(?P<label>[01])\.aif$', re.IGNORECASE)
def parse_train(p):
    name = p.name
    m = label_re.search(name)
    if not m:
        return None
    label = int(m.group('label'))
    group_hour = name[:11]  # YYYYMMDD_HH
    group_date = name[:8]   # YYYYMMDD
    return name, str(p), label, group_hour, group_date

rows = []
bad = 0
for i, p in enumerate(train_files):
    r = parse_train(p)
    if r is None:
        bad += 1
    else:
        rows.append(r)
print('parsed rows:', len(rows), 'bad:', bad)

train_df = pd.DataFrame(rows, columns=['clip','path','label','group_hour','group_date'])
dup = train_df.duplicated('clip').sum()
print('duplicates by clip:', dup)
print('class distribution:\n', train_df['label'].value_counts())
print('unique groups (hour/date):', train_df['group_hour'].nunique(), train_df['group_date'].nunique())

# Build test df aligned to sampleSubmission order
sample = pd.read_csv('sampleSubmission.csv')
test_map = {p.name: str(p) for p in test_files}
miss = []
paths = []
for name in sample['clip'].tolist():
    path = test_map.get(name)
    if path is None:
        miss.append(name)
        paths.append(None)
    else:
        paths.append(path)
if miss:
    print('Missing test paths for', len(miss))
else:
    print('All test clips found')
test_df = pd.DataFrame({'clip': sample['clip'], 'path': paths})
print(train_df.shape, test_df.shape)

# Group sanity for leakage prevention
grp_counts = train_df.groupby('group_hour')['label'].agg(['count','sum'])
print('Groups (hour) count summary:')
print(grp_counts.describe())
pos_per_group = (grp_counts['sum']>0).sum()
print('Groups with at least one positive:', pos_per_group, 'of', len(grp_counts))

Extracting train2.zip ...


Done train2 in 3.7s
Extracting test2.zip ...


Done test2 in 4.2s


n_train_files: 22692 n_test_files: 25149
parsed rows: 22692 bad: 0
duplicates by clip: 0
class distribution:
 label
0    20426
1     2266
Name: count, dtype: int64
unique groups (hour/date): 48 2
All test clips found
(22692, 5) (25149, 2)
Groups (hour) count summary:
            count         sum
count    48.00000   48.000000
mean    472.75000   47.208333
std     231.27643   46.122390
min     138.00000    0.000000
25%     291.50000   12.750000
50%     444.00000   30.500000
75%     564.25000   78.500000
max    1150.00000  207.000000
Groups with at least one positive: 45 of 48


In [13]:
import os, math, warnings, time, gc, multiprocessing as mp, traceback
os.environ['NUMBA_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
import numpy as np
import pandas as pd
from pathlib import Path
from joblib import Parallel, delayed

# Install missing packages if needed
def _lazy_imports():
    global sf, scipy_signal, sosfiltfilt, butter, librosa, scipy_stats, resample_poly
    import soundfile as sf  # noqa: F811
    from scipy import signal as scipy_signal  # noqa: F811
    from scipy.signal import sosfiltfilt, butter, resample_poly  # noqa: F811
    import librosa  # noqa: F811
    from scipy import stats as scipy_stats  # noqa: F811

try:
    import soundfile as sf  # type: ignore
    from scipy import signal as scipy_signal
    from scipy.signal import sosfiltfilt, butter, resample_poly
    import librosa
    from scipy import stats as scipy_stats
except Exception as e:
    print('Installing dependencies...', e)
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'soundfile', 'librosa', 'scipy', 'joblib'])
    from scipy import signal as scipy_signal
    from scipy.signal import sosfiltfilt, butter, resample_poly
    import soundfile as sf
    import librosa
    from scipy import stats as scipy_stats

# Feature extraction parameters (per expert guidance)
SR_TARGET = 2000
BP_LO, BP_HI = 40.0, 500.0
N_FFT = 1024
HOP = 64
N_MELS = 80
FMIN, FMAX = 40.0, 450.0

def read_mono(path):
    try:
        y, sr = sf.read(path, dtype='float32', always_2d=False)
        if y.ndim == 2:
            y = y.mean(axis=1)
        return y, int(sr)
    except Exception:
        # Fallback to librosa backend
        y, sr = librosa.load(path, sr=None, mono=True)
        return y.astype('float32'), int(sr)

def ensure_sr(y, sr, target=SR_TARGET):
    if sr == target:
        return y, sr
    # use resample_poly for speed/quality
    g = math.gcd(sr, target)
    up, down = target//g, sr//g
    y2 = scipy_signal.resample_poly(y, up, down).astype('float32')
    return y2, target

def bandpass(y, sr, lo=BP_LO, hi=BP_HI, order=4):
    lo_n = lo/(sr/2.0)
    hi_n = min(0.999, hi/(sr/2.0))
    if hi_n <= lo_n:
        return y
    sos = butter(order, [lo_n, hi_n], btype='bandpass', output='sos')
    try:
        return sosfiltfilt(sos, y).astype('float32')
    except Exception:
        return y.astype('float32')

def logmel_features(y, sr):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, fmin=FMIN, fmax=FMAX, window='hann', power=2.0, center=True)
    S = np.maximum(S, 1e-12)
    logS = np.log(S).astype('float32')
    d1 = librosa.feature.delta(logS, order=1, mode='nearest')
    d2 = librosa.feature.delta(logS, order=2, mode='nearest')
    return logS, d1, d2

def spectral_summaries(y, sr):
    # RMS, centroid, bandwidth, rolloff
    S = np.abs(librosa.stft(y=y, n_fft=N_FFT, hop_length=HOP, window='hann', center=True))**2
    if S.size == 0:
        return { 'rms_mean':0.0, 'rms_std':0.0, 'centroid_mean':0.0, 'bw_mean':0.0, 'roll85_mean':0.0, 'roll95_mean':0.0 }
    # Ensure frame_length matches the STFT n_fft used for S
    n_fft_used = 2 * (S.shape[0] - 1)
    rms = librosa.feature.rms(S=S, frame_length=n_fft_used, hop_length=HOP, center=True).flatten()
    cen = librosa.feature.spectral_centroid(S=S, sr=sr).flatten()
    bw = librosa.feature.spectral_bandwidth(S=S, sr=sr).flatten()
    roll85 = librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=0.85).flatten()
    roll95 = librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=0.95).flatten()
    def stats(x):
        return {
            'mean': float(np.nanmean(x)) if x.size else 0.0,
            'std': float(np.nanstd(x)) if x.size else 0.0
        }
    out = {}
    out['rms_mean'] = stats(rms)['mean']; out['rms_std'] = stats(rms)['std']
    out['centroid_mean'] = stats(cen)['mean']
    out['bw_mean'] = stats(bw)['mean']
    out['roll85_mean'] = stats(roll85)['mean']
    out['roll95_mean'] = stats(roll95)['mean']
    return out

def aggregate_stats(mat, prefix):
    # mat shape: (bands, frames)
    if mat.ndim != 2 or mat.size == 0:
        return {}
    feats = {}
    # per-band stats
    p10 = np.percentile(mat, 10, axis=1)
    p50 = np.percentile(mat, 50, axis=1)
    p90 = np.percentile(mat, 90, axis=1)
    mean = mat.mean(axis=1)
    std = mat.std(axis=1)
    mx = mat.max(axis=1)
    # skew/kurtosis per band
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        sk = scipy_stats.skew(mat, axis=1, nan_policy='omit')
        ku = scipy_stats.kurtosis(mat, axis=1, fisher=True, nan_policy='omit')
    for i in range(mat.shape[0]):
        feats[f'{prefix}_b{i:02d}_mean'] = float(mean[i])
        feats[f'{prefix}_b{i:02d}_std'] = float(std[i])
        feats[f'{prefix}_b{i:02d}_p10'] = float(p10[i])
        feats[f'{prefix}_b{i:02d}_p50'] = float(p50[i])
        feats[f'{prefix}_b{i:02d}_p90'] = float(p90[i])
        feats[f'{prefix}_b{i:02d}_max'] = float(mx[i])
        feats[f'{prefix}_b{i:02d}_skew'] = float(sk[i]) if np.isfinite(sk[i]) else 0.0
        feats[f'{prefix}_b{i:02d}_kurt'] = float(ku[i]) if np.isfinite(ku[i]) else 0.0
    # global summaries
    feats[f'{prefix}_global_mean'] = float(mean.mean())
    feats[f'{prefix}_global_std'] = float(std.mean())
    return feats

def energy_ratio_features(logmel, sr):
    # Compute mel center freqs to define bands
    mel_cf = librosa.mel_frequencies(n_mels=logmel.shape[0], fmin=FMIN, fmax=FMAX)
    def band_mask(lo, hi):
        return (mel_cf >= lo) & (mel_cf < hi)
    bands = {
        'low': band_mask(40, 120),
        'mid': band_mask(120, 250),
        'high': band_mask(250, 450)
    }
    feats = {}
    for k, m in bands.items():
        if m.sum() == 0:
            feats[f'band_{k}_mean'] = 0.0
        else:
            feats[f'band_{k}_mean'] = float(np.mean(logmel[m, :]))
    denom = 1e-6 + feats['band_low_mean'] + feats['band_mid_mean'] + feats['band_high_mean']
    feats['ratio_low'] = feats['band_low_mean'] / denom
    feats['ratio_mid'] = feats['band_mid_mean'] / denom
    feats['ratio_high'] = feats['band_high_mean'] / denom
    return feats

def eventness_features(logmel):
    # Threshold frames in 80-400 Hz region
    mel_cf = librosa.mel_frequencies(n_mels=logmel.shape[0], fmin=FMIN, fmax=FMAX)
    mask = (mel_cf >= 80) & (mel_cf <= 400)
    band = logmel[mask, :]
    if band.size == 0:
        return {'evt_frac':0.0, 'evt_runs':0, 'evt_longest':0}
    b = (band - band.mean(axis=1, keepdims=True))/ (1e-6 + band.std(axis=1, keepdims=True))
    s = b.mean(axis=0)
    thr = s.mean() + 0.5*s.std()
    flags = (s > thr).astype(np.int32)
    if flags.size == 0:
        return {'evt_frac':0.0, 'evt_runs':0, 'evt_longest':0}
    frac = float(flags.mean())
    runs = 0; longest = 0; cur = 0
    for v in flags:
        if v == 1:
            cur += 1; longest = max(longest, cur);
        else:
            if cur>0: runs += 1
            cur = 0
    if cur>0: runs += 1
    return {'evt_frac': frac, 'evt_runs': runs, 'evt_longest': int(longest)}

def extract_one(row):
    # Ensure heavy deps are imported in each worker
    try:
        _ = sf  # type: ignore
    except NameError:
        _lazy_imports()
    path = row['path']
    clip = row['clip']
    try:
        y, sr = read_mono(path)
        if y.size == 0:
            raise ValueError('empty')
        y, sr = ensure_sr(y, sr, SR_TARGET)
        y = bandpass(y, sr, BP_LO, BP_HI, order=4)
        logS, d1, d2 = logmel_features(y, sr)
        feats = {}
        # Aggregated mel and deltas
        feats.update(aggregate_stats(logS, 'mel'))
        feats.update(aggregate_stats(d1, 'd1'))
        feats.update(aggregate_stats(d2, 'd2'))
        # Additional low-cost features
        expS = np.exp(logS).astype('float32')
        # Spectral flux on mel spectrogram (half-wave rectified frame diff)
        if expS.shape[1] >= 2:
            dS = np.diff(expS, axis=1)
            dS_pos = np.maximum(dS, 0.0)
            flux = dS_pos.sum(axis=0).astype('float32')  # length T-1
            feats['flux_mean'] = float(np.nanmean(flux)) if flux.size else 0.0
            feats['flux_std']  = float(np.nanstd(flux)) if flux.size else 0.0
            feats['flux_p90']  = float(np.nanpercentile(flux, 90)) if flux.size else 0.0
            feats['flux_max']  = float(np.nanmax(flux)) if flux.size else 0.0
        else:
            feats['flux_mean']=feats['flux_std']=feats['flux_p90']=feats['flux_max']=0.0
        # Spectral flatness
        try:
            flat = librosa.feature.spectral_flatness(S=expS).flatten()
            feats['flat_mean'] = float(np.nanmean(flat)) if flat.size else 0.0
            feats['flat_std']  = float(np.nanstd(flat)) if flat.size else 0.0
        except Exception:
            feats['flat_mean']=feats['flat_std']=0.0
        # Centroid slope from STFT power
        try:
            P = np.abs(librosa.stft(y=y, n_fft=N_FFT, hop_length=HOP, window='hann', center=True))**2
            cen = librosa.feature.spectral_centroid(S=P, sr=sr).flatten()
            if cen.size > 1:
                x = np.arange(cen.size, dtype=np.float32)
                slope = float(np.polyfit(x, cen, 1)[0])
            else:
                slope = 0.0
            feats['centroid_slope'] = slope
        except Exception:
            feats['centroid_slope'] = 0.0
        # Energy timing percentiles
        ene = expS.mean(axis=0) if expS.size else np.zeros(0, dtype=np.float32)
        if ene.size:
            c = np.cumsum(ene)
            tot = c[-1] if c.size else 0.0
            if tot <= 0:
                t10=t50=t90=spread=0.0
            else:
                c = c/(tot + 1e-12)
                nT = len(c)
                t10 = float(np.searchsorted(c, 0.1)/nT)
                t50 = float(np.searchsorted(c, 0.5)/nT)
                t90 = float(np.searchsorted(c, 0.9)/nT)
                spread = t90 - t10
        else:
            t10=t50=t90=spread=0.0
        feats['t10'] = t10; feats['t50'] = t50; feats['t90'] = t90; feats['t_spread'] = spread
        # Spectral summaries + ratios + eventness
        feats.update(spectral_summaries(y, sr))
        feats.update(energy_ratio_features(logS, sr))
        feats.update(eventness_features(logS))
        feats['n_frames'] = int(logS.shape[1])
        feats['clip'] = clip
        feats['failed'] = 0
        return feats
    except Exception as e:
        # Robust fallback with error info
        f = {'clip': clip, 'n_frames': 0, 'failed': 1, 'err': repr(e)}
        return f

def parallel_extract_joblib(df, out_path, n_jobs=16, batch_log=200):
    t0 = time.time()
    rows = df.to_dict('records')
    print(f'Extracting features for {len(rows)} clips -> {out_path} with joblib loky, n_jobs={n_jobs}')
    feats = Parallel(n_jobs=n_jobs, backend='loky', verbose=10)(
        delayed(extract_one)(row) for row in rows
    )
    # Collect stats and small sample errors
    err = sum(1 for f in feats if f.get('failed', 0)==1)
    if err:
        print('Total failures:', err)
        sample_errs = [f.get('err','') for f in feats if f.get('failed',0)==1][:5]
        for se in sample_errs:
            if se:
                print('Sample worker error:', se)
    feat_df = pd.DataFrame(feats)
    feat_df = feat_df.set_index('clip').reindex(df['clip'].values)
    if 'err' in feat_df.columns:
        feat_df = feat_df.drop(columns=['err'])
    feat_df = feat_df.fillna(0).reset_index()
    feat_df.to_parquet(out_path, index=False)
    print(f'Saved {out_path} with shape {feat_df.shape} in {time.time()-t0:.1f}s')
    return feat_df

# Debug small subset first to ensure rich columns
debug_pq = Path('train_features_debug.parquet')
if debug_pq.exists():
    try:
        _tmpd = pd.read_parquet(debug_pq)
        if _tmpd.shape[1] <= 5:
            print('Existing debug parquet invalid; removing')
            debug_pq.unlink(missing_ok=True)
    except Exception:
        print('Could not read existing debug parquet; removing')
        debug_pq.unlink(missing_ok=True)
if not debug_pq.exists():
    _ = parallel_extract_joblib(train_df[['clip','path']].head(200), debug_pq, n_jobs=8, batch_log=50)
else:
    print('Found', debug_pq)

# Run full extraction if cache missing or invalid - TRAIN ONLY for now
train_feat_pq = Path('train_features.parquet')
need_full = True
if train_feat_pq.exists():
    try:
        _tmp = pd.read_parquet(train_feat_pq)
        if _tmp.shape[1] > 5:
            need_full = False
        else:
            print('Existing train_features.parquet has too few columns; regenerating...')
            train_feat_pq.unlink(missing_ok=True)
    except Exception:
        print('Could not read existing train_features.parquet; regenerating...')
        train_feat_pq.unlink(missing_ok=True)
if need_full:
    _ = parallel_extract_joblib(train_df[['clip','path']], train_feat_pq, n_jobs=16, batch_log=200)
else:
    print('Found valid', train_feat_pq)

gc.collect()
print('Feature extraction (train) complete.')

Existing debug parquet invalid; removing
Extracting features for 200 clips -> train_features_debug.parquet with joblib loky, n_jobs=8


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    1.8s


[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Batch computation too fast (0.19683571019612733s.) Setting batch_size=2.
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:    2.1s


[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Batch computation too fast (0.19902801513671875s.) Setting batch_size=4.
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:    2.5s


[Parallel(n_jobs=8)]: Done 128 tasks      | elapsed:    2.8s


[Parallel(n_jobs=8)]: Done 173 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    3.2s finished


Saved train_features_debug.parquet with shape (200, 1955) in 3.5s
Extracting features for 22692 clips -> train_features.parquet with joblib loky, n_jobs=16


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    2.1s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    2.2s


[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    2.3s
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    2.4s


[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:    2.7s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:    2.8s


[Parallel(n_jobs=16)]: Done  81 tasks      | elapsed:    2.9s


[Parallel(n_jobs=16)]: Done  96 tasks      | elapsed:    3.1s
[Parallel(n_jobs=16)]: Done 113 tasks      | elapsed:    3.3s


[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:    3.5s
[Parallel(n_jobs=16)]: Done 149 tasks      | elapsed:    3.7s


[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    3.8s
[Parallel(n_jobs=16)]: Done 189 tasks      | elapsed:    4.0s


[Parallel(n_jobs=16)]: Done 210 tasks      | elapsed:    4.2s
[Parallel(n_jobs=16)]: Done 233 tasks      | elapsed:    4.4s


[Parallel(n_jobs=16)]: Done 256 tasks      | elapsed:    4.6s


[Parallel(n_jobs=16)]: Done 281 tasks      | elapsed:    4.9s


[Parallel(n_jobs=16)]: Done 306 tasks      | elapsed:    5.2s


[Parallel(n_jobs=16)]: Done 333 tasks      | elapsed:    5.4s


[Parallel(n_jobs=16)]: Done 360 tasks      | elapsed:    5.6s


[Parallel(n_jobs=16)]: Done 389 tasks      | elapsed:    5.9s


[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    6.2s


[Parallel(n_jobs=16)]: Done 449 tasks      | elapsed:    6.5s


[Parallel(n_jobs=16)]: Done 480 tasks      | elapsed:    6.7s


[Parallel(n_jobs=16)]: Done 513 tasks      | elapsed:    7.0s


[Parallel(n_jobs=16)]: Done 546 tasks      | elapsed:    7.3s


[Parallel(n_jobs=16)]: Done 581 tasks      | elapsed:    7.6s


[Parallel(n_jobs=16)]: Done 616 tasks      | elapsed:    8.0s


[Parallel(n_jobs=16)]: Done 653 tasks      | elapsed:    8.3s


[Parallel(n_jobs=16)]: Done 690 tasks      | elapsed:    8.7s


[Parallel(n_jobs=16)]: Done 729 tasks      | elapsed:    9.1s


[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    9.5s


[Parallel(n_jobs=16)]: Done 809 tasks      | elapsed:    9.9s


[Parallel(n_jobs=16)]: Done 850 tasks      | elapsed:   10.2s


[Parallel(n_jobs=16)]: Done 893 tasks      | elapsed:   10.6s


[Parallel(n_jobs=16)]: Done 936 tasks      | elapsed:   11.0s


[Parallel(n_jobs=16)]: Done 981 tasks      | elapsed:   11.4s


[Parallel(n_jobs=16)]: Done 1026 tasks      | elapsed:   11.8s


[Parallel(n_jobs=16)]: Done 1073 tasks      | elapsed:   12.3s


[Parallel(n_jobs=16)]: Done 1120 tasks      | elapsed:   12.8s


[Parallel(n_jobs=16)]: Done 1169 tasks      | elapsed:   13.3s


[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:   13.7s


[Parallel(n_jobs=16)]: Done 1269 tasks      | elapsed:   14.2s


[Parallel(n_jobs=16)]: Done 1320 tasks      | elapsed:   14.8s


[Parallel(n_jobs=16)]: Done 1373 tasks      | elapsed:   15.3s


[Parallel(n_jobs=16)]: Done 1426 tasks      | elapsed:   15.8s


[Parallel(n_jobs=16)]: Done 1481 tasks      | elapsed:   16.3s


[Parallel(n_jobs=16)]: Done 1536 tasks      | elapsed:   16.8s


[Parallel(n_jobs=16)]: Done 1593 tasks      | elapsed:   17.3s


[Parallel(n_jobs=16)]: Done 1650 tasks      | elapsed:   18.0s


[Parallel(n_jobs=16)]: Done 1709 tasks      | elapsed:   18.5s


[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:   19.1s


[Parallel(n_jobs=16)]: Done 1829 tasks      | elapsed:   19.6s


[Parallel(n_jobs=16)]: Done 1890 tasks      | elapsed:   20.1s


[Parallel(n_jobs=16)]: Done 1953 tasks      | elapsed:   20.7s


[Parallel(n_jobs=16)]: Done 2016 tasks      | elapsed:   21.4s


[Parallel(n_jobs=16)]: Done 2081 tasks      | elapsed:   22.0s


[Parallel(n_jobs=16)]: Done 2146 tasks      | elapsed:   22.6s


[Parallel(n_jobs=16)]: Done 2213 tasks      | elapsed:   23.3s


[Parallel(n_jobs=16)]: Done 2280 tasks      | elapsed:   23.9s


[Parallel(n_jobs=16)]: Done 2349 tasks      | elapsed:   24.7s


[Parallel(n_jobs=16)]: Done 2418 tasks      | elapsed:   25.4s


[Parallel(n_jobs=16)]: Done 2489 tasks      | elapsed:   26.1s


[Parallel(n_jobs=16)]: Done 2560 tasks      | elapsed:   26.7s


[Parallel(n_jobs=16)]: Done 2633 tasks      | elapsed:   27.4s


[Parallel(n_jobs=16)]: Done 2706 tasks      | elapsed:   28.1s


[Parallel(n_jobs=16)]: Done 2781 tasks      | elapsed:   28.9s


[Parallel(n_jobs=16)]: Done 2856 tasks      | elapsed:   29.6s


[Parallel(n_jobs=16)]: Done 2933 tasks      | elapsed:   30.5s


[Parallel(n_jobs=16)]: Done 3010 tasks      | elapsed:   31.2s


[Parallel(n_jobs=16)]: Done 3089 tasks      | elapsed:   31.9s


[Parallel(n_jobs=16)]: Done 3168 tasks      | elapsed:   32.8s


[Parallel(n_jobs=16)]: Done 3249 tasks      | elapsed:   33.6s


[Parallel(n_jobs=16)]: Done 3330 tasks      | elapsed:   34.3s


[Parallel(n_jobs=16)]: Done 3413 tasks      | elapsed:   35.2s


[Parallel(n_jobs=16)]: Done 3496 tasks      | elapsed:   35.9s


[Parallel(n_jobs=16)]: Done 3581 tasks      | elapsed:   36.8s


[Parallel(n_jobs=16)]: Done 3666 tasks      | elapsed:   37.6s


[Parallel(n_jobs=16)]: Done 3753 tasks      | elapsed:   38.5s


[Parallel(n_jobs=16)]: Done 3840 tasks      | elapsed:   39.4s


[Parallel(n_jobs=16)]: Done 3929 tasks      | elapsed:   40.2s


[Parallel(n_jobs=16)]: Done 4018 tasks      | elapsed:   40.9s


[Parallel(n_jobs=16)]: Done 4109 tasks      | elapsed:   41.8s


[Parallel(n_jobs=16)]: Done 4200 tasks      | elapsed:   42.7s


[Parallel(n_jobs=16)]: Done 4293 tasks      | elapsed:   43.6s


[Parallel(n_jobs=16)]: Done 4386 tasks      | elapsed:   44.6s


[Parallel(n_jobs=16)]: Done 4481 tasks      | elapsed:   45.4s


[Parallel(n_jobs=16)]: Done 4576 tasks      | elapsed:   46.3s


[Parallel(n_jobs=16)]: Done 4673 tasks      | elapsed:   47.3s


[Parallel(n_jobs=16)]: Done 4770 tasks      | elapsed:   48.3s


[Parallel(n_jobs=16)]: Done 4869 tasks      | elapsed:   49.2s


[Parallel(n_jobs=16)]: Done 4968 tasks      | elapsed:   50.2s


[Parallel(n_jobs=16)]: Done 5069 tasks      | elapsed:   51.3s


[Parallel(n_jobs=16)]: Done 5170 tasks      | elapsed:   52.4s


[Parallel(n_jobs=16)]: Done 5273 tasks      | elapsed:   53.4s


[Parallel(n_jobs=16)]: Done 5376 tasks      | elapsed:   54.2s


[Parallel(n_jobs=16)]: Done 5481 tasks      | elapsed:   55.1s


[Parallel(n_jobs=16)]: Done 5586 tasks      | elapsed:   56.0s


[Parallel(n_jobs=16)]: Done 5693 tasks      | elapsed:   56.8s


[Parallel(n_jobs=16)]: Done 5800 tasks      | elapsed:   57.7s


[Parallel(n_jobs=16)]: Done 5909 tasks      | elapsed:   58.6s


[Parallel(n_jobs=16)]: Done 6018 tasks      | elapsed:   59.6s


[Parallel(n_jobs=16)]: Done 6129 tasks      | elapsed:  1.0min


[Parallel(n_jobs=16)]: Done 6240 tasks      | elapsed:  1.0min


[Parallel(n_jobs=16)]: Done 6353 tasks      | elapsed:  1.0min


[Parallel(n_jobs=16)]: Done 6466 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 6581 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 6696 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 6813 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 6930 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 7049 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 7168 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 7289 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 7410 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 7533 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 7656 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 7781 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 7906 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 8033 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 8160 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 8289 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 8418 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 8549 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 8680 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 8813 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 8946 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 9081 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 9216 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 9353 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 9490 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 9629 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 9768 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 9909 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 10050 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 10193 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 10336 tasks      | elapsed:  1.7min




[Parallel(n_jobs=16)]: Done 10481 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 10626 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 10773 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 10920 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 11069 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 11218 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 11369 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 11520 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 11673 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 11826 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 11981 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 12136 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 12293 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 12450 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 12609 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 12768 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 12929 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 13090 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 13253 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 13416 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 13581 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 13746 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 13913 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 14080 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 14249 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 14418 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 14589 tasks      | elapsed:  2.4min


[Parallel(n_jobs=16)]: Done 14760 tasks      | elapsed:  2.4min


[Parallel(n_jobs=16)]: Done 14933 tasks      | elapsed:  2.4min


[Parallel(n_jobs=16)]: Done 15106 tasks      | elapsed:  2.4min


[Parallel(n_jobs=16)]: Done 15281 tasks      | elapsed:  2.5min


[Parallel(n_jobs=16)]: Done 15456 tasks      | elapsed:  2.5min


[Parallel(n_jobs=16)]: Done 15633 tasks      | elapsed:  2.5min


[Parallel(n_jobs=16)]: Done 15810 tasks      | elapsed:  2.6min


[Parallel(n_jobs=16)]: Done 15989 tasks      | elapsed:  2.6min


[Parallel(n_jobs=16)]: Done 16168 tasks      | elapsed:  2.6min




[Parallel(n_jobs=16)]: Done 16349 tasks      | elapsed:  2.6min


[Parallel(n_jobs=16)]: Done 16530 tasks      | elapsed:  2.7min


[Parallel(n_jobs=16)]: Done 16713 tasks      | elapsed:  2.7min


[Parallel(n_jobs=16)]: Done 16896 tasks      | elapsed:  2.7min


[Parallel(n_jobs=16)]: Done 17081 tasks      | elapsed:  2.7min


[Parallel(n_jobs=16)]: Done 17266 tasks      | elapsed:  2.8min


[Parallel(n_jobs=16)]: Done 17453 tasks      | elapsed:  2.8min


[Parallel(n_jobs=16)]: Done 17640 tasks      | elapsed:  2.8min


[Parallel(n_jobs=16)]: Done 17829 tasks      | elapsed:  2.9min


[Parallel(n_jobs=16)]: Done 18018 tasks      | elapsed:  2.9min


[Parallel(n_jobs=16)]: Done 18209 tasks      | elapsed:  2.9min


[Parallel(n_jobs=16)]: Done 18400 tasks      | elapsed:  3.0min


[Parallel(n_jobs=16)]: Done 18593 tasks      | elapsed:  3.0min


[Parallel(n_jobs=16)]: Done 18786 tasks      | elapsed:  3.0min


[Parallel(n_jobs=16)]: Done 18981 tasks      | elapsed:  3.1min


[Parallel(n_jobs=16)]: Done 19176 tasks      | elapsed:  3.1min


[Parallel(n_jobs=16)]: Done 19373 tasks      | elapsed:  3.1min


[Parallel(n_jobs=16)]: Done 19570 tasks      | elapsed:  3.2min


[Parallel(n_jobs=16)]: Done 19769 tasks      | elapsed:  3.2min


[Parallel(n_jobs=16)]: Done 19968 tasks      | elapsed:  3.2min


[Parallel(n_jobs=16)]: Done 20169 tasks      | elapsed:  3.2min


[Parallel(n_jobs=16)]: Done 20370 tasks      | elapsed:  3.3min


[Parallel(n_jobs=16)]: Done 20573 tasks      | elapsed:  3.3min


[Parallel(n_jobs=16)]: Done 20776 tasks      | elapsed:  3.3min


[Parallel(n_jobs=16)]: Done 20981 tasks      | elapsed:  3.4min


[Parallel(n_jobs=16)]: Done 21186 tasks      | elapsed:  3.4min


[Parallel(n_jobs=16)]: Done 21393 tasks      | elapsed:  3.4min


[Parallel(n_jobs=16)]: Done 21600 tasks      | elapsed:  3.5min


[Parallel(n_jobs=16)]: Done 21809 tasks      | elapsed:  3.5min


[Parallel(n_jobs=16)]: Done 22018 tasks      | elapsed:  3.5min


[Parallel(n_jobs=16)]: Done 22229 tasks      | elapsed:  3.6min


[Parallel(n_jobs=16)]: Done 22440 tasks      | elapsed:  3.6min


[Parallel(n_jobs=16)]: Done 22653 tasks      | elapsed:  3.6min


[Parallel(n_jobs=16)]: Done 22692 out of 22692 | elapsed:  3.6min finished


Saved train_features.parquet with shape (22692, 1955) in 234.1s


Feature extraction (train) complete.


In [8]:
# Debug: inspect extracted feature parquet and a single-file feature dict
import pandas as pd, numpy as np
from pathlib import Path

pq = Path('train_features.parquet')
if pq.exists():
    dfp = pd.read_parquet(pq)
    print('parquet shape:', dfp.shape)
    print('columns (first 20):', list(dfp.columns[:20]))
    print(dfp.head(3))
else:
    print('train_features.parquet not found')

# Try extracting one sample to inspect keys and any exceptions
row0 = train_df[['clip','path']].iloc[0].to_dict()
print('Testing extract_one on:', row0['clip'])
feat0 = extract_one(row0)
print('n_keys:', len(feat0))
print('keys sample:', list(feat0.keys())[:20])
print('n_frames:', feat0.get('n_frames'))

# If features are minimal, hint next steps
if len(feat0) <= 3:
    print('Warning: feature dict minimal. Investigate mel/logS sizes and exceptions inside extract_one.')

parquet shape: (22692, 2)
columns (first 20): ['clip', 'n_frames']
                                   clip  n_frames
0  20090328_000000_002s3ms_TRAIN0_0.aif         0
1  20090328_000000_010s8ms_TRAIN1_0.aif         0
2  20090328_000000_021s6ms_TRAIN2_0.aif         0
Testing extract_one on: 20090328_000000_002s3ms_TRAIN0_0.aif
n_keys: 2
keys sample: ['clip', 'n_frames']
n_frames: 0


In [9]:
# Deep debug: can we read AIFF and produce mel for a single file?
import soundfile as sf, librosa, numpy as np
from pathlib import Path

sample_row = train_df.iloc[0]
p = sample_row['path']
print('Path:', p)
try:
    info = sf.info(p)
    print('sf.info:', info)
except Exception as e:
    print('sf.info failed:', e)

try:
    y_sf, sr_sf = sf.read(p, dtype='float32', always_2d=False)
    print('sf.read ok. shape:', y_sf.shape, 'sr:', sr_sf, 'min/max:', float(np.min(y_sf)) if y_sf.size else None, float(np.max(y_sf)) if y_sf.size else None)
except Exception as e:
    print('sf.read failed:', e)
    y_sf, sr_sf = np.array([], dtype=np.float32), None

try:
    y_lb, sr_lb = librosa.load(p, sr=None, mono=True)
    print('librosa.load ok. len:', len(y_lb), 'sr:', sr_lb, 'min/max:', float(np.min(y_lb)) if y_lb.size else None, float(np.max(y_lb)) if y_lb.size else None)
except Exception as e:
    print('librosa.load failed:', e)
    y_lb, sr_lb = np.array([], dtype=np.float32), None

# Try pipeline steps explicitly using whichever loaded signal is non-empty
y0, sr0 = (y_sf, sr_sf) if (y_sf is not None and y_sf.size>0) else (y_lb, sr_lb)
print('Chosen source:', 'soundfile' if (y_sf is not None and y_sf.size>0) else 'librosa', 'len:', (len(y0) if y0 is not None else None), 'sr:', sr0)
if y0 is not None and y0.size>0 and sr0 is not None:
    if y0.ndim==2:
        y0 = y0.mean(axis=1)
    # Resample, bandpass, mel
    y1, _ = ensure_sr(y0, sr0, SR_TARGET)
    print('After resample len:', len(y1))
    y2 = bandpass(y1, SR_TARGET, BP_LO, BP_HI, order=4)
    print('After bandpass len:', len(y2), 'min/max:', float(np.min(y2)), float(np.max(y2)))
    try:
        logS, d1, d2 = logmel_features(y2, SR_TARGET)
        print('logS shape:', logS.shape, 'd1:', d1.shape, 'd2:', d2.shape, 'n_frames:', logS.shape[1])
        feats = aggregate_stats(logS, 'mel')
        print('aggregate mel keys:', len(feats))
    except Exception as e:
        print('mel computation failed:', e)
else:
    print('No audio loaded to proceed pipeline')

Path: train2/20090328_000000_002s3ms_TRAIN0_0.aif
sf.info: train2/20090328_000000_002s3ms_TRAIN0_0.aif
samplerate: 2000 Hz
channels: 1
duration: 2.000 s
format: AIFF (Apple/SGI) [AIFF]
subtype: Signed 16 bit PCM [PCM_16]
sf.read ok. shape: (4000,) sr: 2000 min/max: -0.04150390625 -0.00439453125
librosa.load ok. len: 4000 sr: 2000 min/max: -0.04150390625 -0.00439453125
Chosen source: soundfile len: 4000 sr: 2000
After resample len: 4000
After bandpass len: 4000 min/max: -0.009670577943325043 0.007527660112828016
logS shape: (80, 63) d1: (80, 63) d2: (80, 63) n_frames: 63
aggregate mel keys: 642


In [14]:
# Validate train features cache and extract test features; align columns
import pandas as pd, numpy as np, os, time
from pathlib import Path

t0 = time.time()
train_feat_pq = Path('train_features.parquet')
assert train_feat_pq.exists(), 'train_features.parquet missing; run extraction first.'
train_feat = pd.read_parquet(train_feat_pq)
print('Loaded train_features.parquet:', train_feat.shape)
assert train_feat.shape[1] > 1000, f'Unexpected low feature count: {train_feat.shape}'
assert any(c.startswith('mel_b00_') for c in train_feat.columns), 'mel_b00_* columns not found'
if 'failed' in train_feat.columns:
    n_failed = int((train_feat['failed']==1).sum())
    print('Train failed rows:', n_failed)
else:
    n_failed = 0

# Extract test features if missing or invalid
test_feat_pq = Path('test_features.parquet')
need_test = True
if test_feat_pq.exists():
    try:
        _t = pd.read_parquet(test_feat_pq)
        if _t.shape[1] > 1000 and len(_t) == len(test_df):
            need_test = False
            print('Found valid test_features.parquet:', _t.shape)
        else:
            print('Existing test_features invalid; will regenerate...')
            test_feat_pq.unlink(missing_ok=True)
    except Exception:
        print('Could not read existing test_features; regenerating...')
        test_feat_pq.unlink(missing_ok=True)

if need_test:
    _ = parallel_extract_joblib(test_df[['clip','path']], test_feat_pq, n_jobs=16, batch_log=200)
test_feat = pd.read_parquet(test_feat_pq)
print('Loaded test_features.parquet:', test_feat.shape)
if 'failed' in test_feat.columns:
    print('Test failed rows:', int((test_feat['failed']==1).sum()))

# Align columns: ensure test has same feature columns as train (excluding clip)
drop_cols = ['failed'] if 'failed' in train_feat.columns else []
train_cols = [c for c in train_feat.columns if c not in drop_cols]
test_cols = [c for c in test_feat.columns if c not in drop_cols]

# Ensure 'clip' is present and first
assert 'clip' in train_cols and 'clip' in test_cols, 'clip column missing'
feat_cols = [c for c in train_cols if c != 'clip']

# Add any missing columns to test, fill with 0; drop extras not in train
missing_in_test = sorted(list(set(feat_cols) - set([c for c in test_cols if c != 'clip'])))
if missing_in_test:
    print('Adding missing cols to test:', len(missing_in_test))
    for c in missing_in_test:
        test_feat[c] = 0.0
extra_in_test = sorted(list(set([c for c in test_cols if c != 'clip']) - set(feat_cols)))
if extra_in_test:
    print('Dropping extra cols from test:', len(extra_in_test))
    test_feat = test_feat.drop(columns=extra_in_test)

# Reorder test columns to match train
test_feat = test_feat[['clip'] + feat_cols]
train_feat = train_feat[['clip'] + feat_cols]
print('Aligned shapes:', train_feat.shape, test_feat.shape)

# Save aligned versions for modeling convenience
train_feat.to_parquet('train_features_aligned.parquet', index=False)
test_feat.to_parquet('test_features_aligned.parquet', index=False)
print('Saved aligned parquet files in', f'{time.time()-t0:.1f}s')

Loaded train_features.parquet: (22692, 1955)
Train failed rows: 0
Extracting features for 25149 clips -> test_features.parquet with joblib loky, n_jobs=16


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Batch computation too fast (0.036685943603515625s.) Setting batch_size=2.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.1s


[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    0.4s


[Parallel(n_jobs=16)]: Done  48 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done  74 tasks      | elapsed:    0.8s


[Parallel(n_jobs=16)]: Done 100 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:    1.1s


[Parallel(n_jobs=16)]: Done 160 tasks      | elapsed:    1.3s
[Parallel(n_jobs=16)]: Done 194 tasks      | elapsed:    1.4s


[Parallel(n_jobs=16)]: Done 228 tasks      | elapsed:    1.6s


[Parallel(n_jobs=16)]: Done 266 tasks      | elapsed:    1.9s


[Parallel(n_jobs=16)]: Done 304 tasks      | elapsed:    2.2s
[Parallel(n_jobs=16)]: Done 346 tasks      | elapsed:    2.4s


[Parallel(n_jobs=16)]: Done 388 tasks      | elapsed:    2.6s
[Parallel(n_jobs=16)]: Done 434 tasks      | elapsed:    2.8s


[Parallel(n_jobs=16)]: Done 480 tasks      | elapsed:    3.0s


[Parallel(n_jobs=16)]: Done 530 tasks      | elapsed:    3.2s


[Parallel(n_jobs=16)]: Done 580 tasks      | elapsed:    3.4s


[Parallel(n_jobs=16)]: Done 634 tasks      | elapsed:    3.7s


[Parallel(n_jobs=16)]: Done 688 tasks      | elapsed:    4.0s


[Parallel(n_jobs=16)]: Done 746 tasks      | elapsed:    4.3s


[Parallel(n_jobs=16)]: Done 804 tasks      | elapsed:    4.7s


[Parallel(n_jobs=16)]: Done 866 tasks      | elapsed:    5.0s


[Parallel(n_jobs=16)]: Done 928 tasks      | elapsed:    5.4s


[Parallel(n_jobs=16)]: Done 994 tasks      | elapsed:    5.8s


[Parallel(n_jobs=16)]: Done 1060 tasks      | elapsed:    6.2s


[Parallel(n_jobs=16)]: Done 1130 tasks      | elapsed:    6.6s


[Parallel(n_jobs=16)]: Done 1200 tasks      | elapsed:    7.0s


[Parallel(n_jobs=16)]: Done 1274 tasks      | elapsed:    7.4s


[Parallel(n_jobs=16)]: Done 1348 tasks      | elapsed:    7.8s


[Parallel(n_jobs=16)]: Done 1426 tasks      | elapsed:    8.3s


[Parallel(n_jobs=16)]: Done 1504 tasks      | elapsed:    8.7s


[Parallel(n_jobs=16)]: Done 1586 tasks      | elapsed:    9.2s


[Parallel(n_jobs=16)]: Done 1668 tasks      | elapsed:    9.7s


[Parallel(n_jobs=16)]: Done 1754 tasks      | elapsed:   10.2s


[Parallel(n_jobs=16)]: Done 1840 tasks      | elapsed:   10.6s


[Parallel(n_jobs=16)]: Done 1930 tasks      | elapsed:   11.1s


[Parallel(n_jobs=16)]: Done 2020 tasks      | elapsed:   11.6s


[Parallel(n_jobs=16)]: Done 2114 tasks      | elapsed:   12.1s


[Parallel(n_jobs=16)]: Done 2208 tasks      | elapsed:   12.6s


[Parallel(n_jobs=16)]: Done 2306 tasks      | elapsed:   13.2s


[Parallel(n_jobs=16)]: Done 2404 tasks      | elapsed:   13.7s


[Parallel(n_jobs=16)]: Done 2506 tasks      | elapsed:   14.2s


[Parallel(n_jobs=16)]: Done 2608 tasks      | elapsed:   14.8s


[Parallel(n_jobs=16)]: Done 2714 tasks      | elapsed:   15.4s


[Parallel(n_jobs=16)]: Done 2820 tasks      | elapsed:   15.8s


[Parallel(n_jobs=16)]: Done 2930 tasks      | elapsed:   16.3s


[Parallel(n_jobs=16)]: Done 3040 tasks      | elapsed:   16.8s


[Parallel(n_jobs=16)]: Done 3154 tasks      | elapsed:   17.2s


[Parallel(n_jobs=16)]: Done 3268 tasks      | elapsed:   17.7s


[Parallel(n_jobs=16)]: Done 3386 tasks      | elapsed:   18.2s


[Parallel(n_jobs=16)]: Done 3504 tasks      | elapsed:   18.7s


[Parallel(n_jobs=16)]: Done 3626 tasks      | elapsed:   19.3s


[Parallel(n_jobs=16)]: Done 3748 tasks      | elapsed:   20.1s


[Parallel(n_jobs=16)]: Done 3874 tasks      | elapsed:   20.8s


[Parallel(n_jobs=16)]: Done 4000 tasks      | elapsed:   21.6s


[Parallel(n_jobs=16)]: Done 4130 tasks      | elapsed:   22.5s


[Parallel(n_jobs=16)]: Done 4260 tasks      | elapsed:   23.2s


[Parallel(n_jobs=16)]: Done 4394 tasks      | elapsed:   23.9s


[Parallel(n_jobs=16)]: Done 4528 tasks      | elapsed:   24.6s


[Parallel(n_jobs=16)]: Done 4666 tasks      | elapsed:   25.4s


[Parallel(n_jobs=16)]: Done 4804 tasks      | elapsed:   26.1s


[Parallel(n_jobs=16)]: Done 4946 tasks      | elapsed:   26.9s


[Parallel(n_jobs=16)]: Done 5088 tasks      | elapsed:   27.7s


[Parallel(n_jobs=16)]: Done 5234 tasks      | elapsed:   28.6s


[Parallel(n_jobs=16)]: Done 5380 tasks      | elapsed:   29.4s


[Parallel(n_jobs=16)]: Done 5530 tasks      | elapsed:   30.3s


[Parallel(n_jobs=16)]: Done 5680 tasks      | elapsed:   31.0s


[Parallel(n_jobs=16)]: Done 5834 tasks      | elapsed:   31.8s


[Parallel(n_jobs=16)]: Done 5988 tasks      | elapsed:   32.7s


[Parallel(n_jobs=16)]: Done 6146 tasks      | elapsed:   33.6s


[Parallel(n_jobs=16)]: Done 6304 tasks      | elapsed:   34.6s


[Parallel(n_jobs=16)]: Done 6466 tasks      | elapsed:   35.7s


[Parallel(n_jobs=16)]: Done 6628 tasks      | elapsed:   36.6s


[Parallel(n_jobs=16)]: Done 6794 tasks      | elapsed:   37.6s


[Parallel(n_jobs=16)]: Done 6960 tasks      | elapsed:   38.5s


[Parallel(n_jobs=16)]: Done 7130 tasks      | elapsed:   39.6s


[Parallel(n_jobs=16)]: Done 7300 tasks      | elapsed:   40.6s


[Parallel(n_jobs=16)]: Done 7474 tasks      | elapsed:   41.7s


[Parallel(n_jobs=16)]: Done 7648 tasks      | elapsed:   42.8s


[Parallel(n_jobs=16)]: Done 7826 tasks      | elapsed:   43.8s


[Parallel(n_jobs=16)]: Done 8004 tasks      | elapsed:   45.0s


[Parallel(n_jobs=16)]: Done 8186 tasks      | elapsed:   46.2s


[Parallel(n_jobs=16)]: Done 8368 tasks      | elapsed:   47.2s


[Parallel(n_jobs=16)]: Done 8554 tasks      | elapsed:   48.3s


[Parallel(n_jobs=16)]: Done 8740 tasks      | elapsed:   49.3s


[Parallel(n_jobs=16)]: Done 8930 tasks      | elapsed:   50.3s




[Parallel(n_jobs=16)]: Done 9120 tasks      | elapsed:   51.4s


[Parallel(n_jobs=16)]: Done 9314 tasks      | elapsed:   52.5s


[Parallel(n_jobs=16)]: Done 9508 tasks      | elapsed:   53.6s


[Parallel(n_jobs=16)]: Done 9706 tasks      | elapsed:   54.8s


[Parallel(n_jobs=16)]: Done 9904 tasks      | elapsed:   55.9s


[Parallel(n_jobs=16)]: Done 10106 tasks      | elapsed:   57.0s


[Parallel(n_jobs=16)]: Done 10308 tasks      | elapsed:   58.2s


[Parallel(n_jobs=16)]: Done 10514 tasks      | elapsed:   59.5s


[Parallel(n_jobs=16)]: Done 10720 tasks      | elapsed:  1.0min


[Parallel(n_jobs=16)]: Done 10930 tasks      | elapsed:  1.0min


[Parallel(n_jobs=16)]: Done 11140 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 11354 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 11568 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 11786 tasks      | elapsed:  1.1min




[Parallel(n_jobs=16)]: Done 12004 tasks      | elapsed:  1.1min


[Parallel(n_jobs=16)]: Done 12226 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 12448 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 12674 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 12900 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 13130 tasks      | elapsed:  1.2min


[Parallel(n_jobs=16)]: Done 13360 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 13594 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 13828 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 14066 tasks      | elapsed:  1.3min


[Parallel(n_jobs=16)]: Done 14304 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 14546 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 14788 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 15034 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 15280 tasks      | elapsed:  1.4min


[Parallel(n_jobs=16)]: Done 15530 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 15780 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 16034 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 16288 tasks      | elapsed:  1.5min


[Parallel(n_jobs=16)]: Done 16546 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 16804 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 17066 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 17328 tasks      | elapsed:  1.6min


[Parallel(n_jobs=16)]: Done 17594 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 17860 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 18130 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 18400 tasks      | elapsed:  1.7min


[Parallel(n_jobs=16)]: Done 18674 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 18948 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 19226 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 19504 tasks      | elapsed:  1.8min


[Parallel(n_jobs=16)]: Done 19786 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 20068 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 20354 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 20640 tasks      | elapsed:  1.9min


[Parallel(n_jobs=16)]: Done 20930 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 21220 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 21514 tasks      | elapsed:  2.0min


[Parallel(n_jobs=16)]: Done 21808 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 22106 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 22404 tasks      | elapsed:  2.1min


[Parallel(n_jobs=16)]: Done 22706 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 23008 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 23314 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 23620 tasks      | elapsed:  2.2min


[Parallel(n_jobs=16)]: Done 23930 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 24240 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 24554 tasks      | elapsed:  2.3min


[Parallel(n_jobs=16)]: Done 24868 tasks      | elapsed:  2.4min


[Parallel(n_jobs=16)]: Done 25118 out of 25149 | elapsed:  2.4min remaining:    0.2s
[Parallel(n_jobs=16)]: Done 25149 out of 25149 | elapsed:  2.4min finished


Saved test_features.parquet with shape (25149, 1955) in 160.8s


Loaded test_features.parquet: (25149, 1955)
Test failed rows: 0


Aligned shapes: (22692, 1954) (25149, 1954)


Saved aligned parquet files in 175.2s


In [None]:
# Modeling: StratifiedGroupKFold LightGBM (sklearn API for stable logging), OOF AUC, test prediction, submission
import pandas as pd, numpy as np, time, os
from pathlib import Path
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import check_random_state

t0 = time.time()
# Load aligned features
train_feat = pd.read_parquet('train_features_aligned.parquet')
test_feat = pd.read_parquet('test_features_aligned.parquet')
# Merge labels and groups
aux = train_df[['clip','label','group_hour','group_date']].copy()
df = train_feat.merge(aux, on='clip', how='left')
assert df['label'].notnull().all(), 'Missing labels after merge'
X_cols = [c for c in df.columns if c not in ['clip','label','group_hour','group_date']]
print('n_feature_cols:', len(X_cols))

# Remove constant cols
vt = VarianceThreshold(threshold=0.0)
X = df[X_cols].astype('float32').values
X = vt.fit_transform(X)
kept_mask = vt.get_support()
X_cols_kept = [c for c, k in zip(X_cols, kept_mask) if k]
print('Removed constant cols:', len(X_cols) - len(X_cols_kept))

# Apply same to test
Xt = test_feat[X_cols].astype('float32').values
Xt = vt.transform(Xt)

y = df['label'].values.astype(np.int32)
groups = df['group_hour'].values
rng = check_random_state(42)

# LightGBM (sklearn API for better notebook logging)
try:
    import lightgbm as lgb
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb
from lightgbm import LGBMClassifier

pos = (y==1).sum(); neg = (y==0).sum()
scale_pos_weight = float(neg)/float(pos)
print('Class counts pos/neg:', pos, neg, 'scale_pos_weight:', round(scale_pos_weight,3))

clf_params = {
    'objective': 'binary',
    'learning_rate': 0.05,
    'n_estimators': 4000,
    'num_leaves': 63,
    'max_depth': -1,
    'min_child_samples': 100,
    'colsample_bytree': 0.7,
    'subsample': 0.8,
    'subsample_freq': 1,
    'reg_alpha': 0.3,
    'reg_lambda': 12.0,
    'scale_pos_weight': scale_pos_weight,
    'n_jobs': 8,
    'random_state': 1337
}

n_splits = 5
skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof = np.zeros(len(df), dtype=np.float32)
test_preds = np.zeros(len(test_feat), dtype=np.float32)
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y, groups), 1):
    t_fold = time.time()
    y_tr, y_va = y[tr_idx], y[va_idx]
    pos_tr, pos_va = int(y_tr.sum()), int(y_va.sum())
    print(f'Fold {fold}/{n_splits} | train:{len(tr_idx)} (pos {pos_tr}) valid:{len(va_idx)} (pos {pos_va})', flush=True)
    if pos_va < 50:
        print('Warning: low positives in validation fold')
    model = LGBMClassifier(**clf_params)
    # Use callbacks for early stopping/logging to support older sklearn API versions
    callbacks = [
        lgb.early_stopping(stopping_rounds=200, verbose=True),
        lgb.log_evaluation(period=200)
    ]
    model.fit(
        X[tr_idx], y_tr,
        eval_set=[(X[va_idx], y_va)],
        eval_metric='auc',
        callbacks=callbacks
    )
    best_iter = getattr(model, 'best_iteration_', None)
    if best_iter is None:
        best_iter = clf_params['n_estimators']
    p_va = model.predict_proba(X[va_idx], num_iteration=best_iter)[:,1]
    oof[va_idx] = p_va.astype(np.float32)
    auc = roc_auc_score(y_va, p_va)
    fold_aucs.append(auc)
    print(f'  Fold {fold} AUC: {auc:.5f} | best_iter {best_iter} | time {time.time()-t_fold:.1f}s', flush=True)
    test_preds += model.predict_proba(Xt, num_iteration=best_iter)[:,1].astype(np.float32) / n_splits

oof_auc = roc_auc_score(y, oof)
print('OOF AUC:', f'{oof_auc:.5f}', 'folds:', [f'{a:.5f}' for a in fold_aucs], 'mean±std:', f'{np.mean(fold_aucs):.5f}±{np.std(fold_aucs):.5f}')

# Sanity 2-fold date-based CV
dates = df['group_date'].values
date_uni = np.unique(dates)
if len(date_uni) >= 2:
    mask_date = (dates == date_uni[0])
    tr_idx = np.where(~mask_date)[0]; va_idx = np.where(mask_date)[0]
    if va_idx.size > 0 and tr_idx.size > 0:
        model_tmp = LGBMClassifier(**clf_params)
        model_tmp.fit(X[tr_idx], y[tr_idx], eval_set=[(X[va_idx], y[va_idx])], eval_metric='auc', callbacks=[lgb.early_stopping(100, verbose=False)])
        best_iter_tmp = getattr(model_tmp, 'best_iteration_', None)
        if best_iter_tmp is None:
            best_iter_tmp = clf_params['n_estimators']
        p_va = model_tmp.predict_proba(X[va_idx], num_iteration=best_iter_tmp)[:,1]
        auc_date = roc_auc_score(y[va_idx], p_va)
        print('Date-based sanity AUC:', f'{auc_date:.5f}', 'date:', date_uni[0])

# Build submission
sample = pd.read_csv('sampleSubmission.csv')
sub = pd.DataFrame({'clip': test_feat['clip'].values, 'probability': test_preds})
sub = sample[['clip']].merge(sub, on='clip', how='left')
miss = sub['probability'].isna().sum()
if miss:
    print('Warning: missing predictions for', miss, 'clips; filling with mean')
    sub['probability'] = sub['probability'].fillna(float(test_preds.mean()))
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv', sub.shape, 'in', f'{time.time()-t0:.1f}s')

n_feature_cols: 1953


Removed constant cols: 0


Class counts pos/neg: 2266 20426 scale_pos_weight: 9.014
Fold 1/5 | train:19139 (pos 1872) valid:3553 (pos 394)
