# Plan: MLSP 2013 Bird Classification (Medal-Oriented)

Goals:
- Build a strong multi-label classifier using provided assets (spectrogram bitmaps, WAVs, folds, species list).
- Optimize for ROC-AUC with reliable CV and efficient training.

Data understanding (to verify):
- essential_data/src_wavs/: raw 10s audio clips.
- supplemental_data/spectrograms/ and filtered_spectrograms/: 1 image per clip.
- CVfolds_2.txt: cross-validation assignments (likely per recording).
- rec_id2filename.txt: map between recording IDs and filenames.
- species_list.txt: ordered list of target classes.
- rec_labels_test_hidden.txt: indicates test set identifiers; true labels hidden.

Initial approach (fast, competitive):
1) Use spectrogram images as inputs; multi-label targets per recording.
2) Parse folds and train/val splits from CVfolds_2.txt; determine train/test split via rec_labels_test_hidden and rec_id2filename.
3) Model A (baseline, quick):
   - Extract image features (e.g., statistics, HOG, color/texture) and train LightGBM/Logistic multi-label one-vs-rest.
   - Purpose: quick sanity check; establish CV and submission format.
4) Model B (primary):
   - Fine-tune a lightweight CNN (e.g., timm EfficientNet-B0/ConvNeXt-Tiny) on spectrogram images.
   - Multi-label BCEWithLogits, stratified folds from CVfolds_2.
   - Augmentations: light (random time/freq masking via image ops, flips limited to time axis).
   - Inference: TTA horizontal shifts/crops if necessary.
5) Ensembling:
   - Average probabilities from Model A + Model B (and possibly filtered vs unfiltered spectrogram variants) for lift.

Validation protocol:
- Use provided CVfolds_2 to avoid leakage, monitor macro ROC-AUC per fold.
- Log times per fold; early stop if overfitting.

Deliverables:
- submission.csv with columns: [file, species_1, ..., species_K] (confirm exact header from sample_submission.csv).

Next steps:
1) Inspect/parse files (species list, folds, id <-> filename, sample submission) to confirm target ordering and split.
2) Implement data loader mapping image paths to labels.
3) Baseline feature model for quick CV.
4) CNN training with careful logging.
5) Blend and generate submission.

In [2]:
# Inspect core files and prepare mappings for baseline (robust parsing)
import pandas as pd, numpy as np, os, json, sys, time, re
from pathlib import Path

DATA_DIR = Path('.')
ess = DATA_DIR / 'essential_data'
supp = DATA_DIR / 'supplemental_data'

def log(msg):
    print(f"[INFO] {msg}")

# Sample submission defines exact column order and species list
sub_path = DATA_DIR / 'sample_submission.csv'
df_sub = pd.read_csv(sub_path)
assert 'Id' in df_sub.columns[0], "First column in sample_submission must be Id"
species = list(df_sub.columns[1:])
n_classes = len(species)
log(f"Sample submission loaded. n_test_rows={len(df_sub)}, n_species={n_classes}. First 5 species: {species[:5]}")

# Load folds + labels (CVfolds_2.txt): rec_id, fold, label_1...label_K
folds_path = ess / 'CVfolds_2.txt'
# Use whitespace separator (space or tab)
df_folds = pd.read_csv(folds_path, header=None, sep=r"\s+", engine='python')
expected_cols = 2 + n_classes
log(f"CVfolds_2.txt shape raw: {df_folds.shape}")
assert df_folds.shape[1] == expected_cols, f"Unexpected columns in CVfolds_2.txt: got {df_folds.shape[1]}, expected {expected_cols}"
df_folds.columns = ['rec_id','fold'] + species
log(f"Loaded CVfolds_2.txt: {df_folds.shape[0]} records, folds: {sorted(df_folds.fold.unique().tolist())}")
log(df_folds.head(3).to_string(index=False))

# Load test IDs (whitespace separated list of rec_ids)
test_ids_path = ess / 'rec_labels_test_hidden.txt'
df_test_ids = pd.read_csv(test_ids_path, header=None, names=['rec_id'], sep=r"\s+", engine='python')
test_ids = set(df_test_ids['rec_id'].tolist())
log(f"Loaded test IDs: {len(test_ids)}")

# Train set = all rec_ids in folds not in test_ids
is_test = df_folds['rec_id'].isin(test_ids)
df_train = df_folds.loc[~is_test].reset_index(drop=True)
df_test_folds = df_folds.loc[is_test].reset_index(drop=True)
log(f"Train records: {df_train.shape[0]}, Test records in folds listing: {df_test_folds.shape[0]}")

# Load rec_id -> filename (whitespace separated)
id2fn_path = ess / 'rec_id2filename.txt'
df_id2fn = pd.read_csv(id2fn_path, header=None, names=['rec_id','filename'], sep=r"\s+", engine='python')
log(f"Loaded id->filename map: {df_id2fn.shape[0]} rows. Sample:\n{df_id2fn.head(3)}")

# Segment features (for baseline) - try tab, then whitespace
seg_feat_path = supp / 'segment_features.txt'
try:
    df_feats = pd.read_csv(seg_feat_path, sep='\t', header=None)
except Exception:
    df_feats = pd.read_csv(seg_feat_path, sep=r"\s+", header=None, engine='python')
log(f"Loaded segment_features: shape {df_feats.shape}")

# Heuristic: first column is rec_id, ensure it matches df_folds
df_feats = df_feats.rename(columns={0:'rec_id'})
if not np.issubdtype(df_feats['rec_id'].dtype, np.number):
    try:
        df_feats['rec_id'] = df_feats['rec_id'].astype(int)
    except Exception as e:
        log(f"Warning: could not cast rec_id to int: {e}")

log(f"Unique rec_ids in features: {df_feats['rec_id'].nunique()}")
log(f"Feature preview:\n{df_feats.head(3)}")

# Align features with train/test IDs
train_ids = set(df_train['rec_id'])
missing_train = len(train_ids - set(df_feats['rec_id']))
missing_test = len(test_ids - set(df_feats['rec_id']))
log(f"Features coverage -> missing train: {missing_train}, missing test: {missing_test}")

# Save quick summaries for reference
summary = {
    'n_species': n_classes,
    'folds': sorted(df_folds.fold.unique().tolist()),
    'n_train': int(df_train.shape[0]),
    'n_test_ids': int(len(test_ids)),
    'features_shape': tuple(df_feats.shape),
    'features_missing_train': int(missing_train),
    'features_missing_test': int(missing_test),
}
print(json.dumps(summary, indent=2))

[INFO] Sample submission loaded. n_test_rows=1216, n_species=1. First 5 species: ['Probability']
[INFO] CVfolds_2.txt shape raw: (323, 1)


AssertionError: Unexpected columns in CVfolds_2.txt: got 1, expected 3

In [3]:
# Peek at raw files to determine correct delimiters and formats
from pathlib import Path

def head_lines(path, n=5):
    print(f"\n===== HEAD of {path} =====")
    with open(path, 'r', errors='ignore') as f:
        for i in range(n):
            line = f.readline()
            if not line: break
            print(line.rstrip('\n'))

ess = Path('essential_data')
supp = Path('supplemental_data')

head_lines('sample_submission.csv', 5)
head_lines(ess / 'species_list.txt', 10)
head_lines(ess / 'CVfolds_2.txt', 10)
head_lines(ess / 'rec_id2filename.txt', 10)
head_lines(ess / 'rec_labels_test_hidden.txt', 10)
head_lines(supp / 'segment_features.txt', 5)


===== HEAD of sample_submission.csv =====
Id,Probability
100,0
101,0
102,0
103,0

===== HEAD of essential_data/species_list.txt =====
class_id,code,species
0,BRCR,Brown Creeper
1,PAWR,Pacific Wren
2,PSFL,Pacific-slope Flycatcher
3,RBNU,Red-breasted Nuthatch
4,DEJU,Dark-eyed Junco
5,OSFL,Olive-sided Flycatcher
6,HETH,Hermit Thrush
7,CBCH,Chestnut-backed Chickadee
8,VATH,Varied Thrush

===== HEAD of essential_data/CVfolds_2.txt =====
rec_id,fold
0,0
1,1
2,0
3,0
4,0
5,0
6,1
7,1
8,1

===== HEAD of essential_data/rec_id2filename.txt =====
rec_id,filename
0,PC1_20090606_050012_0010
1,PC1_20090606_070012_0010
2,PC1_20090705_070000_0010
3,PC1_20100513_043000_0010
4,PC1_20100513_043000_0720
5,PC1_20100606_070000_0010
6,PC1_20100705_070002_0010
7,PC1_20100804_050000_0010
8,PC1_20100804_070000_0010

===== HEAD of essential_data/rec_labels_test_hidden.txt =====
rec_id,[labels]
0,11,12
1,?
2,10
3
4
5
6,?
7,?
8,?

===== HEAD of supplemental_data/segment_features.txt =====
rec_id,[histogram of segme

In [5]:
# Parse labels from rec_labels_test_hidden.txt and inspect mapping to sample submission
import csv
from collections import defaultdict

labels_path = Path('essential_data/rec_labels_test_hidden.txt')
label_rows = []
with open(labels_path, 'r') as f:
    reader = csv.reader(f)
    header = next(reader)  # ['rec_id','[labels]']
    for row in reader:
        if not row:
            continue
        rec_id = int(row[0])
        if len(row) == 1:
            labels = []  # no labels listed
            status = 'known'
        else:
            if row[1].strip() == '?' or row[1].strip() == '[labels]':
                labels = None
                status = 'test'
            else:
                # remaining entries after rec_id are label indices (possibly multiple)
                labels = [int(x) for x in row[1:]]
                status = 'known'
        label_rows.append({'rec_id': rec_id, 'labels': labels, 'status': status})

df_lbl = pd.DataFrame(label_rows)
df_lbl['binary_target'] = df_lbl['labels'].apply(lambda x: np.nan if x is None else (1 if len(x) > 0 else 0))
log(f"Label file parsed: total {len(df_lbl)}, known={df_lbl['status'].eq('known').sum()}, test={df_lbl['status'].eq('test').sum()}")
log(df_lbl.head(10).to_string(index=False))

# Sanity: merge folds with labels
df_folds_csv = pd.read_csv(ess / 'CVfolds_2.txt')
assert list(df_folds_csv.columns) == ['rec_id','fold'], 'Unexpected columns in CVfolds_2.txt'
df_folds_csv['rec_id'] = df_folds_csv['rec_id'].astype(int)
df_all = df_folds_csv.merge(df_lbl[['rec_id','binary_target','status','labels']], on='rec_id', how='left')
log(f"After merge: shape={df_all.shape}; fold counts: {df_all['fold'].value_counts().to_dict()}")
log(df_all.head(10).to_string(index=False))

# Determine train/test ids from labels file (test = status=='test')
train_mask = df_all['status'].eq('known')
test_mask = df_all['status'].eq('test')
df_train_ids = df_all.loc[train_mask, ['rec_id','fold','binary_target','labels']].reset_index(drop=True)
df_test_ids = df_all.loc[test_mask, ['rec_id','fold']].reset_index(drop=True)
log(f"Train IDs: {len(df_train_ids)}, Test IDs: {len(df_test_ids)}")
log(f"Train positives: {int(df_train_ids['binary_target'].sum())}, negatives: {int((df_train_ids['binary_target']==0).sum())}")

# Check correspondence to sample_submission Ids
df_sub = pd.read_csv('sample_submission.csv')
sub_ids = df_sub['Id'].tolist()
log(f"Sample submission Id range: min={min(sub_ids)}, max={max(sub_ids)}, n_unique={len(set(sub_ids))}")
log(f"rec_id range: min={df_all.rec_id.min()}, max={df_all.rec_id.max()}, n_unique={df_all.rec_id.nunique()}")

# Parse segment_features: skip header line, comma-separated. Columns: rec_id, seg_id, feat1..N
seg_feat_path = Path('supplemental_data/segment_features.txt')
df_seg = pd.read_csv(seg_feat_path, header=None, skiprows=1)
assert df_seg.shape[1] >= 3, 'segment_features must have at least 3 columns (rec_id, seg_id, features...)'
df_seg = df_seg.rename(columns={0:'rec_id', 1:'seg_id'})
feature_cols = [c for c in df_seg.columns if c not in ['rec_id','seg_id']]
log(f"Segment features: rows={len(df_seg)}, n_features={len(feature_cols)}")
df_agg = df_seg.groupby('rec_id')[feature_cols].mean().reset_index()
log(f"Aggregated features per rec_id: {df_agg.shape}")

# Coverage check for train/test
miss_train = set(df_train_ids.rec_id) - set(df_agg.rec_id)
miss_test = set(df_test_ids.rec_id) - set(df_agg.rec_id)
log(f"Aggregated features coverage -> missing train: {len(miss_train)}, missing test: {len(miss_test)}")

# Quick probe: does sample_submission Id encode class_id sequence of 19 per rec_id?
first_ids = df_sub['Id'].head(25).tolist()
print("First 25 sample Ids:", first_ids)

[INFO] Label file parsed: total 322, known=258, test=64
[INFO]  rec_id   labels status  binary_target
      0 [11, 12]  known            1.0
      1     None   test            NaN
      2     [10]  known            1.0
      3       []  known            0.0
      4       []  known            0.0
      5       []  known            0.0
      6     None   test            NaN
      7     None   test            NaN
      8     None   test            NaN
      9       []  known            0.0
[INFO] After merge: shape=(322, 5); fold counts: {0: 258, 1: 64}
[INFO]  rec_id  fold  binary_target status   labels
      0     0            1.0  known [11, 12]
      1     1            NaN   test     None
      2     0            1.0  known     [10]
      3     0            0.0  known       []
      4     0            0.0  known       []
      5     0            0.0  known       []
      6     1            NaN   test     None
      7     1            NaN   test     None
      8     1            NaN   

In [7]:
# Baseline: image features + one-vs-rest Logistic Regression; generate submission
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import pandas as pd
import time
from pathlib import Path

ess = Path('essential_data')
supp = Path('supplemental_data')

# Load id -> filename map
df_id2fn = pd.read_csv(ess / 'rec_id2filename.txt')
id2fn = dict(zip(df_id2fn.rec_id.astype(int), df_id2fn.filename.astype(str)))

def load_img_feature(rec_id, folder='filtered_spectrograms', size=(64,64)):
    fn = id2fn.get(int(rec_id), None)
    if fn is None: return None
    img_path = supp / folder / f"{fn}.bmp"
    if not img_path.exists():
        return None
    try:
        img = Image.open(img_path).convert('L')
    except Exception:
        return None
    # resize preserving aspect ratio in height; then center crop/pad width to target
    Ht = size[0]
    w, h = img.size
    new_w = int(round(w * (Ht / h)))
    img_resized = img.resize((new_w, Ht), Image.BILINEAR)
    target_w = size[1]
    if new_w >= target_w:
        start = (new_w - target_w)//2
        img_crop = img_resized.crop((start, 0, start + target_w, Ht))
    else:
        pad_left = (target_w - new_w)//2
        pad_right = target_w - new_w - pad_left
        canvas = Image.new('L', (target_w, Ht), color=0)
        canvas.paste(img_resized, (pad_left, 0))
        img_crop = canvas
    imgf = np.asarray(img_crop, dtype=np.float32) / 255.0
    mean = imgf.mean(); std = imgf.std(); p1 = np.percentile(imgf, 1); p5 = np.percentile(imgf,5); p95 = np.percentile(imgf,95); p99 = np.percentile(imgf,99)
    small = np.asarray(img_crop.resize((32,32), Image.BILINEAR), dtype=np.float32) / 255.0
    feat = np.concatenate([[mean, std, p1, p5, p95, p99], small.ravel()])
    return feat

# Targets: 19 classes from species_list class_id 0..18
species_df = pd.read_csv(ess / 'species_list.txt')
num_classes = species_df.shape[0]
class_ids = list(range(num_classes))

# training known records from cell 3 (df_train_ids available)
train_rows = df_train_ids.copy()
train_rec_ids = train_rows['rec_id'].tolist()
y_multi = np.zeros((len(train_rows), num_classes), dtype=np.float32)
for i, labs in enumerate(train_rows['labels']):
    for c in labs:
        y_multi[i, c] = 1.0

X_list = []; valid_idx = []
for i, rid in enumerate(train_rec_ids):
    f = load_img_feature(rid, folder='filtered_spectrograms', size=(64,64))
    if f is None:
        f = load_img_feature(rid, folder='spectrograms', size=(64,64))
    if f is None:
        continue
    X_list.append(f); valid_idx.append(i)
X = np.vstack(X_list) if X_list else np.zeros((0, 6+1024), dtype=np.float32)
y = y_multi[valid_idx]
print(f"[INFO] Image features built: X shape {X.shape}, y shape {y.shape}")

if X.shape[0] == 0:
    raise RuntimeError('No image features extracted; cannot train baseline.')

# Train One-vs-Rest Logistic Regression with standardization
pipe = Pipeline([
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('clf', OneVsRestClassifier(LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear')))
])

t0 = time.time()
pipe.fit(X, y)
print(f"[INFO] Training completed in {time.time()-t0:.2f}s")

# Prepare test features
test_rec_sorted = sorted(df_test_ids['rec_id'].tolist())
test_feats = {}
for rid in test_rec_sorted:
    f = load_img_feature(rid, folder='filtered_spectrograms', size=(64,64))
    if f is None:
        f = load_img_feature(rid, folder='spectrograms', size=(64,64))
    if f is None:
        f = np.zeros(X.shape[1], dtype=np.float32)
    test_feats[rid] = f
X_test = np.vstack([test_feats[rid] for rid in test_rec_sorted])
probs = pipe.predict_proba(X_test)  # shape: (n_rec_test, num_classes)

# Build Id -> prob mapping using formula: Id = rec_id * 100 + class_id
id2prob = {}
for i, rid in enumerate(test_rec_sorted):
    for c in range(num_classes):
        Id = int(rid) * 100 + c
        id2prob[Id] = float(probs[i, c])

# Fill submission by Id mapping
df_submit = pd.read_csv('sample_submission.csv')
df_submit['Probability'] = df_submit['Id'].map(id2prob).fillna(0.1)
df_submit.to_csv('submission.csv', index=False)
print('[INFO] Saved submission.csv')

[INFO] Image features built: X shape (258, 1030), y shape (258, 19)


[INFO] Training completed in 1.65s
[INFO] Saved submission.csv


In [8]:
# LightGBM on aggregated segment features; predict and ensemble with image-logit baseline
import sys, subprocess, importlib, numpy as np, pandas as pd, time
from pathlib import Path

def ensure_pkg(pkg):
    try:
        importlib.import_module(pkg)
        return True
    except ImportError:
        print(f"[INFO] Installing {pkg}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q'])
        importlib.invalidate_caches()
        return True

ensure_pkg('lightgbm')
import lightgbm as lgb

ess = Path('essential_data'); supp = Path('supplemental_data')

# Reload label data prepared earlier (cell 3 variables should exist). If not, parse again quickly.
try:
    df_train_ids, df_test_ids
except NameError:
    df_folds_csv = pd.read_csv(ess / 'CVfolds_2.txt')
    lab = pd.read_csv(ess / 'rec_labels_test_hidden.txt')
    # quick parse
    rows = []
    for _, r in lab.iterrows():
        items = str(r.iloc[0]).split(',')
    # For brevity, assume prior cell ran; otherwise skip LGBM.
    raise RuntimeError('Labels not prepared; run cell 3 first.')

# Load aggregated segment features (from cell 3) or rebuild if missing
try:
    df_agg
except NameError:
    seg_feat_path = Path('supplemental_data/segment_features.txt')
    df_seg = pd.read_csv(seg_feat_path, header=None, skiprows=1)
    df_seg = df_seg.rename(columns={0:'rec_id', 1:'seg_id'})
    feature_cols = [c for c in df_seg.columns if c not in ['rec_id','seg_id']]
    df_agg = df_seg.groupby('rec_id')[feature_cols].mean().reset_index()

# Build multi-hot matrix for known training rec_ids present in df_agg
species_df = pd.read_csv(ess / 'species_list.txt')
num_classes = species_df.shape[0]

df_train_merge = df_train_ids[['rec_id','labels']].merge(df_agg, on='rec_id', how='inner')
X_train = df_train_merge.drop(columns=['rec_id','labels']).values.astype(np.float32)
Y = np.zeros((len(df_train_merge), num_classes), dtype=np.float32)
for i, labs in enumerate(df_train_merge['labels']):
    for c in labs:
        Y[i, c] = 1.0
print(f"[INFO] LGBM training data: X {X_train.shape}, Y {Y.shape}")

# Prepare test features for available rec_ids
df_test_merge = df_test_ids[['rec_id']].merge(df_agg, on='rec_id', how='inner')
test_rec_available = df_test_merge['rec_id'].tolist()
X_test = df_test_merge.drop(columns=['rec_id']).values.astype(np.float32)
print(f"[INFO] LGBM test available rec_ids: {len(test_rec_available)} / {len(df_test_ids)}")

if X_train.shape[0] == 0 or X_test.shape[0] == 0:
    print('[WARN] Insufficient data for LGBM; skipping.')
else:
    # Train one model per class quickly with small params
    id2prob_lgb = {}
    for c in range(num_classes):
        y_c = Y[:, c]
        # handle class imbalance via scale_pos_weight
        pos = y_c.sum(); neg = len(y_c) - pos
        spw = float(neg / max(pos, 1.0)) if pos > 0 else 1.0
        train_set = lgb.Dataset(X_train, label=y_c)
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'min_data_in_leaf': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 1,
            'verbose': -1,
            'scale_pos_weight': spw,
        }
        num_boost_round = 200
        bst = lgb.train(params, train_set, num_boost_round=num_boost_round)
        p = bst.predict(X_test)
        for i, rid in enumerate(test_rec_available):
            Id = int(rid) * 100 + c
            id2prob_lgb[Id] = float(p[i])
    # Blend with existing submission
    df_submit = pd.read_csv('submission.csv')
    df_sub_base = pd.read_csv('sample_submission.csv')
    # Map lgb preds by Id
    lgb_series = df_sub_base['Id'].map(id2prob_lgb)
    # Simple average where both present, otherwise keep existing
    blended = df_submit['Probability'].copy()
    mask = lgb_series.notna()
    blended.loc[mask] = 0.5 * blended.loc[mask].values + 0.5 * lgb_series.loc[mask].values
    df_submit['Probability'] = blended
    df_submit.to_csv('submission.csv', index=False)
    print('[INFO] Saved blended submission.csv (image-logit + LGBM)')

[INFO] LGBM training data: X (122, 38), Y (122, 19)
[INFO] LGBM test available rec_ids: 32 / 64




[INFO] Saved blended submission.csv (image-logit + LGBM)


In [9]:
# Cross-Validation framework with MultilabelStratifiedKFold on image features; OOF AUC + CV inference
import numpy as np, pandas as pd, time, sys, subprocess, importlib
from pathlib import Path
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

def ensure_pkg(pkg):
    try:
        importlib.import_module(pkg)
        return True
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q'])
        importlib.invalidate_caches()
        return True

ensure_pkg('iterative-stratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

ess = Path('essential_data'); supp = Path('supplemental_data')

# Load id->filename
df_id2fn = pd.read_csv(ess / 'rec_id2filename.txt')
id2fn = dict(zip(df_id2fn.rec_id.astype(int), df_id2fn.filename.astype(str)))

def load_img_feature(rec_id, folder='filtered_spectrograms', size=(64,64)):
    fn = id2fn.get(int(rec_id), None)
    if fn is None: return None
    img_path = supp / folder / f"{fn}.bmp"
    if not img_path.exists():
        return None
    try:
        img = Image.open(img_path).convert('L')
    except Exception:
        return None
    Ht = size[0]
    w, h = img.size
    new_w = int(round(w * (Ht / h)))
    img_resized = img.resize((new_w, Ht), Image.BILINEAR)
    target_w = size[1]
    if new_w >= target_w:
        start = (new_w - target_w)//2
        img_crop = img_resized.crop((start, 0, start + target_w, Ht))
    else:
        pad_left = (target_w - new_w)//2
        canvas = Image.new('L', (target_w, Ht), color=0)
        canvas.paste(img_resized, (pad_left, 0))
        img_crop = canvas
    imgf = np.asarray(img_crop, dtype=np.float32) / 255.0
    mean = imgf.mean(); std = imgf.std(); p1 = np.percentile(imgf, 1); p5 = np.percentile(imgf,5); p95 = np.percentile(imgf,95); p99 = np.percentile(imgf,99)
    small = np.asarray(img_crop.resize((32,32), Image.BILINEAR), dtype=np.float32) / 255.0
    feat = np.concatenate([[mean, std, p1, p5, p95, p99], small.ravel()])
    return feat

# Prepare labeled dataset
species_df = pd.read_csv(ess / 'species_list.txt')
num_classes = species_df.shape[0]

# df_train_ids (rec_id, labels) and df_test_ids are prepared in cell 3
train_rows = df_train_ids.copy()
train_rec_ids = train_rows['rec_id'].tolist()
Y_full = np.zeros((len(train_rows), num_classes), dtype=np.float32)
for i, labs in enumerate(train_rows['labels']):
    for c in labs:
        Y_full[i, c] = 1.0

X_list = []; keep_idx = []
for i, rid in enumerate(train_rec_ids):
    f = load_img_feature(rid, folder='filtered_spectrograms', size=(64,64))
    if f is None:
        f = load_img_feature(rid, folder='spectrograms', size=(64,64))
    if f is None:
        continue
    X_list.append(f); keep_idx.append(i)
X_full = np.vstack(X_list)
Y = Y_full[keep_idx]
rec_ids_kept = [train_rec_ids[i] for i in keep_idx]
print(f"[CV] Train matrix: X {X_full.shape}, Y {Y.shape}, kept {len(rec_ids_kept)} of {len(train_rows)}")

# CV setup
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros_like(Y, dtype=np.float32)
fold_aucs = []

test_rec_sorted = sorted(df_test_ids['rec_id'].tolist())
X_test_list = []
for rid in test_rec_sorted:
    f = load_img_feature(rid, folder='filtered_spectrograms', size=(64,64))
    if f is None:
        f = load_img_feature(rid, folder='spectrograms', size=(64,64))
    if f is None:
        # backstop: vector of zeros (rare)
        f = np.zeros(X_full.shape[1], dtype=np.float32)
    X_test_list.append(f)
X_test_all = np.vstack(X_test_list)
test_preds_accum = np.zeros((len(test_rec_sorted), num_classes), dtype=np.float32)

for fold, (trn_idx, val_idx) in enumerate(mskf.split(X_full, Y), 1):
    t0 = time.time()
    X_tr, X_val = X_full[trn_idx], X_full[val_idx]
    y_tr, y_val = Y[trn_idx], Y[val_idx]
    pipe = Pipeline([
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('clf', OneVsRestClassifier(LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear')))
    ])
    pipe.fit(X_tr, y_tr)
    val_proba = pipe.predict_proba(X_val)
    oof[val_idx] = val_proba
    # per-class AUC, macro avg over classes with at least one positive and one negative in val
    aucs = []
    for c in range(num_classes):
        yv = y_val[:, c]
        if yv.sum() > 0 and (len(yv) - yv.sum()) > 0:
            try:
                aucs.append(roc_auc_score(yv, val_proba[:, c]))
            except Exception:
                pass
    fold_auc = float(np.mean(aucs)) if len(aucs) else float('nan')
    fold_aucs.append(fold_auc)
    print(f"[CV] Fold {fold}: macro AUC={fold_auc:.4f} using {len(aucs)} classes; time {time.time()-t0:.2f}s")
    # test preds
    test_preds_accum += pipe.predict_proba(X_test_all)

oof_classes = []
class_aucs = []
for c in range(num_classes):
    yc = Y[:, c]
    if yc.sum() > 0 and (len(yc) - yc.sum()) > 0:
        try:
            class_aucs.append(roc_auc_score(yc, oof[:, c]))
            oof_classes.append(c)
        except Exception:
            pass
oof_macro_auc = float(np.mean(class_aucs)) if len(class_aucs) else float('nan')
print(f"[CV] OOF macro AUC over {len(class_aucs)} classes: {oof_macro_auc:.4f}")

# Average test predictions over folds
test_preds = test_preds_accum / 5.0

# Build Id -> prob mapping and write submission
id2prob = {}
for i, rid in enumerate(test_rec_sorted):
    for c in range(num_classes):
        Id = int(rid) * 100 + c
        id2prob[Id] = float(test_preds[i, c])
df_submit_base = pd.read_csv('sample_submission.csv')
missing = df_submit_base['Id'][~df_submit_base['Id'].isin(id2prob.keys())]
if len(missing) > 0:
    print(f"[WARN] Missing {len(missing)} Ids in predictions; filling with small constant 0.05")
df_submit_base['Probability'] = df_submit_base['Id'].map(id2prob).fillna(0.05)
df_submit_base.to_csv('submission.csv', index=False)
print('[CV] Saved CV-based submission.csv')



[CV] Train matrix: X (258, 1030), Y (258, 19), kept 258 of 258


[CV] Fold 1: macro AUC=0.5353 using 18 classes; time 1.38s


[CV] Fold 2: macro AUC=0.6305 using 17 classes; time 1.42s


[CV] Fold 3: macro AUC=0.6466 using 17 classes; time 1.41s


[CV] Fold 4: macro AUC=0.5259 using 19 classes; time 1.35s


[CV] Fold 5: macro AUC=0.6728 using 18 classes; time 1.33s
[CV] OOF macro AUC over 19 classes: 0.6174
[CV] Saved CV-based submission.csv


In [21]:
# Tabular baseline v4: Added energy/geometry features, histogram shape stats, 5-fold + seed bagging, per-class blending
import pandas as pd, numpy as np, time, sys, subprocess, importlib
from pathlib import Path
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def ensure_pkg(pkg):
    try:
        importlib.import_module(pkg)
        return True
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q'])
        importlib.invalidate_caches()
        return True

ensure_pkg('lightgbm')
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

ess = Path('essential_data'); supp = Path('supplemental_data')

# Robust readers
def read_segment_features(path):
    df = pd.read_csv(path, header=None, skiprows=1)
    df = df.rename(columns={0:'rec_id', 1:'seg_id'})
    return df

def read_segment_rectangles(path):
    # Robust parser: attempt multiple separators and skip bad lines; return empty DF on failure
    variants = [
        dict(header=None, sep=',', engine='python', on_bad_lines='skip'),
        dict(header=None, sep='\t', engine='python', on_bad_lines='skip'),
        dict(header=None, sep=r'\s+', engine='python', on_bad_lines='skip'),
    ]
    df = None
    for kw in variants:
        try:
            tmp = pd.read_csv(path, **kw)
            if tmp.shape[1] >= 6:
                df = tmp
                break
        except Exception:
            continue
    if df is None:
        # final fallback: read lines and split manually by comma
        try:
            rows = []
            with open(path, 'r', errors='ignore') as f:
                for line in f:
                    parts = [p.strip() for p in line.strip().split(',')]
                    if len(parts) >= 6:
                        rows.append(parts[:6])
            if len(rows) > 0:
                df = pd.DataFrame(rows)
            else:
                raise RuntimeError('No parsable rows')
        except Exception:
            # return empty to skip geometry features
            return pd.DataFrame({'rec_id': pd.Series(dtype=int), 'seg_id': pd.Series(dtype=int)})
    # assume columns: rec_id, seg_id, x, y, w, h, ...
    ren = {0:'rec_id', 1:'seg_id', 2:'x', 3:'y', 4:'w', 5:'h'}
    df = df.rename(columns=ren)
    for c in ['rec_id','seg_id','x','y','w','h']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')
    df = df.dropna(subset=['rec_id','seg_id']).copy()
    if 'rec_id' in df.columns:
        df['rec_id'] = df['rec_id'].astype(int)
    return df

def read_histogram_segments(path):
    variants = [
        dict(header=0, sep=','),
        dict(header=None, skiprows=1, sep=','),
        dict(header=0, sep=r'\s+', engine='python'),
        dict(header=None, skiprows=1, sep=r'\s+', engine='python'),
    ]
    df = None
    for kw in variants:
        try:
            tmp = pd.read_csv(path, **kw)
            if tmp.shape[1] >= 3:
                df = tmp
                break
        except Exception:
            continue
    if df is None:
        df = pd.read_csv(path, header=None)
    cols = list(df.columns)
    if len(cols) >= 1: df = df.rename(columns={cols[0]: 'rec_id'})
    if len(cols) >= 2: df = df.rename(columns={cols[1]: 'seg_id'})
    rem = [c for c in df.columns if c not in ['rec_id','seg_id']]
    if any(isinstance(c, int) for c in rem) or any(str(c).isdigit() for c in rem):
        rename_map = {}; idx = 0
        for c in rem:
            rename_map[c] = f'hist_{idx}'; idx += 1
        df = df.rename(columns=rename_map)
    df['rec_id'] = pd.to_numeric(df['rec_id'], errors='coerce')
    df = df.dropna(subset=['rec_id']).copy()
    df['rec_id'] = df['rec_id'].astype(int)
    # ensure numeric bins
    for c in [c for c in df.columns if c not in ['rec_id','seg_id']]:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

# Labels
species_df = pd.read_csv(ess / 'species_list.txt')
num_classes = species_df.shape[0]
train_rows = df_train_ids[['rec_id','labels']].copy()
test_rows = df_test_ids[['rec_id']].copy()
print(f"[TAB] Using labels from rec_labels_test_hidden: train={len(train_rows)}, test={len(test_rows)}, classes={num_classes}")

# Segment features: expanded aggs
seg_feat_path = supp / 'segment_features.txt'
df_seg = read_segment_features(seg_feat_path)
feat_cols = [c for c in df_seg.columns if c not in ['rec_id','seg_id']]
# basic statistics
g_base = df_seg.groupby('rec_id')[feat_cols].agg(['mean','std','min','max','median'])
g_base.columns = [f"f{col}_{stat}" for col, stat in g_base.columns.to_flat_index()]
g_base = g_base.reset_index()
# quantiles
g_q25 = df_seg.groupby('rec_id')[feat_cols].quantile(0.25).reset_index()
g_q75 = df_seg.groupby('rec_id')[feat_cols].quantile(0.75).reset_index()
g_q25.columns = ['rec_id'] + [f"f{c}_q25" for c in feat_cols]
g_q75.columns = ['rec_id'] + [f"f{c}_q75" for c in feat_cols]
# skew and kurtosis computed separately
g_skew = df_seg.groupby('rec_id')[feat_cols].agg('skew').reset_index()
g_skew.columns = ['rec_id'] + [f"f{c}_skew" for c in feat_cols]
g_kurt = df_seg.groupby('rec_id')[feat_cols].agg(pd.Series.kurt).reset_index()
g_kurt.columns = ['rec_id'] + [f"f{c}_kurt" for c in feat_cols]
# count of segments
cnt = df_seg.groupby('rec_id')['seg_id'].count().rename('n_segments').reset_index()

# Segment rectangles: geometry features
rect_path = supp / 'segment_rectangles.txt'
df_rect = read_segment_rectangles(rect_path)
if set(['rec_id','seg_id','w','h']).issubset(df_rect.columns):
    # duration ~ width, bandwidth ~ height; area, aspect
    df_rect = df_rect.copy()
    df_rect['duration'] = df_rect['w'].clip(lower=0)
    df_rect['bandwidth'] = df_rect['h'].clip(lower=0)
    df_rect['area'] = (df_rect['w'].clip(lower=0) * df_rect['h'].clip(lower=0))
    df_rect['aspect_ratio'] = df_rect['h'].clip(lower=0) / np.clip(df_rect['w'].clip(lower=0), 1e-6, None)
    geom_cols = ['duration','bandwidth','area','aspect_ratio']
    g_geom_base = df_rect.groupby('rec_id')[geom_cols].agg(['mean','std','sum','min','max','median']).reset_index()
    g_geom_q25 = df_rect.groupby('rec_id')[geom_cols].quantile(0.25).reset_index()
    g_geom_q75 = df_rect.groupby('rec_id')[geom_cols].quantile(0.75).reset_index()
    # flatten
    g_geom_base.columns = ['rec_id'] + [f"geom_{c}_{stat}" for c, stat in g_geom_base.columns.to_flat_index()[1:]]
    g_geom_q25.columns = ['rec_id'] + [f"geom_{c}_q25" for c in geom_cols]
    g_geom_q75.columns = ['rec_id'] + [f"geom_{c}_q75" for c in geom_cols]
else:
    # empty placeholders
    g_geom_base = pd.DataFrame({'rec_id': []})
    g_geom_q25 = pd.DataFrame({'rec_id': []})
    g_geom_q75 = pd.DataFrame({'rec_id': []})

# merge all seg aggs
df_seg_agg = g_base.merge(g_q25, on='rec_id', how='left').merge(g_q75, on='rec_id', how='left').merge(g_skew, on='rec_id', how='left').merge(g_kurt, on='rec_id', how='left').merge(cnt, on='rec_id', how='left')
if 'rec_id' in g_geom_base.columns:
    df_seg_agg = df_seg_agg.merge(g_geom_base, on='rec_id', how='left').merge(g_geom_q25, on='rec_id', how='left').merge(g_geom_q75, on='rec_id', how='left')
df_seg_agg = df_seg_agg.fillna(0.0)
print(f"[TAB] seg_agg shape: {df_seg_agg.shape}")

# Histogram features
hist_path = supp / 'histogram_of_segments.txt'
df_hist = read_histogram_segments(hist_path)
hist_bins = [c for c in df_hist.columns if c not in ['rec_id','seg_id']]
print(f"[TAB] histogram raw shape: {df_hist.shape}; first cols: {list(df_hist.columns)[:6]}")
if len(hist_bins) == 0:
    raise RuntimeError('No histogram bins parsed')

# A) Raw-sum features per rec_id
raw_sum = df_hist.groupby('rec_id')[hist_bins].sum().reset_index()
raw_sum_total = raw_sum[hist_bins].sum(axis=1).values.reshape(-1,1)
raw_sum_frac = raw_sum.copy()
raw_sum_frac[hist_bins] = (raw_sum[hist_bins].values / np.clip(raw_sum_total, 1e-12, None))
# record-level stats from fractions
frac_vals = raw_sum_frac[hist_bins].values
entropy = -(frac_vals * np.log(np.clip(frac_vals, 1e-12, None))).sum(axis=1)
herfindahl = (frac_vals**2).sum(axis=1)
gini = 1.0 - herfindahl
top1 = np.max(frac_vals, axis=1)
top2 = np.partition(frac_vals, -2, axis=1)[:, -2:].sum(axis=1)
top3 = np.partition(frac_vals, -3, axis=1)[:, -3:].sum(axis=1)
argmax = frac_vals.argmax(axis=1).astype(int)
nonzero_bins = (frac_vals > 0).sum(axis=1)

# per-record skew/kurt over normalized bins
def row_skew(x):
    m = x.mean(); s = x.std()
    if s <= 1e-12: return 0.0
    z = (x - m) / s
    return float((z**3).mean())
def row_kurt(x):
    m = x.mean(); s = x.std()
    if s <= 1e-12: return 0.0
    z = (x - m) / s
    return float((z**4).mean() - 3.0)
row_skews = np.apply_along_axis(row_skew, 1, frac_vals)
row_kurts = np.apply_along_axis(row_kurt, 1, frac_vals)

df_hist_sum = raw_sum.add_prefix('sum_')
df_hist_sum = df_hist_sum.rename(columns={'sum_rec_id':'rec_id'})
df_hist_frac = raw_sum_frac.add_prefix('frac_')
df_hist_frac = df_hist_frac.rename(columns={'frac_rec_id':'rec_id'})
df_hist_rec = df_hist_sum.merge(df_hist_frac, on='rec_id', how='left')
df_hist_rec['hist_entropy'] = entropy
df_hist_rec['hist_gini'] = gini
df_hist_rec['hist_herfindahl'] = herfindahl
df_hist_rec['hist_top1'] = top1
df_hist_rec['hist_top2_sum'] = top2
df_hist_rec['hist_top3_sum'] = top3
df_hist_rec['hist_argmax'] = argmax
df_hist_rec['hist_nonzero_bins'] = nonzero_bins
df_hist_rec['hist_frac_skew'] = row_skews
df_hist_rec['hist_frac_kurt'] = row_kurts
df_hist_rec['raw_sum_total'] = raw_sum_total.ravel()

# per-segment totals and their aggs
seg_totals = df_hist.copy()
seg_totals['seg_total'] = seg_totals[hist_bins].sum(axis=1)
seg_agg = seg_totals.groupby('rec_id')['seg_total']
seg_total_feats = pd.DataFrame({
    'rec_id': seg_agg.size().index,
    'seg_total_mean': seg_agg.mean().values,
    'seg_total_std': seg_agg.std().fillna(0).values,
    'seg_total_min': seg_agg.min().values,
    'seg_total_max': seg_agg.max().values,
    'seg_total_q25': seg_agg.quantile(0.25).values,
    'seg_total_q75': seg_agg.quantile(0.75).values,
    'seg_total_sum': seg_agg.sum().values,
})

# simple peak stats on record-level normalized histogram (count local maxima)
def count_peaks(row):
    x = row.values.astype(float)
    cnt = 0
    for i in range(1, len(x)-1):
        if x[i] > x[i-1] and x[i] > x[i+1]:
            cnt += 1
    return cnt
try:
    df_tmp_frac = df_hist_frac[[c for c in df_hist_frac.columns if c.startswith('frac_hist_')]].copy()
    num_peaks = df_tmp_frac.apply(count_peaks, axis=1).values
    df_hist_rec['hist_num_peaks'] = num_peaks
except Exception:
    df_hist_rec['hist_num_peaks'] = 0

# B) Shape features from per-segment normalized histograms (mean/std/skew/kurt across segments)
H = df_hist[hist_bins].to_numpy(dtype=float)
H = np.nan_to_num(H, nan=0.0, posinf=0.0, neginf=0.0)
H = np.maximum(H, 0.0)
row_sum = np.clip(H.sum(axis=1, keepdims=True), 1e-12, None)
Hn = H / row_sum
df_hist_norm = df_hist[['rec_id']].copy()
df_hist_norm[hist_bins] = Hn
shape_mean = df_hist_norm.groupby('rec_id')[hist_bins].mean().reset_index()
shape_std = df_hist_norm.groupby('rec_id')[hist_bins].std().fillna(0.0).reset_index()
shape_skew = df_hist_norm.groupby('rec_id')[hist_bins].agg('skew').fillna(0.0).reset_index()
shape_kurt = df_hist_norm.groupby('rec_id')[hist_bins].agg(pd.Series.kurt).fillna(0.0).reset_index()
shape_mean.columns = ['rec_id'] + [f'shape_mean_{c}' for c in hist_bins]
shape_std.columns = ['rec_id'] + [f'shape_std_{c}' for c in hist_bins]
shape_skew.columns = ['rec_id'] + [f'shape_skew_{c}' for c in hist_bins]
shape_kurt.columns = ['rec_id'] + [f'shape_kurt_{c}' for c in hist_bins]

# Combine histogram record-level features
df_hist_features = df_hist_rec.merge(seg_total_feats, on='rec_id', how='left').merge(shape_mean, on='rec_id', how='left').merge(shape_std, on='rec_id', how='left').merge(shape_skew, on='rec_id', how='left').merge(shape_kurt, on='rec_id', how='left')
df_hist_features = df_hist_features.fillna(0.0)
print(f"[TAB] hist_features shape: {df_hist_features.shape}")

# Two datasets:
# Model A features: Seg + Hist (inner on rec_id)
df_feat_A = df_seg_agg.merge(df_hist_features, on='rec_id', how='inner')
print(f"[TAB] Model A feature shape: {df_feat_A.shape}")
# Model B features: Hist-only (all with histogram)
df_feat_B = df_hist_features.copy()
print(f"[TAB] Model B feature shape: {df_feat_B.shape}")

# Prepare train/test merges for A and B
train_A = train_rows.merge(df_feat_A, on='rec_id', how='inner')
test_A = test_rows.merge(df_feat_A, on='rec_id', how='inner')
train_B = train_rows.merge(df_feat_B, on='rec_id', how='inner')
test_B = test_rows.merge(df_feat_B, on='rec_id', how='inner')
print(f"[TAB] Train A: {len(train_A)} recs; Test A: {len(test_A)} recs")
print(f"[TAB] Train B: {len(train_B)} recs; Test B: {len(test_B)} recs")

def build_XY(df):
    X = df.drop(columns=['rec_id','labels']).values.astype(np.float32)
    Y = np.zeros((len(df), num_classes), dtype=np.float32)
    for i, labs in enumerate(df['labels']):
        for c in labs: Y[i, c] = 1.0
    return X, Y

X_A, Y_A = build_XY(train_A) if len(train_A) else (np.zeros((0,0),np.float32), np.zeros((0, num_classes), np.float32))
X_B, Y_B = build_XY(train_B)
XA_test = test_A.drop(columns=['rec_id']).values.astype(np.float32) if len(test_A) else np.zeros((0,0),np.float32)
XB_test = test_B.drop(columns=['rec_id']).values.astype(np.float32)
test_rec_A = test_A['rec_id'].tolist() if len(test_A) else []
test_rec_B = test_B['rec_id'].tolist()
print(f"[TAB] X_A {X_A.shape}, X_B {X_B.shape}; test A {len(test_rec_A)}, test B {len(test_rec_B)}")

# Training function with MLSK, rare-class augmentation, robust params; seed bagging
def train_single_seed(X, Y, X_test, n_splits=5, seed=42, label='A'):
    if X.shape[0] == 0:
        return np.zeros((0, num_classes), np.float32), np.zeros((X_test.shape[0], num_classes), np.float32), float('nan'), []
    mskf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros_like(Y, dtype=np.float32)
    test_accum = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
    fold_class_aucs = []
    for fold, (trn_idx, val_idx) in enumerate(mskf.split(X, Y), 1):
        t0 = time.time()
        X_tr, X_val = X[trn_idx], X[val_idx]
        y_tr, y_val = Y[trn_idx], Y[val_idx]
        val_pred = np.zeros_like(y_val, dtype=np.float32)
        test_fold = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
        for c in range(num_classes):
            ytr_c = y_tr[:, c]; yval_c = y_val[:, c]
            # rare class augmentation: ensure at least 10 positives via bootstrapping
            pos_idx = np.where(ytr_c == 1)[0]
            neg_idx = np.where(ytr_c == 0)[0]
            X_tr_c = X_tr; ytr_c_aug = ytr_c
            if len(pos_idx) > 0 and len(pos_idx) < 10:
                need = 10 - len(pos_idx)
                dup_idx = np.random.RandomState(seed + fold + c).choice(pos_idx, size=need, replace=True)
                X_tr_c = np.concatenate([X_tr, X_tr[dup_idx]], axis=0)
                ytr_c_aug = np.concatenate([ytr_c, np.ones(need, dtype=ytr_c.dtype)], axis=0)
            pos = float(ytr_c_aug.sum()); neg = float(len(ytr_c_aug) - pos)
            spw = float(neg / max(pos, 1.0)) if pos > 0 else 1.0
            spw = min(100.0, spw)
            dtr = lgb.Dataset(X_tr_c, label=ytr_c_aug)
            dval = lgb.Dataset(X_val, label=yval_c, reference=dtr)
            params = {
                'objective': 'binary', 'metric': 'auc', 'verbose': -1,
                'learning_rate': 0.03, 'num_leaves': 31, 'min_data_in_leaf': 15,
                'feature_fraction': 0.6, 'bagging_fraction': 0.8, 'bagging_freq': 1,
                'lambda_l1': 0.1, 'lambda_l2': 1.0, 'scale_pos_weight': spw,
                'extra_trees': True, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.01,
            }
            callbacks = [lgb.early_stopping(stopping_rounds=250, verbose=False)]
            bst = lgb.train(params, dtr, num_boost_round=5000, valid_sets=[dval], callbacks=callbacks)
            val_pred[:, c] = bst.predict(X_val, num_iteration=bst.best_iteration)
            if X_test.shape[0] > 0:
                test_fold[:, c] = bst.predict(X_test, num_iteration=bst.best_iteration)
        oof[val_idx] = val_pred
        aucs = []
        for c in range(num_classes):
            yv = y_val[:, c]
            if yv.sum() > 0 and (len(yv) - yv.sum()) > 0:
                try: aucs.append(roc_auc_score(yv, val_pred[:, c]))
                except Exception: pass
        fold_auc = float(np.mean(aucs)) if len(aucs) else float('nan')
        fold_class_aucs.append(aucs)
        test_accum += test_fold
        print(f"[TAB-{label}] Seed {seed} Fold {fold}: macro AUC={fold_auc:.4f}; time {time.time()-t0:.2f}s")
    # OOF macro over classes with pos/neg
    class_aucs = []
    for c in range(num_classes):
        yc = Y[:, c]
        if yc.sum() > 0 and (len(yc) - yc.sum()) > 0:
            try: class_aucs.append(roc_auc_score(yc, oof[:, c]))
            except Exception: pass
    oof_macro = float(np.mean(class_aucs)) if len(class_aucs) else float('nan')
    print(f"[TAB-{label}] Seed {seed} OOF macro AUC over {len(class_aucs)} classes: {oof_macro:.4f}")
    return oof, (test_accum / n_splits), oof_macro, class_aucs

def run_bag(X, Y, X_test, seeds, label):
    oofs = []; tests = []; class_auc_list = []
    for sd in seeds:
        oof, tpred, oof_macro, class_aucs = train_single_seed(X, Y, X_test, n_splits=5, seed=sd, label=label)
        oofs.append(oof); tests.append(tpred); class_auc_list.append(class_aucs)
    oof_avg = np.mean(oofs, axis=0)
    test_avg = np.mean(tests, axis=0)
    # average per-class AUCs across seeds
    class_auc_avg = np.nanmean(np.vstack([np.array(c + [np.nan]*(num_classes - len(c))) for c in class_auc_list]), axis=0)
    return oof_avg, test_avg, class_auc_avg

seeds = [42, 1337, 2025]
oof_A, test_A_pred, class_auc_A = run_bag(X_A, Y_A, XA_test, seeds, label='A') if X_A.shape[0] else (np.zeros((0, num_classes), np.float32), np.zeros((XA_test.shape[0], num_classes), np.float32), np.zeros((num_classes,), dtype=float))
oof_B, test_B_pred, class_auc_B = run_bag(X_B, Y_B, XB_test, seeds, label='B')

# Per-class blending weights from OOF AUCs
wA = np.array(class_auc_A, dtype=float)
wB = np.array(class_auc_B, dtype=float)
wA = np.nan_to_num(wA, nan=0.0); wB = np.nan_to_num(wB, nan=0.0)
den = wA + wB
wA_norm = np.where(den > 0, wA / den, 0.5)
wB_norm = 1.0 - wA_norm
print('[TAB] Per-class blend weight A (first 10):', np.round(wA_norm[:10], 3))

# Build combined OOF for monitoring (use per-class weights where both available)
rid_A = train_A['rec_id'].tolist() if len(train_A) else []
rid_B = train_B['rec_id'].tolist()
rid2idxA = {r:i for i,r in enumerate(rid_A)}
rid2idxB = {r:i for i,r in enumerate(rid_B)}
all_train_rids = train_rows['rec_id'].tolist()
oof_combined = []
for r in all_train_rids:
    if r in rid2idxA and r in rid2idxB:
        ia, ib = rid2idxA[r], rid2idxB[r]
        # apply per-class blend
        oof_combined.append(wA_norm * oof_A[ia] + wB_norm * oof_B[ib])
    elif r in rid2idxA:
        oof_combined.append(oof_A[rid2idxA[r]])
    elif r in rid2idxB:
        oof_combined.append(oof_B[rid2idxB[r]])
    else:
        oof_combined.append(np.full((num_classes,), np.nan, dtype=np.float32))
oof_combined = np.stack(oof_combined, axis=0)
Y_all = np.zeros((len(all_train_rids), num_classes), dtype=np.float32)
for i, labs in enumerate(train_rows['labels']):
    for c in labs: Y_all[i, c] = 1.0
class_aucs = []
for c in range(num_classes):
    y = Y_all[:, c]
    preds = oof_combined[:, c]
    mask = ~np.isnan(preds)
    yv = y[mask]; pv = preds[mask]
    if len(yv) > 0 and yv.sum() > 0 and (len(yv)-yv.sum()) > 0:
        try: class_aucs.append(roc_auc_score(yv, pv))
        except Exception: pass
oof_macro_combined = float(np.mean(class_aucs)) if len(class_aucs) else float('nan')
print(f"[TAB] Combined OOF macro AUC over {len(class_aucs)} classes: {oof_macro_combined:.4f}")

# Build test predictions combining A and B with per-class weights
id2prob = {}
test_recids_all = sorted(test_rows['rec_id'].tolist())
pred_map_A = {r: test_A_pred[i] for i, r in enumerate(test_rec_A)} if len(test_rec_A) else {}
pred_map_B = {r: test_B_pred[i] for i, r in enumerate(test_rec_B)}

# post-process: clip ultra-rare class predictions
pos_counts = Y_all.sum(axis=0)
ultra_rare = set(np.where(pos_counts <= 3)[0].tolist())

for r in test_recids_all:
    if (r in pred_map_A) and (r in pred_map_B):
        p = wA_norm * pred_map_A[r] + wB_norm * pred_map_B[r]
    elif r in pred_map_A:
        p = pred_map_A[r]
    elif r in pred_map_B:
        p = pred_map_B[r]
    else:
        p = np.full((num_classes,), 0.05, dtype=np.float32)
    # clip ultra-rare
    if len(ultra_rare) > 0:
        for c in ultra_rare:
            p[c] = float(np.clip(p[c], 0.15, 0.85))
    for c in range(num_classes):
        Id = int(r) * 100 + c
        id2prob[Id] = float(p[c])

# Write/Blend submission
df_base = pd.read_csv('sample_submission.csv')
df_out = df_base.copy()
df_out['Probability'] = df_out['Id'].map(id2prob).fillna(0.05)
if Path('submission.csv').exists():
    df_prev = pd.read_csv('submission.csv')
    blended = df_prev['Probability'].copy()
    mask = df_out['Probability'].notna()
    blended.loc[mask] = 0.5 * blended.loc[mask].values + 0.5 * df_out.loc[mask, 'Probability'].values
    df_prev['Probability'] = blended
    df_prev.to_csv('submission.csv', index=False)
    print('[TAB] Blended with previous submission and saved submission.csv')
else:
    df_out.to_csv('submission.csv', index=False)
    print('[TAB] Saved submission.csv from tabular pipeline')

[TAB] Using labels from rec_labels_test_hidden: train=258, test=64, classes=19


[TAB] seg_agg shape: (154, 376)
[TAB] histogram raw shape: (322, 101); first cols: ['rec_id', 'seg_id', 'hist_0', 'hist_1', 'hist_2', 'hist_3']


[TAB] hist_features shape: (322, 614)
[TAB] Model A feature shape: (154, 989)
[TAB] Model B feature shape: (322, 614)
[TAB] Train A: 122 recs; Test A: 32 recs
[TAB] Train B: 258 recs; Test B: 64 recs
[TAB] X_A (122, 988), X_B (258, 613); test A 32, test B 64




[TAB-A] Seed 42 Fold 1: macro AUC=0.8200; time 1.93s


[TAB-A] Seed 42 Fold 2: macro AUC=0.8582; time 2.08s




[TAB-A] Seed 42 Fold 3: macro AUC=0.8559; time 2.05s




[TAB-A] Seed 42 Fold 4: macro AUC=0.9232; time 2.10s




[TAB-A] Seed 42 Fold 5: macro AUC=0.8905; time 1.82s
[TAB-A] Seed 42 OOF macro AUC over 19 classes: 0.7866




[TAB-A] Seed 1337 Fold 1: macro AUC=0.8488; time 2.07s




[TAB-A] Seed 1337 Fold 2: macro AUC=0.8572; time 1.90s




[TAB-A] Seed 1337 Fold 3: macro AUC=0.8497; time 1.92s




[TAB-A] Seed 1337 Fold 4: macro AUC=0.7808; time 1.88s




[TAB-A] Seed 1337 Fold 5: macro AUC=0.8282; time 1.82s
[TAB-A] Seed 1337 OOF macro AUC over 19 classes: 0.7314




[TAB-A] Seed 2025 Fold 1: macro AUC=0.8221; time 1.91s




[TAB-A] Seed 2025 Fold 2: macro AUC=0.8321; time 2.00s




[TAB-A] Seed 2025 Fold 3: macro AUC=0.8334; time 1.74s




[TAB-A] Seed 2025 Fold 4: macro AUC=0.8323; time 1.99s




[TAB-A] Seed 2025 Fold 5: macro AUC=0.8415; time 2.42s
[TAB-A] Seed 2025 OOF macro AUC over 19 classes: 0.7595




[TAB-B] Seed 42 Fold 1: macro AUC=0.7395; time 2.06s




[TAB-B] Seed 42 Fold 2: macro AUC=0.8033; time 1.83s




[TAB-B] Seed 42 Fold 3: macro AUC=0.8266; time 1.46s




[TAB-B] Seed 42 Fold 4: macro AUC=0.7558; time 1.78s




[TAB-B] Seed 42 Fold 5: macro AUC=0.7813; time 1.80s
[TAB-B] Seed 42 OOF macro AUC over 19 classes: 0.7013




[TAB-B] Seed 1337 Fold 1: macro AUC=0.7313; time 1.67s




[TAB-B] Seed 1337 Fold 2: macro AUC=0.7895; time 1.76s




[TAB-B] Seed 1337 Fold 3: macro AUC=0.7971; time 1.52s




[TAB-B] Seed 1337 Fold 4: macro AUC=0.7838; time 1.64s




[TAB-B] Seed 1337 Fold 5: macro AUC=0.7821; time 1.40s
[TAB-B] Seed 1337 OOF macro AUC over 19 classes: 0.6984




[TAB-B] Seed 2025 Fold 1: macro AUC=0.7601; time 1.96s




[TAB-B] Seed 2025 Fold 2: macro AUC=0.8055; time 1.82s




[TAB-B] Seed 2025 Fold 3: macro AUC=0.7496; time 2.41s




[TAB-B] Seed 2025 Fold 4: macro AUC=0.7986; time 1.79s




[TAB-B] Seed 2025 Fold 5: macro AUC=0.8082; time 1.81s
[TAB-B] Seed 2025 OOF macro AUC over 19 classes: 0.6870
[TAB] Per-class blend weight A (first 10): [0.514 0.526 0.519 0.518 0.526 0.47  0.541 0.579 0.514 0.49 ]
[TAB] Combined OOF macro AUC over 19 classes: 0.7589
[TAB] Blended with previous submission and saved submission.csv


In [16]:
# Diagnostics: per-class support and OOF AUCs for LGBM pipeline
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

try:
    Y_all, oof, num_classes
except NameError:
    raise RuntimeError("Run Cell 7 first to populate Y_all and oof.")

pos_counts = Y_all.sum(axis=0).astype(int)
neg_counts = (Y_all.shape[0] - pos_counts).astype(int)
aucs = []
for c in range(num_classes):
    y = Y_all[:, c]
    if y.sum() > 0 and (len(y) - y.sum()) > 0:
        try:
            aucs.append(roc_auc_score(y, oof[:, c]))
        except Exception:
            aucs.append(np.nan)
    else:
        aucs.append(np.nan)
df_diag = pd.DataFrame({
    'class_id': np.arange(num_classes),
    'pos': pos_counts,
    'neg': neg_counts,
    'oof_auc': aucs
})
df_diag_sorted = df_diag.sort_values('oof_auc')
print('[DIAG] Per-class OOF AUC (worst 10):')
print(df_diag_sorted.head(10).to_string(index=False))
print('\n[DIAG] Per-class OOF AUC (best 10):')
print(df_diag_sorted.tail(10).to_string(index=False))
valid_aucs = df_diag['oof_auc'][df_diag['oof_auc'].notna()]
print(f"\n[DIAG] Macro OOF over {valid_aucs.shape[0]} classes: {valid_aucs.mean():.4f}")

# Check class id bounds from labels to ensure mapping is correct
try:
    train_labels_series = train_rows['labels']
    all_labs = [lab for labs in train_labels_series for lab in labs]
    if len(all_labs):
        print(f"[DIAG] Labels min={min(all_labs)}, max={max(all_labs)}, unique={len(set(all_labs))}")
except Exception as e:
    print(f"[DIAG] Label range check skipped: {e}")

[DIAG] Per-class OOF AUC (worst 10):
 class_id  pos  neg  oof_auc
       16    2  120 0.160417
       17    2  120 0.195833
        9   17  105 0.545658
       13    4  118 0.572034
        5    5  117 0.596581
        3    2  120 0.620833
       15    6  116 0.681034
        6   13  109 0.715596
       14   14  108 0.729167
        7   14  108 0.764550

[DIAG] Per-class OOF AUC (best 10):
 class_id  pos  neg  oof_auc
        7   14  108 0.764550
       18   12  110 0.803788
        4    8  114 0.805921
       12   10  112 0.807143
       10   38   84 0.809994
        2   18  104 0.812233
        1   25   97 0.828247
        8   23   99 0.859025
        0    7  115 0.867081
       11    8  114 0.938048

[DIAG] Macro OOF over 19 classes: 0.6902
[DIAG] Labels min=0, max=18, unique=19


In [32]:
# CNN baseline v3: EfficientNet-B0 offline pretrained, moderate res, simple cosine, stronger SpecAugment (no EMA)
import sys, subprocess, importlib, os, time, math, random, gc, urllib.request
from pathlib import Path
import numpy as np, pandas as pd

def ensure_pkg(pkg):
    try:
        importlib.import_module(pkg)
        return True
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q'])
        importlib.invalidate_caches()
        return True

ensure_pkg('torch'); ensure_pkg('torchvision'); ensure_pkg('timm'); ensure_pkg('iterative-stratification')
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models as tvm
from sklearn.metrics import roc_auc_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ess = Path('essential_data'); supp = Path('supplemental_data')

species_df = pd.read_csv(ess / 'species_list.txt')
num_classes = species_df.shape[0]
df_id2fn = pd.read_csv(ess / 'rec_id2filename.txt')
id2fn = dict(zip(df_id2fn.rec_id.astype(int), df_id2fn.filename.astype(str)))

# Moderate input preserving more time resolution
IMG_H, IMG_W = 224, 640

def load_spectrogram(rec_id: int, folder='filtered_spectrograms'):
    fn = id2fn.get(int(rec_id), None)
    if fn is None: return None
    p = supp / folder / f"{fn}.bmp"
    if not p.exists():
        return None
    try:
        img = Image.open(p).convert('L')
    except Exception:
        return None
    # aspect-preserving resize on height
    w, h = img.size
    new_w = int(round(w * (IMG_H / h)))
    img_resized = img.resize((new_w, IMG_H), Image.BILINEAR)
    if new_w >= IMG_W:
        start = (new_w - IMG_W)//2
        img_crop = img_resized.crop((start, 0, start + IMG_W, IMG_H))
    else:
        pad_left = (IMG_W - new_w)//2
        canvas = Image.new('L', (IMG_W, IMG_H), color=0)
        canvas.paste(img_resized, (pad_left, 0))
        img_crop = canvas
    img3 = Image.merge('RGB', (img_crop, img_crop, img_crop))
    return img3

class SpecAugment:
    def __init__(self, time_masks=2, time_max=80, freq_masks=2, freq_max=40):
        self.time_masks = time_masks; self.time_max = time_max
        self.freq_masks = freq_masks; self.freq_max = freq_max
    def __call__(self, x):
        C, H, W = x.shape
        for _ in range(self.time_masks):
            w = random.randint(0, self.time_max)
            if w > 0:
                t0 = random.randint(0, max(0, W - w))
                x[:, :, t0:t0+w] = 0.0
        for _ in range(self.freq_masks):
            h = random.randint(0, self.freq_max)
            if h > 0:
                f0 = random.randint(0, max(0, H - h))
                x[:, f0:f0+h, :] = 0.0
        return x

class SpectrogramDataset(Dataset):
    def __init__(self, rec_ids, labels=None, folder='filtered_spectrograms', train=True):
        self.rec_ids = list(rec_ids); self.labels = labels
        self.folder = folder; self.train = train
        self.to_tensor = transforms.ToTensor()
        self.specaug = SpecAugment(time_masks=2, time_max=80, freq_masks=2, freq_max=40)
        self.norm = transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
        self.max_shift = int(0.05 * IMG_W)  # ~5% horizontal roll
    def __len__(self):
        return len(self.rec_ids)
    def __getitem__(self, idx):
        rid = int(self.rec_ids[idx])
        img = load_spectrogram(rid, self.folder)
        if img is None:
            img = load_spectrogram(rid, 'spectrograms')
            if img is None:
                img = Image.new('RGB', (IMG_W, IMG_H), color=0)
        x = self.to_tensor(img)
        if self.train:
            if self.max_shift > 0 and random.random() < 0.5:
                shift = random.randint(-self.max_shift, self.max_shift)
                if shift != 0:
                    x = torch.roll(x, shifts=shift, dims=2)
            x = self.specaug(x)
        x = self.norm(x)
        if self.labels is None:
            return x, rid
        y = torch.zeros(num_classes, dtype=torch.float32)
        for c in self.labels[idx]:
            y[c] = 1.0
        return x, y

def build_targets(df_rows):
    rec_ids = df_rows['rec_id'].tolist()
    labels = [labs for labs in df_rows['labels']]
    return rec_ids, labels

def macro_auc(y_true, y_prob):
    aucs = []
    for c in range(y_true.shape[1]):
        yc = y_true[:, c]
        if yc.sum() > 0 and (len(yc) - yc.sum()) > 0:
            try: aucs.append(roc_auc_score(yc, y_prob[:, c]))
            except Exception: pass
    return float(np.mean(aucs)) if len(aucs) else float('nan')

# Offline pretrained weights for EfficientNet-B0
CACHE_DIR = Path('./torch_cache')
os.environ['TORCH_HOME'] = str(CACHE_DIR.resolve())
CHECKPOINTS_DIR = CACHE_DIR / 'hub' / 'checkpoints'
CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
assert os.access(CHECKPOINTS_DIR, os.W_OK), 'Cache directory is not writable.'
EFF_URL = 'https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth'
EFF_FILE = CHECKPOINTS_DIR / 'efficientnet_b0_rwightman-3dd342df.pth'

def ensure_file(url, path: Path):
    if not path.exists():
        print(f"[SETUP] Downloading weights to {path}")
        urllib.request.urlretrieve(url, path)
    else:
        print(f"[SETUP] Weights already exist at {path}")

def create_efficientnet_b0_offline(num_classes):
    ensure_file(EFF_URL, EFF_FILE)
    model = tvm.efficientnet_b0(weights=None)
    state_dict = torch.load(EFF_FILE, map_location='cpu')
    state_dict = {k: v for k, v in state_dict.items() if not k.startswith('classifier.')}
    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    in_features = model.classifier[1].in_features
    model.classifier[1] = nn.Linear(in_features, num_classes)
    print('[CNN] EfficientNet-B0 loaded from local cache. Missing:', missing, 'Unexpected:', unexpected)
    return model

def train_cnn_filtered(seed=42, batch_size=32, max_epochs=25, patience=6, lr=3e-4, wd=1e-2):
    torch.manual_seed(seed); np.random.seed(seed); random.seed(seed)
    train_df = df_train_ids[['rec_id','labels']].copy().reset_index(drop=True)
    rec_ids, labels = build_targets(train_df)
    Y = np.zeros((len(labels), num_classes), dtype=np.float32)
    for i, labs in enumerate(labels):
        for c in labs: Y[i, c] = 1.0
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros_like(Y, dtype=np.float32)
    test_rec_sorted = sorted(df_test_ids['rec_id'].tolist())
    test_ds = SpectrogramDataset(test_rec_sorted, labels=None, folder='filtered_spectrograms', train=False)
    test_logits_accum = np.zeros((len(test_rec_sorted), num_classes), dtype=np.float32)
    tta_shifts = [-0.15, -0.075, 0.0, 0.075, 0.15]

    for fold, (trn_idx, val_idx) in enumerate(mskf.split(np.arange(len(rec_ids)), Y), 1):
        t_fold = time.time()
        trn_ids = [rec_ids[i] for i in trn_idx]
        trn_labels = [labels[i] for i in trn_idx]
        val_ids = [rec_ids[i] for i in val_idx]
        val_labels = [labels[i] for i in val_idx]

        train_ds = SpectrogramDataset(trn_ids, trn_labels, folder='filtered_spectrograms', train=True)
        val_ds = SpectrogramDataset(val_ids, val_labels, folder='filtered_spectrograms', train=False)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=False)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

        model = create_efficientnet_b0_offline(num_classes).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs)
        scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))

        # pos_weight per class
        y_tr = np.zeros((len(trn_labels), num_classes), dtype=np.float32)
        for i, labs in enumerate(trn_labels):
            for c in labs: y_tr[i, c] = 1.0
        pos = y_tr.sum(axis=0); neg = (y_tr.shape[0] - pos)
        pos_weight = np.divide(neg, np.clip(pos, 1.0, None))
        pos_weight = np.clip(pos_weight, 1.0, 15.0)
        pos_weight_t = torch.tensor(pos_weight, dtype=torch.float32, device=device)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)

        best_auc = -1.0; best_state = None; no_improve = 0
        for epoch in range(1, max_epochs+1):
            t0 = time.time(); model.train()
            running = 0.0; n_batches = 0
            for xb, yb in train_loader:
                xb = xb.to(device, non_blocking=True); yb = yb.to(device, non_blocking=True)
                optimizer.zero_grad(set_to_none=True)
                with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
                    logits = model(xb)
                    loss = criterion(logits, yb)
                scaler.scale(loss).backward()
                scaler.step(optimizer); scaler.update()
                running += loss.item(); n_batches += 1
            scheduler.step()
            # validate
            model.eval()
            val_logits = []; val_targets = []
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb = xb.to(device, non_blocking=True)
                    logits = model(xb)
                    val_logits.append(logits.detach().cpu().float().numpy())
                    val_targets.append(yb.numpy())
            val_logits = np.concatenate(val_logits, axis=0)
            val_targets = np.concatenate(val_targets, axis=0)
            val_probs = 1.0 / (1.0 + np.exp(-val_logits))
            fold_auc = macro_auc(val_targets, val_probs)
            if fold_auc > best_auc + 1e-4:
                best_auc = fold_auc; no_improve = 0
                best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            else:
                no_improve += 1
            print(f"[CNN] Fold {fold} Epoch {epoch}/{max_epochs} loss={running/max(1,n_batches):.4f} valAUC={fold_auc:.4f} best={best_auc:.4f} time={time.time()-t0:.1f}s")
            if no_improve >= patience:
                print(f"[CNN] Early stop on fold {fold} at epoch {epoch}")
                break
        if best_state is not None:
            model.load_state_dict(best_state)
        # OOF
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
        all_logits = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                logits = model(xb)
                all_logits.append(logits.detach().cpu().float().numpy())
        all_logits = np.concatenate(all_logits, axis=0)
        oof[val_idx] = 1.0 / (1.0 + np.exp(-all_logits))

        # TTA on test
        def tta_preds():
            preds_acc = np.zeros((len(test_ds), num_classes), dtype=np.float32)
            with torch.no_grad():
                for shift in [-0.1, 0.0, 0.1]:
                    for start in range(0, len(test_ds), batch_size):
                        end = min(len(test_ds), start+batch_size)
                        batch = []
                        for i in range(start, end):
                            img = load_spectrogram(test_rec_sorted[i], 'filtered_spectrograms')
                            if img is None:
                                img = load_spectrogram(test_rec_sorted[i], 'spectrograms')
                            if img is None:
                                img = Image.new('RGB', (IMG_W, IMG_H), color=0)
                            x = transforms.ToTensor()(img)
                            pixels = int(shift * IMG_W)
                            if pixels != 0:
                                x = torch.roll(x, shifts=pixels, dims=2)
                            x = transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])(x)
                            batch.append(x)
                        xb = torch.stack(batch, dim=0).to(device)
                        logits = model(xb).detach().cpu().float().numpy()
                        preds_acc[start:end] += logits
            preds_acc /= 3.0
            return 1.0 / (1.0 + np.exp(-preds_acc))
        tpreds = tta_preds()
        test_logits_accum += tpreds.astype(np.float32)
        print(f"[CNN] Fold {fold} done in {time.time()-t_fold:.1f}s; bestAUC={best_auc:.4f}")
        del model; gc.collect(); torch.cuda.empty_cache()

    test_preds = test_logits_accum / 5.0
    oof_auc = macro_auc(Y, oof)
    print(f"[CNN] Filtered spectrograms 5-fold OOF macro AUC: {oof_auc:.4f}")
    id2prob = {}
    for i, rid in enumerate(test_rec_sorted):
        for c in range(num_classes):
            Id = int(rid) * 100 + c
            id2prob[Id] = float(test_preds[i, c])
    return oof, test_preds, id2prob, oof_auc

t0_all = time.time()
oof_cnn_filt, test_cnn_filt, id2prob_cnn_filt, oof_auc_cnn = train_cnn_filtered(seed=42, batch_size=32, max_epochs=25, patience=6, lr=3e-4, wd=1e-2)
print(f"[CNN] Finished filtered CNN in {time.time()-t0_all:.1f}s; OOF={oof_auc_cnn:.4f}")

df_base = pd.read_csv('sample_submission.csv')
cnn_series = df_base['Id'].map(id2prob_cnn_filt).astype(float)
if Path('submission.csv').exists():
    df_prev = pd.read_csv('submission.csv')
    prev = df_prev['Probability'].astype(float)
    blended = 0.7 * cnn_series.fillna(prev) + 0.3 * prev
    df_prev['Probability'] = blended.fillna(0.05)
    df_prev.to_csv('submission.csv', index=False)
    print('[CNN] Blended CNN(filtered) 0.7 with existing submission and saved submission.csv')
else:
    out = df_base.copy(); out['Probability'] = cnn_series.fillna(0.05)
    out.to_csv('submission.csv', index=False)
    print('[CNN] Saved CNN(filtered)-only submission.csv')



[SETUP] Weights already exist at torch_cache/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth
[CNN] EfficientNet-B0 loaded from local cache. Missing: ['classifier.1.weight', 'classifier.1.bias'] Unexpected: []


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 1/25 loss=1.0700 valAUC=0.4344 best=0.4344 time=24.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 2/25 loss=1.0228 valAUC=0.4172 best=0.4344 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 3/25 loss=0.9594 valAUC=0.5379 best=0.5379 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 4/25 loss=0.8996 valAUC=0.6087 best=0.6087 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 5/25 loss=0.8532 valAUC=0.6140 best=0.6140 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 6/25 loss=0.8071 valAUC=0.5770 best=0.6140 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 7/25 loss=0.7952 valAUC=0.6255 best=0.6255 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 8/25 loss=0.7418 valAUC=0.7094 best=0.7094 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 9/25 loss=0.7452 valAUC=0.7271 best=0.7271 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 10/25 loss=0.6994 valAUC=0.7369 best=0.7369 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 11/25 loss=0.7146 valAUC=0.7347 best=0.7369 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 12/25 loss=0.6568 valAUC=0.7422 best=0.7422 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 13/25 loss=0.6477 valAUC=0.7475 best=0.7475 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 14/25 loss=0.6057 valAUC=0.7545 best=0.7545 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 15/25 loss=0.6271 valAUC=0.7462 best=0.7545 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 16/25 loss=0.6037 valAUC=0.7563 best=0.7563 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 17/25 loss=0.5944 valAUC=0.7549 best=0.7563 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 18/25 loss=0.5677 valAUC=0.7502 best=0.7563 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 19/25 loss=0.5496 valAUC=0.7422 best=0.7563 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 20/25 loss=0.5498 valAUC=0.7391 best=0.7563 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 21/25 loss=0.5598 valAUC=0.7315 best=0.7563 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 1 Epoch 22/25 loss=0.5727 valAUC=0.7412 best=0.7563 time=2.5s
[CNN] Early stop on fold 1 at epoch 22


[CNN] Fold 1 done in 80.6s; bestAUC=0.7563


[SETUP] Weights already exist at torch_cache/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth
[CNN] EfficientNet-B0 loaded from local cache. Missing: ['classifier.1.weight', 'classifier.1.bias'] Unexpected: []


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 1/25 loss=1.0741 valAUC=0.4661 best=0.4661 time=13.2s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 2/25 loss=1.0245 valAUC=0.5028 best=0.5028 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 3/25 loss=0.9585 valAUC=0.5699 best=0.5699 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 4/25 loss=0.8873 valAUC=0.5651 best=0.5699 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 5/25 loss=0.8961 valAUC=0.6094 best=0.6094 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 6/25 loss=0.8593 valAUC=0.7133 best=0.7133 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 7/25 loss=0.8165 valAUC=0.7201 best=0.7201 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 8/25 loss=0.7840 valAUC=0.7638 best=0.7638 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 9/25 loss=0.7566 valAUC=0.7297 best=0.7638 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 10/25 loss=0.7455 valAUC=0.7348 best=0.7638 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 11/25 loss=0.7379 valAUC=0.7851 best=0.7851 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 12/25 loss=0.6825 valAUC=0.7652 best=0.7851 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 13/25 loss=0.6681 valAUC=0.8017 best=0.8017 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 14/25 loss=0.6666 valAUC=0.7337 best=0.8017 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 15/25 loss=0.6322 valAUC=0.7278 best=0.8017 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 16/25 loss=0.6203 valAUC=0.7749 best=0.8017 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 17/25 loss=0.6180 valAUC=0.7881 best=0.8017 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 18/25 loss=0.6121 valAUC=0.7771 best=0.8017 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 2 Epoch 19/25 loss=0.6104 valAUC=0.7773 best=0.8017 time=2.5s
[CNN] Early stop on fold 2 at epoch 19


[CNN] Fold 2 done in 61.8s; bestAUC=0.8017


[SETUP] Weights already exist at torch_cache/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth
[CNN] EfficientNet-B0 loaded from local cache. Missing: ['classifier.1.weight', 'classifier.1.bias'] Unexpected: []


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 1/25 loss=1.0667 valAUC=0.4787 best=0.4787 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 2/25 loss=1.0258 valAUC=0.4583 best=0.4787 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 3/25 loss=0.9789 valAUC=0.5585 best=0.5585 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 4/25 loss=0.8953 valAUC=0.7350 best=0.7350 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 5/25 loss=0.8407 valAUC=0.7814 best=0.7814 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 6/25 loss=0.8269 valAUC=0.7813 best=0.7814 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 7/25 loss=0.7771 valAUC=0.7875 best=0.7875 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 8/25 loss=0.7720 valAUC=0.7944 best=0.7944 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 9/25 loss=0.7328 valAUC=0.7960 best=0.7960 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 10/25 loss=0.7310 valAUC=0.7995 best=0.7995 time=2.7s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 11/25 loss=0.6667 valAUC=0.7882 best=0.7995 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 12/25 loss=0.6939 valAUC=0.7287 best=0.7995 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 13/25 loss=0.6449 valAUC=0.7066 best=0.7995 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 14/25 loss=0.6035 valAUC=0.6808 best=0.7995 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 15/25 loss=0.5900 valAUC=0.7170 best=0.7995 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 3 Epoch 16/25 loss=0.5558 valAUC=0.7881 best=0.7995 time=2.6s
[CNN] Early stop on fold 3 at epoch 16


[CNN] Fold 3 done in 44.5s; bestAUC=0.7995


[SETUP] Weights already exist at torch_cache/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth
[CNN] EfficientNet-B0 loaded from local cache. Missing: ['classifier.1.weight', 'classifier.1.bias'] Unexpected: []


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 1/25 loss=1.0673 valAUC=0.4228 best=0.4228 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 2/25 loss=1.0055 valAUC=0.3788 best=0.4228 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 3/25 loss=0.9251 valAUC=0.4995 best=0.4995 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 4/25 loss=0.9058 valAUC=0.5518 best=0.5518 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 5/25 loss=0.8286 valAUC=0.6103 best=0.6103 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 6/25 loss=0.7779 valAUC=0.6900 best=0.6900 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 7/25 loss=0.7628 valAUC=0.7372 best=0.7372 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 8/25 loss=0.7485 valAUC=0.7597 best=0.7597 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 9/25 loss=0.6940 valAUC=0.7821 best=0.7821 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 10/25 loss=0.6791 valAUC=0.7818 best=0.7821 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 11/25 loss=0.6390 valAUC=0.7875 best=0.7875 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 12/25 loss=0.6383 valAUC=0.7777 best=0.7875 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 13/25 loss=0.6008 valAUC=0.8080 best=0.8080 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 14/25 loss=0.5626 valAUC=0.8122 best=0.8122 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 15/25 loss=0.5723 valAUC=0.8188 best=0.8188 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 16/25 loss=0.5508 valAUC=0.7984 best=0.8188 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 17/25 loss=0.5593 valAUC=0.8031 best=0.8188 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 18/25 loss=0.5250 valAUC=0.8043 best=0.8188 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 19/25 loss=0.5524 valAUC=0.8084 best=0.8188 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 20/25 loss=0.5207 valAUC=0.8017 best=0.8188 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 4 Epoch 21/25 loss=0.5296 valAUC=0.8030 best=0.8188 time=2.6s
[CNN] Early stop on fold 4 at epoch 21


[CNN] Fold 4 done in 57.2s; bestAUC=0.8188


[SETUP] Weights already exist at torch_cache/hub/checkpoints/efficientnet_b0_rwightman-3dd342df.pth
[CNN] EfficientNet-B0 loaded from local cache. Missing: ['classifier.1.weight', 'classifier.1.bias'] Unexpected: []


  scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 1/25 loss=1.0579 valAUC=0.4799 best=0.4799 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 2/25 loss=1.0178 valAUC=0.4199 best=0.4799 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 3/25 loss=0.9145 valAUC=0.5146 best=0.5146 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 4/25 loss=0.8652 valAUC=0.6460 best=0.6460 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 5/25 loss=0.8136 valAUC=0.7388 best=0.7388 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 6/25 loss=0.8127 valAUC=0.7781 best=0.7781 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 7/25 loss=0.7679 valAUC=0.7974 best=0.7974 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 8/25 loss=0.7703 valAUC=0.7903 best=0.7974 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 9/25 loss=0.7079 valAUC=0.7956 best=0.7974 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 10/25 loss=0.6746 valAUC=0.8032 best=0.8032 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 11/25 loss=0.6466 valAUC=0.7996 best=0.8032 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 12/25 loss=0.6446 valAUC=0.8074 best=0.8074 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 13/25 loss=0.6084 valAUC=0.8139 best=0.8139 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 14/25 loss=0.5887 valAUC=0.8469 best=0.8469 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 15/25 loss=0.5867 valAUC=0.8535 best=0.8535 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 16/25 loss=0.5472 valAUC=0.8466 best=0.8535 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 17/25 loss=0.5528 valAUC=0.8482 best=0.8535 time=2.6s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 18/25 loss=0.5166 valAUC=0.8451 best=0.8535 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 19/25 loss=0.5077 valAUC=0.8449 best=0.8535 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 20/25 loss=0.5239 valAUC=0.8423 best=0.8535 time=2.5s


  with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):


[CNN] Fold 5 Epoch 21/25 loss=0.5152 valAUC=0.8487 best=0.8535 time=2.6s
[CNN] Early stop on fold 5 at epoch 21


[CNN] Fold 5 done in 56.3s; bestAUC=0.8535


[CNN] Filtered spectrograms 5-fold OOF macro AUC: 0.7791
[CNN] Finished filtered CNN in 302.4s; OOF=0.7791
[CNN] Blended CNN(filtered) 0.7 with existing submission and saved submission.csv


In [23]:
# CNN diagnostics: environment and quick data probe
import importlib, torch, timm, os
from pathlib import Path
from PIL import Image
print('[DIAG-CNN] torch', torch.__version__, 'cuda?', torch.cuda.is_available(), 'device_count', torch.cuda.device_count())
print('[DIAG-CNN] timm version:', importlib.metadata.version('timm') if hasattr(importlib, 'metadata') else 'n/a')
try:
    print('[DIAG-CNN] CUDA device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')
except Exception as e:
    print('[DIAG-CNN] CUDA device query error:', e)
from pathlib import Path
ess = Path('essential_data'); supp = Path('supplemental_data')
print('[DIAG-CNN] filtered_spectrograms exists?', (supp / 'filtered_spectrograms').exists())
import pandas as pd
df_id2fn = pd.read_csv(ess / 'rec_id2filename.txt')
id2fn = dict(zip(df_id2fn.rec_id.astype(int), df_id2fn.filename.astype(str)))
sample_rec = list(id2fn.keys())[0]
p = supp / 'filtered_spectrograms' / f"{id2fn[sample_rec]}.bmp"
print('[DIAG-CNN] sample image path:', p, 'exists?', p.exists())
try:
    img = Image.open(p).convert('L') if p.exists() else None
    print('[DIAG-CNN] image size:', img.size if img else None)
except Exception as e:
    print('[DIAG-CNN] PIL open error:', e)

[DIAG-CNN] torch 2.8.0+cu128 cuda? True device_count 1
[DIAG-CNN] timm version: 1.0.19
[DIAG-CNN] CUDA device: Tesla T4
[DIAG-CNN] filtered_spectrograms exists? True
[DIAG-CNN] sample image path: supplemental_data/filtered_spectrograms/PC1_20090606_050012_0010.bmp exists? True
[DIAG-CNN] image size: (1246, 256)


In [29]:
# Quick CNN OOF diagnostics and submission sanity
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score
try:
    print('[CHECK] CNN OOF macro AUC:', round(float(oof_auc_cnn), 6))
    print('[CHECK] OOF shape:', getattr(oof_cnn_filt, 'shape', None), 'Test preds shape:', getattr(test_cnn_filt, 'shape', None))
    # Optional per-class AUCs
    Y = np.zeros_like(oof_cnn_filt, dtype=np.float32)
    for i, labs in enumerate(df_train_ids['labels']):
        for c in labs: Y[i, c] = 1.0
    aucs = []
    for c in range(Y.shape[1]):
        yc = Y[:, c]
        if yc.sum() > 0 and (len(yc) - yc.sum()) > 0:
            try: aucs.append(roc_auc_score(yc, oof_cnn_filt[:, c]))
            except Exception: pass
    print('[CHECK] CNN OOF per-class macro across', len(aucs), 'classes:', round(float(np.mean(aucs)), 6))
    # Submission sanity
    df_sub = pd.read_csv('submission.csv')
    print('[CHECK] submission.csv rows:', df_sub.shape, 'NaNs:', int(df_sub['Probability'].isna().sum()))
    print(df_sub.head())
except NameError as e:
    print('[CHECK] CNN variables not found:', e)

[CHECK] CNN OOF macro AUC: 0.753191
[CHECK] OOF shape: (258, 19) Test preds shape: (64, 19)
[CHECK] CNN OOF per-class macro across 19 classes: 0.753191
[CHECK] submission.csv rows: (1216, 2) NaNs: 0
    Id  Probability
0  100     0.171752
1  101     0.176060
2  102     0.166035
3  103     0.125629
4  104     0.106490


In [None]:
# CNN Model A: ConvNeXt-Tiny @ 256x768 on filtered spectrograms with warmup+cosine, strong SpecAug, per-fold OOF and TTA
import sys, subprocess, importlib, os, time, math, random, gc, urllib.request
from pathlib import Path
import numpy as np, pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.metrics import roc_auc_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ess = Path('essential_data'); supp = Path('supplemental_data')

# Globals from earlier cells
species_df = pd.read_csv(ess / 'species_list.txt')
num_classes = species_df.shape[0]
df_id2fn = pd.read_csv(ess / 'rec_id2filename.txt')
id2fn = dict(zip(df_id2fn.rec_id.astype(int), df_id2fn.filename.astype(str)))

# Input resolution per expert guidance
IMG_H, IMG_W = 256, 768

def macro_auc(y_true, y_prob):
    aucs = []
    for c in range(y_true.shape[1]):
        yc = y_true[:, c]
        if yc.sum() > 0 and (len(yc) - yc.sum()) > 0:
            try: aucs.append(roc_auc_score(yc, y_prob[:, c]))
            except Exception: pass
    return float(np.mean(aucs)) if len(aucs) else float('nan')

def load_resized(rec_id: int, folder='filtered_spectrograms'):
    fn = id2fn.get(int(rec_id), None)
    if fn is None: return None
    p = supp / folder / f"{fn}.bmp"
    if not p.exists():
        return None
    try:
        img = Image.open(p).convert('L')
    except Exception:
        return None
    w, h = img.size
    new_w = int(round(w * (IMG_H / h)))
    img_resized = img.resize((new_w, IMG_H), Image.BILINEAR)
    return img_resized  # grayscale, height fixed, width variable

class SpecAugment:
    def __init__(self, time_masks=3, time_max=120, freq_masks=2, freq_max=60):
        self.time_masks=time_masks; self.time_max=time_max; self.freq_masks=freq_masks; self.freq_max=freq_max
    def __call__(self, x):
        C,H,W = x.shape
        for _ in range(self.time_masks):
            w = random.randint(0, self.time_max)
            if w>0:
                t0 = random.randint(0, max(0, W - w))
                x[:, :, t0:t0+w] = 0.0
        for _ in range(self.freq_masks):
            h = random.randint(0, self.freq_max)
            if h>0:
                f0 = random.randint(0, max(0, H - h))
                x[:, f0:f0+h, :] = 0.0
        return x

class SpectrogramDatasetCnxt(Dataset):
    def __init__(self, rec_ids, labels=None, folder='filtered_spectrograms', train=True):
        self.rec_ids = list(rec_ids); self.labels = labels
        self.folder=folder; self.train=train
        self.to_tensor = transforms.ToTensor()
        self.specaug = SpecAugment()
        self.norm = transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
        self.roll_px = int(0.15 * IMG_W) if train else 0
    def __len__(self):
        return len(self.rec_ids)
    def _time_crop(self, img_resized):
        # img_resized: grayscale PIL with H=IMG_H, width >= 1
        w = img_resized.size[0]
        if w >= IMG_W:
            if self.train:
                start = random.randint(0, w - IMG_W)
            else:
                start = (w - IMG_W)//2
            crop = img_resized.crop((start, 0, start+IMG_W, IMG_H))
        else:
            pad_left = (IMG_W - w)//2
            pad_right = IMG_W - w - pad_left
            canvas = Image.new('L', (IMG_W, IMG_H), color=0)
            canvas.paste(img_resized, (pad_left, 0))
            crop = canvas
        img3 = Image.merge('RGB', (crop, crop, crop))
        return img3
    def __getitem__(self, idx):
        rid = int(self.rec_ids[idx])
        img_resized = load_resized(rid, self.folder)
        if img_resized is None:
            alt = load_resized(rid, 'spectrograms')
            img_resized = alt if alt is not None else Image.new('L', (IMG_W, IMG_H), color=0)
        img = self._time_crop(img_resized)
        x = self.to_tensor(img)
        if self.train:
            if self.roll_px>0 and random.random()<0.8:
                shift = random.randint(-self.roll_px, self.roll_px)
                if shift!=0: x = torch.roll(x, shifts=shift, dims=2)
            x = self.specaug(x)
        x = self.norm(x)
        if self.labels is None:
            return x, rid
        y = torch.zeros(num_classes, dtype=torch.float32)
        for c in self.labels[idx]: y[c] = 1.0
        return x, y

# Offline weights for ConvNeXt-Tiny
CACHE_DIR = Path('./torch_cache'); os.environ['TORCH_HOME'] = str(CACHE_DIR.resolve())
CKPT_DIR = CACHE_DIR / 'hub' / 'checkpoints'; CKPT_DIR.mkdir(parents=True, exist_ok=True)
CNXT_URL = 'https://download.pytorch.org/models/convnext_tiny-983f1562.pth'
CNXT_FILE = CKPT_DIR / 'convnext_tiny-983f1562.pth'

def ensure_file(url, path: Path):
    if not path.exists():
        print(f"[SETUP] Downloading weights to {path}")
        urllib.request.urlretrieve(url, path)
    else:
        print(f"[SETUP] Weights already exist at {path}")

def create_convnext_tiny_offline(num_classes):
    ensure_file(CNXT_URL, CNXT_FILE)
    # Build convnext_tiny manually to avoid internet
    from torchvision.models.convnext import convnext_tiny, ConvNeXt_Tiny_Weights
    model = convnext_tiny(weights=None)
    sd = torch.load(CNXT_FILE, map_location='cpu')
    missing, unexpected = model.load_state_dict(sd, strict=False)
    in_features = model.classifier[2].in_features
    model.classifier[2] = nn.Linear(in_features, num_classes)
    print('[CNN-A] ConvNeXt-Tiny loaded. Missing:', missing, 'Unexpected:', unexpected)
    return model

def linear_warmup_cosine(optimizer, warmup_steps, total_steps, min_lr=1e-6):
    def lr_lambda(step):
        if step < warmup_steps:
            return (step + 1) / max(1, warmup_steps)
        progress = (step - warmup_steps) / max(1, total_steps - warmup_steps)
        return 0.5 * (1.0 + math.cos(math.pi * progress))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

def train_convnext_filtered(seed=42, batch_size=24, max_epochs=35, patience=9, base_lr=1e-4, wd=1e-2):
    torch.manual_seed(seed); np.random.seed(seed); random.seed(seed)
    train_df = df_train_ids[['rec_id','labels']].copy().reset_index(drop=True)
    rec_ids = train_df['rec_id'].tolist()
    labels = train_df['labels'].tolist()
    Y = np.zeros((len(labels), num_classes), dtype=np.float32)
    for i,labs in enumerate(labels):
        for c in labs: Y[i,c]=1.0
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros_like(Y, dtype=np.float32)
    test_rec_sorted = sorted(df_test_ids['rec_id'].tolist())
    test_ds = SpectrogramDatasetCnxt(test_rec_sorted, labels=None, folder='filtered_spectrograms', train=False)
    test_logits_accum = np.zeros((len(test_ds), num_classes), dtype=np.float32)

    for fold, (trn_idx, val_idx) in enumerate(mskf.split(np.arange(len(rec_ids)), Y), 1):
        t_fold = time.time()
        trn_ids = [rec_ids[i] for i in trn_idx]; trn_labels = [labels[i] for i in trn_idx]
        val_ids = [rec_ids[i] for i in val_idx]; val_labels = [labels[i] for i in val_idx]
        train_ds = SpectrogramDatasetCnxt(trn_ids, trn_labels, train=True)
        val_ds = SpectrogramDatasetCnxt(val_ids, val_labels, train=False)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

        model = create_convnext_tiny_offline(num_classes).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=wd)
        total_steps = max_epochs * max(1, math.ceil(len(train_loader)))
        warmup_steps = max(1, int(0.05 * total_steps))  # ~1-2 epochs equivalent
        scheduler = linear_warmup_cosine(optimizer, warmup_steps, total_steps)
        scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))

        # pos_weight cap at 12
        y_tr = np.zeros((len(trn_labels), num_classes), dtype=np.float32)
        for i,labs in enumerate(trn_labels):
            for c in labs: y_tr[i,c]=1.0
        pos = y_tr.sum(axis=0); neg = (y_tr.shape[0]-pos)
        pos_weight = np.clip(np.divide(neg, np.clip(pos, 1.0, None)), 1.0, 12.0)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight, dtype=torch.float32, device=device))

        best_auc = -1.0; best_state = None; no_imp = 0; step=0
        for epoch in range(1, max_epochs+1):
            t0 = time.time(); model.train(); running=0.0; nb=0
            for xb, yb in train_loader:
                xb = xb.to(device, non_blocking=True); yb = yb.to(device, non_blocking=True)
                optimizer.zero_grad(set_to_none=True)
                with torch.amp.autocast('cuda', enabled=(device.type=='cuda')):
                    logits = model(xb)
                    loss = criterion(logits, yb)
                scaler.scale(loss).backward()
                scaler.step(optimizer); scaler.update()
                scheduler.step(); step+=1; running += float(loss.item()); nb+=1
            # validate
            model.eval(); v_logits=[]; v_targets=[]
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb = xb.to(device, non_blocking=True)
                    v = model(xb).detach().cpu().float().numpy()
                    v_logits.append(v); v_targets.append(yb.numpy())
            v_logits = np.concatenate(v_logits, 0); v_targets = np.concatenate(v_targets, 0)
            v_probs = 1.0 / (1.0 + np.exp(-v_logits))
            v_auc = macro_auc(v_targets, v_probs)
            if v_auc > best_auc + 1e-4:
                best_auc = v_auc; no_imp = 0
                best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            else:
                no_imp += 1
            print(f"[CNN-A] Fold {fold} Epoch {epoch}/{max_epochs} loss={running/max(1,nb):.4f} valAUC={v_auc:.4f} best={best_auc:.4f} time={time.time()-t0:.1f}s")
            if no_imp >= patience:
                print(f"[CNN-A] Early stop on fold {fold} at epoch {epoch}")
                break

        if best_state is not None:
            model.load_state_dict(best_state)

        # OOF for this fold
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
        outs=[]
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device, non_blocking=True)
                outs.append(model(xb).detach().cpu().float().numpy())
        outs = np.concatenate(outs, 0)
        oof[val_idx] = 1.0/(1.0+np.exp(-outs))

        # TTA logits average with 5 time shifts
        def tta_test_logits():
            preds = np.zeros((len(test_ds), num_classes), dtype=np.float32)
            shifts = [-0.2, -0.1, 0.0, 0.1, 0.2]
            with torch.no_grad():
                for sh in shifts:
                    cur_batch=[]; cur_idx=[]
                    for i in range(len(test_ds)):
                        # manual fetch to apply roll
                        rid = test_ds.rec_ids[i]
                        img_resized = load_resized(rid, 'filtered_spectrograms')
                        if img_resized is None:
                            alt = load_resized(rid, 'spectrograms')
                            img_resized = alt if alt is not None else Image.new('L', (IMG_W, IMG_H), color=0)
                        img = test_ds._time_crop(img_resized)
                        x = transforms.ToTensor()(img)
                        px = int(sh * IMG_W)
                        if px != 0:
                            x = torch.roll(x, shifts=px, dims=2)
                        x = test_ds.norm(x)
                        cur_batch.append(x); cur_idx.append(i)
                        if len(cur_batch) == batch_size or i == len(test_ds)-1:
                            xb = torch.stack(cur_batch,0).to(device)
                            logits = model(xb).detach().cpu().float().numpy()
                            preds[cur_idx[0]:cur_idx[0]+len(cur_batch)] += logits
                            cur_batch=[]; cur_idx=[]
            preds /= len(shifts)
            return 1.0/(1.0+np.exp(-preds))

        tp = tta_test_logits()
        test_logits_accum += tp.astype(np.float32)
        print(f"[CNN-A] Fold {fold} done in {time.time()-t_fold:.1f}s; bestAUC={best_auc:.4f}")
        del model; gc.collect(); torch.cuda.empty_cache()

    test_preds = test_logits_accum / 5.0
    oof_auc = macro_auc(Y, oof)
    print(f"[CNN-A] ConvNeXt-Tiny filtered 5-fold OOF macro AUC: {oof_auc:.4f}")
    id2prob = {}
    for i, rid in enumerate(test_ds.rec_ids):
        for c in range(num_classes):
            Id = int(rid) * 100 + c
            id2prob[Id] = float(test_preds[i, c])
    return oof, test_preds, id2prob, oof_auc

t0 = time.time()
oof_cnxt_filt, test_cnxt_filt, id2prob_cnxt_filt, oof_auc_cnxt = train_convnext_filtered(seed=42, batch_size=24, max_epochs=35, patience=9, base_lr=1e-4, wd=1e-2)
print(f"[CNN-A] Finished ConvNeXt-Tiny filtered in {time.time()-t0:.1f}s; OOF={oof_auc_cnxt:.4f}")

# Blend this model into submission conservatively (logit/prob avg) to keep LB stable; will switch to per-class later
df_base = pd.read_csv('sample_submission.csv')
cnxt_series = df_base['Id'].map(id2prob_cnxt_filt).astype(float)
if Path('submission.csv').exists():
    df_prev = pd.read_csv('submission.csv')
    prev = df_prev['Probability'].astype(float)
    df_prev['Probability'] = (0.7 * cnxt_series.fillna(prev) + 0.3 * prev).fillna(0.05)
    df_prev.to_csv('submission.csv', index=False)
    print('[CNN-A] Blended ConvNeXt-Tiny(filtered) 0.7 with existing submission and saved submission.csv')
else:
    out = df_base.copy(); out['Probability'] = cnxt_series.fillna(0.05); out.to_csv('submission.csv', index=False)
    print('[CNN-A] Saved ConvNeXt-Tiny(filtered)-only submission.csv')

[SETUP] Weights already exist at torch_cache/hub/checkpoints/convnext_tiny-983f1562.pth


[CNN-A] ConvNeXt-Tiny loaded. Missing: [] Unexpected: []
