# MLSP 2013 Bird Classification – Working Notebook

## Plan
- Understand data artifacts present (audio, spectrograms, mappings, folds, species list, sample submission).
- Identify train vs test split and locate labels.
- Establish a fast baseline using image models on provided spectrograms (multi-label, ROC-AUC).
- Use CV folds (CVfolds_2.txt) for validation and early stopping.
- Iterate on feature extraction and modeling (e.g., CNN on BMP spectrograms, LightGBM on segment_features).
- Ensembling: combine CNN logits with tabular segment feature model.
- Generate predictions for all test files and save to submission.csv.
- Track experiments and scores; stop when bronze/silver is achieved.

## Experiment Log
- 00:00 Init: Explore files, find labels, define data pipeline.
- TODO: Baseline model + CV AUC.
- TODO: Improve model/augmentations/ensembling.

## Notes
- Metric: ROC-AUC (macro over species).
- Multi-label problem; outputs are probabilities per species.

In [1]:
import os, glob, time, re, sys, json, textwrap
from pathlib import Path
import pandas as pd
import numpy as np

t0 = time.time()
base = Path.cwd()
print(f"CWD: {base}")

def list_dir(p):
    p = Path(p)
    items = sorted(p.iterdir()) if p.exists() else []
    print(f"\n== {p} ({len(items)} items) ==")
    for q in items[:40]:
        try:
            print(q, f"[{q.stat().st_size//1024} KB]")
        except Exception:
            print(q)
    if len(items) > 40:
        print(f"... (+{len(items)-40} more)")

list_dir('essential_data')
list_dir('supplemental_data')
list_dir('supplemental_data/spectrograms')
list_dir('supplemental_data/filtered_spectrograms')

# Load species list
sp_path = base / 'essential_data' / 'species_list.txt'
species = []
if sp_path.exists():
    with open(sp_path, 'r') as f:
        for line in f:
            s = line.strip()
            if s:
                species.append(s)
print(f"\nSpecies count: {len(species)}")
print("First 10 species:", species[:10])

# Load sample submission to confirm column order/ids
sub_path = base / 'sample_submission.csv'
if sub_path.exists():
    sub_df = pd.read_csv(sub_path)
    print(f"\nsample_submission shape: {sub_df.shape}")
    print("sample_submission columns (first 10):", list(sub_df.columns[:10]))
    print(sub_df.head(3))
else:
    print("sample_submission.csv not found")

# Read rec_id2filename mapping
map_path = base / 'essential_data' / 'rec_id2filename.txt'
rec_map = None
if map_path.exists():
    try:
        rec_map = pd.read_csv(map_path, sep='\s+|,|\t', header=None, engine='python')
        # try to infer columns
        if rec_map.shape[1] >= 2:
            rec_map = rec_map.iloc[:, :2]
            rec_map.columns = ['rec_id', 'filename']
        print(f"\nrec_id2filename loaded: {rec_map.shape}")
        print(rec_map.head())
    except Exception as e:
        print("Failed to parse rec_id2filename.txt:", e)
else:
    print("rec_id2filename.txt not found")

# Read CV folds
cv_path = base / 'essential_data' / 'CVfolds_2.txt'
cv_df = None
if cv_path.exists():
    try:
        cv_df = pd.read_csv(cv_path, sep='\s+|,|\t', header=None, engine='python')
        # typical format: rec_id fold
        if cv_df.shape[1] >= 2:
            cv_df = cv_df.iloc[:, :2]
            cv_df.columns = ['rec_id', 'fold']
        print(f"\nCVfolds loaded: {cv_df.shape}")
        print(cv_df['fold'].value_counts().sort_index())
        print(cv_df.head())
    except Exception as e:
        print("Failed to parse CVfolds_2.txt:", e)
else:
    print("CVfolds_2.txt not found")

# Locate training labels file
cand_label_files = []
for pat in [
    'essential_data/*label*.txt',
    'essential_data/*labels*.txt',
    'essential_data/*train*.txt',
    'essential_data/*_labels.*',
]:
    cand_label_files.extend(glob.glob(pat))
cand_label_files = sorted(set(cand_label_files))
print("\nCandidate label files:")
for p in cand_label_files:
    print(" -", p)

# Exclude known hidden test labels file
label_path = None
for p in cand_label_files:
    if 'test_hidden' in p.lower():
        continue
    label_path = Path(p)
    break

labels_df = None
if label_path and label_path.exists():
    try:
        # Attempt flexible parsing: rec_id then multi-hot labels or species names
        tmp = pd.read_csv(label_path, sep='\s+|,|\t', header=None, engine='python')
        print(f"\nLoaded labels from {label_path} shape={tmp.shape}")
        print(tmp.head())
        labels_df = tmp
    except Exception as e:
        print("Failed to parse labels:", e)
else:
    print("\nNo explicit train labels file found yet (excluding hidden test). We'll need to infer or locate it.")

# Count spectrogram images
spec_paths = sorted(glob.glob('supplemental_data/filtered_spectrograms/*.bmp'))
raw_spec_paths = sorted(glob.glob('supplemental_data/spectrograms/*.bmp'))
print(f"\nFiltered spectrograms: {len(spec_paths)} | Raw spectrograms: {len(raw_spec_paths)}")
print("Example filtered spectrograms:", spec_paths[:5])

print(f"\nElapsed: {time.time()-t0:.2f}s")

CWD: /app/agent_run_states/mlsp-2013-birds-spray-20250908-000706

== essential_data (5 items) ==
essential_data/CVfolds_2.txt [1 KB]
essential_data/rec_id2filename.txt [9 KB]
essential_data/rec_labels_test_hidden.txt [1 KB]
essential_data/species_list.txt [0 KB]
essential_data/src_wavs [20 KB]

== supplemental_data (9 items) ==
supplemental_data/filtered_spectrograms [20 KB]
supplemental_data/histogram_of_segments.txt [284 KB]
supplemental_data/segment_clusters.bmp [15731 KB]
supplemental_data/segment_features.txt [394 KB]
supplemental_data/segment_mosaic.bmp [7116 KB]
supplemental_data/segment_rectangles.txt [24 KB]
supplemental_data/segmentation_examples [4 KB]
supplemental_data/spectrograms [20 KB]
supplemental_data/supervised_segmentation [20 KB]

== supplemental_data/spectrograms (322 items) ==
supplemental_data/spectrograms/PC10_20090513_054500_0020.bmp [313 KB]
supplemental_data/spectrograms/PC10_20090513_054500_0030.bmp [313 KB]
supplemental_data/spectrograms/PC10_20090513_0545

In [2]:
from pprint import pprint
import itertools

def peek_file(path, n=10):
    p = Path(path)
    print(f"\n--- {p} ---")
    if not p.exists():
        print("Missing")
        return
    try:
        with open(p, 'r', errors='ignore') as f:
            for i, line in zip(range(n), f):
                print(line.rstrip())
    except Exception as e:
        print("Error reading:", e)

# Properly parse species_list as CSV
sp_csv = pd.read_csv(sp_path) if sp_path.exists() else None
print('\nSpecies CSV parsed:' if sp_csv is not None else 'Species CSV missing',
      sp_csv.shape if sp_csv is not None else '')
if sp_csv is not None:
    print(sp_csv.head())

# Peek key text feature files
peek_file('supplemental_data/histogram_of_segments.txt', 15)
peek_file('supplemental_data/segment_features.txt', 15)
peek_file('supplemental_data/segment_rectangles.txt', 15)
peek_file('essential_data/rec_labels_test_hidden.txt', 20)
peek_file('essential_data/rec_id2filename.txt', 20)
peek_file('essential_data/CVfolds_2.txt', 20)

# Search repository for any file likely containing train labels
cands = []
for pat in ['**/*.txt', '**/*.csv']:
    cands.extend(glob.glob(pat, recursive=True))
label_like = [p for p in cands if re.search(r'label|train|truth|target', Path(p).name, re.I)]
print("\nLabel-like files found:")
for p in sorted(set(label_like)):
    print(' -', p)

# Try to infer if histogram_of_segments has per-rec rows and can be merged with labels later
try:
    hos = pd.read_csv('supplemental_data/histogram_of_segments.txt', sep='\s+|,|\t', engine='python', header=None)
    print('\nhistogram_of_segments shape:', hos.shape)
    print(hos.head())
except Exception as e:
    print('Failed to load histogram_of_segments:', e)

try:
    segf = pd.read_csv('supplemental_data/segment_features.txt', sep='\s+|,|\t', engine='python', header=None)
    print('segment_features shape:', segf.shape)
    print(segf.head())
except Exception as e:
    print('Failed to load segment_features:', e)

print('\nDone peeking.')


Species CSV parsed: (19, 3)
   class_id  code                   species
0         0  BRCR             Brown Creeper
1         1  PAWR              Pacific Wren
2         2  PSFL  Pacific-slope Flycatcher
3         3  RBNU     Red-breasted Nuthatch
4         4  DEJU           Dark-eyed Junco

--- supplemental_data/histogram_of_segments.txt ---
rec_id,[histogram of segment features]
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.076923,0.000000,0.000000,0.000000,0.000000,0.000000,0.076923,0.000000,0.076923,0.000000,0.076923,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.153846,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.076923,0

In [5]:
# Build master dataframe, parse labels, features, and train a fast OVR logistic baseline
import time
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

start = time.time()

# Parse species list correctly (19 classes)
sp_csv = pd.read_csv('essential_data/species_list.txt')
num_classes = sp_csv.shape[0]
print('Num classes:', num_classes)

# Parse folds and mapping (CSV with header)
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')

# Parse labels: use rec_labels_test_hidden.txt: labels available for train (fold==0), '?' for test (fold==1)
lab_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    header = next(f, None)
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = [p for p in line.split(',')]
        parts = [p for p in parts if p is not None and p != '']
        try:
            rec_id = int(parts[0])
        except Exception:
            continue
        labels = parts[1:] if len(parts) > 1 else []
        lab_rows.append((rec_id, labels))

# Convert to multi-hot vector per rec_id
# - If labels contain '?': unknown, skip (test only)
# - If labels empty: valid negative example (all zeros)
label_mat = {}
for rec_id, labels in lab_rows:
    if any(l.strip() == '?' for l in labels):
        continue  # unknown labels, likely test set
    y = np.zeros(num_classes, dtype=np.int8)
    for l in labels:
        if l.strip()=='' or l.strip()=='?':
            continue
        cid = int(l)
        if 0 <= cid < num_classes:
            y[cid] = 1
    label_mat[rec_id] = y
print('Parsed label rows (incl. all-zero):', len(label_mat))

# Parse histogram_of_segments.txt manually (robust to formatting quirks)
hist_rows = []
with open('supplemental_data/histogram_of_segments.txt', 'r') as f:
    header = next(f, None)  # skip header
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = [p for p in line.split(',') if p!='']
        try:
            rid = int(parts[0])
        except Exception:
            continue
        vals = []
        for v in parts[1:]:
            try:
                vals.append(float(v))
            except Exception:
                vals.append(0.0)
        hist_rows.append((rid, vals))
max_len = max((len(v) for _, v in hist_rows), default=0)
data = np.zeros((len(hist_rows), max_len), dtype=np.float32)
rec_ids_hist = np.zeros((len(hist_rows),), dtype=int)
for i, (rid, vals) in enumerate(hist_rows):
    rec_ids_hist[i] = rid
    L = min(max_len, len(vals))
    if L > 0:
        data[i, :L] = np.array(vals[:L], dtype=np.float32)
hos = pd.DataFrame(data)
hos.insert(0, 'rec_id', rec_ids_hist)
print('Histogram features shape (manual):', hos.shape)

# Merge master frame
master = cv_df.merge(rec_map, on='rec_id', how='left').merge(hos, on='rec_id', how='left')
print('Master shape:', master.shape)

# Build labels aligned to master
Y = np.full((master.shape[0], num_classes), np.nan, dtype=float)
known_mask = np.zeros(master.shape[0], dtype=bool)
for i, rid in enumerate(master['rec_id'].values):
    if rid in label_mat:
        Y[i, :] = label_mat[rid]
        known_mask[i] = True
print('Known label rows:', int(known_mask.sum()))

# Split indices
feature_cols = [c for c in hos.columns if c != 'rec_id']
is_train_fold = (master['fold'] == 0)
is_test_fold = (master['fold'] == 1)
train_idx = is_train_fold & known_mask
test_idx = is_test_fold
X = master.loc[train_idx, feature_cols].to_numpy(dtype=np.float32)
Y_train = Y[train_idx, :].astype(np.float32)
X_test = master.loc[test_idx, feature_cols].to_numpy(dtype=np.float32)
rec_ids_test = master.loc[test_idx, 'rec_id'].values.astype(int)
print('Train X/Y:', X.shape, Y_train.shape, '| Test X:', X_test.shape, 'Test recs:', len(rec_ids_test))

# OVR Logistic Regression baseline with KFold OOF
kf = KFold(n_splits=5, shuffle=True, random_state=42)
n_train = X.shape[0]
oof = np.zeros((n_train, num_classes), dtype=np.float32)
test_pred = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
per_class_auc = []

for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    # If all zeros or ones, AUC undefined; skip OOF and predict zeros
    if y.sum() == 0 or y.sum() == y.shape[0]:
        per_class_auc.append(np.nan)
        continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_test = np.zeros(X_test.shape[0], dtype=np.float32)
    fold_no = 0
    for tr_idx, va_idx in kf.split(X):
        fold_no += 1
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        # Balanced logistic regression for imbalance
        base_lr = LogisticRegression(max_iter=2000, solver='liblinear', class_weight='balanced', n_jobs=-1)
        model = base_lr.fit(X_tr, y_tr)
        preds_va = model.predict_proba(X_va)[:, 1]
        preds_te = model.predict_proba(X_test)[:, 1]
        cls_oof[va_idx] = preds_va.astype(np.float32)
        cls_test += preds_te.astype(np.float32) / kf.get_n_splits()
    try:
        auc_c = roc_auc_score(y, cls_oof)
    except Exception:
        auc_c = np.nan
    per_class_auc.append(auc_c)
    oof[:, c] = cls_oof
    test_pred[:, c] = cls_test
    print(f"Class {c:02d} AUC: {auc_c}")

valid_aucs = [a for a in per_class_auc if not np.isnan(a)]
macro_auc = float(np.mean(valid_aucs)) if valid_aucs else float('nan')
print(f"\nMacro AUC (OOF over train): {macro_auc:.5f}")

# Build submission in the exact order of sample_submission.csv (Id = rec_id*100 + class_id) for test set (fold==1)
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
prob = np.zeros_like(id_vals, dtype=np.float32)
rid_to_row = {rid: i for i, rid in enumerate(rec_ids_test)}
for i, Id in enumerate(id_vals):
    rid = Id // 100
    cid = Id % 100
    if cid >= num_classes:
        p = 0.0
    else:
        row = rid_to_row.get(rid, None)
        p = float(test_pred[row, cid]) if row is not None else 0.0
    prob[i] = np.clip(p, 1e-5, 1-1e-5)
sub['Probability'] = prob
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv with shape:', sub.shape)
print('Elapsed total: %.2fs' % (time.time()-start))

Num classes: 19
Parsed label rows (incl. all-zero): 258
Histogram features shape (manual): (322, 101)
Master shape: (322, 103)
Known label rows: 258
Train X/Y: (258, 100) (258, 19) | Test X: (64, 100) Test recs: 64
Class 00 AUC: 0.9339783722253842
Class 01 AUC: 0.8362577639751553
Class 02 AUC: 0.727813256991852
Class 03 AUC: 0.343503937007874
Class 04 AUC: 0.393128067826863
Class 05 AUC: 0.5407114624505929
Class 06 AUC: 0.5638625853336269
Class 07 AUC: 0.47346399471482054
Class 08 AUC: 0.8497150997150997
Class 09 AUC: 0.6912798874824191
Class 10 AUC: 0.7194121667805878
Class 11 AUC: 0.8449596774193548
Class 12 AUC: 0.839769647696477
Class 13 AUC: 0.8720472440944882
Class 14 AUC: 0.6547004132231405
Class 15 AUC: 0.7493386243386243
Class 16 AUC: 0.126953125
Class 17 AUC: 0.6766732283464567


Class 18 AUC: 0.7384823848238482

Macro AUC (OOF over train): 0.66190
Saved submission.csv with shape: (1216, 2)
Elapsed total: 0.22s




In [33]:
# Strong tabular model: aggregate segment_features + rectangles, GroupKFold by station, OVR LightGBM (robust + periodic saves)
import numpy as np, pandas as pd, time, re, glob, sys, os
from pathlib import Path
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold

t0 = time.time()

# 1) Load core metadata
sp_df = pd.read_csv('essential_data/species_list.txt')
num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')

# station group from filename prefix before first '_'
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# 2) Parse labels (skip '?' lines; empty -> all-zero negatives)
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        line = line.strip()
        if not line: continue
        parts = [p for p in line.split(',') if p!='']
        try:
            rid = int(parts[0])
        except:
            continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?':
                continue
            cid = int(p)
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}
print('Labelled rec_ids:', len(label_map))

# 3) Parse segment_features.txt (rec_id, seg_idx, <numerics...>)
seg_records = []
with open('supplemental_data/segment_features.txt', 'r') as f:
    _ = next(f, None)  # header
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 3: continue
        try:
            rid = int(parts[0]); seg = int(parts[1])
        except:
            continue
        vals = []
        for v in parts[2:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        seg_records.append((rid, seg, vals))
max_len_feat = max((len(v) for _,_,v in seg_records), default=0)
sf_cols = [f'sf_{i}' for i in range(max_len_feat)]
sf_df = pd.DataFrame([([rid,seg]+v+[0.0]*(max_len_feat-len(v))) for rid,seg,v in seg_records],
                     columns=['rec_id','seg_idx']+sf_cols)
print('segment_features parsed:', sf_df.shape)

# 4) Parse segment_rectangles.txt (rec_id, seg_idx, t_start, t_end, f_start, f_end, [trailing comma])
rect_rows = []
with open('supplemental_data/segment_rectangles.txt', 'r') as f:
    _ = next(f, None)  # header (ignore text)
    for line in f:
        s = line.strip().strip(',')
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 6: continue
        try:
            rid = int(parts[0]); seg = int(parts[1])
            t0r = float(parts[2]); t1r = float(parts[3]); f0r = float(parts[4]); f1r = float(parts[5])
        except:
            continue
        rect_rows.append((rid, seg, t0r, t1r, f0r, f1r))
rect_df = pd.DataFrame(rect_rows, columns=['rec_id','seg_idx','t_start','t_end','f_start','f_end'])
if not rect_df.empty:
    rect_df['duration'] = rect_df['t_end'] - rect_df['t_start']
    rect_df['freq_span'] = rect_df['f_end'] - rect_df['f_start']
print('segment_rectangles parsed:', rect_df.shape)

# 5) Merge per-segment features and aggregate per rec_id
seg_full = sf_df.merge(rect_df, on=['rec_id','seg_idx'], how='left') if not rect_df.empty else sf_df.copy()
seg_full['segment_count'] = 1

num_cols = [c for c in seg_full.columns if c not in ['rec_id','seg_idx']]
def q10(x): return x.quantile(0.10)
def q90(x): return x.quantile(0.90)
agg_funcs = ['mean','std','min','max','median','skew']
agg_dict = {c: agg_funcs + [q10, q90] for c in num_cols}

gb = seg_full.groupby('rec_id').agg(agg_dict)
# Flatten columns
gb.columns = ['%s_%s' % (c[0], c[1] if isinstance(c[1], str) else ('q10' if c[1]==q10 else 'q90')) for c in gb.columns.to_flat_index()]
gb = gb.reset_index()

# Add simple counts
gb['segment_count_total'] = seg_full.groupby('rec_id')['segment_count'].sum().values
print('Aggregated per-rec features:', gb.shape)

# 6) Build training/test matrices with GroupKFold on station within fold==0
feat_df = meta.merge(gb, on='rec_id', how='left')
feat_df = feat_df.fillna(0.0)
known_mask = feat_df['rec_id'].isin(label_map.keys())
train_mask = (feat_df['fold']==0) & known_mask
test_mask = (feat_df['fold']==1)
feature_cols = [c for c in feat_df.columns if c not in ['rec_id','fold','filename','station']]
X = feat_df.loc[train_mask, feature_cols].to_numpy(np.float32)
groups = feat_df.loc[train_mask, 'station'].astype(str).values
rec_train = feat_df.loc[train_mask, 'rec_id'].values.astype(int)
X_test = feat_df.loc[test_mask, feature_cols].to_numpy(np.float32)
rec_test = feat_df.loc[test_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)
print('Train X/Y:', X.shape, Y_train.shape, '| Test X:', X_test.shape)
print('Stations in train:', np.unique(groups))

# Save id order immediately for strict alignment
np.save('tab_strong_train_ids.npy', rec_train.astype(int))
np.save('tab_strong_test_ids.npy', rec_test.astype(int))

# 7) OVR LightGBM with GroupKFold (robust + periodic saves)
try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
except Exception:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb
    from lightgbm import LGBMClassifier

gkf = GroupKFold(n_splits=5)
n_train = X.shape[0]
oof = np.zeros((n_train, num_classes), dtype=np.float32)
test_pred = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
per_auc = []

for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    pos_total = int(y.sum())
    print(f"Class {c:02d} start | positives={pos_total} of {y.shape[0]}"); sys.stdout.flush()
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_auc.append(np.nan)
        continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_test = np.zeros(X_test.shape[0], dtype=np.float32)
    fold_id = 0
    for tr_idx, va_idx in gkf.split(X, y, groups):
        fold_id += 1
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        pos = int(y_tr.sum()); neg = int((1-y_tr).sum())
        if pos == 0 or neg == 0:
            const = float(y.mean())
            cls_oof[va_idx] = const
            cls_test += np.full(X_test.shape[0], const, np.float32) / 5.0
            print(f"Class {c:02d} fold {fold_id}: degenerate train labels -> const {const:.4f}")
            continue
        spw = neg / pos
        params = dict(
            objective='binary',
            learning_rate=0.05,
            num_leaves=31,
            n_estimators=600,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_samples=5,
            min_data_in_bin=1,
            reg_lambda=1.0,
            random_state=42,
            n_jobs=-1,
            scale_pos_weight=spw
        )
        model = LGBMClassifier(**params)
        try:
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_va, y_va)],
                eval_metric='auc',
                callbacks=[lgb.early_stopping(50, verbose=False)]
            )
            p_va = model.predict_proba(X_va)[:,1].astype(np.float32)
            p_te = model.predict_proba(X_test)[:,1].astype(np.float32)
            cls_oof[va_idx] = p_va
            cls_test += p_te / 5.0
            print(f"Class {c:02d} fold {fold_id}: pos={pos} neg={neg} spw={spw:.2f} best_iter={getattr(model, 'best_iteration_', None)}")
        except Exception as e:
            # Fallback to constant if LightGBM fails
            const = float(y.mean())
            cls_oof[va_idx] = const
            cls_test += np.full(X_test.shape[0], const, np.float32) / 5.0
            print(f"Class {c:02d} fold {fold_id}: LGBM error -> const {const:.4f} | {e}")
        sys.stdout.flush()
    try:
        auc_c = roc_auc_score(y, cls_oof)
    except Exception:
        auc_c = np.nan
    per_auc.append(auc_c)
    oof[:, c] = cls_oof
    test_pred[:, c] = cls_test
    # Periodic save after each class to avoid losing work
    np.save('tab_strong_oof_tmp.npy', oof)
    np.save('tab_strong_test_tmp.npy', test_pred)
    print(f"Class {c:02d} AUC: {auc_c}")
    sys.stdout.flush()

valid_aucs = [a for a in per_auc if not np.isnan(a)]
macro_auc = float(np.mean(valid_aucs)) if valid_aucs else float('nan')
print(f"\nTabular OOF Macro AUC: {macro_auc:.5f}")

# Save OOF/test for selection and ensembling + id orders for strict alignment
np.save('tab_strong_oof.npy', oof)
np.save('tab_strong_test.npy', test_pred)
np.save('tab_strong_train_ids.npy', rec_train.astype(int))
np.save('tab_strong_test_ids.npy', rec_test.astype(int))

# 8) Build submission on test fold (Id = rec_id*100 + class_id) also saved separately
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission_tab_strong.csv', index=False)
print('Saved submission_tab_strong.csv. Shape:', sub.shape)
print('Elapsed: %.2fs' % (time.time()-t0))

In [11]:
# CNN setup: install deps (PyTorch cu121, timm, albumentations, opencv)
import sys, subprocess, pkgutil, os

def pip_install(args):
    print('Installing:', ' '.join(args)); sys.stdout.flush()
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q'] + args)

# Install torch/cu121 wheels if not present
need_torch = pkgutil.find_loader('torch') is None
if need_torch:
    pip_install(['--index-url', 'https://download.pytorch.org/whl/cu121',
                 'torch==2.4.0', 'torchvision==0.19.0', 'torchaudio==2.4.0'])
else:
    import torch
    print('torch version:', torch.__version__)

# timm, albumentations, opencv-headless
for pkg, spec in [('timm', 'timm==0.9.16'), ('albumentations', 'albumentations>=1.3.1'),
                  ('cv2', 'opencv-python-headless>=4.8.0.74')]:
    try:
        __import__(pkg)
        print(pkg, 'already installed')
    except Exception:
        pip_install([spec])

print('CNN dependencies ready.')

Installing: --index-url https://download.pytorch.org/whl/cu121 torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0




Installing: timm==0.9.16


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.4.0+cu121 requires torch==2.4.0, but you have torch 2.8.0 which is incompatible.




Installing: albumentations>=1.3.1


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.8.0 requires nvidia-nvjitlink-cu12==12.8.93; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-nvjitlink-cu12 12.9.86 which is incompatible.




cv2 already installed
CNN dependencies ready.


In [13]:
# CNN baseline: EfficientNet-B0 on filtered spectrograms with GroupKFold by station (force CPU to avoid CUDA issues)
import os, glob, time, math, random, gc
os.environ['CUDA_VISIBLE_DEVICES'] = ''  # disable CUDA to prevent kernel crashes from CUDA/cuDNN mismatch
import numpy as np
import pandas as pd
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import timm

def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = False; torch.backends.cudnn.benchmark = False

seed_everything(42)
device = torch.device('cpu')
print('Device:', device)

# Load metadata and labels
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# Parse labels (skip '?' lines; empty -> all-zero negatives)
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try:
            rid = int(parts[0])
        except:
            continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

# Build dataframe for fold==0 (train) and fold==1 (test)
meta['has_label'] = meta['rec_id'].isin(label_map.keys())
train_df = meta[(meta['fold']==0) & (meta['has_label'])].copy().reset_index(drop=True)
test_df = meta[meta['fold']==1].copy().reset_index(drop=True)
print('Train rows:', len(train_df), 'Test rows:', len(test_df))

# Map filenames to image paths in filtered_spectrograms
img_dir = 'supplemental_data/filtered_spectrograms'
name_to_path = {os.path.splitext(os.path.basename(p))[0]: p for p in glob.glob(os.path.join(img_dir, '*.bmp'))}

def get_img_path(name):
    return name_to_path.get(name, None)

train_df['img_path'] = train_df['filename'].apply(get_img_path)
test_df['img_path'] = test_df['filename'].apply(get_img_path)
missing_train = train_df['img_path'].isna().sum(); missing_test = test_df['img_path'].isna().sum()
print('Missing train imgs:', missing_train, '| Missing test imgs:', missing_test)

# Build label matrix for train
Y_train = np.vstack([label_map[int(r)] for r in train_df['rec_id'].values]).astype(np.float32)

class_counts = Y_train.sum(axis=0)
neg_counts = (Y_train.shape[0] - class_counts)
pos_weight = (neg_counts / np.clip(class_counts, 1, None)).astype(np.float32)
pos_weight_t = torch.from_numpy(pos_weight)
print('Positives per class:', class_counts.astype(int))

IMG_SIZE = 224

class SpecAugment:
    def __init__(self, time_mask=40, freq_mask=24, p=0.5):
        self.time_mask = time_mask; self.freq_mask = freq_mask; self.p = p
    def __call__(self, img):
        if random.random() < self.p:
            H, W = img.shape[:2]
            w = random.randint(0, min(self.time_mask, W//4) if W>0 else 0)
            x0 = random.randint(0, max(W - w, 0)) if W>0 else 0
            if w>0: img[:, x0:x0+w] = img.mean()
        if random.random() < self.p:
            H, W = img.shape[:2]
            h = random.randint(0, min(self.freq_mask, H//4) if H>0 else 0)
            y0 = random.randint(0, max(H - h, 0)) if H>0 else 0
            if h>0: img[y0:y0+h, :] = img.mean()
        return img

specaug = SpecAugment(time_mask=56, freq_mask=32, p=0.7)

class BirdDataset(Dataset):
    def __init__(self, df, y=None, train=True):
        self.df = df.reset_index(drop=True)
        self.y = y
        self.train = train
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = row['img_path']
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            img = np.zeros((IMG_SIZE, IMG_SIZE), dtype=np.uint8)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)
        img = img.astype(np.float32) / 255.0
        if self.train:
            shift = random.randint(-16, 16)
            if shift != 0:
                M = np.float32([[1, 0, shift], [0, 1, 0]])
                img = cv2.warpAffine(img, M, (IMG_SIZE, IMG_SIZE), borderMode=cv2.BORDER_REFLECT_101)
            img = specaug(img)
        img3 = np.stack([img, img, img], axis=0)  # C,H,W
        x = torch.from_numpy(img3)
        if self.y is not None:
            target = torch.from_numpy(self.y[idx])
            return x, target
        else:
            return x

def build_model(num_classes):
    model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=num_classes)
    return model

def train_one_fold(train_idx, val_idx, epochs=8, lr=3e-4, batch_size=32):
    tr_df = train_df.iloc[train_idx].reset_index(drop=True)
    va_df = train_df.iloc[val_idx].reset_index(drop=True)
    y_tr = Y_train[train_idx]
    y_va = Y_train[val_idx]
    tr_ds = BirdDataset(tr_df, y_tr, train=True)
    va_ds = BirdDataset(va_df, y_va, train=False)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=False)
    va_ld = DataLoader(va_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=False)
    model = build_model(num_classes).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)
    best_auc = -1.0; best_state = None; patience = 2; wait = 0
    for ep in range(1, epochs+1):
        model.train(); loss_sum=0.0; n=0; t0 = time.time()
        for xb, yb in tr_ld:
            xb = xb.to(device); yb = yb.to(device)
            opt.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            opt.step()
            loss_sum += loss.item()*xb.size(0); n += xb.size(0)
        model.eval(); all_logits=[]; all_targets=[]
        with torch.no_grad():
            for xb, yb in va_ld:
                xb = xb.to(device);
                logits = model(xb)
                all_logits.append(logits.detach().cpu().numpy());
                all_targets.append(yb.numpy())
        val_logits = np.concatenate(all_logits, axis=0)
        val_probs = 1/(1+np.exp(-val_logits))
        val_targets = np.concatenate(all_targets, axis=0)
        aucs=[]
        for c in range(num_classes):
            y = val_targets[:, c]; p = val_probs[:, c]
            if y.sum()==0 or y.sum()==y.shape[0]: continue
            try: aucs.append(roc_auc_score(y, p))
            except: pass
        fold_auc = float(np.mean(aucs)) if aucs else float('nan')
        print(f"Epoch {ep:02d} | train_loss={(loss_sum/max(n,1)):.4f} | val_auc={fold_auc:.4f} | time={time.time()-t0:.1f}s")
        if fold_auc > best_auc:
            best_auc = fold_auc; best_state = {k:v.cpu() for k,v in model.state_dict().items()}; wait=0
        else:
            wait += 1
            if wait >= patience: break
    if best_state is not None:
        model.load_state_dict(best_state)
    va_ld = DataLoader(va_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=False)
    model.eval(); all_logits=[]
    with torch.no_grad():
        for xb, _ in va_ld:
            xb = xb.to(device);
            logits = model(xb); all_logits.append(logits.detach().cpu().numpy())
    val_logits = np.concatenate(all_logits, axis=0); val_probs = 1/(1+np.exp(-val_logits))
    te_ds = BirdDataset(test_df, None, train=False)
    te_ld = DataLoader(te_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=False)
    all_tlog=[]
    with torch.no_grad():
        for xb in te_ld:
            xb = xb.to(device);
            logits = model(xb); all_tlog.append(logits.detach().cpu().numpy())
    te_logits = np.concatenate(all_tlog, axis=0); te_probs = 1/(1+np.exp(-te_logits))
    del model; gc.collect()
    return val_probs, te_probs, best_auc

# GroupKFold by station
groups = train_df['station'].astype(str).values
gkf = GroupKFold(n_splits=5)
n_train = len(train_df)
oof = np.zeros((n_train, num_classes), dtype=np.float32)
test_pred = np.zeros((len(test_df), num_classes), dtype=np.float32)
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train_df, Y_train, groups), 1):
    print(f"\n==== CNN Fold {fold} ({len(tr_idx)} train / {len(va_idx)} val) ====")
    val_probs, te_probs, best_auc = train_one_fold(tr_idx, va_idx, epochs=8, lr=3e-4, batch_size=32)
    oof[va_idx] = val_probs.astype(np.float32)
    test_pred += te_probs.astype(np.float32) / 5.0
    fold_aucs.append(best_auc)
    print(f"Fold {fold} best val AUC: {best_auc:.4f}")

# Compute overall OOF AUC
aucs=[]
for c in range(num_classes):
    y = Y_train[:, c]; p = oof[:, c]
    if y.sum()==0 or y.sum()==y.shape[0]: continue
    try: aucs.append(roc_auc_score(y, p))
    except: pass
macro_auc = float(np.mean(aucs)) if aucs else float('nan')
print(f"\nCNN OOF Macro AUC: {macro_auc:.5f} | per-fold: {[round(a,4) for a in fold_aucs]}")

# Save CNN predictions for ensembling
np.save('cnn_oof.npy', oof)
np.save('cnn_test_pred.npy', test_pred)

# Build submission (Id = rec_id*100 + class_id)
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_row = {rid:i for i, rid in enumerate(test_df['rec_id'].values.astype(int))}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_row.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission_cnn.csv', index=False)
print('Saved submission_cnn.csv:', sub.shape)

KernelDied: Kernel died unexpectedly.

In [17]:
# Tabular ensembling: add OVR Logistic on aggregated features and blend with LightGBM using OOF AUC tuning
import numpy as np, pandas as pd, time, sys, glob, re
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

t0 = time.time()
# 1) Load metadata and aggregated features (repeat logic from cell 4 to be self-contained)
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# Labels
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

# segment_features
seg_records = []
with open('supplemental_data/segment_features.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 3: continue
        try: rid = int(parts[0]); seg = int(parts[1])
        except: continue
        vals = []
        for v in parts[2:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        seg_records.append((rid, seg, vals))
max_len_feat = max((len(v) for _,_,v in seg_records), default=0)
sf_cols = [f'sf_{i}' for i in range(max_len_feat)]
sf_df = pd.DataFrame([([rid,seg]+v+[0.0]*(max_len_feat-len(v))) for rid,seg,v in seg_records],
                     columns=['rec_id','seg_idx']+sf_cols)

# rectangles
rect_rows = []
with open('supplemental_data/segment_rectangles.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip().strip(',')
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 6: continue
        try:
            rid = int(parts[0]); seg = int(parts[1])
            t0r = float(parts[2]); t1r = float(parts[3]); f0r = float(parts[4]); f1r = float(parts[5])
        except:
            continue
        rect_rows.append((rid, seg, t0r, t1r, f0r, f1r))
rect_df = pd.DataFrame(rect_rows, columns=['rec_id','seg_idx','t_start','t_end','f_start','f_end'])
if not rect_df.empty:
    rect_df['duration'] = rect_df['t_end'] - rect_df['t_start']
    rect_df['freq_span'] = rect_df['f_end'] - rect_df['f_start']

# aggregate
seg_full = sf_df.merge(rect_df, on=['rec_id','seg_idx'], how='left') if not rect_df.empty else sf_df.copy()
seg_full['segment_count'] = 1
num_cols = [c for c in seg_full.columns if c not in ['rec_id','seg_idx']]
agg_funcs = ['mean','std','min','max','median']
def q10(x): return x.quantile(0.10)
def q90(x): return x.quantile(0.90)
agg_dict = {c: agg_funcs + [q10, q90] for c in num_cols}
gb = seg_full.groupby('rec_id').agg(agg_dict)
gb.columns = ['%s_%s' % (c[0], c[1] if isinstance(c[1], str) else ('q10' if c[1]==q10 else 'q90')) for c in gb.columns.to_flat_index()]
gb = gb.reset_index()
gb['segment_count_total'] = seg_full.groupby('rec_id')['segment_count'].sum().values

feat_df = meta.merge(gb, on='rec_id', how='left').fillna(0.0)
known_mask = feat_df['rec_id'].isin(label_map.keys())
train_mask = (feat_df['fold']==0) & known_mask
test_mask = (feat_df['fold']==1)
feature_cols = [c for c in feat_df.columns if c not in ['rec_id','fold','filename','station']]
X = feat_df.loc[train_mask, feature_cols].to_numpy(np.float32)
groups = feat_df.loc[train_mask, 'station'].astype(str).values
rec_train = feat_df.loc[train_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)
X_test = feat_df.loc[test_mask, feature_cols].to_numpy(np.float32)
rec_test = feat_df.loc[test_mask, 'rec_id'].values.astype(int)
print('Shapes | X:', X.shape, 'Y:', Y_train.shape, 'X_test:', X_test.shape)

# 2) Train OVR Logistic (scaled) with GroupKFold and get OOF/test
gkf = GroupKFold(n_splits=5)
n_train = X.shape[0]
oof_lr = np.zeros((n_train, num_classes), dtype=np.float32)
test_lr = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
per_auc_lr = []

for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_auc_lr.append(np.nan); continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_te = np.zeros(X_test.shape[0], dtype=np.float32)
    prev = float(y.mean())
    fold_id = 0
    for tr_idx, va_idx in gkf.split(X, y, groups):
        fold_id += 1
        X_tr, X_va = X[tr_idx], X[va_idx]; y_tr, y_va = y[tr_idx], y[va_idx]
        # Guard: some folds may have only one class in y_tr due to grouping/imbalance
        if len(np.unique(y_tr)) < 2:
            const = prev
            cls_oof[va_idx] = const
            cls_te += np.full(X_test.shape[0], const, dtype=np.float32) / 5.0
            continue
        pipe = Pipeline([('sc', StandardScaler(with_mean=True, with_std=True)),
                         ('lr', LogisticRegression(max_iter=2000, solver='liblinear', class_weight='balanced'))])
        pipe.fit(X_tr, y_tr)
        cls_oof[va_idx] = pipe.predict_proba(X_va)[:,1].astype(np.float32)
        cls_te += pipe.predict_proba(X_test)[:,1].astype(np.float32) / 5.0
    try:
        auc_c = roc_auc_score(y, cls_oof)
    except Exception:
        auc_c = np.nan
    per_auc_lr.append(auc_c)
    oof_lr[:, c] = cls_oof; test_lr[:, c] = cls_te
print('Logistic per-class AUC (nan skipped):', [round(a,4) for a in per_auc_lr if not np.isnan(a)])
valid_lr = [a for a in per_auc_lr if not np.isnan(a)]
macro_lr = float(np.mean(valid_lr)) if valid_lr else float('nan')
print('Logistic OOF Macro AUC:', round(macro_lr,5))

# 3) Load LightGBM OOF/test from re-run in this cell for consistency,
#    or compute quickly with conservative params (to avoid dependency on earlier state).
try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
except Exception:
    import subprocess, sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb
    from lightgbm import LGBMClassifier

oof_lgb = np.zeros((n_train, num_classes), dtype=np.float32)
test_lgb = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
per_auc_lgb = []
for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_auc_lgb.append(np.nan); continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_te = np.zeros(X_test.shape[0], dtype=np.float32)
    prev = float(y.mean())
    fold_id = 0
    for tr_idx, va_idx in gkf.split(X, y, groups):
        fold_id += 1
        X_tr, X_va = X[tr_idx], X[va_idx]; y_tr, y_va = y[tr_idx], y[va_idx]
        pos = int(y_tr.sum()); neg = int((1-y_tr).sum())
        if pos == 0 or neg == 0:
            const = prev
            cls_oof[va_idx] = const
            cls_te += np.full(X_test.shape[0], const, dtype=np.float32) / 5.0
            continue
        spw = neg/pos
        model = LGBMClassifier(objective='binary', learning_rate=0.05, num_leaves=31, n_estimators=600,
                               subsample=0.8, colsample_bytree=0.8, min_child_samples=5,
                               reg_lambda=1.0, random_state=42, n_jobs=-1, scale_pos_weight=spw)
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='auc',
                  callbacks=[lgb.early_stopping(50, verbose=False)])
        cls_oof[va_idx] = model.predict_proba(X_va)[:,1].astype(np.float32)
        cls_te += model.predict_proba(X_test)[:,1].astype(np.float32) / 5.0
    try: auc_c = roc_auc_score(y, cls_oof)
    except Exception: auc_c = np.nan
    per_auc_lgb.append(auc_c)
    oof_lgb[:, c] = cls_oof; test_lgb[:, c] = cls_te
valid_lgb = [a for a in per_auc_lgb if not np.isnan(a)]
macro_lgb = float(np.mean(valid_lgb)) if valid_lgb else float('nan')
print('LightGBM OOF Macro AUC (recomputed):', round(macro_lgb,5))

# 4) Blend: find global weight w maximizing OOF macro AUC for blend = w*lgb + (1-w)*lr
best_w, best_auc = 0.5, -1.0
for w in np.linspace(0.0, 1.0, 51):
    blend = w*oof_lgb + (1-w)*oof_lr
    aucs=[]
    for c in range(num_classes):
        y = Y_train[:, c]
        p = blend[:, c]
        if y.sum()==0 or y.sum()==y.shape[0]: continue
        try: aucs.append(roc_auc_score(y, p))
        except: pass
    if aucs:
        auc = float(np.mean(aucs))
        if auc > best_auc:
            best_auc = auc; best_w = float(w)
print(f'Blending weight best_w={best_w:.2f} | OOF Macro AUC={best_auc:.5f}')

# Apply blend to test
test_blend = best_w*test_lgb + (1-best_w)*test_lr

# 5) Build submission
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_blend[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission.csv', index=False)
np.save('tab_lr_oof.npy', oof_lr); np.save('tab_lr_test.npy', test_lr)
np.save('tab_lgb_oof.npy', oof_lgb); np.save('tab_lgb_test.npy', test_lgb)
np.save('tab_blend_test.npy', test_blend)
print('Saved submission.csv. Time: %.2fs' % (time.time()-t0))

Shapes | X: (258, 316) Y: (258, 19) X_test: (64, 316)


Logistic per-class AUC (nan skipped): [0.7814, 0.7523, 0.7033, 0.3012, 0.5272, 0.7241, 0.4339, 0.4045, 0.7819, 0.3542, 0.6107, 0.8024, 0.5337, 0.2185, 0.3505, 0.5767, 0.127, 0.3332, 0.8482]
Logistic OOF Macro AUC: 0.535
[LightGBM] [Info] Number of positive: 7, number of negative: 198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034146 -> initscore=-3.342357
[LightGBM] [Info] Start training from score -3.342357


[LightGBM] [Info] Number of positive: 7, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034483 -> initscore=-3.332205
[LightGBM] [Info] Start training from score -3.332205
[LightGBM] [Info] Number of positive: 6, number of negative: 205
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028436 -> initscore=-3.531251
[LightGBM] [Info] Start training from score -3.531251
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 24, number of negative: 179
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118227 -> initscore=-2.009332
[LightGBM] [Info] Start training from score -2.009332
[LightGBM] [Info] Number of positive: 28, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.132701 -> initscore=-1.877282
[LightGBM] [Info] Start training from score -1.877282
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 24, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002990 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114286 -> initscore=-2.047693
[LightGBM] [Info] Start training from score -2.047693
[LightGBM] [Info] Number of positive: 16, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078049 -> initscore=-2.469158
[LightGBM] [Info] Start training from score -2.469158
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075
[LightGBM] [Info] Number of positive: 17, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002720 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080952 -> initscore=-2.429477
[LightGBM] [Info] Start training from score -2.429477
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158
[LightGBM] [Info] Number of positive: 4, number of negative: 206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019048 -> initscore=-3.941582
[LightGBM] [Info] Start training from score -3.941582
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
[LightGBM] [Info] Number of positive: 9, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042857 -> initscore=-3.106080
[LightGBM] [Info] Start training from score -3.106080
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 3, number of negative: 208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014218 -> initscore=-4.238926
[LightGBM] [Info] Start training from score -4.238926
[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 14, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002786 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068966 -> initscore=-2.602690
[LightGBM] [Info] Start training from score -2.602690
[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 18, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087805 -> initscore=-2.340737
[LightGBM] [Info] Start training from score -2.340737
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002834 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075
[LightGBM] [Info] Number

[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269
[LightGBM] [Info] Number of positive: 20, number of negative: 185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002647 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097561 -> initscore=-2.224624
[LightGBM] [Info] Start training from score -2.224624
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 17, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083744 -> initscore=-2.392533
[LightGBM] [Info] Start training from score -2.392533
[LightGBM] [Info] Number of positive: 21, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100000 -> initscore=-2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392
[LightGBM] [Info] Number of positive: 16, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075829 -> initscore=-2.500411
[LightGBM] [Info] Start training from score -2.500411
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269
[LightGBM] [Info] Number of positive: 34, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165854 -> initscore=-1.615303
[LightGBM] [Info] Start training from score -1.615303
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 39, number of negative: 164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192118 -> initscore=-1.436305
[LightGBM] [Info] Start training from score -1.436305
[LightGBM] [Info] Number of positive: 45, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214286 -> initscore=-1.299283
[LightGBM] [Info] Start training from score -1.299283
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 10, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002749 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049261 -> initscore=-2.960105
[LightGBM] [Info] Start training from score -2.960105
[LightGBM] [Info] Number of positive: 10, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047393 -> initscore=-3.000720
[LightGBM] [Info] Start training from score -3.000720
[LightGBM] [Info] Numbe

[LightGBM] [Info] Number of positive: 9, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.044335 -> initscore=-3.070634
[LightGBM] [Info] Start training from score -3.070634
[LightGBM] [Info] Number of positive: 9, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
[LightGBM] [Info] Number of positive: 4, number of negative: 207
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10085
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018957 -> initscore=-3.946424
[LightGBM] [Info] Start training from score -3.946424
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 14, number of negative: 191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068293 -> initscore=-2.613216
[LightGBM] [Info] Start training from score -2.613216
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002856 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075
[LightGBM] [Info] Number

[LightGBM] [Info] Number of positive: 13, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061905 -> initscore=-2.718254
[LightGBM] [Info] Start training from score -2.718254
[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
[LightGBM] [Info] Number

[LightGBM] [Info] Number of positive: 6, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9256
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444
[LightGBM] [Info] Number of positive: 6, number of negative: 204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10646
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028571 -> initscore=-3.526361
[LightGBM] [Info] Start training from score -3.526361
[LightGBM] [Info] Number 

[LightGBM] [Info] Number of positive: 3, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014634 -> initscore=-4.209655
[LightGBM] [Info] Start training from score -4.209655
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158
[LightGBM] [Info] Number o

[LightGBM] [Info] Number of positive: 11, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9641
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.053659 -> initscore=-2.869963
[LightGBM] [Info] Start training from score -2.869963
[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8896
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 315
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
[LightGBM] [Info] Number 

LightGBM OOF Macro AUC (recomputed): 0.48783


Blending weight best_w=0.42 | OOF Macro AUC=0.55079
Saved submission.csv. Time: 12.88s


In [19]:
# Advanced tabular features + robust LGBM (per-fold SVD on hist, station priors, richer aggregations)
import numpy as np, pandas as pd, time, glob, re, sys
from sklearn.model_selection import GroupKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score

t0 = time.time()
# Load metadata
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# Labels
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

# Parse segment_features
seg_records = []
with open('supplemental_data/segment_features.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 3: continue
        try: rid = int(parts[0]); seg = int(parts[1])
        except: continue
        vals = []
        for v in parts[2:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        seg_records.append((rid, seg, vals))
max_len_feat = max((len(v) for _,_,v in seg_records), default=0)
sf_cols = [f'sf_{i}' for i in range(max_len_feat)]
sf_df = pd.DataFrame([([rid,seg]+v+[0.0]*(max_len_feat-len(v))) for rid,seg,v in seg_records],
                     columns=['rec_id','seg_idx']+sf_cols) if seg_records else pd.DataFrame(columns=['rec_id','seg_idx']+sf_cols)

# Parse rectangles
rect_rows = []
with open('supplemental_data/segment_rectangles.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip().strip(',')
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 6: continue
        try:
            rid = int(parts[0]); seg = int(parts[1])
            t0r = float(parts[2]); t1r = float(parts[3]); f0r = float(parts[4]); f1r = float(parts[5])
        except:
            continue
        rect_rows.append((rid, seg, t0r, t1r, f0r, f1r))
rect_df = pd.DataFrame(rect_rows, columns=['rec_id','seg_idx','t_start','t_end','f_start','f_end'])
if not rect_df.empty:
    rect_df['duration'] = rect_df['t_end'] - rect_df['t_start']
    rect_df['freq_span'] = rect_df['f_end'] - rect_df['f_start']
    rect_df['area_tf'] = rect_df['duration'] * rect_df['freq_span']
    rect_df['ratio_f_over_t'] = np.where(rect_df['duration']>0, rect_df['freq_span']/rect_df['duration'], np.nan)

# Merge per-segment
seg_full = sf_df.merge(rect_df, on=['rec_id','seg_idx'], how='outer') if not sf_df.empty else rect_df.copy()
if seg_full is None or seg_full.empty:
    seg_full = pd.DataFrame(columns=['rec_id','seg_idx'])
seg_full['segment_count'] = 1

# Per-record ranges for time/freq coverage
rec_ranges = None
if not rect_df.empty:
    rec_ranges = rect_df.groupby('rec_id').agg(rec_t_min=('t_start','min'), rec_t_max=('t_end','max'),
                                              rec_f_min=('f_start','min'), rec_f_max=('f_end','max')).reset_index()
    rec_ranges['rec_duration'] = rec_ranges['rec_t_max'] - rec_ranges['rec_t_min']
    rec_ranges['rec_freq_span'] = rec_ranges['rec_f_max'] - rec_ranges['rec_f_min']

# Aggregate functions
def q10(x): return x.quantile(0.10)
def q90(x): return x.quantile(0.90)
def q25(x): return x.quantile(0.25)
def q75(x): return x.quantile(0.75)

num_cols = [c for c in seg_full.columns if c not in ['rec_id','seg_idx']]
agg_funcs = ['mean','std','min','max','median','skew']
agg_dict = {c: agg_funcs + [q10, q90, q25, q75] for c in num_cols}
gb = seg_full.groupby('rec_id').agg(agg_dict) if not seg_full.empty else pd.DataFrame()
if not gb.empty:
    gb.columns = ['%s_%s' % (c[0], c[1] if isinstance(c[1], str) else ('q10' if c[1]==q10 else ('q90' if c[1]==q90 else ('q25' if c[1]==q25 else 'q75')))) for c in gb.columns.to_flat_index()]
    gb = gb.reset_index()
    # IQR features
    for base in ['duration','freq_span','area_tf']:
        if f'{base}_q75' in gb.columns and f'{base}_q25' in gb.columns:
            gb[f'{base}_iqr'] = gb[f'{base}_q75'] - gb[f'{base}_q25']
    # Nonzero counts/ratios for first 20 sf features to limit dimension
    nz_list = []
    for name in [c for c in seg_full.columns if c.startswith('sf_')][:20]:
        tmp = seg_full[['rec_id', name]].copy()
        tmp[name] = (tmp[name] != 0).astype(np.int8)
        nz = tmp.groupby('rec_id')[name].agg(['sum','count']).reset_index()
        nz.rename(columns={'sum': f'{name}_nonzero_count', 'count': f'{name}_total_count'}, inplace=True)
        nz[f'{name}_nonzero_ratio'] = np.where(nz[f'{name}_total_count']>0, nz[f'{name}_nonzero_count']/nz[f'{name}_total_count'], 0.0)
        nz_list.append(nz[['rec_id', f'{name}_nonzero_count', f'{name}_nonzero_ratio']])
    if nz_list:
        nz_all = nz_list[0]
        for z in nz_list[1:]:
            nz_all = nz_all.merge(z, on='rec_id', how='outer')
        gb = gb.merge(nz_all, on='rec_id', how='left')
    # Interaction
    if 'duration_mean' in gb.columns and 'freq_span_mean' in gb.columns:
        gb['dur_x_freq_mean'] = gb['duration_mean'] * gb['freq_span_mean']

# Merge to meta; add has_segments and rec_ranges
feat_df = meta.merge(gb, on='rec_id', how='left') if not gb.empty else meta.copy()
feat_df['has_segments'] = (~feat_df.filter(regex='segment_count_').isna()).any(axis=1).astype(np.int8)
if rec_ranges is not None:
    feat_df = feat_df.merge(rec_ranges, on='rec_id', how='left')
    # Coverage & density
    if 'duration_sum' in feat_df.columns and 'rec_duration' in feat_df.columns:
        feat_df['coverage_time_ratio'] = np.where(feat_df['rec_duration']>0, feat_df['duration_sum']/feat_df['rec_duration'], np.nan)
    if 'freq_span_sum' in feat_df.columns and 'rec_freq_span' in feat_df.columns:
        feat_df['coverage_freq_ratio'] = np.where(feat_df['rec_freq_span']>0, feat_df['freq_span_sum']/feat_df['rec_freq_span'], np.nan)
    if 'segment_count_sum' in feat_df.columns and 'rec_duration' in feat_df.columns:
        feat_df['segment_density'] = np.where(feat_df['rec_duration']>0, feat_df['segment_count_sum']/feat_df['rec_duration'], np.nan)

# Read histogram_of_segments and build arrays aligned to rec_id
hist_rows = []
with open('supplemental_data/histogram_of_segments.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        vals = []
        for v in parts[1:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        hist_rows.append((rid, vals))
max_len = max((len(v) for _, v in hist_rows), default=0)
H = np.zeros((len(hist_rows), max_len), dtype=np.float32)
R = np.zeros((len(hist_rows),), dtype=int)
for i,(rid, vals) in enumerate(hist_rows):
    R[i] = rid; L = min(max_len, len(vals));
    if L: H[i,:L] = np.asarray(vals[:L], np.float32)
# L1 normalize rows
row_sums = H.sum(axis=1, keepdims=True); row_sums[row_sums==0] = 1.0; H_norm = H / row_sums
hist_df = pd.DataFrame(H_norm); hist_df.insert(0, 'rec_id', R)

# Build train/test indices and label matrix
known_mask = feat_df['rec_id'].isin(label_map.keys())
train_mask = (feat_df['fold']==0) & known_mask
test_mask = (feat_df['fold']==1)
groups = feat_df.loc[train_mask, 'station'].astype(str).values
rec_train = feat_df.loc[train_mask, 'rec_id'].values.astype(int)
rec_test = feat_df.loc[test_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)

# Select candidate feature columns (exclude id/meta); keep NaNs (no fillna)
exclude = set(['rec_id','fold','filename','station'])
feature_cols = [c for c in feat_df.columns if c not in exclude]
X_base = feat_df.loc[train_mask, feature_cols].copy()
X_test_base = feat_df.loc[test_mask, feature_cols].copy()

print('Base shapes:', X_base.shape, X_test_base.shape, '| labels:', Y_train.shape)

# Modeling per-class with LGBM + per-fold SVD(histo) + station prior
try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
except Exception:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb
    from lightgbm import LGBMClassifier

gkf = GroupKFold(n_splits=5)
n_train = X_base.shape[0]
oof = np.zeros((n_train, num_classes), dtype=np.float32)
test_pred = np.zeros((len(rec_test), num_classes), dtype=np.float32)
per_auc = []

# Pre-map hist matrices aligned to rec ids
rid_to_hrow = {int(rid): i for i, rid in enumerate(hist_df['rec_id'].values)}
H_train = np.vstack([hist_df.iloc[rid_to_hrow[int(r)]] .values[1:] for r in rec_train]) if len(rec_train)>0 else np.zeros((0, max_len), np.float32)
H_test = np.vstack([hist_df.iloc[rid_to_hrow[int(r)]] .values[1:] for r in rec_test]) if len(rec_test)>0 else np.zeros((0, max_len), np.float32)

for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_auc.append(np.nan); continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_te = np.zeros(len(rec_test), dtype=np.float32)
    fold_id = 0
    for tr_idx, va_idx in gkf.split(X_base, y, groups):
        fold_id += 1
        X_tr, X_va = X_base.iloc[tr_idx].copy(), X_base.iloc[va_idx].copy()
        y_tr, y_va = y[tr_idx], y[va_idx]
        rec_tr, rec_va = rec_train[tr_idx], rec_train[va_idx]
        # Drop zero-variance columns on X_tr
        var = X_tr.var(axis=0, numeric_only=True)
        keep_cols = var.index[var.values > 0.0].tolist()
        X_tr = X_tr[keep_cols]; X_va = X_va[keep_cols]
        # Station prior (within fold) as a single feature
        tr_df_tmp = pd.DataFrame({'station': feat_df.loc[train_mask, 'station'].values[tr_idx], 'y': y_tr})
        st_prior = tr_df_tmp.groupby('station')['y'].mean().to_dict()
        st_tr = np.array([st_prior.get(s, y_tr.mean()) for s in feat_df.loc[train_mask, 'station'].values[tr_idx]], dtype=np.float32)
        st_va = np.array([st_prior.get(s, y_tr.mean()) for s in feat_df.loc[train_mask, 'station'].values[va_idx]], dtype=np.float32)
        X_tr = X_tr.assign(st_prior=st_tr); X_va = X_va.assign(st_prior=st_va)
        # Per-fold TruncatedSVD on histogram
        H_tr, H_va = H_train[tr_idx], H_train[va_idx]
        n_comp = min(24, H_tr.shape[1] if H_tr.ndim==2 else 0) if H_tr.size>0 else 0
        if n_comp >= 2:
            svd = TruncatedSVD(n_components=n_comp, random_state=42)
            svd.fit(H_tr)
            Z_tr = svd.transform(H_tr); Z_va = svd.transform(H_va); Z_te = svd.transform(H_test)
            # Append SVD comps
            for j in range(Z_tr.shape[1]):
                X_tr[f'hsvd_{j}'] = Z_tr[:, j]
                X_va[f'hsvd_{j}'] = Z_va[:, j]
        else:
            Z_te = np.zeros((H_test.shape[0], 0), dtype=np.float32)
        # Build numpy arrays
        X_tr_np = X_tr.to_numpy(dtype=np.float32); X_va_np = X_va.to_numpy(dtype=np.float32)
        pos = int(y_tr.sum()); neg = int((1-y_tr).sum())
        if pos == 0 or neg == 0:
            const = float(y.mean())
            cls_oof[va_idx] = const; cls_te += np.full(len(rec_test), const, np.float32)/5.0
            print(f'Class {c:02d} fold {fold_id}: degenerate labels -> const {const:.4f}')
            continue
        spw = neg/pos
        model = LGBMClassifier(objective='binary', learning_rate=0.03, num_leaves=63, n_estimators=3000,
                               subsample=0.9, colsample_bytree=0.9, min_child_samples=10,
                               reg_lambda=2.0, random_state=42, n_jobs=-1, scale_pos_weight=spw)
        model.fit(X_tr_np, y_tr, eval_set=[(X_va_np, y_va)], eval_metric='auc',
                  callbacks=[lgb.early_stopping(150, verbose=False)])
        p_va = model.predict_proba(X_va_np)[:,1].astype(np.float32)
        cls_oof[va_idx] = p_va
        # For test, must align features to X_va columns: rebuild test with same columns
        X_te_fold = X_test_base[keep_cols].copy()
        # station prior for test using train fold priors
        st_te = np.array([st_prior.get(s, y_tr.mean()) for s in feat_df.loc[test_mask, 'station'].values], dtype=np.float32)
        X_te_fold = X_te_fold.assign(st_prior=st_te)
        if 'Z_te' in locals() and Z_te.shape[1] > 0:
            for j in range(Z_te.shape[1]):
                X_te_fold[f'hsvd_{j}'] = Z_te[:, j]
        p_te = model.predict_proba(X_te_fold.to_numpy(np.float32))[:,1].astype(np.float32)
        cls_te += p_te / 5.0
        print(f"Class {c:02d} fold {fold_id}: pos={pos} neg={neg} spw={spw:.2f} best_iter={getattr(model,'best_iteration_',None)}")
    try: auc_c = roc_auc_score(y, cls_oof)
    except Exception: auc_c = np.nan
    per_auc.append(auc_c); oof[:, c] = cls_oof; test_pred[:, c] = cls_te
    print(f"Class {c:02d} OOF AUC: {auc_c}")

valid = [a for a in per_auc if not np.isnan(a)]
macro_auc = float(np.mean(valid)) if valid else float('nan')
print(f"\nAdvanced Tabular OOF Macro AUC: {macro_auc:.5f}")

# Build submission
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission.csv', index=False)
np.save('advtab_oof.npy', oof); np.save('advtab_test.npy', test_pred)
print('Saved submission.csv | Time: %.2fs' % (time.time()-t0))

Base shapes: (258, 521) (64, 521) | labels: (258, 19)
[LightGBM] [Info] Number of positive: 7, number of negative: 198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16183
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034146 -> initscore=-3.342357
[LightGBM] [Info] Start training from score -3.342357


Class 00 fold 1: pos=7 neg=198 spw=28.29 best_iter=1


[LightGBM] [Info] Number of positive: 7, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14955
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034483 -> initscore=-3.332205
[LightGBM] [Info] Start training from score -3.332205
Class 00 fold 2: pos=7 neg=196 spw=28.00 best_iter=1


[LightGBM] [Info] Number of positive: 6, number of negative: 205
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16866
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028436 -> initscore=-3.531251
[LightGBM] [Info] Start training from score -3.531251
Class 00 fold 3: pos=6 neg=205 spw=34.17 best_iter=5


[LightGBM] [Info] Number of positive: 7, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15571
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034483 -> initscore=-3.332205
[LightGBM] [Info] Start training from score -3.332205
Class 00 fold 4: pos=7 neg=196 spw=28.00 best_iter=1


[LightGBM] [Info] Number of positive: 1, number of negative: 209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17800
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004762 -> initscore=-5.342334
[LightGBM] [Info] Start training from score -5.342334
Class 00 fold 5: pos=1 neg=209 spw=209.00 best_iter=1
Class 00 OOF AUC: 0.3264086511098463


[LightGBM] [Info] Number of positive: 15, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16185
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073171 -> initscore=-2.538974
[LightGBM] [Info] Start training from score -2.538974
Class 01 fold 1: pos=15 neg=190 spw=12.67 best_iter=16


[LightGBM] [Info] Number of positive: 24, number of negative: 179
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14957
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118227 -> initscore=-2.009332
[LightGBM] [Info] Start training from score -2.009332


Class 01 fold 2: pos=24 neg=179 spw=7.46 best_iter=1
[LightGBM] [Info] Number of positive: 28, number of negative: 183
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16871
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.132701 -> initscore=-1.877282
[LightGBM] [Info] Start training from score -1.877282


Class 01 fold 3: pos=28 neg=183 spw=6.54 best_iter=1
[LightGBM] [Info] Number of positive: 21, number of negative: 182
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15573
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103448 -> initscore=-2.159484
[LightGBM] [Info] Start training from score -2.159484


Class 01 fold 4: pos=21 neg=182 spw=8.67 best_iter=186


[LightGBM] [Info] Number of positive: 24, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17805
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114286 -> initscore=-2.047693
[LightGBM] [Info] Start training from score -2.047693
Class 01 fold 5: pos=24 neg=186 spw=7.75 best_iter=11
Class 01 OOF AUC: 0.7671583850931677


[LightGBM] [Info] Number of positive: 16, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050730 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16186
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078049 -> initscore=-2.469158
[LightGBM] [Info] Start training from score -2.469158
Class 02 fold 1: pos=16 neg=189 spw=11.81 best_iter=1


[LightGBM] [Info] Number of positive: 16, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049477 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14959
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078818 -> initscore=-2.458520
[LightGBM] [Info] Start training from score -2.458520
Class 02 fold 2: pos=16 neg=187 spw=11.69 best_iter=1


[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16871
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146


Class 02 fold 3: pos=14 neg=197 spw=14.07 best_iter=183
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15573
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075


Class 02 fold 4: pos=13 neg=190 spw=14.62 best_iter=5
[LightGBM] [Info] Number of positive: 17, number of negative: 193


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17806
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080952 -> initscore=-2.429477
[LightGBM] [Info] Start training from score -2.429477
Class 02 fold 5: pos=17 neg=193 spw=11.35 best_iter=1
Class 02 OOF AUC: 0.721206782646994


[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033638 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16182
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
Class 03 fold 1: pos=4 neg=201 spw=50.25 best_iter=1


[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14953
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158


Class 03 fold 2: pos=2 neg=201 spw=100.50 best_iter=1
[LightGBM] [Info] Number of positive: 4, number of negative: 207
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16866
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018957 -> initscore=-3.946424
[LightGBM] [Info] Start training from score -3.946424


Class 03 fold 3: pos=4 neg=207 spw=51.75 best_iter=1
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15569
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158


Class 03 fold 4: pos=2 neg=201 spw=100.50 best_iter=1
[LightGBM] [Info] Number of positive: 4, number of negative: 206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17801
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019048 -> initscore=-3.941582
[LightGBM] [Info] Start training from score -3.941582
Class 03 fold 5: pos=4 neg=206 spw=51.50 best_iter=1
Class 03 OOF AUC: 0.20275590551181102


[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16182
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
Class 04 fold 1: pos=4 neg=201 spw=50.25 best_iter=1


[LightGBM] [Info] Number of positive: 6, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14955
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444


Class 04 fold 2: pos=6 neg=197 spw=32.83 best_iter=12
[LightGBM] [Info] Number of positive: 9, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16868
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043


Class 04 fold 3: pos=9 neg=202 spw=22.44 best_iter=1
[LightGBM] [Info] Number of positive: 8, number of negative: 195


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15571
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558


Class 04 fold 4: pos=8 neg=195 spw=24.38 best_iter=1
[LightGBM] [Info] Number of positive: 9, number of negative: 201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041206 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17803
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042857 -> initscore=-3.106080
[LightGBM] [Info] Start training from score -3.106080
Class 04 fold 5: pos=9 neg=201 spw=22.33 best_iter=1
Class 04 OOF AUC: 0.604194556001785


[LightGBM] [Info] Number of positive: 5, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045634 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16185
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.024390 -> initscore=-3.688879
[LightGBM] [Info] Start training from score -3.688879
Class 05 fold 1: pos=5 neg=200 spw=40.00 best_iter=1


[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14956
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010
Class 05 fold 2: pos=4 neg=199 spw=49.75 best_iter=2


[LightGBM] [Info] Number of positive: 3, number of negative: 208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16867
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014218 -> initscore=-4.238926
[LightGBM] [Info] Start training from score -4.238926
Class 05 fold 3: pos=3 neg=208 spw=69.33 best_iter=10


[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15572
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010
Class 05 fold 4: pos=4 neg=199 spw=49.75 best_iter=1


[LightGBM] [Info] Number of positive: 4, number of negative: 206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17803
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019048 -> initscore=-3.941582
[LightGBM] [Info] Start training from score -3.941582
Class 05 fold 5: pos=4 neg=206 spw=51.50 best_iter=8
Class 05 OOF AUC: 0.6940711462450593


[LightGBM] [Info] Number of positive: 19, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16187
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.092683 -> initscore=-2.281308
[LightGBM] [Info] Start training from score -2.281308
Class 06 fold 1: pos=19 neg=186 spw=9.79 best_iter=1


[LightGBM] [Info] Number of positive: 14, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14957
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068966 -> initscore=-2.602690
[LightGBM] [Info] Start training from score -2.602690


Class 06 fold 2: pos=14 neg=189 spw=13.50 best_iter=30
[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16869
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146
Class 06 fold 3: pos=14 neg=197 spw=14.07 best_iter=4


[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15574
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392
Class 06 fold 4: pos=15 neg=188 spw=12.53 best_iter=1


[LightGBM] [Info] Number of positive: 14, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17804
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066667 -> initscore=-2.639057
[LightGBM] [Info] Start training from score -2.639057


Class 06 fold 5: pos=14 neg=196 spw=14.00 best_iter=1
Class 06 OOF AUC: 0.5694780885267563


[LightGBM] [Info] Number of positive: 18, number of negative: 187
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16189
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087805 -> initscore=-2.340737
[LightGBM] [Info] Start training from score -2.340737


Class 07 fold 1: pos=18 neg=187 spw=10.39 best_iter=1
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051696 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14959
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075


Class 07 fold 2: pos=13 neg=190 spw=14.62 best_iter=1
[LightGBM] [Info] Number of positive: 17, number of negative: 194


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16873
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080569 -> initscore=-2.434645
[LightGBM] [Info] Start training from score -2.434645


Class 07 fold 3: pos=17 neg=194 spw=11.41 best_iter=1
[LightGBM] [Info] Number of positive: 12, number of negative: 191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15575
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.059113 -> initscore=-2.767367
[LightGBM] [Info] Start training from score -2.767367


Class 07 fold 4: pos=12 neg=191 spw=15.92 best_iter=1


[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17807
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269


Class 07 fold 5: pos=16 neg=194 spw=12.12 best_iter=1
Class 07 OOF AUC: 0.41686853116053735
[LightGBM] [Info] Number of positive: 20, number of negative: 185
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16188
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097561 -> initscore=-2.224624
[LightGBM] [Info] Start training from score -2.224624


Class 08 fold 1: pos=20 neg=185 spw=9.25 best_iter=7


[LightGBM] [Info] Number of positive: 19, number of negative: 184
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14959
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093596 -> initscore=-2.270497
[LightGBM] [Info] Start training from score -2.270497


Class 08 fold 2: pos=19 neg=184 spw=9.68 best_iter=191


[LightGBM] [Info] Number of positive: 19, number of negative: 192
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16871
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090047 -> initscore=-2.313056
[LightGBM] [Info] Start training from score -2.313056


Class 08 fold 3: pos=19 neg=192 spw=10.11 best_iter=10
[LightGBM] [Info] Number of positive: 17, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15575


[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083744 -> initscore=-2.392533
[LightGBM] [Info] Start training from score -2.392533
Class 08 fold 4: pos=17 neg=186 spw=10.94 best_iter=52


[LightGBM] [Info] Number of positive: 21, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17806
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100000 -> initscore=-2.197225
[LightGBM] [Info] Start training from score -2.197225
Class 08 fold 5: pos=21 neg=189 spw=9.00 best_iter=4
Class 08 OOF AUC: 0.894675925925926


[LightGBM] [Info] Number of positive: 21, number of negative: 184
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034677 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16190
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.102439 -> initscore=-2.170413
[LightGBM] [Info] Start training from score -2.170413


Class 09 fold 1: pos=21 neg=184 spw=8.76 best_iter=1
[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14959
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392


Class 09 fold 2: pos=15 neg=188 spw=12.53 best_iter=3
[LightGBM] [Info] Number of positive: 16, number of negative: 195


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16872
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075829 -> initscore=-2.500411
[LightGBM] [Info] Start training from score -2.500411


Class 09 fold 3: pos=16 neg=195 spw=12.19 best_iter=1
[LightGBM] [Info] Number of positive: 16, number of negative: 187
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15575
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078818 -> initscore=-2.458520
[LightGBM] [Info] Start training from score -2.458520




Class 09 fold 4: pos=16 neg=187 spw=11.69 best_iter=62
[LightGBM] [Info] Number of positive: 16, number of negative: 194


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17807
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269
Class 09 fold 5: pos=16 neg=194 spw=12.12 best_iter=4
Class 09 OOF AUC: 0.546313039983926


[LightGBM] [Info] Number of positive: 34, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16189
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165854 -> initscore=-1.615303
[LightGBM] [Info] Start training from score -1.615303
Class 10 fold 1: pos=34 neg=171 spw=5.03 best_iter=13


[LightGBM] [Info] Number of positive: 38, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14962
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187192 -> initscore=-1.468359
[LightGBM] [Info] Start training from score -1.468359


Class 10 fold 2: pos=38 neg=165 spw=4.34 best_iter=10
[LightGBM] [Info] Number of positive: 40, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16874
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.189573 -> initscore=-1.452784
[LightGBM] [Info] Start training from score -1.452784


Class 10 fold 3: pos=40 neg=171 spw=4.28 best_iter=8


[LightGBM] [Info] Number of positive: 39, number of negative: 164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042618 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15577
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192118 -> initscore=-1.436305
[LightGBM] [Info] Start training from score -1.436305


Class 10 fold 4: pos=39 neg=164 spw=4.21 best_iter=17
[LightGBM] [Info] Number of positive: 45, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17809
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214286 -> initscore=-1.299283
[LightGBM] [Info] Start training from score -1.299283
Class 10 fold 5: pos=45 neg=165 spw=3.67 best_iter=1
Class 10 OOF AUC: 0.764378478664193


[LightGBM] [Info] Number of positive: 2, number of negative: 203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16182
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009756 -> initscore=-4.620059
[LightGBM] [Info] Start training from score -4.620059
Class 11 fold 1: pos=2 neg=203 spw=101.50 best_iter=10


[LightGBM] [Info] Number of positive: 10, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14955
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049261 -> initscore=-2.960105
[LightGBM] [Info] Start training from score -2.960105
Class 11 fold 2: pos=10 neg=193 spw=19.30 best_iter=1


[LightGBM] [Info] Number of positive: 10, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16867
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047393 -> initscore=-3.000720
[LightGBM] [Info] Start training from score -3.000720
Class 11 fold 3: pos=10 neg=201 spw=20.10 best_iter=1


[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15569
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558


Class 11 fold 4: pos=8 neg=195 spw=24.38 best_iter=1
[LightGBM] [Info] Number of positive: 10, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17802
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047619 -> initscore=-2.995732
[LightGBM] [Info] Start training from score -2.995732


Class 11 fold 5: pos=10 neg=200 spw=20.00 best_iter=1
Class 11 OOF AUC: 0.43266129032258066
[LightGBM] [Info] Number of positive: 10, number of negative: 195


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16184
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048780 -> initscore=-2.970414
[LightGBM] [Info] Start training from score -2.970414
Class 12 fold 1: pos=10 neg=195 spw=19.50 best_iter=1


[LightGBM] [Info] Number of positive: 9, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14956
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.044335 -> initscore=-3.070634
[LightGBM] [Info] Start training from score -3.070634
Class 12 fold 2: pos=9 neg=194 spw=21.56 best_iter=70


[LightGBM] [Info] Number of positive: 9, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16868
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043
Class 12 fold 3: pos=9 neg=202 spw=22.44 best_iter=15


[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043702 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15571
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
Class 12 fold 4: pos=8 neg=195 spw=24.38 best_iter=1


[LightGBM] [Info] Number of positive: 12, number of negative: 198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17804
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057143 -> initscore=-2.803360
[LightGBM] [Info] Start training from score -2.803360


Class 12 fold 5: pos=12 neg=198 spw=16.50 best_iter=1
Class 12 OOF AUC: 0.6598915989159893
[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16181
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011


Class 13 fold 1: pos=4 neg=201 spw=50.25 best_iter=1
Class 13 fold 2: degenerate labels -> const 0.0155
[LightGBM] [Info] Number of positive: 4, number of negative: 207


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16865
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018957 -> initscore=-3.946424
[LightGBM] [Info] Start training from score -3.946424
Class 13 fold 3: pos=4 neg=207 spw=51.75 best_iter=1


[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15569
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010


Class 13 fold 4: pos=4 neg=199 spw=49.75 best_iter=1
[LightGBM] [Info] Number of positive: 4, number of negative: 206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17800
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019048 -> initscore=-3.941582
[LightGBM] [Info] Start training from score -3.941582


Class 13 fold 5: pos=4 neg=206 spw=51.50 best_iter=1
Class 13 OOF AUC: 0.10039370078740156
[LightGBM] [Info] Number of positive: 14, number of negative: 191


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040824 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16186
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068293 -> initscore=-2.613216
[LightGBM] [Info] Start training from score -2.613216


Class 14 fold 1: pos=14 neg=191 spw=13.64 best_iter=2
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14958
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075


Class 14 fold 2: pos=13 neg=190 spw=14.62 best_iter=8
[LightGBM] [Info] Number of positive: 9, number of negative: 202


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16869
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043
Class 14 fold 3: pos=9 neg=202 spw=22.44 best_iter=6
[LightGBM] [Info] Number of positive: 15, number of negative: 188


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15574
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392
Class 14 fold 4: pos=15 neg=188 spw=12.53 best_iter=4


[LightGBM] [Info] Number of positive: 13, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17804
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061905 -> initscore=-2.718254
[LightGBM] [Info] Start training from score -2.718254
Class 14 fold 5: pos=13 neg=197 spw=15.15 best_iter=1
Class 14 OOF AUC: 0.6800103305785123


[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16182
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
Class 15 fold 1: pos=4 neg=201 spw=50.25 best_iter=1


[LightGBM] [Info] Number of positive: 3, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14955
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014778 -> initscore=-4.199705
[LightGBM] [Info] Start training from score -4.199705
Class 15 fold 2: pos=3 neg=200 spw=66.67 best_iter=1


[LightGBM] [Info] Number of positive: 5, number of negative: 206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16867
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.023697 -> initscore=-3.718438
[LightGBM] [Info] Start training from score -3.718438


Class 15 fold 3: pos=5 neg=206 spw=41.20 best_iter=2
[LightGBM] [Info] Number of positive: 6, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15572
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444


Class 15 fold 4: pos=6 neg=197 spw=32.83 best_iter=1
[LightGBM] [Info] Number of positive: 6, number of negative: 204


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17803
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028571 -> initscore=-3.526361
[LightGBM] [Info] Start training from score -3.526361
Class 15 fold 5: pos=6 neg=204 spw=34.00 best_iter=1
Class 15 OOF AUC: 0.33730158730158727


[LightGBM] [Info] Number of positive: 2, number of negative: 203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16181
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009756 -> initscore=-4.620059
[LightGBM] [Info] Start training from score -4.620059
Class 16 fold 1: pos=2 neg=203 spw=101.50 best_iter=1


Class 16 fold 2: degenerate labels -> const 0.0078
[LightGBM] [Info] Number of positive: 2, number of negative: 209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16865
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009479 -> initscore=-4.649187
[LightGBM] [Info] Start training from score -4.649187


Class 16 fold 3: pos=2 neg=209 spw=104.50 best_iter=1
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15569


[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158
Class 16 fold 4: pos=2 neg=201 spw=100.50 best_iter=1


[LightGBM] [Info] Number of positive: 2, number of negative: 208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17800
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009524 -> initscore=-4.644391
[LightGBM] [Info] Start training from score -4.644391


Class 16 fold 5: pos=2 neg=208 spw=104.00 best_iter=1
Class 16 OOF AUC: 0.103515625
[LightGBM] [Info] Number of positive: 3, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16182
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014634 -> initscore=-4.209655
[LightGBM] [Info] Start training from score -4.209655


Class 17 fold 1: pos=3 neg=202 spw=67.33 best_iter=1
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043737 seconds.
You can set `force_col_wise=true` to remove the overhead.


[LightGBM] [Info] Total Bins 14954
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158
Class 17 fold 2: pos=2 neg=201 spw=100.50 best_iter=1
[LightGBM] [Info] Number of positive: 3, number of negative: 208


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16866
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014218 -> initscore=-4.238926
[LightGBM] [Info] Start training from score -4.238926
Class 17 fold 3: pos=3 neg=208 spw=69.33 best_iter=1


[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15571
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010


Class 17 fold 4: pos=4 neg=199 spw=49.75 best_iter=1
[LightGBM] [Info] Number of positive: 4, number of negative: 206
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17802


[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019048 -> initscore=-3.941582
[LightGBM] [Info] Start training from score -3.941582
Class 17 fold 5: pos=4 neg=206 spw=51.50 best_iter=1
Class 17 OOF AUC: 0.3700787401574803


[LightGBM] [Info] Number of positive: 11, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16183
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.053659 -> initscore=-2.869963
[LightGBM] [Info] Start training from score -2.869963
Class 18 fold 1: pos=11 neg=194 spw=17.64 best_iter=38


[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14955
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
Class 18 fold 2: pos=8 neg=195 spw=24.38 best_iter=163


[LightGBM] [Info] Number of positive: 11, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16867
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.052133 -> initscore=-2.900422
[LightGBM] [Info] Start training from score -2.900422
Class 18 fold 3: pos=11 neg=200 spw=18.18 best_iter=3


[LightGBM] [Info] Number of positive: 6, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15571
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444


Class 18 fold 4: pos=6 neg=197 spw=32.83 best_iter=26
[LightGBM] [Info] Number of positive: 12, number of negative: 198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17803
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 519


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057143 -> initscore=-2.803360
[LightGBM] [Info] Start training from score -2.803360
Class 18 fold 5: pos=12 neg=198 spw=16.50 best_iter=1
Class 18 OOF AUC: 0.817920054200542

Advanced Tabular OOF Macro AUC: 0.52680
Saved submission.csv | Time: 39.59s


In [20]:
# Evaluate advanced tabular OOF AUC from saved files
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

# Reload labels aligned to fold==0 train rec_ids
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
meta = cv_df.merge(rec_map, on='rec_id', how='left')

label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

train_mask = (meta['fold']==0) & (meta['rec_id'].isin(label_map.keys()))
rec_train = meta.loc[train_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)

oof = np.load('advtab_oof.npy')
assert oof.shape == Y_train.shape, f"Shape mismatch: oof {oof.shape} vs Y {Y_train.shape}"

aucs = []
per_class = []
for c in range(Y_train.shape[1]):
    y = Y_train[:, c]; p = oof[:, c]
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_class.append(np.nan); continue
    try:
        a = roc_auc_score(y, p); per_class.append(a); aucs.append(a)
    except Exception:
        per_class.append(np.nan)
macro = float(np.mean(aucs)) if aucs else float('nan')
print('Advanced Tabular OOF Macro AUC:', round(macro,5))
print('Per-class AUC (first 10):', [round(x,4) if x==x else None for x in per_class[:10]])

Advanced Tabular OOF Macro AUC: 0.5268
Per-class AUC (first 10): [0.3264, 0.7672, 0.7212, 0.2028, 0.6042, 0.6941, 0.5695, 0.4169, 0.8947, 0.5463]


In [22]:
# Strong Tabular v2: reproduce Cell 4 features, boost LGBM (more trees, early stopping), 3-seed avg
import numpy as np, pandas as pd, time, sys, glob, re
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

t0 = time.time()
# Load core metadata
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# Labels
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

# segment_features
seg_records = []
with open('supplemental_data/segment_features.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 3: continue
        try: rid = int(parts[0]); seg = int(parts[1])
        except: continue
        vals = []
        for v in parts[2:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        seg_records.append((rid, seg, vals))
max_len_feat = max((len(v) for _,_,v in seg_records), default=0)
sf_cols = [f'sf_{i}' for i in range(max_len_feat)]
sf_df = pd.DataFrame([([rid,seg]+v+[0.0]*(max_len_feat-len(v))) for rid,seg,v in seg_records],
                     columns=['rec_id','seg_idx']+sf_cols)

# rectangles
rect_rows = []
with open('supplemental_data/segment_rectangles.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip().strip(',')
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 6: continue
        try:
            rid = int(parts[0]); seg = int(parts[1])
            t0r = float(parts[2]); t1r = float(parts[3]); f0r = float(parts[4]); f1r = float(parts[5])
        except:
            continue
        rect_rows.append((rid, seg, t0r, t1r, f0r, f1r))
rect_df = pd.DataFrame(rect_rows, columns=['rec_id','seg_idx','t_start','t_end','f_start','f_end'])
if not rect_df.empty:
    rect_df['duration'] = rect_df['t_end'] - rect_df['t_start']
    rect_df['freq_span'] = rect_df['f_end'] - rect_df['f_start']

# aggregate per-rec
seg_full = sf_df.merge(rect_df, on=['rec_id','seg_idx'], how='left') if not rect_df.empty else sf_df.copy()
seg_full['segment_count'] = 1
num_cols = [c for c in seg_full.columns if c not in ['rec_id','seg_idx']]
agg_funcs = ['mean','std','min','max','median','skew']
def q10(x): return x.quantile(0.10)
def q90(x): return x.quantile(0.90)
agg_dict = {c: agg_funcs + [q10, q90] for c in num_cols}
gb = seg_full.groupby('rec_id').agg(agg_dict)
gb.columns = ['%s_%s' % (c[0], c[1] if isinstance(c[1], str) else ('q10' if c[1]==q10 else 'q90')) for c in gb.columns.to_flat_index()]
gb = gb.reset_index()
gb['segment_count_total'] = seg_full.groupby('rec_id')['segment_count'].sum().values

feat_df = meta.merge(gb, on='rec_id', how='left')
feat_df = feat_df.fillna(0.0)  # this performed best in prior strong run
known_mask = feat_df['rec_id'].isin(label_map.keys())
train_mask = (feat_df['fold']==0) & known_mask
test_mask = (feat_df['fold']==1)
feature_cols = [c for c in feat_df.columns if c not in ['rec_id','fold','filename','station']]
X = feat_df.loc[train_mask, feature_cols].to_numpy(np.float32)
groups = feat_df.loc[train_mask, 'station'].astype(str).values
rec_train = feat_df.loc[train_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)
X_test = feat_df.loc[test_mask, feature_cols].to_numpy(np.float32)
rec_test = feat_df.loc[test_mask, 'rec_id'].values.astype(int)
print('Train X/Y:', X.shape, Y_train.shape, '| Test X:', X_test.shape)

try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
except Exception:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb
    from lightgbm import LGBMClassifier

def run_lgbm_seed(seed):
    gkf = GroupKFold(n_splits=5)
    n_train = X.shape[0]
    oof = np.zeros((n_train, num_classes), dtype=np.float32)
    test_pred = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
    per_auc = []
    for c in range(num_classes):
        y = Y_train[:, c].astype(int)
        if y.sum()==0 or y.sum()==y.shape[0]:
            per_auc.append(np.nan); continue
        cls_oof = np.zeros(n_train, dtype=np.float32)
        cls_test = np.zeros(X_test.shape[0], dtype=np.float32)
        fold_id = 0
        for tr_idx, va_idx in gkf.split(X, y, groups):
            fold_id += 1
            X_tr, X_va = X[tr_idx], X[va_idx]
            y_tr, y_va = y[tr_idx], y[va_idx]
            pos = int(y_tr.sum()); neg = int((1-y_tr).sum())
            spw = (neg/max(pos,1)) if pos>0 else 1.0
            model = LGBMClassifier(
                objective='binary',
                learning_rate=0.03,
                num_leaves=63,
                n_estimators=2500,
                subsample=0.9,
                colsample_bytree=0.9,
                min_child_samples=10,
                reg_lambda=2.0,
                random_state=seed,
                n_jobs=-1,
                scale_pos_weight=spw
            )
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_va, y_va)],
                eval_metric='auc',
                callbacks=[lgb.early_stopping(150, verbose=False)]
            )
            p_va = model.predict_proba(X_va)[:,1].astype(np.float32)
            p_te = model.predict_proba(X_test)[:,1].astype(np.float32)
            cls_oof[va_idx] = p_va
            cls_test += p_te / 5.0
        try: auc_c = roc_auc_score(y, cls_oof)
        except Exception: auc_c = np.nan
        per_auc.append(auc_c)
        oof[:, c] = cls_oof; test_pred[:, c] = cls_test
    valid = [a for a in per_auc if not np.isnan(a)]
    macro = float(np.mean(valid)) if valid else float('nan')
    return oof, test_pred, macro

seeds = [42, 7, 2025]
oofs = []; tests = []; macros = []
for si, seed in enumerate(seeds, 1):
    print(f'Running LGBM seed {seed} ({si}/{len(seeds)})'); sys.stdout.flush()
    oof_s, test_s, macro_s = run_lgbm_seed(seed)
    oofs.append(oof_s); tests.append(test_s); macros.append(macro_s)
    print(f'Seed {seed} OOF Macro AUC: {macro_s:.5f}')

# Average predictions across seeds
oof_stack = np.stack(oofs, axis=0)
test_stack = np.stack(tests, axis=0)
oof_mean = oof_stack.mean(axis=0)
test_mean = test_stack.mean(axis=0)

# Compute final OOF macro
aucs=[]
for c in range(num_classes):
    y = Y_train[:, c]; p = oof_mean[:, c]
    if y.sum()==0 or y.sum()==y.shape[0]: continue
    try: aucs.append(roc_auc_score(y, p))
    except: pass
macro_final = float(np.mean(aucs)) if aucs else float('nan')
print(f'Final averaged OOF Macro AUC: {macro_final:.5f} | seeds individual: {[round(m,5) for m in macros]}')

# Save and build submission
np.save('lgbv2_oof.npy', oof_mean); np.save('lgbv2_test.npy', test_mean)
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_mean[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv | Total time: %.2fs' % (time.time()-t0))

Train X/Y: (258, 361) (258, 19) | Test X: (64, 361)
Running LGBM seed 42 (1/3)


[LightGBM] [Info] Number of positive: 7, number of negative: 198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034146 -> initscore=-3.342357
[LightGBM] [Info] Start training from score -3.342357
[LightGBM] [Info] Number of positive: 7, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034483 -> initscore=-3.332205
[LightGBM] [Info] Start training from score -3.332205




[LightGBM] [Info] Number of positive: 6, number of negative: 205
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028436 -> initscore=-3.531251
[LightGBM] [Info] Start training from score -3.531251
[LightGBM] [Info] Number of positive: 7, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034483 -> initscore=-3.332205
[LightGBM] [Info] Start training from score -3.332205




[LightGBM] [Info] Number of positive: 1, number of negative: 209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004762 -> initscore=-5.342334
[LightGBM] [Info] Start training from score -5.342334
[LightGBM] [Info] Number of positive: 15, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073171 -> initscore=-2.538974
[LightGBM] [Info] Start training from score -2.538974




[LightGBM] [Info] Number of positive: 24, number of negative: 179
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118227 -> initscore=-2.009332
[LightGBM] [Info] Start training from score -2.009332
[LightGBM] [Info] Number of positive: 28, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.132701 -> initscore=-1.877282
[LightGBM] [Info] Start training from score -1.877282




[LightGBM] [Info] Number of positive: 21, number of negative: 182
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103448 -> initscore=-2.159484
[LightGBM] [Info] Start training from score -2.159484




[LightGBM] [Info] Number of positive: 24, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114286 -> initscore=-2.047693
[LightGBM] [Info] Start training from score -2.047693


[LightGBM] [Info] Number of positive: 16, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078049 -> initscore=-2.469158
[LightGBM] [Info] Start training from score -2.469158
[LightGBM] [Info] Number of positive: 16, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078818 -> initscore=-2.458520
[LightGBM] [Info] Start training from score -2.458520


[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146


[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075
[LightGBM] [Info] Number of positive: 17, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080952 -> initscore=-2.429477
[LightGBM] [Info] Start training from score -2.429477


[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158


[LightGBM] [Info] Number of positive: 4, number of negative: 207
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018957 -> initscore=-3.946424
[LightGBM] [Info] Start training from score -3.946424
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> initscore=-4.610158
[LightGBM] [Info] Start training from score -4.610158
[LightGBM] [Info] Number

[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003561 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
[LightGBM] [Info] Number of positive: 6, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003634 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444


[LightGBM] [Info] Number of positive: 9, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043
[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558


[LightGBM] [Info] Number of positive: 9, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042857 -> initscore=-3.106080
[LightGBM] [Info] Start training from score -3.106080
[LightGBM] [Info] Number of positive: 5, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.024390 -> initscore=-3.688879
[LightGBM] [Info] Start training from score -3.688879


[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003539 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010
[LightGBM] [Info] Number of positive: 3, number of negative: 208


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014218 -> initscore=-4.238926
[LightGBM] [Info] Start training from score -4.238926
[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start training from score -3.907010
[LightGBM] [Info] Number of positive: 4, number of negative: 206
[LightGBM] [Info] Auto-c

[LightGBM] [Info] Number of positive: 19, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.092683 -> initscore=-2.281308
[LightGBM] [Info] Start training from score -2.281308


[LightGBM] [Info] Number of positive: 14, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.068966 -> initscore=-2.602690
[LightGBM] [Info] Start training from score -2.602690
[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146


[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392
[LightGBM] [Info] Number of positive: 14, number of negative: 196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066667 -> initscore=-2.639057
[LightGBM] [Info] Start training from score -2.639057


[LightGBM] [Info] Number of positive: 18, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087805 -> initscore=-2.340737
[LightGBM] [Info] Start training from score -2.340737
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075


[LightGBM] [Info] Number of positive: 17, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080569 -> initscore=-2.434645
[LightGBM] [Info] Start training from score -2.434645
[LightGBM] [Info] Number of positive: 12, number of negative: 191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.059113 -> initscore=-2.767367
[LightGBM] [Info] Start training from score -2.767367


[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269


[LightGBM] [Info] Number of positive: 20, number of negative: 185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097561 -> initscore=-2.224624
[LightGBM] [Info] Start training from score -2.224624
[LightGBM] [Info] Number of positive: 19, number of negative: 184
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093596 -> initscore=-2.270497
[LightGBM] [Info] Start training from score -2.270497


[LightGBM] [Info] Number of positive: 19, number of negative: 192
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090047 -> initscore=-2.313056
[LightGBM] [Info] Start training from score -2.313056


[LightGBM] [Info] Number of positive: 17, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003469 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083744 -> initscore=-2.392533
[LightGBM] [Info] Start training from score -2.392533
[LightGBM] [Info] Number of positive: 21, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100000 -> initscore=-2.197225
[LightGBM] [Info] Start training from score -2.197225


[LightGBM] [Info] Number of positive: 21, number of negative: 184
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.102439 -> initscore=-2.170413
[LightGBM] [Info] Start training from score -2.170413


[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392
[LightGBM] [Info] Number of positive: 16, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075829 -> initscore=-2.500411
[LightGBM] [Info] Start training from score -2.500411


[LightGBM] [Info] Number of positive: 16, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078818 -> initscore=-2.458520
[LightGBM] [Info] Start training from score -2.458520
[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269


[LightGBM] [Info] Number of positive: 34, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165854 -> initscore=-1.615303
[LightGBM] [Info] Start training from score -1.615303


[LightGBM] [Info] Number of positive: 38, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187192 -> initscore=-1.468359
[LightGBM] [Info] Start training from score -1.468359
[LightGBM] [Info] Number of positive: 40, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.189573 -> initscore=-1.452784
[LightGBM] [Info] Start training from score -1.452784


[LightGBM] [Info] Number of positive: 39, number of negative: 164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.192118 -> initscore=-1.436305
[LightGBM] [Info] Start training from score -1.436305
[LightGBM] [Info] Number of positive: 45, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214286 -> initscore=-1.299283
[LightGBM] [Info] Start training from score -1.299283


[LightGBM] [Info] Number of positive: 2, number of negative: 203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009756 -> initscore=-4.620059
[LightGBM] [Info] Start training from score -4.620059
[LightGBM] [Info] Number of positive: 10, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003459 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049261 -> initscore=-2.960105
[LightGBM] [Info] Start training from score -2.960105


[LightGBM] [Info] Number of positive: 10, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047393 -> initscore=-3.000720
[LightGBM] [Info] Start training from score -3.000720
[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558


[LightGBM] [Info] Number of positive: 10, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047619 -> initscore=-2.995732
[LightGBM] [Info] Start training from score -2.995732
[LightGBM] [Info] Number of positive: 10, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048780 -> initscore=-2.970414
[LightGBM] [Info] Start training from score -2.970414


[LightGBM] [Info] Number of positive: 9, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003542 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9923
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.044335 -> initscore=-3.070634
[LightGBM] [Info] Start training from score -3.070634
[LightGBM] [Info] Number of positive: 9, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11252
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043


[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10372
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
[LightGBM] [Info] Number of positive: 12, number of negative: 198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057143 -> initscore=-2.803360
[LightGBM] [Info] Start training from score -2.803360


[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10776
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 359
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011


ValueError: y contains previously unseen labels: [1]

In [23]:
# Evaluate available OOF preds and build submission from the best model
import os
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

# Load labels aligned to fold==0 train rec_ids
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
meta = cv_df.merge(rec_map, on='rec_id', how='left')

label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

train_mask = (meta['fold']==0) & (meta['rec_id'].isin(label_map.keys()))
test_mask = (meta['fold']==1)
rec_train = meta.loc[train_mask, 'rec_id'].values.astype(int)
rec_test = meta.loc[test_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)

candidates = [
    ('advtab', 'advtab_oof.npy', 'advtab_test.npy'),
    ('tab_lgb', 'tab_lgb_oof.npy', 'tab_lgb_test.npy'),
    ('lgbv2', 'lgbv2_oof.npy', 'lgbv2_test.npy')
]

results = []
for name, oof_path, te_path in candidates:
    if os.path.exists(oof_path) and os.path.exists(te_path):
        try:
            oof = np.load(oof_path)
            if oof.shape != Y_train.shape:
                print(f"Skip {name}: shape mismatch {oof.shape} vs {Y_train.shape}")
                continue
            aucs=[]
            for c in range(Y_train.shape[1]):
                y = Y_train[:, c]; p = oof[:, c]
                if y.sum()==0 or y.sum()==y.shape[0]: continue
                try: aucs.append(roc_auc_score(y, p))
                except: pass
            macro = float(np.mean(aucs)) if aucs else float('nan')
            results.append((name, macro, oof_path, te_path))
            print(f"{name} OOF Macro AUC: {macro:.5f}")
        except Exception as e:
            print(f"Failed to load {name}: {e}")
    else:
        print(f"Missing files for {name}: {oof_path}, {te_path}")

if not results:
    raise SystemExit('No candidate prediction files found.')

best = max(results, key=lambda x: (x[1] if x[1]==x[1] else -1))
best_name, best_auc, _, best_te_path = best
print(f"Best model: {best_name} with OOF Macro AUC={best_auc:.5f}")

# Build submission from best test preds
test_pred = np.load(best_te_path)
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission.csv', index=False)
print('Submission updated from', best_name, '| shape:', sub.shape)

advtab OOF Macro AUC: 0.52680
tab_lgb OOF Macro AUC: 0.48783
Missing files for lgbv2: lgbv2_oof.npy, lgbv2_test.npy
Best model: advtab with OOF Macro AUC=0.52680
Submission updated from advtab | shape: (1216, 2)


In [42]:
# Evaluate all saved OOFs with strict ID alignment and pick best submission
import os, numpy as np, pandas as pd, sys
from sklearn.metrics import roc_auc_score

# Load labels (train fold==0) and build rec_id -> y mapping
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
meta = cv_df.merge(rec_map, on='rec_id', how='left')

label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            if p.strip()=='' or p.strip()=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

# Train/test ids from meta (fallback only if model-specific ids are missing; prefer model ids)
train_mask_meta = (meta['fold']==0) & (meta['rec_id'].isin(label_map.keys()))
test_mask_meta = (meta['fold']==1)
rec_train_meta = meta.loc[train_mask_meta, 'rec_id'].values.astype(int)
rec_test_meta = meta.loc[test_mask_meta, 'rec_id'].values.astype(int)

def eval_candidate(prefix, oof_path, te_path):
    if not (os.path.exists(oof_path) and os.path.exists(te_path)):
        print(f"Missing files for {prefix}: {oof_path}, {te_path}")
        return None
    try:
        oof = np.load(oof_path)
    except Exception as e:
        print(f"Failed to load {prefix} OOF: {e}")
        return None
    # Load model-specific id orders if present
    tr_ids_path = f"{prefix}_train_ids.npy"
    te_ids_path = f"{prefix}_test_ids.npy"
    use_meta_fallback = False
    if os.path.exists(tr_ids_path):
        rec_train_ids = np.load(tr_ids_path).astype(int)
    else:
        # Fallback: assume meta order (may misalign) -> de-prioritize by returning None if shapes mismatch
        rec_train_ids = rec_train_meta.copy()
        use_meta_fallback = True
    # Build Y aligned strictly to rec_train_ids
    try:
        Y_aligned = np.vstack([label_map[int(r)] for r in rec_train_ids]).astype(np.int8)
    except Exception as e:
        print(f"{prefix}: failed to build Y_aligned from ids ({e})")
        return None
    if oof.shape != Y_aligned.shape:
        print(f"Skip {prefix}: shape mismatch {oof.shape} vs Y {Y_aligned.shape}")
        return None
    aucs=[]
    for c in range(Y_aligned.shape[1]):
        y = Y_aligned[:, c]; p = oof[:, c]
        if y.sum()==0 or y.sum()==y.shape[0]:
            continue
        try:
            aucs.append(roc_auc_score(y, p))
        except Exception:
            pass
    macro = float(np.mean(aucs)) if aucs else float('nan')
    note = '' if not use_meta_fallback else ' (meta-id fallback)'
    print(f"{prefix} OOF Macro AUC: {macro:.5f}{note}")
    return (prefix, macro, oof_path, te_path, te_ids_path)

candidates = [
    ('advtab', 'advtab_oof.npy', 'advtab_test.npy'),
    ('tab_lgb', 'tab_lgb_oof.npy', 'tab_lgb_test.npy'),
    ('lgbv2', 'lgbv2_oof.npy', 'lgbv2_test.npy'),
    ('tab_strong', 'tab_strong_oof.npy', 'tab_strong_test.npy'),
    ('tab_fast', 'tab_fast_oof.npy', 'tab_fast_test.npy'),
    ('hist_lr', 'hist_lr_oof.npy', 'hist_lr_test.npy'),
    ('blend1', 'blend1_oof.npy', 'blend1_test.npy')
]

results = []
for name, oof_path, te_path in candidates:
    r = eval_candidate(name, oof_path, te_path)
    if r is not None:
        results.append(r)

if not results:
    raise SystemExit('No valid candidate prediction files found (id alignment failed).')

best = max(results, key=lambda x: (x[1] if x[1]==x[1] else -1))
best_name, best_auc, _, best_te_path, best_te_ids_path = best
print(f"Best model: {best_name} with OOF Macro AUC={best_auc:.5f}")

# Build submission from best using its own test id order if available
test_pred = np.load(best_te_path)
if os.path.exists(best_te_ids_path):
    rec_test_best = np.load(best_te_ids_path).astype(int)
else:
    rec_test_best = rec_test_meta.copy()

sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test_best)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission.csv', index=False)
print('submission.csv written from', best_name, '| shape:', sub.shape)

advtab OOF Macro AUC: 0.52680 (meta-id fallback)
tab_lgb OOF Macro AUC: 0.48783 (meta-id fallback)
Missing files for lgbv2: lgbv2_oof.npy, lgbv2_test.npy
tab_strong OOF Macro AUC: 0.48996
tab_fast OOF Macro AUC: 0.58310
hist_lr OOF Macro AUC: 0.63444
blend1 OOF Macro AUC: 0.68892
Best model: blend1 with OOF Macro AUC=0.68892
submission.csv written from blend1 | shape: (1216, 2)


In [34]:
# Fast tabular model (tab_fast): simplified aggregations (mean/std/min/max/median), GroupKFold by station, robust LGBM with ID alignment
import numpy as np, pandas as pd, sys, time, os
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

t0 = time.time()
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# Labels
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            p = p.strip()
            if p=='' or p=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}
print('Labels parsed:', len(label_map))

# Load segment_features
seg_records = []
with open('supplemental_data/segment_features.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 3: continue
        try: rid = int(parts[0]); seg = int(parts[1])
        except: continue
        vals = []
        for v in parts[2:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        seg_records.append((rid, seg, vals))
max_len = max((len(v) for _,_,v in seg_records), default=0)
sf_cols = [f'sf_{i}' for i in range(max_len)]
sf_df = pd.DataFrame([([rid,seg]+v+[0.0]*(max_len-len(v))) for rid,seg,v in seg_records],
                     columns=['rec_id','seg_idx']+sf_cols) if seg_records else pd.DataFrame(columns=['rec_id','seg_idx']+sf_cols)
print('segment_features:', sf_df.shape)

# Load rectangles
rect_rows = []
with open('supplemental_data/segment_rectangles.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip().strip(',')
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        if len(parts) < 6: continue
        try:
            rid = int(parts[0]); seg = int(parts[1])
            t0r = float(parts[2]); t1r = float(parts[3]); f0r = float(parts[4]); f1r = float(parts[5])
        except:
            continue
        rect_rows.append((rid, seg, t0r, t1r, f0r, f1r))
rect_df = pd.DataFrame(rect_rows, columns=['rec_id','seg_idx','t_start','t_end','f_start','f_end'])
if not rect_df.empty:
    rect_df['duration'] = rect_df['t_end'] - rect_df['t_start']
    rect_df['freq_span'] = rect_df['f_end'] - rect_df['f_start']
print('segment_rectangles:', rect_df.shape)

# Merge + aggregate (fast aggregations only)
seg_full = sf_df.merge(rect_df, on=['rec_id','seg_idx'], how='left') if not rect_df.empty else sf_df.copy()
seg_full['segment_count'] = 1
num_cols = [c for c in seg_full.columns if c not in ['rec_id','seg_idx']]
agg_funcs = ['mean','std','min','max','median']
agg_dict = {c: agg_funcs for c in num_cols}
gb = seg_full.groupby('rec_id').agg(agg_dict) if not seg_full.empty else pd.DataFrame()
if not gb.empty:
    gb.columns = [f"{a}_{b}" for a,b in gb.columns.to_flat_index()]
    gb = gb.reset_index()
    # extra totals
    gb['segment_count_total'] = seg_full.groupby('rec_id')['segment_count'].sum().values
else:
    gb = pd.DataFrame({'rec_id': meta['rec_id'].unique()})
feat_df = meta.merge(gb, on='rec_id', how='left').fillna(0.0)

# Train/test masks
known_mask = feat_df['rec_id'].isin(label_map.keys())
train_mask = (feat_df['fold']==0) & known_mask
test_mask = (feat_df['fold']==1)
feature_cols = [c for c in feat_df.columns if c not in ['rec_id','fold','filename','station']]
X = feat_df.loc[train_mask, feature_cols].to_numpy(np.float32)
groups = feat_df.loc[train_mask, 'station'].astype(str).values
rec_train = feat_df.loc[train_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)
X_test = feat_df.loc[test_mask, feature_cols].to_numpy(np.float32)
rec_test = feat_df.loc[test_mask, 'rec_id'].values.astype(int)
print('Shapes | X:', X.shape, 'Y:', Y_train.shape, '| X_test:', X_test.shape)

# Save ID orders for alignment
np.save('tab_fast_train_ids.npy', rec_train); np.save('tab_fast_test_ids.npy', rec_test)

try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
except Exception:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb
    from lightgbm import LGBMClassifier

gkf = GroupKFold(n_splits=5)
n_train = X.shape[0]
oof = np.zeros((n_train, num_classes), dtype=np.float32)
test_pred = np.zeros((X_test.shape[0], num_classes), dtype=np.float32)
per_auc = []

for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_auc.append(np.nan); continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_te = np.zeros(X_test.shape[0], dtype=np.float32)
    fold_id = 0
    for tr_idx, va_idx in gkf.split(X, y, groups):
        fold_id += 1
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        pos = int(y_tr.sum()); neg = int((1-y_tr).sum())
        if pos == 0 or neg == 0 or (y_va.sum()==0 or y_va.sum()==len(y_va)):
            const = float(y.mean())
            cls_oof[va_idx] = const
            cls_te += np.full(X_test.shape[0], const, np.float32)/5.0
            print(f'class {c:02d} fold {fold_id}: degenerate -> const {const:.4f}')
            continue
        spw = neg/pos
        model = LGBMClassifier(objective='binary', learning_rate=0.03, num_leaves=63,
                               n_estimators=1200, subsample=0.9, colsample_bytree=0.9,
                               min_child_samples=10, reg_lambda=1.5, random_state=42,
                               n_jobs=-1, scale_pos_weight=spw)
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='auc',
                  callbacks=[lgb.early_stopping(100, verbose=False)])
        p_va = model.predict_proba(X_va)[:,1].astype(np.float32)
        p_te = model.predict_proba(X_test)[:,1].astype(np.float32)
        cls_oof[va_idx] = p_va; cls_te += p_te/5.0
        if fold_id % 2 == 0: print(f'class {c:02d} fold {fold_id}: pos={pos} neg={neg} spw={spw:.2f} best_iter={getattr(model,"best_iteration_",None)}')
    try:
        auc_c = roc_auc_score(y, cls_oof)
    except Exception:
        auc_c = np.nan
    per_auc.append(auc_c)
    oof[:, c] = cls_oof; test_pred[:, c] = cls_te
valid = [a for a in per_auc if not np.isnan(a)]
macro = float(np.mean(valid)) if valid else float('nan')
print(f'OOF Macro AUC (tab_fast): {macro:.5f}')

np.save('tab_fast_oof.npy', oof); np.save('tab_fast_test.npy', test_pred)

# Also write a convenience submission file
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission_tab_fast.csv', index=False)
print('Saved submission_tab_fast.csv; elapsed %.1fs' % (time.time()-t0))

Labels parsed: 258
segment_features: (1119, 40)
segment_rectangles: (1119, 8)
Shapes | X: (258, 226) Y: (258, 19) | X_test: (64, 226)
class 00 fold 1: degenerate -> const 0.0271
class 00 fold 2: degenerate -> const 0.0271
[LightGBM] [Info] Number of positive: 6, number of negative: 205
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028436 -> initscore=-3.531251
[LightGBM] [Info] Start training from score -3.531251
class 00 fold 4: degenerate -> const 0.0271
[LightGBM] [Info] Number of positive: 1, number of negative: 209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

[LightGBM] [Info] Number of positive: 24, number of negative: 179
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118227 -> initscore=-2.009332
[LightGBM] [Info] Start training from score -2.009332
class 01 fold 2: pos=24 neg=179 spw=7.46 best_iter=14
class 01 fold 3: degenerate -> const 0.1085
[LightGBM] [Info] Number of positive: 21, number of negative: 182
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.103448 -> 

class 01 fold 4: pos=21 neg=182 spw=8.67 best_iter=27
[LightGBM] [Info] Number of positive: 24, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7501
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.114286 -> initscore=-2.047693
[LightGBM] [Info] Start training from score -2.047693
[LightGBM] [Info] Number of positive: 16, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078049 -> initscore=-2.469158
[LightGBM] [Info] Start 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078818 -> initscore=-2.458520
[LightGBM] [Info] Start training from score -2.458520
class 02 fold 2: pos=16 neg=187 spw=11.69 best_iter=5
[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146
[LightGBM] [Info] Number of positive

class 02 fold 4: pos=13 neg=190 spw=14.62 best_iter=13
[LightGBM] [Info] Number of positive: 17, number of negative: 193
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7501
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080952 -> initscore=-2.429477
[LightGBM] [Info] Start training from score -2.429477
class 03 fold 1: degenerate -> const 0.0155
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009852 -> 

[LightGBM] [Info] Number of positive: 6, number of negative: 197
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444
class 04 fold 2: pos=6 neg=197 spw=32.83 best_iter=5
class 04 fold 3: degenerate -> const 0.0349
[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> ini

class 05 fold 2: pos=4 neg=199 spw=49.75 best_iter=49
[LightGBM] [Info] Number of positive: 3, number of negative: 208
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014218 -> initscore=-4.238926
[LightGBM] [Info] Start training from score -4.238926
[LightGBM] [Info] Number of positive: 4, number of negative: 199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002006 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019704 -> initscore=-3.907010
[LightGBM] [Info] Start tr

class 06 fold 2: pos=14 neg=189 spw=13.50 best_iter=3
[LightGBM] [Info] Number of positive: 14, number of negative: 197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066351 -> initscore=-2.644146
[LightGBM] [Info] Start training from score -2.644146
[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore

[LightGBM] [Info] Number of positive: 18, number of negative: 187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001982 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087805 -> initscore=-2.340737
[LightGBM] [Info] Start training from score -2.340737
[LightGBM] [Info] Number of positive: 13, number of negative: 190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.064039 -> initscore=-2.682075
[LightGBM] [Info] Start training from score -2.682075
class 07 fold 2: pos=13 

[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080569 -> initscore=-2.434645
[LightGBM] [Info] Start training from score -2.434645
[LightGBM] [Info] Number of positive: 12, number of negative: 191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.059113 -> initscore=-2.767367
[LightGBM] [Info] Start training from score -2.767367
class 07 fold 4: pos=12 neg=191 spw=15.92 best_iter=1
[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002059 seconds.
You can set `force_col_wise=

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005399 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097561 -> initscore=-2.224624
[LightGBM] [Info] Start training from score -2.224624
[LightGBM] [Info] Number of positive: 19, number of negative: 184
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093596 -> initscore=-2.270497
[LightGBM] [Info] Start training from score -2.270497
class 08 fold 2: pos=19 neg=184 spw=9.68 best_iter=8
[LightGBM] [Info] Number of positive:

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090047 -> initscore=-2.313056
[LightGBM] [Info] Start training from score -2.313056
[LightGBM] [Info] Number of positive: 17, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083744 -> initscore=-2.392533
[LightGBM] [Info] Start training from score -2.392533
class 08 fold 4: pos=17 neg=186 spw=10.94 best_iter=6
[LightGBM] [Info] Number of positive: 21, number of negative: 189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7501
[LightGBM] [Info] Number of data points in the train set: 210,

class 09 fold 1: degenerate -> const 0.0814
[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start training from score -2.528392
class 09 fold 2: pos=15 neg=188 spw=12.53 best_iter=4
[LightGBM] [Info] Number of positive: 16, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075829 -> 

class 09 fold 4: pos=16 neg=187 spw=11.69 best_iter=2
[LightGBM] [Info] Number of positive: 16, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7501
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076190 -> initscore=-2.495269
[LightGBM] [Info] Start training from score -2.495269
[LightGBM] [Info] Number of positive: 34, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.165854 -> initscore=-1.615303
[LightGBM] [Info] Start 

[LightGBM] [Info] Number of positive: 38, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187192 -> initscore=-1.468359
[LightGBM] [Info] Start training from score -1.468359
class 10 fold 2: pos=38 neg=165 spw=4.34 best_iter=7
[LightGBM] [Info] Number of positive: 40, number of negative: 171
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.189573 -> initscore=-1.452784
[LightGBM] [Info] Start t

class 10 fold 4: pos=39 neg=164 spw=4.21 best_iter=49
[LightGBM] [Info] Number of positive: 45, number of negative: 165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7501
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.214286 -> initscore=-1.299283
[LightGBM] [Info] Start training from score -1.299283
[LightGBM] [Info] Number of positive: 2, number of negative: 203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.009756 -> initscore=-4.620059
[LightGBM] [Info] Start t

class 11 fold 4: pos=8 neg=195 spw=24.38 best_iter=2
class 11 fold 5: degenerate -> const 0.0388
[LightGBM] [Info] Number of positive: 10, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048780 -> initscore=-2.970414
[LightGBM] [Info] Start training from score -2.970414
[LightGBM] [Info] Number of positive: 9, number of negative: 194
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.044335 -> in

[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
class 12 fold 4: pos=8 neg=195 spw=24.38 best_iter=6
class 12 fold 5: degenerate -> const 0.0465
class 13 fold 1: degenerate -> const 0.0155
class 13 fold 2: degenerate -> const 0.0155
class 13 fold 3: degenerate -> const 0.0155
class 13 fold 4: degenerate -> const 0.0155
class 13 fold 5: degenerate -> const 0.0155
[LightGBM] [Info] Number of positive: 14, number of negative: 191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002111 seconds.
You can set `force_col_wis

class 14 fold 2: pos=13 neg=190 spw=14.62 best_iter=5
[LightGBM] [Info] Number of positive: 9, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042654 -> initscore=-3.111043
[LightGBM] [Info] Start training from score -3.111043
[LightGBM] [Info] Number of positive: 15, number of negative: 188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.073892 -> initscore=-2.528392
[LightGBM] [Info] Start t

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061905 -> initscore=-2.718254
[LightGBM] [Info] Start training from score -2.718254
[LightGBM] [Info] Number of positive: 4, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002049 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019512 -> initscore=-3.917011
[LightGBM] [Info] Start training from score -3.917011
[LightGBM] [Info] Number of positive: 3, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:

class 15 fold 4: degenerate -> const 0.0233
class 15 fold 5: degenerate -> const 0.0233
class 16 fold 1: degenerate -> const 0.0078
class 16 fold 2: degenerate -> const 0.0078
class 16 fold 3: degenerate -> const 0.0078
class 16 fold 4: degenerate -> const 0.0078
class 16 fold 5: degenerate -> const 0.0078
[LightGBM] [Info] Number of positive: 3, number of negative: 202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6786
[LightGBM] [Info] Number of data points in the train set: 205, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014634 -> initscore=-4.209655
[LightGBM] [Info] Start training from score -4.209655
[LightGBM] [Info] Number of positive: 2, number of negative: 201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002274 seconds.
You can set `force_col_wise=true` to

[LightGBM] [Info] Number of positive: 8, number of negative: 195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6274
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039409 -> initscore=-3.193558
[LightGBM] [Info] Start training from score -3.193558
class 18 fold 2: pos=8 neg=195 spw=24.38 best_iter=78
[LightGBM] [Info] Number of positive: 11, number of negative: 200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7103
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.052133 -> initscore=-2.900422
[LightGBM] [Info] Start t

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003100 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 203, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029557 -> initscore=-3.491444
[LightGBM] [Info] Start training from score -3.491444
class 18 fold 4: pos=6 neg=197 spw=32.83 best_iter=45
class 18 fold 5: degenerate -> const 0.0465
OOF Macro AUC (tab_fast): 0.58310
Saved submission_tab_fast.csv; elapsed 4.6s


In [40]:
# Histogram LogisticRegression with GroupKFold by station (hist_lr) + ID alignment + station priors + per-fold SVD
import numpy as np, pandas as pd, sys, time, os
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import TruncatedSVD

t0 = time.time()
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
rec_map['station'] = rec_map['filename'].str.split('_').str[0]
meta = cv_df.merge(rec_map, on='rec_id', how='left')

# Parse labels (train fold only; '?' skipped; empty -> all zeros)
label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            p = p.strip()
            if p=='' or p=='?': continue
            cid = int(p)
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}
print('Labels parsed:', len(label_map))

# Parse histogram_of_segments.txt robustly
hist_rows = []
with open('supplemental_data/histogram_of_segments.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        vals = []
        for v in parts[1:]:
            try: vals.append(float(v))
            except: vals.append(0.0)
        hist_rows.append((rid, vals))
max_len = max((len(v) for _, v in hist_rows), default=0)
H = np.zeros((len(hist_rows), max_len), dtype=np.float32)
R = np.zeros((len(hist_rows),), dtype=int)
for i, (rid, vals) in enumerate(hist_rows):
    R[i] = rid; L = min(max_len, len(vals))
    if L: H[i, :L] = np.asarray(vals[:L], np.float32)
hos = pd.DataFrame(H); hos.insert(0, 'rec_id', R)
print('Histogram features:', hos.shape)

# Merge with meta to get folds and stations
feat_df = meta.merge(hos, on='rec_id', how='left').fillna(0.0)
known_mask = feat_df['rec_id'].isin(label_map.keys())
train_mask = (feat_df['fold']==0) & known_mask
test_mask = (feat_df['fold']==1)
feature_cols = [c for c in hos.columns if c != 'rec_id']
X_base = feat_df.loc[train_mask, feature_cols].to_numpy(np.float32)
groups = feat_df.loc[train_mask, 'station'].astype(str).values
stations_train = groups.copy()
rec_train = feat_df.loc[train_mask, 'rec_id'].values.astype(int)
Y_train = np.vstack([label_map[int(r)] for r in rec_train]).astype(np.int8)
X_test_base = feat_df.loc[test_mask, feature_cols].to_numpy(np.float32)
stations_test = feat_df.loc[test_mask, 'station'].astype(str).values
rec_test = feat_df.loc[test_mask, 'rec_id'].values.astype(int)
print('Shapes | X:', X_base.shape, 'Y:', Y_train.shape, '| X_test:', X_test_base.shape)

# Save ID orders for alignment
np.save('hist_lr_train_ids.npy', rec_train); np.save('hist_lr_test_ids.npy', rec_test)

gkf = GroupKFold(n_splits=5)
n_train = X_base.shape[0]
oof = np.zeros((n_train, num_classes), dtype=np.float32)
test_pred = np.zeros((X_test_base.shape[0], num_classes), dtype=np.float32)
per_auc = []

for c in range(num_classes):
    y = Y_train[:, c].astype(int)
    if y.sum()==0 or y.sum()==y.shape[0]:
        per_auc.append(np.nan); continue
    cls_oof = np.zeros(n_train, dtype=np.float32)
    cls_te = np.zeros(X_test_base.shape[0], dtype=np.float32)
    fold_id = 0
    for tr_idx, va_idx in gkf.split(X_base, y, groups):
        fold_id += 1
        X_tr, X_va = X_base[tr_idx], X_base[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        st_tr = stations_train[tr_idx]; st_va = stations_train[va_idx]
        # Station prior within fold
        st_prior = {}
        for s in np.unique(st_tr):
            m = y_tr[st_tr == s].mean() if (st_tr == s).any() else y_tr.mean()
            st_prior[s] = float(m)
        st_te = np.array([st_prior.get(s, float(y_tr.mean())) for s in stations_test], dtype=np.float32)
        st_tr_feat = np.array([st_prior.get(s, float(y_tr.mean())) for s in st_tr], dtype=np.float32)[:, None]
        st_va_feat = np.array([st_prior.get(s, float(y_tr.mean())) for s in st_va], dtype=np.float32)[:, None]
        # Per-fold SVD on histogram features
        n_comp = min(24, max(2, X_tr.shape[1]-1))
        svd = TruncatedSVD(n_components=n_comp, random_state=42)
        svd.fit(X_tr)
        Z_tr = svd.transform(X_tr)
        Z_va = svd.transform(X_va)
        Z_te = svd.transform(X_test_base)
        # Augment features with station prior
        X_tr_aug = np.concatenate([Z_tr, st_tr_feat], axis=1)
        X_va_aug = np.concatenate([Z_va, st_va_feat], axis=1)
        X_te_aug = np.concatenate([Z_te, st_te[:, None]], axis=1)
        if len(np.unique(y_tr)) < 2 or len(np.unique(y_va)) < 2:
            const = float(y.mean())
            cls_oof[va_idx] = const
            cls_te += np.full(X_test_base.shape[0], const, np.float32)/5.0
            continue
        pipe = Pipeline([('sc', StandardScaler(with_mean=True, with_std=True)),
                         ('lr', LogisticRegression(max_iter=2000, solver='liblinear', class_weight='balanced'))])
        pipe.fit(X_tr_aug, y_tr)
        cls_oof[va_idx] = pipe.predict_proba(X_va_aug)[:,1].astype(np.float32)
        cls_te += pipe.predict_proba(X_te_aug)[:,1].astype(np.float32)/5.0
    try:
        auc_c = roc_auc_score(y, cls_oof)
    except Exception:
        auc_c = np.nan
    per_auc.append(auc_c)
    oof[:, c] = cls_oof; test_pred[:, c] = cls_te

valid = [a for a in per_auc if not np.isnan(a)]
macro = float(np.mean(valid)) if valid else float('nan')
print(f'OOF Macro AUC (hist_lr with station prior + SVD): {macro:.5f}')

np.save('hist_lr_oof.npy', oof); np.save('hist_lr_test.npy', test_pred)

# Convenience submission
sub = pd.read_csv('sample_submission.csv')
id_vals = sub['Id'].values.astype(int)
rid_to_idx = {rid:i for i, rid in enumerate(rec_test)}
probs = np.zeros_like(id_vals, dtype=np.float32)
for i, Id in enumerate(id_vals):
    rid = Id // 100; cid = Id % 100
    row = rid_to_idx.get(rid, None)
    p = float(test_pred[row, cid]) if (row is not None and cid < num_classes) else 0.0
    probs[i] = np.clip(p, 1e-6, 1-1e-6)
sub['Probability'] = probs
sub.to_csv('submission_hist_lr.csv', index=False)
print('Saved submission_hist_lr.csv; elapsed %.1fs' % (time.time()-t0))

Labels parsed: 258
Histogram features: (322, 101)
Shapes | X: (258, 100) Y: (258, 19) | X_test: (64, 100)


OOF Macro AUC (hist_lr with station prior + SVD): 0.63444
Saved submission_hist_lr.csv; elapsed 2.1s


In [41]:
# Blend hist_lr and tab_fast with strict ID alignment (blend1)
import numpy as np, pandas as pd, os
from sklearn.metrics import roc_auc_score

# Metadata and labels
sp_df = pd.read_csv('essential_data/species_list.txt'); num_classes = sp_df.shape[0]
cv_df = pd.read_csv('essential_data/CVfolds_2.txt')
rec_map = pd.read_csv('essential_data/rec_id2filename.txt')
meta = cv_df.merge(rec_map, on='rec_id', how='left')

label_rows = []
with open('essential_data/rec_labels_test_hidden.txt', 'r') as f:
    _ = next(f, None)
    for line in f:
        s = line.strip()
        if not s: continue
        parts = [p for p in s.split(',') if p!='']
        try: rid = int(parts[0])
        except: continue
        labels = parts[1:] if len(parts)>1 else []
        if any(p.strip()=='?' for p in labels):
            continue
        y = np.zeros(num_classes, dtype=np.int8)
        for p in labels:
            p = p.strip()
            if p=='' or p=='?': continue
            cid = int(p);
            if 0 <= cid < num_classes: y[cid]=1
        label_rows.append((rid, y))
label_map = {rid:y for rid,y in label_rows}

# Train/test rec_id orders from meta
train_mask_meta = (meta['fold']==0) & (meta['rec_id'].isin(label_map.keys()))
test_mask_meta = (meta['fold']==1)
rec_train_meta = meta.loc[train_mask_meta, 'rec_id'].values.astype(int)
rec_test_meta = meta.loc[test_mask_meta, 'rec_id'].values.astype(int)
Y_meta = np.vstack([label_map[int(r)] for r in rec_train_meta]).astype(np.int8)

def load_model(prefix):
    oof = np.load(f'{prefix}_oof.npy') if os.path.exists(f'{prefix}_oof.npy') else None
    te = np.load(f'{prefix}_test.npy') if os.path.exists(f'{prefix}_test.npy') else None
    tr_ids = np.load(f'{prefix}_train_ids.npy').astype(int) if os.path.exists(f'{prefix}_train_ids.npy') else None
    te_ids = np.load(f'{prefix}_test_ids.npy').astype(int) if os.path.exists(f'{prefix}_test_ids.npy') else None
    return oof, te, tr_ids, te_ids

h_oof, h_te, h_tr, h_teids = load_model('hist_lr')
f_oof, f_te, f_tr, f_teids = load_model('tab_fast')
assert h_oof is not None and f_oof is not None, 'Required model files missing.'

# Align OOFs to meta train order by rec_id
def align_to_ids(oof, tr_ids, target_ids):
    idx_map = {int(r): i for i, r in enumerate(tr_ids)}
    arr = np.zeros((len(target_ids), oof.shape[1]), dtype=np.float32)
    for i, r in enumerate(target_ids):
        j = idx_map.get(int(r), None)
        if j is not None:
            arr[i] = oof[j]
    return arr

h_oof_aln = align_to_ids(h_oof, h_tr, rec_train_meta)
f_oof_aln = align_to_ids(f_oof, f_tr, rec_train_meta)

# Grid search blend weight on OOF
best_w, best_auc = 0.0, -1.0
for w in np.linspace(0.0, 1.0, 51):
    blend = w*h_oof_aln + (1.0-w)*f_oof_aln
    aucs=[]
    for c in range(num_classes):
        y = Y_meta[:, c]; p = blend[:, c]
        if y.sum()==0 or y.sum()==y.shape[0]: continue
        try: aucs.append(roc_auc_score(y, p))
        except: pass
    if aucs:
        macro = float(np.mean(aucs))
        if macro > best_auc:
            best_auc = macro; best_w = float(w)
print(f'blend1 OOF Macro AUC: {best_auc:.5f} at w={best_w:.2f} (w*hist_lr + (1-w)*tab_fast)')

# Align test preds to meta test order and blend with best_w
def align_test(te, te_ids, target_ids):
    idx_map = {int(r): i for i, r in enumerate(te_ids)} if te_ids is not None else {int(r): i for i, r in enumerate(target_ids)}
    arr = np.zeros((len(target_ids), te.shape[1]), dtype=np.float32)
    for i, r in enumerate(target_ids):
        j = idx_map.get(int(r), None)
        if j is not None:
            arr[i] = te[j]
    return arr

h_te_aln = align_test(h_te, h_teids, rec_test_meta)
f_te_aln = align_test(f_te, f_teids, rec_test_meta)
blend_te = best_w*h_te_aln + (1.0-best_w)*f_te_aln

# Save blended outputs and ids
np.save('blend1_oof.npy', (best_w*h_oof_aln + (1.0-best_w)*f_oof_aln))
np.save('blend1_test.npy', blend_te)
np.save('blend1_train_ids.npy', rec_train_meta)
np.save('blend1_test_ids.npy', rec_test_meta)
print('Saved blend1_* files aligned to meta id order.')

blend1 OOF Macro AUC: 0.68892 at w=0.38 (w*hist_lr + (1-w)*tab_fast)
Saved blend1_* files aligned to meta id order.
