# Plan
- Objective: Achieve medal-level AUC-ROC on TPS May 2022 via strong GBDT baseline + sensible CV + iterative feature engineering.
- Steps:
  1) Load data, basic EDA (shapes, dtypes, target distribution).
  2) Baseline model: LightGBM with StratifiedKFold CV, robust parameters, early stopping, logging.
  3) Generate out-of-fold (OOF) predictions, compute CV AUC, predict test, save submission.csv.
  4) Iterate: try interaction features (pairwise products/sums), categorical-like encodings if applicable, and alternative models (CatBoost/XGBoost), blending.
  5) Hyperparam refinements guided by CV; ensure leak-free processing.

Note: Keep cells modular; log timing per fold; use seed control for reproducibility.

Medal targets:
- Bronze: AUC >= 0.99818; Stretch: >= 0.99822.

Next: Implement baseline pipeline.

In [1]:
# Imports, versions, and utils
import os, sys, gc, time, random, math, json
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
seed_everything(SEED)

print('Python', sys.version)
print('Pandas', pd.__version__)
print('Numpy', np.__version__)

def log(msg):
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f'[{ts}] {msg}', flush=True)

Python 3.11.0rc1 (main, Aug 12 2022, 10:02:14) [GCC 11.2.0]
Pandas 2.2.2
Numpy 1.26.4


In [2]:
# Load data, basic EDA, dtype optimization, f_27 FE, and 10-fold splits
log('Loading data...')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
log(f'train shape: {train.shape}, test shape: {test.shape}')

# Basic target info
log('Target distribution:')
log(train['target'].value_counts(normalize=True).to_dict())

# Identify columns
id_col = 'id' if 'id' in train.columns else None
target_col = 'target'
all_cols = train.columns.tolist()
num_cols = [c for c in all_cols if c not in [id_col, target_col, 'f_27'] and train[c].dtype != 'object']

# Cast numerics to float32 for memory
for c in num_cols:
    train[c] = train[c].astype('float32')
    if c in test.columns:
        test[c] = test[c].astype('float32')
log('Numeric columns cast to float32.')

# f_27 feature engineering
assert 'f_27' in train.columns, 'f_27 not found'
all_f27 = pd.concat([train['f_27'], test['f_27']], axis=0).astype(str)
chars = sorted(set(''.join(all_f27.values)))
char2int = {ch:i for i, ch in enumerate(chars)}
log(f'f_27 unique chars: {len(chars)} -> {chars[:20]}...')

def add_f27_features(df):
    s = df['f_27'].astype(str)
    # positional features (10 positions)
    for i in range(10):
        col = f'f_27_pos_{i}'
        df[col] = s.str[i].map(char2int).astype('int8')
    # inverse-position features for A..J (find returns -1..9); store as int8 with offset
    for ch in list('ABCDEFGHIJ'):
        col = f'f_27_char_{ch}_pos'
        df[col] = s.str.find(ch).astype('int8')
    return df

t0 = time.time()
train = add_f27_features(train)
test = add_f27_features(test)
log(f'Added f_27 features in {time.time()-t0:.2f}s')

# Optional cheap row stats
row_num_cols = num_cols.copy()
if len(row_num_cols) > 0:
    train['row_sum'] = train[row_num_cols].sum(axis=1).astype('float32')
    train['row_std'] = train[row_num_cols].std(axis=1).astype('float32')
    test['row_sum'] = test[row_num_cols].sum(axis=1).astype('float32')
    test['row_std'] = test[row_num_cols].std(axis=1).astype('float32')
    log('Added row_sum and row_std')

# Update feature list after FE
fe_cols = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
log(f'Feature count (after FE): {len(fe_cols)}')

# Check nulls
null_train = train[fe_cols + [target_col]].isnull().sum().sum()
null_test = test[[c for c in fe_cols if c in test.columns]].isnull().sum().sum()
log(f'Nulls -> train: {null_train}, test: {null_test}')

# Save 10-fold stratified splits
log('Creating StratifiedKFold(10) indices...')
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
folds = np.full(len(train), -1, dtype=np.int16)
for fold, (_, val_idx) in enumerate(skf.split(train[fe_cols], train[target_col])):
    folds[val_idx] = fold
assert (folds >= 0).all()

folds_df = pd.DataFrame({id_col if id_col else 'row_id': train[id_col] if id_col else np.arange(len(train)), 'fold': folds})
folds_path = 'folds_10fold_seed42.csv'
folds_df.to_csv(folds_path, index=False)
log(f'Saved folds to {folds_path}')

# Save memory
gc.collect();
log('Data prep done.')

[2025-09-11 22:14:15] Loading data...


[2025-09-11 22:14:20] train shape: (800000, 33), test shape: (100000, 32)


[2025-09-11 22:14:20] Target distribution:


[2025-09-11 22:14:20] {0: 0.51386, 1: 0.48614}


[2025-09-11 22:14:20] Numeric columns cast to float32.


[2025-09-11 22:14:20] f_27 unique chars: 20 -> ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T']...


[2025-09-11 22:14:27] Added f_27 features in 6.53s


[2025-09-11 22:14:28] Added row_sum and row_std


[2025-09-11 22:14:28] Feature count (after FE): 52


[2025-09-11 22:14:28] Nulls -> train: 0, test: 0


[2025-09-11 22:14:28] Creating StratifiedKFold(10) indices...


[2025-09-11 22:14:29] Saved folds to folds_10fold_seed42.csv


[2025-09-11 22:14:29] Data prep done.


In [None]:
# LightGBM 10-fold CV training, OOF/test preds, submission
t_start = time.time()
log('Setting up LightGBM training...')
try:
    import lightgbm as lgb
except ImportError:
    import sys, subprocess
    log('LightGBM not found. Installing...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'])
    import lightgbm as lgb

# Features and target
features = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
X = train[features]
y = train[target_col].values
X_test = test[features]

# Folds
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values

# Params
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 256,
    'max_depth': -1,
    'min_data_in_leaf': 150,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.0,
    'lambda_l2': 2.0,
    'max_bin': 255,
    'verbose': -1,
    'n_jobs': -1,
    'seed': SEED
}

n_splits = len(np.unique(folds_arr))
oof = np.zeros(len(train), dtype='float32')
pred_test = np.zeros(len(test), dtype='float32')
feat_imps = []

for fold in range(n_splits):
    fold_t0 = time.time()
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')

    dtrain = lgb.Dataset(X.iloc[trn_idx], label=y[trn_idx])
    dvalid = lgb.Dataset(X.iloc[val_idx], label=y[val_idx])

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=10000,
        valid_sets=[dtrain, dvalid],
        valid_names=['train','valid'],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)]
    )

    oof[val_idx] = model.predict(X.iloc[val_idx], num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y[val_idx], oof[val_idx])
    log(f'Fold {fold} AUC: {fold_auc:.6f} | best_iter={model.best_iteration} | elapsed={time.time()-fold_t0:.1f}s')

    pred_test += model.predict(X_test, num_iteration=model.best_iteration) / n_splits

    fi = pd.DataFrame({'feature': features, f'fold_{fold}': model.feature_importance(importance_type='gain')})
    feat_imps.append(fi)
    del dtrain, dvalid, model
    gc.collect()

cv_auc = roc_auc_score(y, oof)
log(f'OOF AUC: {cv_auc:.6f}')

# Save OOF and test preds
pd.DataFrame({id_col: train[id_col], 'oof': oof}).to_csv('oof_lgb_seed42.csv', index=False)
pd.DataFrame({id_col: test[id_col], 'prediction': pred_test}).to_csv('pred_lgb_seed42.csv', index=False)

# Feature importance
fi_merged = feat_imps[0]
for i in range(1, len(feat_imps)):
    fi_merged = fi_merged.merge(feat_imps[i], on='feature', how='outer')
fi_merged.fillna(0, inplace=True)
fi_merged['gain_mean'] = fi_merged[[c for c in fi_merged.columns if c.startswith('fold_')]].mean(axis=1)
fi_merged.sort_values('gain_mean', ascending=False).to_csv('feature_importance_lgb.csv', index=False)
log('Saved OOF, test preds, and feature importance.')

# Submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred_test
sub.to_csv('submission.csv', index=False)
log(f'Submission saved. Total training time: {time.time()-t_start:.1f}s')

In [3]:
# Patch f_27 features (+1 offset), add bigrams/runs/equality/char counts; add extra row stats
log('Patching/adding f_27 features and cheap row stats (queued for after current run)')

# Ensure we have shared char2int and strings
all_f27 = pd.concat([train['f_27'], test['f_27']], axis=0).astype(str)
chars = sorted(set(''.join(all_f27.values)))  # expecting 20 A..T
char2int = {ch:i for i, ch in enumerate(chars)}

def build_bigrams_map(series):
    # Build bigram vocabulary from train+test
    seen = set()
    for s in series.values:
        for i in range(9):
            seen.add(s[i:i+2])
    bigrams = sorted(seen)
    return {bg:i for i, bg in enumerate(bigrams)}

bigrams2int = build_bigrams_map(all_f27)
log(f'Bigram vocab size: {len(bigrams2int)}')

def longest_run_and_transitions(s):
    # s length is 10
    max_run = 1
    cur = 1
    transitions = 0
    for i in range(1, len(s)):
        if s[i] == s[i-1]:
            cur += 1
        else:
            transitions += 1
            if cur > max_run:
                max_run = cur
            cur = 1
    if cur > max_run:
        max_run = cur
    return max_run, transitions

def add_extra_f27_features(df):
    s = df['f_27'].astype(str)
    # Offset existing find features by +1 (range 0..10); and presence flag
    for ch in list('ABCDEFGHIJ'):
        col = f'f_27_char_{ch}_pos'
        if col in df.columns:
            df[col] = (df[col].astype('int16') + 1).astype('int8')
        pres = f'f_27_char_{ch}_present'
        df[pres] = (s.str.find(ch) >= 0).astype('int8')
    # Adjacent equality flags pos0==pos1 ... pos8==pos9
    for i in range(9):
        df[f'f_27_adj_eq_{i}_{i+1}'] = (s.str[i] == s.str[i+1]).astype('int8')
    # 9 bigram columns: s[i:i+2] label-encoded
    for i in range(9):
        col = f'f_27_bg_{i}_{i+1}'
        df[col] = s.str[i:i+2].map(bigrams2int).astype('int16')
    # longest_run_length and transitions_count
    lr, tr = zip(*s.map(longest_run_and_transitions))
    df['f_27_longest_run'] = np.array(lr, dtype='int8')
    df['f_27_transitions'] = np.array(tr, dtype='int8')
    # Per-char counts A..T (20 ints)
    for ch in chars:
        df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')
    return df

t0 = time.time()
train = add_extra_f27_features(train)
test = add_extra_f27_features(test)
log(f'Added extra f_27 features in {time.time()-t0:.2f}s')

# Cheap row stats additions on numeric block
row_num_cols = [c for c in train.columns if c not in [id_col, target_col, 'f_27'] and c.startswith('f_') and train[c].dtype in [np.float32, np.float64, 'float32', 'float64'] ]
if len(row_num_cols) > 0:
    train['row_min'] = train[row_num_cols].min(axis=1).astype('float32')
    train['row_max'] = train[row_num_cols].max(axis=1).astype('float32')
    train['row_mean'] = train[row_num_cols].mean(axis=1).astype('float32')
    test['row_min'] = test[row_num_cols].min(axis=1).astype('float32')
    test['row_max'] = test[row_num_cols].max(axis=1).astype('float32')
    test['row_mean'] = test[row_num_cols].mean(axis=1).astype('float32')
    train['row_max_minus_mean'] = (train['row_max'] - train['row_mean']).astype('float32')
    train['mean_minus_row_min'] = (train['row_mean'] - train['row_min']).astype('float32')
    test['row_max_minus_mean'] = (test['row_max'] - test['row_mean']).astype('float32')
    test['mean_minus_row_min'] = (test['row_mean'] - test['row_min']).astype('float32')
    log('Added row_min/row_max/row_mean and z-extremes')

# Refresh feature list (exclude id/target/raw f_27)
fe_cols = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
log(f'Feature count after patch: {len(fe_cols)}')
gc.collect();

[2025-09-11 22:14:35] Patching/adding f_27 features and cheap row stats (queued for after current run)


[2025-09-11 22:14:37] Bigram vocab size: 349


  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')


  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')


  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')


  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')


  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')


[2025-09-11 22:14:57] Added extra f_27 features in 19.94s


  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')
  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')
  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')
  df[f'f_27_cnt_{ch}'] = s.str.count(ch).astype('int8')


  train['row_min'] = train[row_num_cols].min(axis=1).astype('float32')


  train['row_max'] = train[row_num_cols].max(axis=1).astype('float32')


[2025-09-11 22:14:57] Added row_min/row_max/row_mean and z-extremes


[2025-09-11 22:14:57] Feature count after patch: 107


  train['row_mean'] = train[row_num_cols].mean(axis=1).astype('float32')
  test['row_min'] = test[row_num_cols].min(axis=1).astype('float32')
  test['row_max'] = test[row_num_cols].max(axis=1).astype('float32')
  test['row_mean'] = test[row_num_cols].mean(axis=1).astype('float32')
  train['row_max_minus_mean'] = (train['row_max'] - train['row_mean']).astype('float32')
  train['mean_minus_row_min'] = (train['row_mean'] - train['row_min']).astype('float32')
  test['row_max_minus_mean'] = (test['row_max'] - test['row_mean']).astype('float32')
  test['mean_minus_row_min'] = (test['row_mean'] - test['row_min']).astype('float32')


In [4]:
# Add targeted numeric interactions and quick-win f_27 features (vectorized, cheap)
log('Adding numeric interactions and f_27 quick wins...')

t0 = time.time()

# Helper: get col safely
def _col(name):
    if name not in train.columns or name not in test.columns:
        raise KeyError(f'Missing column: {name}')
    return name

# Base numeric columns expected in TPS May 2022
base_cols = ['f_00','f_01','f_02','f_03','f_05','f_06','f_10','f_12','f_20','f_21','f_22','f_26']
for c in base_cols:
    _ = _col(c)

def build_num_interactions(df):
    eps = 1e-6
    out = {}
    f00 = df['f_00'].astype('float32').values
    f01 = df['f_01'].astype('float32').values
    f02 = df['f_02'].astype('float32').values
    f03 = df['f_03'].astype('float32').values
    f05 = df['f_05'].astype('float32').values
    f06 = df['f_06'].astype('float32').values
    f10 = df['f_10'].astype('float32').values
    f12 = df['f_12'].astype('float32').values
    f20 = df['f_20'].astype('float32').values
    f21 = df['f_21'].astype('float32').values
    f22 = df['f_22'].astype('float32').values
    f26 = df['f_26'].astype('float32').values
    # Ratios
    out['int_ratio_00_01'] = (f00 / (f01 + eps)).astype('float32')
    out['int_ratio_02_03'] = (f02 / (f03 + eps)).astype('float32')
    out['int_ratio_10_12'] = (f10 / (f12 + eps)).astype('float32')
    out['int_ratio_20_21'] = (f20 / (f21 + eps)).astype('float32')
    out['int_ratio_21_02'] = (f21 / (f02 + eps)).astype('float32')
    out['int_ratio_22_02'] = (f22 / (f02 + eps)).astype('float32')
    out['int_ratio_05_06'] = (f05 / (f06 + eps)).astype('float32')
    out['int_ratio_26_02'] = (f26 / (f02 + eps)).astype('float32')
    # Products
    out['int_prod_00_10'] = (f00 * f10).astype('float32')
    out['int_prod_02_20'] = (f02 * f20).astype('float32')
    out['int_prod_01_21'] = (f01 * f21).astype('float32')
    out['int_prod_21_02'] = (f21 * f02).astype('float32')
    out['int_prod_22_05'] = (f22 * f05).astype('float32')
    # Diffs
    out['int_absdiff_00_10'] = np.abs(f00 - f10).astype('float32')
    out['int_absdiff_02_20'] = np.abs(f02 - f20).astype('float32')
    out['int_absdiff_01_21'] = np.abs(f01 - f21).astype('float32')
    out['int_absdiff_21_02'] = np.abs(f21 - f02).astype('float32')
    out['int_diff_22_05'] = (f22 - f05).astype('float32')
    out['int_diff_26_00'] = (f26 - f00).astype('float32')
    # Squares
    out['int_sq_00'] = (f00 * f00).astype('float32')
    out['int_sq_02'] = (f02 * f02).astype('float32')
    out['int_sq_10'] = (f10 * f10).astype('float32')
    out['int_sq_20'] = (f20 * f20).astype('float32')
    return pd.DataFrame(out)

num_int_train = build_num_interactions(train)
num_int_test = build_num_interactions(test)

# f_27 quick wins using positional ints and counts already present
pos_cols = [f'f_27_pos_{i}' for i in range(10)]
for c in pos_cols:
    if c not in train.columns:
        raise KeyError(f'Missing {c} for f_27 quick features')

# Build numpy arrays for positions
P_tr = np.stack([train[c].astype('int16').values for c in pos_cols], axis=1)  # (n,10)
P_te = np.stack([test[c].astype('int16').values for c in pos_cols], axis=1)

# Per-char counts A..T exist as f_27_cnt_{ch}; assemble count matrices if present, else compute from positions
chars = sorted(set(''.join(pd.concat([train['f_27'], test['f_27']]).astype(str).values)))
char2int = {ch:i for i,ch in enumerate(chars)}
vocab_size = len(chars)  # expected 20

def counts_from_positions(P, vocab_size):
    # P: (n,10) with 0..vocab_size-1
    n = P.shape[0]
    cnt = np.zeros((n, vocab_size), dtype=np.int16)
    for k in range(10):
        idx = P[:, k]
        # bincount per row via advanced indexing
        cnt[np.arange(n), idx] += 1
    return cnt

# Try to build counts from existing columns if available
cnt_cols = [c for c in train.columns if c.startswith('f_27_cnt_')]
if len(cnt_cols) == vocab_size:
    CNT_tr = np.stack([train[c].astype('int16').values for c in cnt_cols], axis=1)
    CNT_te = np.stack([test[c].astype('int16').values for c in cnt_cols], axis=1)
else:
    CNT_tr = counts_from_positions(P_tr, vocab_size)
    CNT_te = counts_from_positions(P_te, vocab_size)

def build_f27_quick(P, CNT):
    n = P.shape[0]
    out = {}
    # nunique
    out['f27_nunique'] = (CNT > 0).sum(axis=1).astype('int8')
    # equal pairs per row: sum cnt*(cnt-1)/2
    cnt = CNT.astype('int32')
    out['f27_equal_pairs'] = ((cnt * (cnt - 1)) // 2).sum(axis=1).astype('int32')
    # entropy over 10 positions
    p = cnt / 10.0
    with np.errstate(divide='ignore', invalid='ignore'):
        ent = -(p * np.log(p + 1e-12)).sum(axis=1)
    out['f27_entropy'] = ent.astype('float32')
    # first-last same
    out['f27_first_last_same'] = (P[:, 0] == P[:, 9]).astype('int8')
    # palindrome matches
    pal = np.zeros(n, dtype=np.int8)
    for i in range(5):
        pal += (P[:, i] == P[:, 9 - i]).astype('int8')
    out['f27_pal_matches'] = pal
    # circular shift equal counts for k=1,2,3
    for k in (1, 2, 3):
        eq = (P == np.roll(P, shift=-k, axis=1)).sum(axis=1).astype('int8')
        out[f'f27_circ_eq_k{k}'] = eq
    # num_runs, mean_run_length, transitions_parity using existing transitions if present
    if 'f_27_transitions' in train.columns:
        trn = None  # placeholder
    # compute transitions from positions
    trans = (P[:, 1:] != P[:, :-1]).sum(axis=1).astype('int8')
    out['f27_num_runs'] = (trans + 1).astype('int8')
    out['f27_mean_run_len'] = (10.0 / (trans + 1).clip(min=1)).astype('float32')
    out['f27_transitions_parity'] = (trans % 2).astype('int8')
    # 3-gram hashed (base-23 hash mod 512) for i=0..7
    base = 23
    mod = 512
    tri_feats = {}
    for i in range(8):
        code = (P[:, i] * (base * base) + P[:, i + 1] * base + P[:, i + 2]) % mod
        tri_feats[f'f27_tri_hash_{i}_{i+2}'] = code.astype('int16')
    out.update(tri_feats)
    return pd.DataFrame(out)

f27q_train = build_f27_quick(P_tr, CNT_tr)
f27q_test = build_f27_quick(P_te, CNT_te)

# Concatenate blocks to reduce fragmentation
train = pd.concat([train, num_int_train, f27q_train], axis=1)
test = pd.concat([test, num_int_test, f27q_test], axis=1)

log(f'Added interactions and f27 quick features in {time.time()-t0:.1f}s | new feature count: {train.shape[1]-3}')
gc.collect();

[2025-09-11 22:15:07] Adding numeric interactions and f_27 quick wins...


[2025-09-11 22:15:08] Added interactions and f27 quick features in 1.4s | new feature count: 149


In [5]:
# CatBoost 10-fold CV training, OOF/test preds, for blending
log('Setting up CatBoost training (same folds/features, fast params)...')
try:
    from catboost import CatBoostClassifier, Pool
except ImportError:
    import sys, subprocess
    log('CatBoost not found. Installing...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'catboost'])
    from catboost import CatBoostClassifier, Pool

# Reuse features from current dataframe
features = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
X = train[features]
y = train[target_col].values
X_test = test[features]

# Identify categorical-like features (int-coded f_27-derived columns)
cat_cols = [
    c for c in features
    if (c.startswith('f_27_pos_') or c.startswith('f_27_char_') or c.startswith('f_27_bg_')
        or c.endswith('_present') or c.startswith('f_27_adj_eq_') or c.startswith('f27_tri_hash_'))
]
cat_idx = [X.columns.get_loc(c) for c in cat_cols if c in X.columns]
log(f'CatBoost categorical features count: {len(cat_idx)}')

# Folds
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

# Fast strong params (GPU) per expert guidance
params = {
    'loss_function': 'Logloss', 'eval_metric': 'AUC',
    'iterations': 3800, 'learning_rate': 0.032, 'depth': 8,
    'l2_leaf_reg': 6.0, 'bootstrap_type': 'Poisson', 'subsample': 0.82,
    'sampling_frequency': 'PerTree', 'border_count': 128,
    'grow_policy': 'SymmetricTree', 'max_ctr_complexity': 1,
    'od_type': 'Iter', 'od_wait': 120,
    'task_type': 'GPU', 'devices': '0',
    'random_seed': 42, 'verbose': 400,
    # Ensure no GPU-incompatible column sampling (rsm) is used
    'rsm': 1.0
}

oof_cb = np.zeros(len(train), dtype='float32')
pred_test_cb = np.zeros(len(test), dtype='float32')

for fold in range(n_splits):
    fold_t0 = time.time()
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'[CB] Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')

    train_pool = Pool(X.iloc[trn_idx], y[trn_idx], cat_features=cat_idx)
    valid_pool = Pool(X.iloc[val_idx], y[val_idx], cat_features=cat_idx)

    model_cb = CatBoostClassifier(**params)
    model_cb.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    oof_cb[val_idx] = model_cb.predict_proba(valid_pool)[:,1].astype('float32')
    fold_auc = roc_auc_score(y[val_idx], oof_cb[val_idx])
    log(f'[CB] Fold {fold} AUC: {fold_auc:.6f} | best_iter={model_cb.tree_count_} | elapsed={time.time()-fold_t0:.1f}s')

    pred_test_cb += model_cb.predict_proba(Pool(X_test, cat_features=cat_idx))[:,1].astype('float32') / n_splits
    del model_cb, train_pool, valid_pool
    gc.collect()

cv_auc_cb = roc_auc_score(y, oof_cb)
log(f'[CB] OOF AUC: {cv_auc_cb:.6f}')

# Save OOF and test preds
pd.DataFrame({id_col: train[id_col], 'oof_cb': oof_cb}).to_csv('oof_catboost_seed42.csv', index=False)
pd.DataFrame({id_col: test[id_col], 'prediction_cb': pred_test_cb}).to_csv('pred_catboost_seed42.csv', index=False)
log('[CB] Saved OOF and test predictions.')

[2025-09-11 22:15:16] Setting up CatBoost training (same folds/features, fast params)...


[2025-09-11 22:15:16] CatBoost categorical features count: 56


[2025-09-11 22:15:16] [CB] Fold 1/10 | trn=720000 val=80000


CatBoostError: catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 100: no CUDA-capable device is detected

In [7]:
# LightGBM fast 10-fold CV (numeric-only), with early stopping and faster params
t_start = time.time()
log('Setting up LightGBM (fast params) training...')
import lightgbm as lgb

# Features and target (after latest FE, exclude id/target/raw f_27)
features = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
X = train[features]
y = train[target_col].values
X_test = test[features]

# Folds
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

# Fast params per expert guidance
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.045,
    'num_leaves': 144,
    'max_depth': -1,
    'min_data_in_leaf': 240,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'lambda_l1': 0.0,
    'lambda_l2': 4.0,
    'max_bin': 127,
    'verbose': -1,
    'n_jobs': -1,
    'seed': SEED,
    'feature_fraction_seed': SEED,
    'bagging_seed': SEED,
    'data_random_seed': SEED
}

oof_fast = np.zeros(len(train), dtype='float32')
pred_test_fast = np.zeros(len(test), dtype='float32')
feat_imps = []

for fold in range(n_splits):
    fold_t0 = time.time()
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'[LGB fast] Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')

    dtrain = lgb.Dataset(X.iloc[trn_idx], label=y[trn_idx])
    dvalid = lgb.Dataset(X.iloc[val_idx], label=y[val_idx])

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=4000,
        valid_sets=[dtrain, dvalid],
        valid_names=['train','valid'],
        callbacks=[lgb.early_stopping(120), lgb.log_evaluation(100)]
    )

    oof_fast[val_idx] = model.predict(X.iloc[val_idx], num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y[val_idx], oof_fast[val_idx])
    log(f'[LGB fast] Fold {fold} AUC: {fold_auc:.6f} | best_iter={model.best_iteration} | elapsed={time.time()-fold_t0:.1f}s')

    pred_test_fast += model.predict(X_test, num_iteration=model.best_iteration) / n_splits

    fi = pd.DataFrame({'feature': features, f'fold_{fold}': model.feature_importance(importance_type='gain')})
    feat_imps.append(fi)
    del dtrain, dvalid, model
    gc.collect()

cv_auc_fast = roc_auc_score(y, oof_fast)
log(f'[LGB fast] OOF AUC: {cv_auc_fast:.6f}')

# Save OOF and test preds
pd.DataFrame({id_col: train[id_col], 'oof_lgb_fast': oof_fast}).to_csv('oof_lgb_fast_seed42.csv', index=False)
pd.DataFrame({id_col: test[id_col], 'prediction_lgb_fast': pred_test_fast}).to_csv('pred_lgb_fast_seed42.csv', index=False)

# Feature importance
fi_merged = feat_imps[0]
for i in range(1, len(feat_imps)):
    fi_merged = fi_merged.merge(feat_imps[i], on='feature', how='outer')
fi_merged.fillna(0, inplace=True)
fi_merged['gain_mean'] = fi_merged[[c for c in fi_merged.columns if c.startswith('fold_')]].mean(axis=1)
fi_merged.sort_values('gain_mean', ascending=False).to_csv('feature_importance_lgb_fast.csv', index=False)
log('[LGB fast] Saved OOF, test preds, and feature importance.')

# Submission
sub = pd.read_csv('sample_submission.csv')
sub['target'] = pred_test_fast
sub.to_csv('submission_lgb_fast.csv', index=False)
log(f'[LGB fast] Submission saved. Total training time: {time.time()-t_start:.1f}s')

[2025-09-11 22:16:24] Setting up LightGBM (fast params) training...


[2025-09-11 22:16:25] [LGB fast] Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[100]	train's auc: 0.963129	valid's auc: 0.960292


[200]	train's auc: 0.978775	valid's auc: 0.975702


[300]	train's auc: 0.985191	valid's auc: 0.981911


[400]	train's auc: 0.98866	valid's auc: 0.985066


[500]	train's auc: 0.990778	valid's auc: 0.986757


[600]	train's auc: 0.992253	valid's auc: 0.98776


[700]	train's auc: 0.993357	valid's auc: 0.988353


[800]	train's auc: 0.994299	valid's auc: 0.988859


[900]	train's auc: 0.995007	valid's auc: 0.98911


[1000]	train's auc: 0.995652	valid's auc: 0.989373


[1100]	train's auc: 0.996212	valid's auc: 0.989543


[1200]	train's auc: 0.9967	valid's auc: 0.98969


[1300]	train's auc: 0.997133	valid's auc: 0.98981


[1400]	train's auc: 0.997516	valid's auc: 0.989907


[1500]	train's auc: 0.997859	valid's auc: 0.98998


[1600]	train's auc: 0.998162	valid's auc: 0.990095


[1700]	train's auc: 0.998437	valid's auc: 0.990213


[1800]	train's auc: 0.998667	valid's auc: 0.990284


[1900]	train's auc: 0.998875	valid's auc: 0.990336


[2000]	train's auc: 0.99905	valid's auc: 0.990377


[2100]	train's auc: 0.999202	valid's auc: 0.990441


[2200]	train's auc: 0.999332	valid's auc: 0.990491


[2300]	train's auc: 0.999444	valid's auc: 0.99053


[2400]	train's auc: 0.999539	valid's auc: 0.990579


[2500]	train's auc: 0.99962	valid's auc: 0.99061


[2600]	train's auc: 0.999689	valid's auc: 0.99063


[2700]	train's auc: 0.999748	valid's auc: 0.990643


[2800]	train's auc: 0.999795	valid's auc: 0.990694


[2900]	train's auc: 0.999835	valid's auc: 0.99073


[3000]	train's auc: 0.999869	valid's auc: 0.990756


[3100]	train's auc: 0.999897	valid's auc: 0.990781


[3200]	train's auc: 0.999917	valid's auc: 0.990825


[3300]	train's auc: 0.999936	valid's auc: 0.99083


[3400]	train's auc: 0.99995	valid's auc: 0.990855


[3500]	train's auc: 0.999962	valid's auc: 0.990875


[3600]	train's auc: 0.999971	valid's auc: 0.990887


[3700]	train's auc: 0.999978	valid's auc: 0.990896


[3800]	train's auc: 0.999984	valid's auc: 0.99091


[3900]	train's auc: 0.999988	valid's auc: 0.990928


[4000]	train's auc: 0.999991	valid's auc: 0.990945
Did not meet early stopping. Best iteration is:
[3988]	train's auc: 0.999991	valid's auc: 0.990946


[2025-09-11 22:21:54] [LGB fast] Fold 0 AUC: 0.990946 | best_iter=3988 | elapsed=329.4s


[2025-09-11 22:21:57] [LGB fast] Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[100]	train's auc: 0.96225	valid's auc: 0.959773


[200]	train's auc: 0.978622	valid's auc: 0.975583


[300]	train's auc: 0.98525	valid's auc: 0.981932


[400]	train's auc: 0.988718	valid's auc: 0.985092


[500]	train's auc: 0.990859	valid's auc: 0.986842


[600]	train's auc: 0.992316	valid's auc: 0.987828


[700]	train's auc: 0.993383	valid's auc: 0.988363


[800]	train's auc: 0.994274	valid's auc: 0.988781


[900]	train's auc: 0.995059	valid's auc: 0.98919


[1000]	train's auc: 0.995695	valid's auc: 0.989431


[1100]	train's auc: 0.996253	valid's auc: 0.989583


[1200]	train's auc: 0.996737	valid's auc: 0.989698


[1300]	train's auc: 0.997186	valid's auc: 0.98987


[1400]	train's auc: 0.997565	valid's auc: 0.989989


[1500]	train's auc: 0.997909	valid's auc: 0.990061


[1600]	train's auc: 0.998196	valid's auc: 0.990127


[1700]	train's auc: 0.998456	valid's auc: 0.990171


[1800]	train's auc: 0.998683	valid's auc: 0.990239


[1900]	train's auc: 0.998882	valid's auc: 0.990302


[2000]	train's auc: 0.999057	valid's auc: 0.990357


[2100]	train's auc: 0.999206	valid's auc: 0.990414


[2200]	train's auc: 0.999335	valid's auc: 0.990468


[2300]	train's auc: 0.999448	valid's auc: 0.990515


[2400]	train's auc: 0.999544	valid's auc: 0.990538


[2500]	train's auc: 0.999626	valid's auc: 0.990574


[2600]	train's auc: 0.999695	valid's auc: 0.990617


[2700]	train's auc: 0.999755	valid's auc: 0.990659


[2800]	train's auc: 0.999803	valid's auc: 0.990682


[2900]	train's auc: 0.999841	valid's auc: 0.990715


[3000]	train's auc: 0.999873	valid's auc: 0.990746


[3100]	train's auc: 0.999899	valid's auc: 0.990778


[3200]	train's auc: 0.999921	valid's auc: 0.990804


[3300]	train's auc: 0.999938	valid's auc: 0.990828


[3400]	train's auc: 0.999952	valid's auc: 0.990867


[3500]	train's auc: 0.999963	valid's auc: 0.99087


[3600]	train's auc: 0.999973	valid's auc: 0.990873


Early stopping, best iteration is:
[3535]	train's auc: 0.999967	valid's auc: 0.990879


[2025-09-11 22:27:04] [LGB fast] Fold 1 AUC: 0.990879 | best_iter=3535 | elapsed=306.8s


[2025-09-11 22:27:06] [LGB fast] Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[100]	train's auc: 0.961317	valid's auc: 0.958583


[200]	train's auc: 0.978655	valid's auc: 0.975542


[300]	train's auc: 0.985395	valid's auc: 0.982038


[400]	train's auc: 0.988786	valid's auc: 0.98512


[500]	train's auc: 0.990899	valid's auc: 0.986881


[600]	train's auc: 0.992315	valid's auc: 0.987885


[700]	train's auc: 0.993409	valid's auc: 0.988478


[800]	train's auc: 0.994279	valid's auc: 0.988868


[900]	train's auc: 0.995037	valid's auc: 0.989245


[1000]	train's auc: 0.995671	valid's auc: 0.989459


[1100]	train's auc: 0.99624	valid's auc: 0.989639


[1200]	train's auc: 0.996741	valid's auc: 0.989816


[1300]	train's auc: 0.997181	valid's auc: 0.989976


[1400]	train's auc: 0.997563	valid's auc: 0.990079


[1500]	train's auc: 0.997899	valid's auc: 0.990141


[1600]	train's auc: 0.998191	valid's auc: 0.990227


[1700]	train's auc: 0.998449	valid's auc: 0.990275


[1800]	train's auc: 0.998676	valid's auc: 0.990338


[1900]	train's auc: 0.998869	valid's auc: 0.990405


[2000]	train's auc: 0.999042	valid's auc: 0.990423


[2100]	train's auc: 0.999191	valid's auc: 0.990486


[2200]	train's auc: 0.999327	valid's auc: 0.990542


[2300]	train's auc: 0.99944	valid's auc: 0.990591


[2400]	train's auc: 0.999537	valid's auc: 0.990621


[2500]	train's auc: 0.999617	valid's auc: 0.990649


[2600]	train's auc: 0.999688	valid's auc: 0.990666


[2700]	train's auc: 0.999748	valid's auc: 0.99071


[2800]	train's auc: 0.999796	valid's auc: 0.990745


[2900]	train's auc: 0.999837	valid's auc: 0.99076


[3000]	train's auc: 0.99987	valid's auc: 0.990782


[3100]	train's auc: 0.999896	valid's auc: 0.99081


[3200]	train's auc: 0.999919	valid's auc: 0.990819


[3300]	train's auc: 0.999937	valid's auc: 0.990833


[3400]	train's auc: 0.999951	valid's auc: 0.99086


[3500]	train's auc: 0.999962	valid's auc: 0.990883


In [None]:
# Blend OOF to optimize weights; produce blended submission
log('Setting up blending between LGB fast and CatBoost...')
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

def load_oof_pred(oof_path, col_name):
    df = pd.read_csv(oof_path)
    if col_name not in df.columns:
        # assume last column is the prediction
        pred_col = [c for c in df.columns if c != id_col][0]
    else:
        pred_col = col_name
    return df[[id_col, pred_col]].rename(columns={pred_col: 'pred'})

def load_test_pred(pred_path, col_name):
    df = pd.read_csv(pred_path)
    if col_name not in df.columns:
        pred_col = [c for c in df.columns if c != id_col][0]
    else:
        pred_col = col_name
    return df[[id_col, pred_col]].rename(columns={pred_col: 'pred'})

oof_lgb_path = 'oof_lgb_fast_seed42.csv'
pred_lgb_path = 'pred_lgb_fast_seed42.csv'
oof_cb_path = 'oof_catboost_seed42.csv'
pred_cb_path = 'pred_catboost_seed42.csv'

# Wait-safe loads (one may not exist yet)
oof_lgb = load_oof_pred(oof_lgb_path, 'oof_lgb_fast') if os.path.exists(oof_lgb_path) else None
pred_lgb = load_test_pred(pred_lgb_path, 'prediction_lgb_fast') if os.path.exists(pred_lgb_path) else None
oof_cb = load_oof_pred(oof_cb_path, 'oof_cb') if os.path.exists(oof_cb_path) else None
pred_cb = load_test_pred(pred_cb_path, 'prediction_cb') if os.path.exists(pred_cb_path) else None

if (oof_lgb is None) or (pred_lgb is None):
    log('LGB predictions not found yet; run LGB first.')
else:
    # Merge with ground truth
    gt = train[[id_col, target_col]].copy()
    oof = gt.merge(oof_lgb, on=id_col, how='left').rename(columns={'pred': 'lgb'})
    have_cb = (oof_cb is not None) and (pred_cb is not None)
    if have_cb:
        oof = oof.merge(oof_cb.rename(columns={'pred': 'cb'}), on=id_col, how='left')
    else:
        oof['cb'] = np.nan

    # Rank transform helper
    def rank_norm(x):
        r = pd.Series(x).rank(method='average').values
        return (r - 1) / (len(r) - 1)

    best_auc = 0.0
    best_w = None
    oof['lgb_r'] = rank_norm(oof['lgb'])
    if have_cb:
        oof['cb_r'] = rank_norm(oof['cb'])

    # Grid search weights (0..1, step 0.05). If CB missing, just evaluate LGB.
    weights = np.linspace(0.0, 1.0, 21) if have_cb else [1.0]
    for w in weights:
        if have_cb:
            blend = w * oof['lgb'] + (1 - w) * oof['cb']
            blend_r = w * oof['lgb_r'] + (1 - w) * oof['cb_r']
        else:
            blend = oof['lgb']
            blend_r = oof['lgb_r']
        auc = roc_auc_score(oof[target_col], blend)
        auc_r = roc_auc_score(oof[target_col], blend_r)
        if auc > best_auc:
            best_auc = auc
            best_w = ('prob', float(w))
            best_mode = 'prob'
            best_is_rank = False
            best_auc_r = auc_r
        if auc_r > best_auc:
            best_auc = auc_r
            best_w = ('rank', float(w))
            best_mode = 'rank'
            best_is_rank = True
            best_auc_r = auc_r

    log(f"Blending search done. Best OOF AUC={best_auc:.6f} | mode={best_mode} | w_lgb={best_w[1] if best_w else 1.0}")

    # Build test blend
    sub = pd.read_csv('sample_submission.csv')
    test_merge = sub[[id_col]].merge(pred_lgb.rename(columns={'pred': 'lgb'}), on=id_col, how='left')
    if have_cb:
        test_merge = test_merge.merge(pred_cb.rename(columns={'pred': 'cb'}), on=id_col, how='left')
    else:
        test_merge['cb'] = np.nan

    if (best_w is None) or (not have_cb):
        # Fallback to LGB only
        sub['target'] = test_merge['lgb'].astype('float32')
        sub.to_csv('submission_blend.csv', index=False)
        sub.to_csv('submission.csv', index=False)
        log('Saved submission_blend.csv and submission.csv using LGB only (CB not available).')
    else:
        w = best_w[1]
        if best_mode == 'prob':
            pred_blend = w * test_merge['lgb'].values + (1 - w) * test_merge['cb'].values
        else:
            # rank-average
            rl = pd.Series(test_merge['lgb']).rank(method='average').values
            rc = pd.Series(test_merge['cb']).rank(method='average').values
            rl = (rl - 1) / (len(rl) - 1)
            rc = (rc - 1) / (len(rc) - 1)
            pred_blend = w * rl + (1 - w) * rc
        sub['target'] = pred_blend.astype('float32')
        sub.to_csv('submission_blend.csv', index=False)
        sub.to_csv('submission.csv', index=False)
        log('Saved submission_blend.csv and submission.csv with optimized blend.')

In [26]:
# Validate submission.csv format; fix if needed
log('Validating submission.csv...')
sub = pd.read_csv('submission.csv') if os.path.exists('submission.csv') else None
issues = []
if sub is None:
    issues.append('submission.csv missing')
else:
    # Basic checks
    if list(sub.columns) != ['id', 'target']:
        issues.append(f'Bad columns: {list(sub.columns)}')
    if len(sub) != len(test):
        issues.append(f'Row count mismatch: {len(sub)} vs test {len(test)}')
    if sub['target'].isna().any():
        issues.append(f'NaNs in target: {int(sub["target"].isna().sum())}')
    # dtype: ensure float
    try:
        sub['target'] = sub['target'].astype('float32')
    except Exception as e:
        issues.append(f'target casting error: {e}')

if issues:
    log('Issues found: ' + '; '.join(issues))
    # Rebuild from LGB fast preds directly
    log('Rebuilding submission.csv from pred_lgb_fast_seed42.csv...')
    pred_lgb = pd.read_csv('pred_lgb_fast_seed42.csv')
    # Align and sanity
    pred_lgb = pred_lgb.rename(columns={pred_lgb.columns[1]: 'prediction_lgb_fast'})
    ss = pd.read_csv('sample_submission.csv')
    merge = ss[['id']].merge(pred_lgb.rename(columns={'prediction_lgb_fast':'target'}), on='id', how='left')
    if merge['target'].isna().any():
        missing = int(merge['target'].isna().sum())
        raise ValueError(f'Missing predictions for {missing} test rows after rebuild.')
    merge['target'] = merge['target'].astype('float32')
    merge.to_csv('submission.csv', index=False)
    log('submission.csv rebuilt from LGB fast preds.')
else:
    log('submission.csv looks valid: ' + f"shape={sub.shape}, dtypes={dict(sub.dtypes)}")

[2025-09-12 02:03:09] Validating submission.csv...


[2025-09-12 02:03:09] submission.csv looks valid: shape=(100000, 2), dtypes={'id': dtype('int64'), 'target': dtype('float32')}


In [8]:
# Force-rebuild submission.csv strictly aligned to sample_submission; ensure finite [0,1] float64
log('Rebuilding submission.csv strictly to avoid format issues...')
ss = pd.read_csv('sample_submission.csv')
pred = pd.read_csv('pred_lgb_fast_seed42.csv')
# Identify prediction column (second column)
pred_col = [c for c in pred.columns if c != 'id']
assert len(pred_col) == 1, f'Unexpected pred columns: {pred.columns.tolist()}'
# Rename to avoid clash with ss.target during merge
pred = pred.rename(columns={pred_col[0]: 'prediction'})
# Left-merge to preserve order and count from sample_submission (use only id to avoid duplicate target columns)
sub = ss[['id']].merge(pred, on='id', how='left')
missing = sub['prediction'].isna().sum()
if missing > 0:
    raise ValueError(f'Missing predictions for {missing} test rows after strict rebuild.')
# Ensure numeric, finite, and in [0,1]
sub['target'] = pd.to_numeric(sub['prediction'], errors='coerce')
if sub['target'].isna().any():
    raise ValueError('NaNs found after numeric conversion.')
sub['target'] = sub['target'].astype('float64').clip(0.0, 1.0)
sub = sub[['id', 'target']]
# Final save
sub.to_csv('submission.csv', index=False)
log('Strict submission.csv rebuilt and saved: ' + f"shape={sub.shape}, dtypes={dict(sub.dtypes)}")

[2025-09-11 22:32:20] Rebuilding submission.csv strictly to avoid format issues...


[2025-09-11 22:32:21] Strict submission.csv rebuilt and saved: shape=(100000, 2), dtypes={'id': dtype('int64'), 'target': dtype('float64')}


In [6]:
# Minimal, high-impact FE block (sorted stats, equality count, parity, f27 majority)
log('Adding minimal high-impact features (sorted stats, equality count, parity, f27 majority)...')
t0 = time.time()

# 1) Row-wise numeric sorting stats
num_float_cols = [c for c in train.columns if (c not in [id_col, target_col, 'f_27']) and (str(train[c].dtype).startswith('float')) and c.startswith('f_')]
if len(num_float_cols) > 0:
    vals_tr = train[num_float_cols].values
    vals_te = test[num_float_cols].values
    srt_tr = np.sort(vals_tr, axis=1)
    srt_te = np.sort(vals_te, axis=1)
    train['f_sorted_0'] = srt_tr[:, 0].astype('float32')
    train['f_sorted_1'] = srt_tr[:, 1].astype('float32')
    train['f_sorted_-1'] = srt_tr[:, -1].astype('float32')
    train['f_sorted_range'] = (srt_tr[:, -1] - srt_tr[:, 0]).astype('float32')
    test['f_sorted_0'] = srt_te[:, 0].astype('float32')
    test['f_sorted_1'] = srt_te[:, 1].astype('float32')
    test['f_sorted_-1'] = srt_te[:, -1].astype('float32')
    test['f_sorted_range'] = (srt_te[:, -1] - srt_te[:, 0]).astype('float32')

# 2) Pairwise equality count over subset of strong numerics
num_list = ['f_00','f_01','f_02','f_03','f_05','f_06','f_10','f_12','f_20','f_21','f_22','f_26']
num_list = [c for c in num_list if c in train.columns and c in test.columns]
if len(num_list) >= 2:
    M_tr = train[num_list].values
    M_te = test[num_list].values
    from itertools import combinations
    def eq_pairs(M):
        n = M.shape[0]
        cnt = np.zeros(n, dtype=np.int16)
        for i, j in combinations(range(M.shape[1]), 2):
            cnt += (np.abs(M[:, i] - M[:, j]) < 1e-6)
        return cnt
    train['num_equal_pairs'] = eq_pairs(M_tr)
    test['num_equal_pairs'] = eq_pairs(M_te)

# 3) Quick parity/mod on a few columns
for c in ['f_00','f_02','f_10','f_20','f_21','f_26']:
    if c in train.columns and c in test.columns:
        train[f'{c}_parity'] = (train[c].astype('int64') & 1).astype('int8')
        test[f'{c}_parity'] = (test[c].astype('int64') & 1).astype('int8')

# 4) f_27 majority count and index (recompute counts from positions)
pos_cols = [f'f_27_pos_{i}' for i in range(10)]
if all(c in train.columns for c in pos_cols):
    P_tr = np.stack([train[c].astype('int16').values for c in pos_cols], axis=1)
    P_te = np.stack([test[c].astype('int16').values for c in pos_cols], axis=1)
    vocab_size = len(sorted(set(''.join(pd.concat([train['f_27'], test['f_27']]).astype(str).values))))
    def counts_from_positions(P, vocab_size):
        n = P.shape[0]
        CNT = np.zeros((n, vocab_size), dtype=np.int16)
        for k in range(P.shape[1]):
            CNT[np.arange(n), P[:, k]] += 1
        return CNT
    CNT_tr = counts_from_positions(P_tr, vocab_size)
    CNT_te = counts_from_positions(P_te, vocab_size)
    train['f27_majority_cnt'] = CNT_tr.max(axis=1).astype('int8')
    train['f27_majority_idx'] = CNT_tr.argmax(axis=1).astype('int8')
    test['f27_majority_cnt'] = CNT_te.max(axis=1).astype('int8')
    test['f27_majority_idx'] = CNT_te.argmax(axis=1).astype('int8')

log(f'Added minimal FE in {time.time()-t0:.2f}s | train cols={train.shape[1]}')
gc.collect();

[2025-09-11 22:16:17] Adding minimal high-impact features (sorted stats, equality count, parity, f27 majority)...


[2025-09-11 22:16:18] Added minimal FE in 1.25s | train cols=165


In [12]:
# XGBoost CPU 10-fold CV training, OOF/test preds, for blending (use xgb.train for broad version compatibility)
log('Setting up XGBoost (CPU, hist) training...')
t0_all = time.time()
try:
    import xgboost as xgb
except ImportError:
    import sys, subprocess
    log('XGBoost not found. Installing...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'xgboost'])
    import xgboost as xgb

# Features/target (same 165-feature set; exclude id/target/raw f_27)
features = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
X = train[features]
y = train[target_col].values
X_test = test[features]

# Folds (locked)
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

# Params per expert guidance (CPU) mapped for xgb.train
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'max_bin': 128,
    'eta': 0.04,                 # learning_rate
    'max_depth': 8,
    'min_child_weight': 100,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'lambda': 3.0,               # reg_lambda
    'nthread': 36,
    'seed': 42
}
num_boost_round = 3000
early_stopping_rounds = 150

oof_xgb = np.zeros(len(train), dtype='float32')
pred_test_xgb = np.zeros(len(test), dtype='float32')

for fold in range(n_splits):
    fold_t0 = time.time()
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'[XGB] Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')

    dtrain = xgb.DMatrix(X.iloc[trn_idx], label=y[trn_idx])
    dvalid = xgb.DMatrix(X.iloc[val_idx], label=y[val_idx])
    dtest = xgb.DMatrix(X_test)

    bst = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=200
    )

    # Robust best iteration retrieval across versions
    best_iter = None
    attrs = {}
    try:
        attrs = bst.attributes()
    except Exception:
        attrs = {}
    if 'best_iteration' in attrs:
        try:
            best_iter = int(attrs['best_iteration'])
        except Exception:
            best_iter = None
    if (best_iter is None) and hasattr(bst, 'best_iteration') and (bst.best_iteration is not None):
        try:
            best_iter = int(bst.best_iteration)
        except Exception:
            best_iter = None

    # Predict using iteration_range when best_iter is available; otherwise use full model
    if best_iter is not None and best_iter >= 0:
        oof_pred = bst.predict(dvalid, iteration_range=(0, best_iter + 1))
    else:
        oof_pred = bst.predict(dvalid)
    oof_xgb[val_idx] = oof_pred.astype('float32')
    fold_auc = roc_auc_score(y[val_idx], oof_xgb[val_idx])
    log(f'[XGB] Fold {fold} AUC: {fold_auc:.6f} | best_iter={best_iter} | elapsed={time.time()-fold_t0:.1f}s')

    if best_iter is not None and best_iter >= 0:
        pred_te = bst.predict(dtest, iteration_range=(0, best_iter + 1))
    else:
        pred_te = bst.predict(dtest)
    pred_test_xgb += pred_te.astype('float32') / n_splits
    del dtrain, dvalid, dtest, bst
    gc.collect()

cv_auc_xgb = roc_auc_score(y, oof_xgb)
log(f'[XGB] OOF AUC: {cv_auc_xgb:.6f} | total_elapsed={time.time()-t0_all:.1f}s')

# Save artifacts
pd.DataFrame({id_col: train[id_col], 'oof_xgb': oof_xgb}).to_csv('oof_xgb_seed42.csv', index=False)
pd.DataFrame({id_col: test[id_col], 'prediction_xgb': pred_test_xgb}).to_csv('pred_xgb_seed42.csv', index=False)
log('[XGB] Saved OOF and test predictions.')

[2025-09-11 22:38:02] Setting up XGBoost (CPU, hist) training...


[2025-09-11 22:38:02] [XGB] Fold 1/10 | trn=720000 val=80000


[0]	train-auc:0.75379	valid-auc:0.75297


[200]	train-auc:0.96478	valid-auc:0.96191


[400]	train-auc:0.97630	valid-auc:0.97305


[600]	train-auc:0.98113	valid-auc:0.97752


[800]	train-auc:0.98426	valid-auc:0.98039


[1000]	train-auc:0.98658	valid-auc:0.98242


[1200]	train-auc:0.98804	valid-auc:0.98352


[1400]	train-auc:0.98932	valid-auc:0.98452


[1600]	train-auc:0.99029	valid-auc:0.98517


[1800]	train-auc:0.99105	valid-auc:0.98557


[2000]	train-auc:0.99170	valid-auc:0.98591


[2200]	train-auc:0.99231	valid-auc:0.98615


[2400]	train-auc:0.99286	valid-auc:0.98634


[2600]	train-auc:0.99333	valid-auc:0.98651


[2800]	train-auc:0.99379	valid-auc:0.98668


[2999]	train-auc:0.99419	valid-auc:0.98678


[2025-09-11 22:40:45] [XGB] Fold 0 AUC: 0.986778 | best_iter=2991 | elapsed=162.6s


[2025-09-11 22:40:45] [XGB] Fold 2/10 | trn=720000 val=80000


[0]	train-auc:0.74556	valid-auc:0.74468


[200]	train-auc:0.96389	valid-auc:0.96120


[400]	train-auc:0.97650	valid-auc:0.97339


[600]	train-auc:0.98115	valid-auc:0.97769


[800]	train-auc:0.98402	valid-auc:0.98019


[1000]	train-auc:0.98643	valid-auc:0.98225


[1200]	train-auc:0.98798	valid-auc:0.98346


[1400]	train-auc:0.98915	valid-auc:0.98431


[1600]	train-auc:0.99022	valid-auc:0.98503


[1800]	train-auc:0.99104	valid-auc:0.98552


[2000]	train-auc:0.99171	valid-auc:0.98586


[2200]	train-auc:0.99229	valid-auc:0.98612


[2400]	train-auc:0.99284	valid-auc:0.98632


[2600]	train-auc:0.99336	valid-auc:0.98649


[2800]	train-auc:0.99381	valid-auc:0.98661


[2999]	train-auc:0.99422	valid-auc:0.98671


[2025-09-11 22:43:25] [XGB] Fold 1 AUC: 0.986706 | best_iter=2999 | elapsed=159.4s


[2025-09-11 22:43:25] [XGB] Fold 3/10 | trn=720000 val=80000


[0]	train-auc:0.75253	valid-auc:0.75335


[200]	train-auc:0.96439	valid-auc:0.96147


[400]	train-auc:0.97614	valid-auc:0.97279


[600]	train-auc:0.98101	valid-auc:0.97724


[800]	train-auc:0.98415	valid-auc:0.98007


[1000]	train-auc:0.98653	valid-auc:0.98220


[1200]	train-auc:0.98811	valid-auc:0.98349


[1400]	train-auc:0.98932	valid-auc:0.98443


[1600]	train-auc:0.99030	valid-auc:0.98506


[1800]	train-auc:0.99106	valid-auc:0.98548


[2000]	train-auc:0.99177	valid-auc:0.98584


[2200]	train-auc:0.99235	valid-auc:0.98609


[2400]	train-auc:0.99288	valid-auc:0.98626


[2600]	train-auc:0.99335	valid-auc:0.98640


[2800]	train-auc:0.99379	valid-auc:0.98650


[2999]	train-auc:0.99421	valid-auc:0.98662


[2025-09-11 22:46:11] [XGB] Fold 2 AUC: 0.986622 | best_iter=2998 | elapsed=166.0s


[2025-09-11 22:46:12] [XGB] Fold 4/10 | trn=720000 val=80000


[0]	train-auc:0.74547	valid-auc:0.74402


[200]	train-auc:0.96561	valid-auc:0.96213


[400]	train-auc:0.97667	valid-auc:0.97276


[600]	train-auc:0.98144	valid-auc:0.97727


[800]	train-auc:0.98459	valid-auc:0.98022


[1000]	train-auc:0.98669	valid-auc:0.98208


[1200]	train-auc:0.98812	valid-auc:0.98319


[1400]	train-auc:0.98938	valid-auc:0.98418


[1600]	train-auc:0.99032	valid-auc:0.98484


[1800]	train-auc:0.99109	valid-auc:0.98529


[2000]	train-auc:0.99173	valid-auc:0.98557


[2200]	train-auc:0.99231	valid-auc:0.98582


[2400]	train-auc:0.99285	valid-auc:0.98603


[2600]	train-auc:0.99334	valid-auc:0.98623


[2800]	train-auc:0.99379	valid-auc:0.98637


[2999]	train-auc:0.99420	valid-auc:0.98648


[2025-09-11 22:48:58] [XGB] Fold 3 AUC: 0.986480 | best_iter=2998 | elapsed=165.5s


[2025-09-11 22:48:58] [XGB] Fold 5/10 | trn=720000 val=80000


[0]	train-auc:0.75317	valid-auc:0.75505


[200]	train-auc:0.96563	valid-auc:0.96236


[400]	train-auc:0.97685	valid-auc:0.97323


[600]	train-auc:0.98144	valid-auc:0.97751


[800]	train-auc:0.98474	valid-auc:0.98048


[1000]	train-auc:0.98701	valid-auc:0.98254


[1200]	train-auc:0.98837	valid-auc:0.98353


[1400]	train-auc:0.98947	valid-auc:0.98430


[1600]	train-auc:0.99040	valid-auc:0.98492


[1800]	train-auc:0.99115	valid-auc:0.98533


[2000]	train-auc:0.99181	valid-auc:0.98564


[2200]	train-auc:0.99241	valid-auc:0.98590


[2400]	train-auc:0.99293	valid-auc:0.98606


[2600]	train-auc:0.99341	valid-auc:0.98622


[2800]	train-auc:0.99386	valid-auc:0.98637


[2999]	train-auc:0.99428	valid-auc:0.98648


[2025-09-11 22:51:44] [XGB] Fold 4 AUC: 0.986480 | best_iter=2997 | elapsed=165.9s


[2025-09-11 22:51:45] [XGB] Fold 6/10 | trn=720000 val=80000


[0]	train-auc:0.75386	valid-auc:0.74824


[200]	train-auc:0.96618	valid-auc:0.96234


[400]	train-auc:0.97720	valid-auc:0.97352


[600]	train-auc:0.98158	valid-auc:0.97765


[800]	train-auc:0.98474	valid-auc:0.98062


[1000]	train-auc:0.98682	valid-auc:0.98237


[1200]	train-auc:0.98828	valid-auc:0.98351


[1400]	train-auc:0.98933	valid-auc:0.98418


[1600]	train-auc:0.99028	valid-auc:0.98484


[1800]	train-auc:0.99106	valid-auc:0.98527


[2000]	train-auc:0.99176	valid-auc:0.98564


[2200]	train-auc:0.99234	valid-auc:0.98589


[2400]	train-auc:0.99287	valid-auc:0.98609


[2600]	train-auc:0.99335	valid-auc:0.98625


[2800]	train-auc:0.99382	valid-auc:0.98642


[2999]	train-auc:0.99421	valid-auc:0.98654


[2025-09-11 22:54:28] [XGB] Fold 5 AUC: 0.986536 | best_iter=2994 | elapsed=163.1s


[2025-09-11 22:54:29] [XGB] Fold 7/10 | trn=720000 val=80000


[0]	train-auc:0.74570	valid-auc:0.74436


[200]	train-auc:0.96556	valid-auc:0.96380


[400]	train-auc:0.97626	valid-auc:0.97392


[600]	train-auc:0.98109	valid-auc:0.97832


[800]	train-auc:0.98453	valid-auc:0.98139


[1000]	train-auc:0.98669	valid-auc:0.98318


[1200]	train-auc:0.98826	valid-auc:0.98441


[1400]	train-auc:0.98936	valid-auc:0.98509


[1600]	train-auc:0.99029	valid-auc:0.98565


[1800]	train-auc:0.99105	valid-auc:0.98602


[2000]	train-auc:0.99173	valid-auc:0.98637


[2200]	train-auc:0.99231	valid-auc:0.98667


[2400]	train-auc:0.99284	valid-auc:0.98688


[2600]	train-auc:0.99331	valid-auc:0.98703


[2800]	train-auc:0.99375	valid-auc:0.98717


[2999]	train-auc:0.99416	valid-auc:0.98731


[2025-09-11 22:57:10] [XGB] Fold 6 AUC: 0.987308 | best_iter=2999 | elapsed=161.3s


[2025-09-11 22:57:11] [XGB] Fold 8/10 | trn=720000 val=80000


[0]	train-auc:0.75370	valid-auc:0.75066


[200]	train-auc:0.96592	valid-auc:0.96226


[400]	train-auc:0.97708	valid-auc:0.97339


[600]	train-auc:0.98158	valid-auc:0.97764


[800]	train-auc:0.98442	valid-auc:0.98020


[1000]	train-auc:0.98667	valid-auc:0.98220


[1200]	train-auc:0.98824	valid-auc:0.98345


[1400]	train-auc:0.98937	valid-auc:0.98428


[1600]	train-auc:0.99029	valid-auc:0.98492


[1800]	train-auc:0.99109	valid-auc:0.98535


[2000]	train-auc:0.99177	valid-auc:0.98566


[2200]	train-auc:0.99234	valid-auc:0.98587


[2400]	train-auc:0.99288	valid-auc:0.98611


[2600]	train-auc:0.99334	valid-auc:0.98625


[2800]	train-auc:0.99378	valid-auc:0.98637


[2999]	train-auc:0.99419	valid-auc:0.98648


[2025-09-11 22:59:49] [XGB] Fold 7 AUC: 0.986485 | best_iter=2999 | elapsed=158.5s


[2025-09-11 22:59:50] [XGB] Fold 9/10 | trn=720000 val=80000


[0]	train-auc:0.75409	valid-auc:0.75339


[200]	train-auc:0.96440	valid-auc:0.96180


[400]	train-auc:0.97647	valid-auc:0.97364


[600]	train-auc:0.98146	valid-auc:0.97835


[800]	train-auc:0.98428	valid-auc:0.98086


[1000]	train-auc:0.98660	valid-auc:0.98287


[1200]	train-auc:0.98817	valid-auc:0.98415


[1400]	train-auc:0.98925	valid-auc:0.98487


[1600]	train-auc:0.99021	valid-auc:0.98550


[1800]	train-auc:0.99105	valid-auc:0.98602


[2000]	train-auc:0.99170	valid-auc:0.98636


[2200]	train-auc:0.99230	valid-auc:0.98660


[2400]	train-auc:0.99285	valid-auc:0.98682


[2600]	train-auc:0.99334	valid-auc:0.98697


[2800]	train-auc:0.99379	valid-auc:0.98711


[2999]	train-auc:0.99420	valid-auc:0.98721


[2025-09-11 23:02:32] [XGB] Fold 8 AUC: 0.987208 | best_iter=2999 | elapsed=161.5s


[2025-09-11 23:02:32] [XGB] Fold 10/10 | trn=720000 val=80000


[0]	train-auc:0.74576	valid-auc:0.74258


[200]	train-auc:0.96646	valid-auc:0.96396


[400]	train-auc:0.97722	valid-auc:0.97453


[600]	train-auc:0.98167	valid-auc:0.97869


[800]	train-auc:0.98471	valid-auc:0.98145


[1000]	train-auc:0.98670	valid-auc:0.98305


[1200]	train-auc:0.98822	valid-auc:0.98421


[1400]	train-auc:0.98935	valid-auc:0.98499


[1600]	train-auc:0.99030	valid-auc:0.98560


[1800]	train-auc:0.99109	valid-auc:0.98604


[2000]	train-auc:0.99176	valid-auc:0.98635


[2200]	train-auc:0.99234	valid-auc:0.98659


[2400]	train-auc:0.99291	valid-auc:0.98681


[2600]	train-auc:0.99341	valid-auc:0.98699


[2800]	train-auc:0.99385	valid-auc:0.98711


[2999]	train-auc:0.99426	valid-auc:0.98721


[2025-09-11 23:05:19] [XGB] Fold 9 AUC: 0.987210 | best_iter=2998 | elapsed=166.9s


[2025-09-11 23:05:20] [XGB] OOF AUC: 0.986782 | total_elapsed=1638.8s


[2025-09-11 23:05:22] [XGB] Saved OOF and test predictions.


In [13]:
# LightGBM multi-seed trainer (CPU-only) on current 165-feature set and locked folds
log('Preparing LightGBM multi-seed training cell (will run after XGB finishes)...')
import lightgbm as lgb

def train_lgb_fast_seed(seed=42, feature_fraction=0.75, bagging_fraction=0.75, tag=None):
    t0 = time.time()
    feats = [c for c in train.columns if c not in [id_col, target_col, 'f_27']]
    X_ = train[feats]; y_ = train[target_col].values; Xte_ = test[feats]
    folds_df = pd.read_csv('folds_10fold_seed42.csv'); folds_arr = folds_df['fold'].values
    n_splits = len(np.unique(folds_arr))
    params = {
        'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
        'learning_rate': 0.045, 'num_leaves': 144, 'max_depth': -1,
        'min_data_in_leaf': 240, 'feature_fraction': feature_fraction, 'bagging_fraction': bagging_fraction, 'bagging_freq': 1,
        'lambda_l1': 0.0, 'lambda_l2': 4.0, 'max_bin': 127, 'verbose': -1, 'n_jobs': -1,
        'seed': seed, 'feature_fraction_seed': seed, 'bagging_seed': seed, 'data_random_seed': seed
    }
    oof_ = np.zeros(len(train), dtype='float32'); pred_te_ = np.zeros(len(test), dtype='float32')
    for fold in range(n_splits):
        fold_t0 = time.time()
        trn_idx = np.where(folds_arr != fold)[0]; val_idx = np.where(folds_arr == fold)[0]
        log(f'[LGB multi] seed={seed} ff={feature_fraction} bf={bagging_fraction} | Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
        dtrain = lgb.Dataset(X_.iloc[trn_idx], label=y_[trn_idx])
        dvalid = lgb.Dataset(X_.iloc[val_idx], label=y_[val_idx])
        model = lgb.train(params, dtrain, num_boost_round=4000, valid_sets=[dtrain, dvalid], valid_names=['train','valid'], callbacks=[lgb.early_stopping(120), lgb.log_evaluation(200)])
        oof_[val_idx] = model.predict(X_.iloc[val_idx], num_iteration=model.best_iteration)
        auc = roc_auc_score(y_[val_idx], oof_[val_idx])
        log(f'[LGB multi] seed={seed} fold={fold} AUC: {auc:.6f} | best_iter={model.best_iteration} | elapsed={time.time()-fold_t0:.1f}s')
        pred_te_ += model.predict(Xte_, num_iteration=model.best_iteration) / n_splits
        del dtrain, dvalid, model; gc.collect()
    cv_auc = roc_auc_score(y_, oof_); log(f'[LGB multi] seed={seed} OOF AUC: {cv_auc:.6f} | total_elapsed={time.time()-t0:.1f}s')
    tag = tag if tag is not None else f'seed{seed}'
    pd.DataFrame({id_col: train[id_col], f'oof_lgb_fast_{tag}': oof_}).to_csv(f'oof_lgb_fast_{tag}.csv', index=False)
    pd.DataFrame({id_col: test[id_col], f'prediction_lgb_fast_{tag}': pred_te_}).to_csv(f'pred_lgb_fast_{tag}.csv', index=False)
    log(f'[LGB multi] Saved OOF/test preds for {tag}')

# Usage plan (execute after XGB finishes):
# train_lgb_fast_seed(42, 0.75, 0.75, tag='s42')
# train_lgb_fast_seed(1337, 0.72, 0.72, tag='s1337')
# train_lgb_fast_seed(2025, 0.78, 0.78, tag='s2025')

[2025-09-11 23:05:36] Preparing LightGBM multi-seed training cell (will run after XGB finishes)...


In [14]:
# Run LightGBM fast for seed 42 on current 165-feature set and locked folds
log('Starting LightGBM multi-seed training: seed 42...')
train_lgb_fast_seed(42, 0.75, 0.75, tag='s42')

[2025-09-11 23:06:01] Starting LightGBM multi-seed training: seed 42...


[2025-09-11 23:06:02] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978775	valid's auc: 0.975702


[400]	train's auc: 0.98866	valid's auc: 0.985066


[600]	train's auc: 0.992253	valid's auc: 0.98776


[800]	train's auc: 0.994299	valid's auc: 0.988859


[1000]	train's auc: 0.995652	valid's auc: 0.989373


[1200]	train's auc: 0.9967	valid's auc: 0.98969


[1400]	train's auc: 0.997516	valid's auc: 0.989907


[1600]	train's auc: 0.998162	valid's auc: 0.990095


[1800]	train's auc: 0.998667	valid's auc: 0.990284


[2000]	train's auc: 0.99905	valid's auc: 0.990377


[2200]	train's auc: 0.999332	valid's auc: 0.990491


[2400]	train's auc: 0.999539	valid's auc: 0.990579


[2600]	train's auc: 0.999689	valid's auc: 0.99063


[2800]	train's auc: 0.999795	valid's auc: 0.990694


[3000]	train's auc: 0.999869	valid's auc: 0.990749


[3200]	train's auc: 0.999916	valid's auc: 0.990818


[3400]	train's auc: 0.99995	valid's auc: 0.990834


[3600]	train's auc: 0.99997	valid's auc: 0.990864


[3800]	train's auc: 0.999983	valid's auc: 0.990886


[4000]	train's auc: 0.999991	valid's auc: 0.990907
Did not meet early stopping. Best iteration is:
[3999]	train's auc: 0.999991	valid's auc: 0.990908


[2025-09-11 23:11:17] [LGB multi] seed=42 fold=0 AUC: 0.990908 | best_iter=3999 | elapsed=315.8s


[2025-09-11 23:11:20] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978622	valid's auc: 0.975583


[400]	train's auc: 0.988718	valid's auc: 0.985092


[600]	train's auc: 0.992316	valid's auc: 0.987828


[800]	train's auc: 0.994274	valid's auc: 0.988781


[1000]	train's auc: 0.995695	valid's auc: 0.989431


[1200]	train's auc: 0.996737	valid's auc: 0.989698


[1400]	train's auc: 0.997565	valid's auc: 0.989989


[1600]	train's auc: 0.998195	valid's auc: 0.990133


[1800]	train's auc: 0.998686	valid's auc: 0.990303


[2000]	train's auc: 0.999061	valid's auc: 0.990386


[2200]	train's auc: 0.999339	valid's auc: 0.990483


[2400]	train's auc: 0.999549	valid's auc: 0.990555


[2600]	train's auc: 0.999698	valid's auc: 0.990637


[2800]	train's auc: 0.999803	valid's auc: 0.990685


[3000]	train's auc: 0.999874	valid's auc: 0.990729


[3200]	train's auc: 0.999922	valid's auc: 0.990793


[3400]	train's auc: 0.999954	valid's auc: 0.990838


[3600]	train's auc: 0.999973	valid's auc: 0.990859


[3800]	train's auc: 0.999985	valid's auc: 0.990892


[4000]	train's auc: 0.999992	valid's auc: 0.990926
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.999992	valid's auc: 0.990926


[2025-09-11 23:16:33] [LGB multi] seed=42 fold=1 AUC: 0.990926 | best_iter=4000 | elapsed=312.9s


[2025-09-11 23:16:36] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978655	valid's auc: 0.975542


[400]	train's auc: 0.988786	valid's auc: 0.98512


[600]	train's auc: 0.992315	valid's auc: 0.987885


[800]	train's auc: 0.994279	valid's auc: 0.988868


[1000]	train's auc: 0.995671	valid's auc: 0.989459


[1200]	train's auc: 0.996741	valid's auc: 0.989816


[1400]	train's auc: 0.997563	valid's auc: 0.990079


[1600]	train's auc: 0.998191	valid's auc: 0.990227


[1800]	train's auc: 0.998676	valid's auc: 0.990338


[2000]	train's auc: 0.999042	valid's auc: 0.990423


[2200]	train's auc: 0.999325	valid's auc: 0.99051


[2400]	train's auc: 0.999533	valid's auc: 0.990586


[2600]	train's auc: 0.999684	valid's auc: 0.990652


[2800]	train's auc: 0.999796	valid's auc: 0.990691


[3000]	train's auc: 0.999868	valid's auc: 0.990748


[3200]	train's auc: 0.999919	valid's auc: 0.990806


[3400]	train's auc: 0.999951	valid's auc: 0.990859


[3600]	train's auc: 0.999972	valid's auc: 0.990886


[3800]	train's auc: 0.999984	valid's auc: 0.990898


Early stopping, best iteration is:
[3720]	train's auc: 0.99998	valid's auc: 0.990903


[2025-09-11 23:21:35] [LGB multi] seed=42 fold=2 AUC: 0.990903 | best_iter=3720 | elapsed=298.8s


[2025-09-11 23:21:37] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978624	valid's auc: 0.975211


[400]	train's auc: 0.988731	valid's auc: 0.984888


[600]	train's auc: 0.992295	valid's auc: 0.987713


[800]	train's auc: 0.994299	valid's auc: 0.988808


[1000]	train's auc: 0.99567	valid's auc: 0.989337


[1200]	train's auc: 0.996725	valid's auc: 0.9897


[1400]	train's auc: 0.997529	valid's auc: 0.989898


[1600]	train's auc: 0.998173	valid's auc: 0.990074


[1800]	train's auc: 0.998662	valid's auc: 0.990223


[2000]	train's auc: 0.99904	valid's auc: 0.990336


[2200]	train's auc: 0.999334	valid's auc: 0.990433


[2400]	train's auc: 0.999538	valid's auc: 0.990544


[2600]	train's auc: 0.999687	valid's auc: 0.990627


[2800]	train's auc: 0.999797	valid's auc: 0.990661


[3000]	train's auc: 0.999868	valid's auc: 0.990708


[3200]	train's auc: 0.999916	valid's auc: 0.990772


[3400]	train's auc: 0.999948	valid's auc: 0.990833


[3600]	train's auc: 0.999969	valid's auc: 0.990858


[3800]	train's auc: 0.999982	valid's auc: 0.990886


[4000]	train's auc: 0.99999	valid's auc: 0.990924
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.99999	valid's auc: 0.990924


[2025-09-11 23:26:56] [LGB multi] seed=42 fold=3 AUC: 0.990924 | best_iter=4000 | elapsed=318.8s


[2025-09-11 23:26:59] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978115	valid's auc: 0.97447


[400]	train's auc: 0.988526	valid's auc: 0.98467


[600]	train's auc: 0.992168	valid's auc: 0.987541


[800]	train's auc: 0.994208	valid's auc: 0.988659


[1000]	train's auc: 0.995613	valid's auc: 0.989259


[1200]	train's auc: 0.996698	valid's auc: 0.989669


[1400]	train's auc: 0.997507	valid's auc: 0.989869


[1600]	train's auc: 0.998151	valid's auc: 0.990076


[1800]	train's auc: 0.998655	valid's auc: 0.990213


[2000]	train's auc: 0.999041	valid's auc: 0.990335


[2200]	train's auc: 0.999323	valid's auc: 0.990415


[2400]	train's auc: 0.999536	valid's auc: 0.990479


[2600]	train's auc: 0.999688	valid's auc: 0.990566


[2800]	train's auc: 0.999795	valid's auc: 0.990643


[3000]	train's auc: 0.999869	valid's auc: 0.990692


[3200]	train's auc: 0.999918	valid's auc: 0.99076


[3400]	train's auc: 0.999951	valid's auc: 0.990812


[3600]	train's auc: 0.999971	valid's auc: 0.990844


[3800]	train's auc: 0.999983	valid's auc: 0.990875


[4000]	train's auc: 0.999991	valid's auc: 0.990913
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.999991	valid's auc: 0.990913


[2025-09-11 23:32:08] [LGB multi] seed=42 fold=4 AUC: 0.990913 | best_iter=4000 | elapsed=308.8s


[2025-09-11 23:32:11] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978233	valid's auc: 0.974894


[400]	train's auc: 0.988841	valid's auc: 0.985093


[600]	train's auc: 0.99226	valid's auc: 0.9876


[800]	train's auc: 0.994228	valid's auc: 0.988561


[1000]	train's auc: 0.995651	valid's auc: 0.989204


[1200]	train's auc: 0.996711	valid's auc: 0.989594


[1400]	train's auc: 0.997528	valid's auc: 0.989809


[1600]	train's auc: 0.998156	valid's auc: 0.98996


[1800]	train's auc: 0.99866	valid's auc: 0.99015


[2000]	train's auc: 0.99904	valid's auc: 0.990245


[2200]	train's auc: 0.999322	valid's auc: 0.990366


[2400]	train's auc: 0.999534	valid's auc: 0.990474


[2600]	train's auc: 0.999684	valid's auc: 0.990549


[2800]	train's auc: 0.99979	valid's auc: 0.990609


[3000]	train's auc: 0.999866	valid's auc: 0.99067


[3200]	train's auc: 0.999914	valid's auc: 0.990727


[3400]	train's auc: 0.999947	valid's auc: 0.990769


[3600]	train's auc: 0.999969	valid's auc: 0.990814


[3800]	train's auc: 0.999982	valid's auc: 0.99084


[4000]	train's auc: 0.99999	valid's auc: 0.990878
Did not meet early stopping. Best iteration is:
[3977]	train's auc: 0.999989	valid's auc: 0.990878


[2025-09-11 23:37:25] [LGB multi] seed=42 fold=5 AUC: 0.990878 | best_iter=3977 | elapsed=314.6s


[2025-09-11 23:37:28] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978408	valid's auc: 0.976143


[400]	train's auc: 0.98858	valid's auc: 0.98557


[600]	train's auc: 0.992346	valid's auc: 0.988504


[800]	train's auc: 0.994272	valid's auc: 0.989386


[1000]	train's auc: 0.995657	valid's auc: 0.989857


[1200]	train's auc: 0.996713	valid's auc: 0.990171


[1400]	train's auc: 0.997537	valid's auc: 0.990416


[1600]	train's auc: 0.998161	valid's auc: 0.99058


[1800]	train's auc: 0.998664	valid's auc: 0.990734


[2000]	train's auc: 0.999033	valid's auc: 0.990848


[2200]	train's auc: 0.999325	valid's auc: 0.99093


[2400]	train's auc: 0.999534	valid's auc: 0.991


[2600]	train's auc: 0.999683	valid's auc: 0.991056


[2800]	train's auc: 0.999793	valid's auc: 0.991127


[3000]	train's auc: 0.999867	valid's auc: 0.991174


[3200]	train's auc: 0.999916	valid's auc: 0.991209


[3400]	train's auc: 0.999949	valid's auc: 0.991264


[3600]	train's auc: 0.99997	valid's auc: 0.991291


[3800]	train's auc: 0.999983	valid's auc: 0.991325


[4000]	train's auc: 0.99999	valid's auc: 0.991342
Did not meet early stopping. Best iteration is:
[3979]	train's auc: 0.99999	valid's auc: 0.991342


[2025-09-11 23:42:38] [LGB multi] seed=42 fold=6 AUC: 0.991342 | best_iter=3979 | elapsed=309.3s


[2025-09-11 23:42:41] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978395	valid's auc: 0.974883


[400]	train's auc: 0.988648	valid's auc: 0.984772


[600]	train's auc: 0.992244	valid's auc: 0.987665


[800]	train's auc: 0.994263	valid's auc: 0.988789


[1000]	train's auc: 0.995625	valid's auc: 0.98927


[1200]	train's auc: 0.996693	valid's auc: 0.989622


[1400]	train's auc: 0.997512	valid's auc: 0.989893


[1600]	train's auc: 0.998162	valid's auc: 0.990124


[1800]	train's auc: 0.998653	valid's auc: 0.990233


[2000]	train's auc: 0.999032	valid's auc: 0.990335


[2200]	train's auc: 0.999322	valid's auc: 0.990426


[2400]	train's auc: 0.999535	valid's auc: 0.990526


[2600]	train's auc: 0.999687	valid's auc: 0.990607


[2800]	train's auc: 0.999793	valid's auc: 0.990674


[3000]	train's auc: 0.99987	valid's auc: 0.990736


[3200]	train's auc: 0.999918	valid's auc: 0.990792


[3400]	train's auc: 0.999951	valid's auc: 0.990824


[3600]	train's auc: 0.999971	valid's auc: 0.990868


[3800]	train's auc: 0.999983	valid's auc: 0.990916


[4000]	train's auc: 0.999991	valid's auc: 0.99094
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.999991	valid's auc: 0.99094


[2025-09-11 23:47:58] [LGB multi] seed=42 fold=7 AUC: 0.990940 | best_iter=4000 | elapsed=316.9s


[2025-09-11 23:48:00] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978407	valid's auc: 0.97558


[400]	train's auc: 0.988609	valid's auc: 0.98532


[600]	train's auc: 0.992207	valid's auc: 0.988053


[800]	train's auc: 0.994174	valid's auc: 0.989036


[1000]	train's auc: 0.995597	valid's auc: 0.989658


[1200]	train's auc: 0.996659	valid's auc: 0.99002


[1400]	train's auc: 0.997488	valid's auc: 0.990215


[1600]	train's auc: 0.998139	valid's auc: 0.990426


[1800]	train's auc: 0.998633	valid's auc: 0.990616


[2000]	train's auc: 0.999027	valid's auc: 0.99074


[2200]	train's auc: 0.999313	valid's auc: 0.990849


[2400]	train's auc: 0.999528	valid's auc: 0.990921


[2600]	train's auc: 0.999683	valid's auc: 0.990968


[2800]	train's auc: 0.999791	valid's auc: 0.991044


[3000]	train's auc: 0.999865	valid's auc: 0.991085


[3200]	train's auc: 0.999916	valid's auc: 0.991146


[3400]	train's auc: 0.999949	valid's auc: 0.991215


[3600]	train's auc: 0.99997	valid's auc: 0.99125


[3800]	train's auc: 0.999984	valid's auc: 0.991286


[4000]	train's auc: 0.999991	valid's auc: 0.991314
Did not meet early stopping. Best iteration is:
[3976]	train's auc: 0.999991	valid's auc: 0.991318


[2025-09-11 23:53:12] [LGB multi] seed=42 fold=8 AUC: 0.991318 | best_iter=3976 | elapsed=311.9s


[2025-09-11 23:53:15] [LGB multi] seed=42 ff=0.75 bf=0.75 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978276	valid's auc: 0.97555


[400]	train's auc: 0.988778	valid's auc: 0.985715


[600]	train's auc: 0.992386	valid's auc: 0.988445


[800]	train's auc: 0.994296	valid's auc: 0.989328


[1000]	train's auc: 0.995681	valid's auc: 0.989872


[1200]	train's auc: 0.996727	valid's auc: 0.99019


[1400]	train's auc: 0.997541	valid's auc: 0.990323


[1600]	train's auc: 0.998176	valid's auc: 0.990467


[1800]	train's auc: 0.998666	valid's auc: 0.990602


[2000]	train's auc: 0.999036	valid's auc: 0.990706


[2200]	train's auc: 0.999324	valid's auc: 0.990802


[2400]	train's auc: 0.999536	valid's auc: 0.990857


[2600]	train's auc: 0.999686	valid's auc: 0.990915


[2800]	train's auc: 0.999791	valid's auc: 0.990951


[3000]	train's auc: 0.999865	valid's auc: 0.991017


[3200]	train's auc: 0.999915	valid's auc: 0.991081


[3400]	train's auc: 0.99995	valid's auc: 0.991106


[3600]	train's auc: 0.999971	valid's auc: 0.991145


[3800]	train's auc: 0.999984	valid's auc: 0.991156


Early stopping, best iteration is:
[3732]	train's auc: 0.99998	valid's auc: 0.991168


[2025-09-11 23:58:19] [LGB multi] seed=42 fold=9 AUC: 0.991168 | best_iter=3732 | elapsed=303.4s


[2025-09-11 23:58:22] [LGB multi] seed=42 OOF AUC: 0.991021 | total_elapsed=3140.3s


[2025-09-11 23:58:23] [LGB multi] Saved OOF/test preds for s42


In [16]:
# Blending: LGB multi-seeds (s42,s1337,s2025) + XGB (seed42); rank/prob grid on OOF; apply to test
log('Preparing blend (LGB seeds + XGB) with OOF weight tuning...')
import glob

gt = train[['id', 'target']].copy()

def safe_load_oof(path, default_col):
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    cols = [c for c in df.columns if c != 'id']
    col = default_col if default_col in df.columns else (cols[0] if cols else None)
    return df[['id', col]].rename(columns={col: 'pred'}) if col else None

def safe_load_pred(path, default_col):
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    cols = [c for c in df.columns if c != 'id']
    col = default_col if default_col in df.columns else (cols[0] if cols else None)
    return df[['id', col]].rename(columns={col: 'pred'}) if col else None

# LGB seeds (new 165-feature runs)
oof_lgb_paths = [
    ('oof_lgb_fast_s42.csv', 'oof_lgb_fast_s42'),
    ('oof_lgb_fast_s1337.csv', 'oof_lgb_fast_s1337'),
    ('oof_lgb_fast_s2025.csv', 'oof_lgb_fast_s2025'),
]
pred_lgb_paths = [
    ('pred_lgb_fast_s42.csv', 'prediction_lgb_fast_s42'),
    ('pred_lgb_fast_s1337.csv', 'prediction_lgb_fast_s1337'),
    ('pred_lgb_fast_s2025.csv', 'prediction_lgb_fast_s2025'),
]

oof_lgb_list = []
for p, col in oof_lgb_paths:
    df = safe_load_oof(p, col)
    if df is not None:
        oof_lgb_list.append(df.rename(columns={'pred': f'pred_{p}'}))

pred_lgb_list = []
for p, col in pred_lgb_paths:
    df = safe_load_pred(p, col)
    if df is not None:
        pred_lgb_list.append(df.rename(columns={'pred': f'pred_{p}'}))

if len(oof_lgb_list) == 0:
    log('No LGB seed OOF files found. Run LGB seeds first.');
else:
    oof_lgb_merged = oof_lgb_list[0]
    for df in oof_lgb_list[1:]:
        oof_lgb_merged = oof_lgb_merged.merge(df, on='id', how='inner')
    # Build LGB ensemble OOF by simple average
    lgb_cols = [c for c in oof_lgb_merged.columns if c != 'id']
    oof_lgb_merged['lgb_ens'] = oof_lgb_merged[lgb_cols].mean(axis=1)
    oof = gt.merge(oof_lgb_merged[['id', 'lgb_ens']], on='id', how='left')

    # XGB (seed42)
    oof_xgb = safe_load_oof('oof_xgb_seed42.csv', 'oof_xgb')
    have_xgb = oof_xgb is not None
    if have_xgb:
        oof = oof.merge(oof_xgb.rename(columns={'pred': 'xgb'}), on='id', how='left')
    else:
        oof['xgb'] = np.nan

    # Rank normalization helper
    def rank_norm(x):
        r = pd.Series(x).rank(method='average').values
        return (r - 1) / (len(r) - 1)

    oof['lgb_r'] = rank_norm(oof['lgb_ens'])
    if have_xgb:
        oof['xgb_r'] = rank_norm(oof['xgb'])

    best_auc = 0.0; best_mode = 'prob'; best_w = 1.0
    weights = np.arange(0.50, 0.95, 0.05) if have_xgb else [1.0]
    for w in weights:
        if have_xgb:
            blend_prob = w * oof['lgb_ens'] + (1 - w) * oof['xgb']
            blend_rank = w * oof['lgb_r'] + (1 - w) * oof['xgb_r']
        else:
            blend_prob = oof['lgb_ens']
            blend_rank = oof['lgb_r']
        auc_prob = roc_auc_score(oof['target'], blend_prob)
        if auc_prob > best_auc:
            best_auc = auc_prob; best_mode = 'prob'; best_w = float(w)
        auc_rank = roc_auc_score(oof['target'], blend_rank)
        if auc_rank > best_auc:
            best_auc = auc_rank; best_mode = 'rank'; best_w = float(w)

    log(f"Blend search best OOF AUC={best_auc:.6f} | mode={best_mode} | w_lgb={best_w:.2f}")

    # Build test ensemble
    if len(pred_lgb_list) == 0:
        log('No LGB seed test preds found. Cannot build submission.');
    else:
        pred_lgb_merged = pred_lgb_list[0]
        for df in pred_lgb_list[1:]:
            pred_lgb_merged = pred_lgb_merged.merge(df, on='id', how='inner')
        lgb_pred_cols = [c for c in pred_lgb_merged.columns if c != 'id']
        pred_lgb_merged['lgb_ens'] = pred_lgb_merged[lgb_pred_cols].mean(axis=1)

        pred_xgb = safe_load_pred('pred_xgb_seed42.csv', 'prediction_xgb') if have_xgb else None
        have_pred_xgb = pred_xgb is not None
        sub = pd.read_csv('sample_submission.csv')
        sub = sub[['id']].merge(pred_lgb_merged[['id','lgb_ens']], on='id', how='left')
        if best_mode == 'prob' and have_pred_xgb:
            sub = sub.merge(pred_xgb.rename(columns={'pred': 'xgb'}), on='id', how='left')
            sub['target'] = (best_w * sub['lgb_ens'] + (1 - best_w) * sub['xgb']).astype('float32')
        elif best_mode == 'rank' and have_pred_xgb:
            sub = sub.merge(pred_xgb.rename(columns={'pred': 'xgb'}), on='id', how='left')
            rl = pd.Series(sub['lgb_ens']).rank(method='average').values
            rx = pd.Series(sub['xgb']).rank(method='average').values
            rl = (rl - 1) / (len(rl) - 1); rx = (rx - 1) / (len(rx) - 1)
            sub['target'] = (best_w * rl + (1 - best_w) * rx).astype('float32')
        else:
            sub['target'] = sub['lgb_ens'].astype('float32')

        sub[['id','target']].to_csv('submission.csv', index=False)
        log('Saved blended submission.csv (or LGB-only if XGB missing).')

[2025-09-12 01:40:30] Preparing blend (LGB seeds + XGB) with OOF weight tuning...


[2025-09-12 01:40:36] Blend search best OOF AUC=0.991159 | mode=prob | w_lgb=0.90


[2025-09-12 01:40:36] Saved blended submission.csv (or LGB-only if XGB missing).


In [15]:
# Run LightGBM fast for seeds 1337 and 2025 sequentially on current 165-feature set
log('Starting LightGBM multi-seed training: seeds 1337 and 2025...')
train_lgb_fast_seed(1337, 0.72, 0.72, tag='s1337')
train_lgb_fast_seed(2025, 0.78, 0.78, tag='s2025')

[2025-09-11 23:58:35] Starting LightGBM multi-seed training: seeds 1337 and 2025...


[2025-09-11 23:58:36] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978403	valid's auc: 0.975425


[400]	train's auc: 0.988473	valid's auc: 0.984865


[600]	train's auc: 0.992095	valid's auc: 0.987586


[800]	train's auc: 0.994129	valid's auc: 0.988623


[1000]	train's auc: 0.99557	valid's auc: 0.989274


[1200]	train's auc: 0.996655	valid's auc: 0.989636


[1400]	train's auc: 0.997501	valid's auc: 0.98989


[1600]	train's auc: 0.998159	valid's auc: 0.990098


[1800]	train's auc: 0.998655	valid's auc: 0.990174


[2000]	train's auc: 0.999031	valid's auc: 0.990273


[2200]	train's auc: 0.999322	valid's auc: 0.99038


[2400]	train's auc: 0.999533	valid's auc: 0.990467


[2600]	train's auc: 0.999686	valid's auc: 0.990512


[2800]	train's auc: 0.999795	valid's auc: 0.990602


[3000]	train's auc: 0.999869	valid's auc: 0.990658


[3200]	train's auc: 0.999919	valid's auc: 0.990687


[3400]	train's auc: 0.999952	valid's auc: 0.990732


[3600]	train's auc: 0.999973	valid's auc: 0.990738


[3800]	train's auc: 0.999985	valid's auc: 0.9908


[4000]	train's auc: 0.999992	valid's auc: 0.990823
Did not meet early stopping. Best iteration is:
[3989]	train's auc: 0.999991	valid's auc: 0.990826


[2025-09-12 00:03:56] [LGB multi] seed=1337 fold=0 AUC: 0.990826 | best_iter=3989 | elapsed=320.4s


[2025-09-12 00:03:59] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.977791	valid's auc: 0.97484


[400]	train's auc: 0.988395	valid's auc: 0.984786


[600]	train's auc: 0.992047	valid's auc: 0.987493


[800]	train's auc: 0.994136	valid's auc: 0.988685


[1000]	train's auc: 0.995555	valid's auc: 0.989225


[1200]	train's auc: 0.996636	valid's auc: 0.989537


[1400]	train's auc: 0.997489	valid's auc: 0.989803


[1600]	train's auc: 0.998128	valid's auc: 0.989947


[1800]	train's auc: 0.998637	valid's auc: 0.990052


[2000]	train's auc: 0.999018	valid's auc: 0.990169


[2200]	train's auc: 0.999311	valid's auc: 0.990249


[2400]	train's auc: 0.999528	valid's auc: 0.990299


[2600]	train's auc: 0.999683	valid's auc: 0.990358


[2800]	train's auc: 0.999793	valid's auc: 0.990413


Early stopping, best iteration is:
[2803]	train's auc: 0.999794	valid's auc: 0.990415


[2025-09-12 00:07:51] [LGB multi] seed=1337 fold=1 AUC: 0.990415 | best_iter=2803 | elapsed=232.5s


[2025-09-12 00:07:53] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978201	valid's auc: 0.975057


[400]	train's auc: 0.988353	valid's auc: 0.984457


[600]	train's auc: 0.992186	valid's auc: 0.987582


[800]	train's auc: 0.994156	valid's auc: 0.988608


[1000]	train's auc: 0.995567	valid's auc: 0.989156


[1200]	train's auc: 0.996668	valid's auc: 0.989529


[1400]	train's auc: 0.997504	valid's auc: 0.989796


[1600]	train's auc: 0.998149	valid's auc: 0.989927


[1800]	train's auc: 0.998652	valid's auc: 0.990084


[2000]	train's auc: 0.999034	valid's auc: 0.990194


[2200]	train's auc: 0.999327	valid's auc: 0.99026


[2400]	train's auc: 0.999538	valid's auc: 0.990326


[2600]	train's auc: 0.999687	valid's auc: 0.990395


[2800]	train's auc: 0.999794	valid's auc: 0.990462


[3000]	train's auc: 0.99987	valid's auc: 0.990516


[3200]	train's auc: 0.999919	valid's auc: 0.990558


[3400]	train's auc: 0.999952	valid's auc: 0.990588


[3600]	train's auc: 0.999972	valid's auc: 0.990598


[3800]	train's auc: 0.999984	valid's auc: 0.990614


Early stopping, best iteration is:
[3713]	train's auc: 0.999979	valid's auc: 0.990626


[2025-09-12 00:12:53] [LGB multi] seed=1337 fold=2 AUC: 0.990626 | best_iter=3713 | elapsed=299.3s


[2025-09-12 00:12:55] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.977869	valid's auc: 0.974453


[400]	train's auc: 0.988321	valid's auc: 0.984458


[600]	train's auc: 0.992111	valid's auc: 0.987538


[800]	train's auc: 0.994162	valid's auc: 0.988634


[1000]	train's auc: 0.995593	valid's auc: 0.989216


[1200]	train's auc: 0.996651	valid's auc: 0.989539


[1400]	train's auc: 0.997501	valid's auc: 0.989827


[1600]	train's auc: 0.998139	valid's auc: 0.990034


[1800]	train's auc: 0.99865	valid's auc: 0.990205


[2000]	train's auc: 0.999027	valid's auc: 0.990278


[2200]	train's auc: 0.999312	valid's auc: 0.990369


[2400]	train's auc: 0.999525	valid's auc: 0.990443


[2600]	train's auc: 0.999683	valid's auc: 0.990482


[2800]	train's auc: 0.999787	valid's auc: 0.990527


[3000]	train's auc: 0.999862	valid's auc: 0.990593


[3200]	train's auc: 0.999914	valid's auc: 0.990647


[3400]	train's auc: 0.999948	valid's auc: 0.990694


[3600]	train's auc: 0.999969	valid's auc: 0.990732


[3800]	train's auc: 0.999982	valid's auc: 0.990797


[4000]	train's auc: 0.99999	valid's auc: 0.990825
Did not meet early stopping. Best iteration is:
[3991]	train's auc: 0.99999	valid's auc: 0.990827


[2025-09-12 00:18:08] [LGB multi] seed=1337 fold=3 AUC: 0.990827 | best_iter=3991 | elapsed=313.0s


[2025-09-12 00:18:11] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978172	valid's auc: 0.974541


[400]	train's auc: 0.988382	valid's auc: 0.984493


[600]	train's auc: 0.992157	valid's auc: 0.987503


[800]	train's auc: 0.994136	valid's auc: 0.988496


[1000]	train's auc: 0.995569	valid's auc: 0.989083


[1200]	train's auc: 0.996648	valid's auc: 0.989416


[1400]	train's auc: 0.997472	valid's auc: 0.989652


[1600]	train's auc: 0.998133	valid's auc: 0.989829


[1800]	train's auc: 0.998637	valid's auc: 0.989989


[2000]	train's auc: 0.999021	valid's auc: 0.990136


[2200]	train's auc: 0.999313	valid's auc: 0.990259


[2400]	train's auc: 0.99953	valid's auc: 0.990334


[2600]	train's auc: 0.999686	valid's auc: 0.990411


[2800]	train's auc: 0.999795	valid's auc: 0.990482


[3000]	train's auc: 0.99987	valid's auc: 0.990525


[3200]	train's auc: 0.999921	valid's auc: 0.990564


[3400]	train's auc: 0.999952	valid's auc: 0.990618


[3600]	train's auc: 0.999972	valid's auc: 0.990655


[3800]	train's auc: 0.999985	valid's auc: 0.990681


[4000]	train's auc: 0.999992	valid's auc: 0.990721
Did not meet early stopping. Best iteration is:
[3996]	train's auc: 0.999992	valid's auc: 0.990722


[2025-09-12 00:23:29] [LGB multi] seed=1337 fold=4 AUC: 0.990722 | best_iter=3996 | elapsed=318.0s


[2025-09-12 00:23:32] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978337	valid's auc: 0.974845


[400]	train's auc: 0.988392	valid's auc: 0.984492


[600]	train's auc: 0.992212	valid's auc: 0.987498


[800]	train's auc: 0.994192	valid's auc: 0.98852


[1000]	train's auc: 0.995594	valid's auc: 0.989052


[1200]	train's auc: 0.996658	valid's auc: 0.989355


[1400]	train's auc: 0.99749	valid's auc: 0.989612


[1600]	train's auc: 0.998137	valid's auc: 0.989798


[1800]	train's auc: 0.998637	valid's auc: 0.989937


[2000]	train's auc: 0.999026	valid's auc: 0.990023


[2200]	train's auc: 0.999321	valid's auc: 0.99012


[2400]	train's auc: 0.999531	valid's auc: 0.990187


[2600]	train's auc: 0.999686	valid's auc: 0.990225


[2800]	train's auc: 0.999795	valid's auc: 0.990336


[3000]	train's auc: 0.99987	valid's auc: 0.990379


[3200]	train's auc: 0.999919	valid's auc: 0.990433


[3400]	train's auc: 0.99995	valid's auc: 0.990489


[3600]	train's auc: 0.999972	valid's auc: 0.990524


[3800]	train's auc: 0.999983	valid's auc: 0.990565


[4000]	train's auc: 0.999991	valid's auc: 0.990594
Did not meet early stopping. Best iteration is:
[3988]	train's auc: 0.99999	valid's auc: 0.990598


[2025-09-12 00:28:48] [LGB multi] seed=1337 fold=5 AUC: 0.990598 | best_iter=3988 | elapsed=315.6s


[2025-09-12 00:28:51] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978417	valid's auc: 0.976079


[400]	train's auc: 0.988416	valid's auc: 0.985259


[600]	train's auc: 0.992141	valid's auc: 0.988023


[800]	train's auc: 0.994169	valid's auc: 0.989092


[1000]	train's auc: 0.995578	valid's auc: 0.989631


[1200]	train's auc: 0.996649	valid's auc: 0.98995


[1400]	train's auc: 0.997489	valid's auc: 0.990228


[1600]	train's auc: 0.998136	valid's auc: 0.990378


[1800]	train's auc: 0.998635	valid's auc: 0.990511


[2000]	train's auc: 0.999027	valid's auc: 0.990657


[2200]	train's auc: 0.999314	valid's auc: 0.990763


[2400]	train's auc: 0.999524	valid's auc: 0.990859


[2600]	train's auc: 0.999679	valid's auc: 0.990904


[2800]	train's auc: 0.99979	valid's auc: 0.990928


[3000]	train's auc: 0.999865	valid's auc: 0.99098


[3200]	train's auc: 0.999915	valid's auc: 0.991017


[3400]	train's auc: 0.999947	valid's auc: 0.991073


[3600]	train's auc: 0.999968	valid's auc: 0.991129


[3800]	train's auc: 0.999982	valid's auc: 0.991141


[4000]	train's auc: 0.99999	valid's auc: 0.991156
Did not meet early stopping. Best iteration is:
[3992]	train's auc: 0.99999	valid's auc: 0.991158


[2025-09-12 00:34:03] [LGB multi] seed=1337 fold=6 AUC: 0.991158 | best_iter=3992 | elapsed=311.8s


[2025-09-12 00:34:06] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978265	valid's auc: 0.974577


[400]	train's auc: 0.988304	valid's auc: 0.98423


[600]	train's auc: 0.992107	valid's auc: 0.987346


[800]	train's auc: 0.9941	valid's auc: 0.98837


[1000]	train's auc: 0.99555	valid's auc: 0.98902


[1200]	train's auc: 0.996632	valid's auc: 0.989339


[1400]	train's auc: 0.997466	valid's auc: 0.989547


[1600]	train's auc: 0.99813	valid's auc: 0.989759


[1800]	train's auc: 0.998638	valid's auc: 0.989931


[2000]	train's auc: 0.999021	valid's auc: 0.990056


[2200]	train's auc: 0.999313	valid's auc: 0.990146


[2400]	train's auc: 0.999527	valid's auc: 0.990231


[2600]	train's auc: 0.999683	valid's auc: 0.990286


[2800]	train's auc: 0.999793	valid's auc: 0.990364


[3000]	train's auc: 0.999869	valid's auc: 0.990404


[3200]	train's auc: 0.999917	valid's auc: 0.990437


[3400]	train's auc: 0.99995	valid's auc: 0.990457


[3600]	train's auc: 0.999971	valid's auc: 0.990499


[3800]	train's auc: 0.999984	valid's auc: 0.990535


[4000]	train's auc: 0.999991	valid's auc: 0.990571
Did not meet early stopping. Best iteration is:
[3976]	train's auc: 0.999991	valid's auc: 0.990573


[2025-09-12 00:39:28] [LGB multi] seed=1337 fold=7 AUC: 0.990573 | best_iter=3976 | elapsed=322.0s


[2025-09-12 00:39:31] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978265	valid's auc: 0.975596


[400]	train's auc: 0.988347	valid's auc: 0.985124


[600]	train's auc: 0.992158	valid's auc: 0.988107


[800]	train's auc: 0.994167	valid's auc: 0.989164


[1000]	train's auc: 0.995569	valid's auc: 0.989709


[1200]	train's auc: 0.996641	valid's auc: 0.989989


[1400]	train's auc: 0.997475	valid's auc: 0.990242


[1600]	train's auc: 0.998131	valid's auc: 0.990445


[1800]	train's auc: 0.998639	valid's auc: 0.990555


[2000]	train's auc: 0.999023	valid's auc: 0.990648


[2200]	train's auc: 0.999313	valid's auc: 0.990771


[2400]	train's auc: 0.999525	valid's auc: 0.990869


[2600]	train's auc: 0.999679	valid's auc: 0.990933


[2800]	train's auc: 0.999793	valid's auc: 0.990955


[3000]	train's auc: 0.999866	valid's auc: 0.991016


[3200]	train's auc: 0.999916	valid's auc: 0.991055


[3400]	train's auc: 0.99995	valid's auc: 0.991106


[3600]	train's auc: 0.99997	valid's auc: 0.991163


[3800]	train's auc: 0.999983	valid's auc: 0.991173


[4000]	train's auc: 0.999991	valid's auc: 0.991207
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.999991	valid's auc: 0.991207


[2025-09-12 00:44:30] [LGB multi] seed=1337 fold=8 AUC: 0.991207 | best_iter=4000 | elapsed=299.9s


[2025-09-12 00:44:33] [LGB multi] seed=1337 ff=0.72 bf=0.72 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978097	valid's auc: 0.975538


[400]	train's auc: 0.988337	valid's auc: 0.985431


[600]	train's auc: 0.992032	valid's auc: 0.98816


[800]	train's auc: 0.99407	valid's auc: 0.98919


[1000]	train's auc: 0.995504	valid's auc: 0.989734


[1200]	train's auc: 0.996609	valid's auc: 0.990101


[1400]	train's auc: 0.997437	valid's auc: 0.99027


[1600]	train's auc: 0.998105	valid's auc: 0.990476


[1800]	train's auc: 0.998614	valid's auc: 0.990544


[2000]	train's auc: 0.999007	valid's auc: 0.990663


[2200]	train's auc: 0.999297	valid's auc: 0.990798


[2400]	train's auc: 0.999516	valid's auc: 0.990871


[2600]	train's auc: 0.999675	valid's auc: 0.990937


[2800]	train's auc: 0.999786	valid's auc: 0.990991


[3000]	train's auc: 0.999866	valid's auc: 0.991029


[3200]	train's auc: 0.999918	valid's auc: 0.991072


[3400]	train's auc: 0.999952	valid's auc: 0.991106


[3600]	train's auc: 0.999972	valid's auc: 0.991149


[3800]	train's auc: 0.999985	valid's auc: 0.991197


[4000]	train's auc: 0.999992	valid's auc: 0.991237
Did not meet early stopping. Best iteration is:
[3997]	train's auc: 0.999992	valid's auc: 0.991238


[2025-09-12 00:49:52] [LGB multi] seed=1337 fold=9 AUC: 0.991238 | best_iter=3997 | elapsed=318.5s


[2025-09-12 00:49:55] [LGB multi] seed=1337 OOF AUC: 0.990812 | total_elapsed=3079.8s


[2025-09-12 00:49:56] [LGB multi] Saved OOF/test preds for s1337


[2025-09-12 00:49:56] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.9788	valid's auc: 0.975769


[400]	train's auc: 0.988695	valid's auc: 0.985027


[600]	train's auc: 0.992401	valid's auc: 0.987925


[800]	train's auc: 0.994338	valid's auc: 0.988889


[1000]	train's auc: 0.995711	valid's auc: 0.989448


[1200]	train's auc: 0.996753	valid's auc: 0.989815


[1400]	train's auc: 0.997566	valid's auc: 0.990062


[1600]	train's auc: 0.998186	valid's auc: 0.99028


[1800]	train's auc: 0.998677	valid's auc: 0.990441


[2000]	train's auc: 0.999047	valid's auc: 0.990582


[2200]	train's auc: 0.999333	valid's auc: 0.990686


[2400]	train's auc: 0.999535	valid's auc: 0.990791


[2600]	train's auc: 0.999685	valid's auc: 0.990852


[2800]	train's auc: 0.99979	valid's auc: 0.990909


[3000]	train's auc: 0.999867	valid's auc: 0.990938


[3200]	train's auc: 0.999917	valid's auc: 0.990984


[3400]	train's auc: 0.99995	valid's auc: 0.991025


[3600]	train's auc: 0.99997	valid's auc: 0.991077


[3800]	train's auc: 0.999983	valid's auc: 0.991109


[4000]	train's auc: 0.999991	valid's auc: 0.991145
Did not meet early stopping. Best iteration is:
[3998]	train's auc: 0.999991	valid's auc: 0.991146


[2025-09-12 00:55:14] [LGB multi] seed=2025 fold=0 AUC: 0.991146 | best_iter=3998 | elapsed=317.7s


[2025-09-12 00:55:17] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978905	valid's auc: 0.976149


[400]	train's auc: 0.988661	valid's auc: 0.985163


[600]	train's auc: 0.992277	valid's auc: 0.987867


[800]	train's auc: 0.994251	valid's auc: 0.988845


[1000]	train's auc: 0.995669	valid's auc: 0.989481


[1200]	train's auc: 0.99671	valid's auc: 0.989834


[1400]	train's auc: 0.997508	valid's auc: 0.990073


[1600]	train's auc: 0.998151	valid's auc: 0.990261


[1800]	train's auc: 0.998647	valid's auc: 0.990472


[2000]	train's auc: 0.999026	valid's auc: 0.990553


[2200]	train's auc: 0.99931	valid's auc: 0.990628


[2400]	train's auc: 0.999524	valid's auc: 0.990734


[2600]	train's auc: 0.99968	valid's auc: 0.990807


[2800]	train's auc: 0.999787	valid's auc: 0.990841


[3000]	train's auc: 0.999861	valid's auc: 0.990886


[3200]	train's auc: 0.999912	valid's auc: 0.990935


[3400]	train's auc: 0.999948	valid's auc: 0.990972


[3600]	train's auc: 0.999969	valid's auc: 0.991005


[3800]	train's auc: 0.999982	valid's auc: 0.991016


[4000]	train's auc: 0.99999	valid's auc: 0.991041
Did not meet early stopping. Best iteration is:
[3933]	train's auc: 0.999988	valid's auc: 0.991046


[2025-09-12 01:00:31] [LGB multi] seed=2025 fold=1 AUC: 0.991046 | best_iter=3933 | elapsed=314.5s


[2025-09-12 01:00:34] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978532	valid's auc: 0.97514


[400]	train's auc: 0.988473	valid's auc: 0.984586


[600]	train's auc: 0.992282	valid's auc: 0.987702


[800]	train's auc: 0.994314	valid's auc: 0.988873


[1000]	train's auc: 0.995704	valid's auc: 0.989402


[1200]	train's auc: 0.996755	valid's auc: 0.989787


[1400]	train's auc: 0.997554	valid's auc: 0.990086


[1600]	train's auc: 0.998184	valid's auc: 0.990285


[1800]	train's auc: 0.998673	valid's auc: 0.990413


[2000]	train's auc: 0.999045	valid's auc: 0.990523


[2200]	train's auc: 0.999331	valid's auc: 0.990595


[2400]	train's auc: 0.999539	valid's auc: 0.990689


[2600]	train's auc: 0.999688	valid's auc: 0.990771


[2800]	train's auc: 0.999797	valid's auc: 0.990784


[3000]	train's auc: 0.999867	valid's auc: 0.99085


[3200]	train's auc: 0.999916	valid's auc: 0.9909


[3400]	train's auc: 0.999949	valid's auc: 0.990931


[3600]	train's auc: 0.99997	valid's auc: 0.990964


[3800]	train's auc: 0.999983	valid's auc: 0.990982


[4000]	train's auc: 0.999991	valid's auc: 0.991009
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.999991	valid's auc: 0.991009


[2025-09-12 01:05:46] [LGB multi] seed=2025 fold=2 AUC: 0.991009 | best_iter=4000 | elapsed=312.0s


[2025-09-12 01:05:49] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.979013	valid's auc: 0.975598


[400]	train's auc: 0.988605	valid's auc: 0.98477


[600]	train's auc: 0.992307	valid's auc: 0.987779


[800]	train's auc: 0.99429	valid's auc: 0.988845


[1000]	train's auc: 0.995667	valid's auc: 0.989328


[1200]	train's auc: 0.996721	valid's auc: 0.989752


[1400]	train's auc: 0.997517	valid's auc: 0.989941


[1600]	train's auc: 0.998162	valid's auc: 0.990151


[1800]	train's auc: 0.998656	valid's auc: 0.990297


[2000]	train's auc: 0.999032	valid's auc: 0.990438


[2200]	train's auc: 0.999315	valid's auc: 0.990511


[2400]	train's auc: 0.999524	valid's auc: 0.9906


[2600]	train's auc: 0.999677	valid's auc: 0.990685


[2800]	train's auc: 0.999788	valid's auc: 0.990757


[3000]	train's auc: 0.999862	valid's auc: 0.990829


[3200]	train's auc: 0.999913	valid's auc: 0.990896


[3400]	train's auc: 0.999946	valid's auc: 0.990955


[3600]	train's auc: 0.999967	valid's auc: 0.990994


[3800]	train's auc: 0.999981	valid's auc: 0.991023


[4000]	train's auc: 0.999989	valid's auc: 0.991052
Did not meet early stopping. Best iteration is:
[3929]	train's auc: 0.999987	valid's auc: 0.991058


[2025-09-12 01:11:05] [LGB multi] seed=2025 fold=3 AUC: 0.991058 | best_iter=3929 | elapsed=316.4s


[2025-09-12 01:11:08] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.979029	valid's auc: 0.975443


[400]	train's auc: 0.988745	valid's auc: 0.984803


[600]	train's auc: 0.992402	valid's auc: 0.987796


[800]	train's auc: 0.994353	valid's auc: 0.988872


[1000]	train's auc: 0.995722	valid's auc: 0.98943


[1200]	train's auc: 0.996754	valid's auc: 0.989792


[1400]	train's auc: 0.99756	valid's auc: 0.990045


[1600]	train's auc: 0.998178	valid's auc: 0.990254


[1800]	train's auc: 0.998666	valid's auc: 0.990452


[2000]	train's auc: 0.999044	valid's auc: 0.990577


[2200]	train's auc: 0.99933	valid's auc: 0.990637


[2400]	train's auc: 0.999538	valid's auc: 0.990707


[2600]	train's auc: 0.999688	valid's auc: 0.990775


[2800]	train's auc: 0.999796	valid's auc: 0.990848


[3000]	train's auc: 0.999871	valid's auc: 0.990897


[3200]	train's auc: 0.999921	valid's auc: 0.990969


[3400]	train's auc: 0.999952	valid's auc: 0.991011


Early stopping, best iteration is:
[3369]	train's auc: 0.999947	valid's auc: 0.991017


[2025-09-12 01:15:53] [LGB multi] seed=2025 fold=4 AUC: 0.991017 | best_iter=3369 | elapsed=285.0s


[2025-09-12 01:15:56] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.979014	valid's auc: 0.975619


[400]	train's auc: 0.988837	valid's auc: 0.984966


[600]	train's auc: 0.992425	valid's auc: 0.987768


[800]	train's auc: 0.994406	valid's auc: 0.988832


[1000]	train's auc: 0.995759	valid's auc: 0.989417


[1200]	train's auc: 0.996781	valid's auc: 0.989764


[1400]	train's auc: 0.997581	valid's auc: 0.990018


[1600]	train's auc: 0.998201	valid's auc: 0.990192


[1800]	train's auc: 0.998681	valid's auc: 0.990363


[2000]	train's auc: 0.999056	valid's auc: 0.99044


[2200]	train's auc: 0.999339	valid's auc: 0.99055


[2400]	train's auc: 0.999547	valid's auc: 0.990624


[2600]	train's auc: 0.999694	valid's auc: 0.990661


[2800]	train's auc: 0.999797	valid's auc: 0.990725


[3000]	train's auc: 0.999868	valid's auc: 0.990782


[3200]	train's auc: 0.999919	valid's auc: 0.990843


[3400]	train's auc: 0.99995	valid's auc: 0.990906


[3600]	train's auc: 0.99997	valid's auc: 0.990934


[3800]	train's auc: 0.999983	valid's auc: 0.990962


[4000]	train's auc: 0.999991	valid's auc: 0.990996
Did not meet early stopping. Best iteration is:
[3999]	train's auc: 0.99999	valid's auc: 0.990997


[2025-09-12 01:21:09] [LGB multi] seed=2025 fold=5 AUC: 0.990997 | best_iter=3999 | elapsed=312.9s


[2025-09-12 01:21:11] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978825	valid's auc: 0.976291


[400]	train's auc: 0.988773	valid's auc: 0.985576


[600]	train's auc: 0.992348	valid's auc: 0.988315


[800]	train's auc: 0.994311	valid's auc: 0.989334


[1000]	train's auc: 0.995707	valid's auc: 0.989863


[1200]	train's auc: 0.99675	valid's auc: 0.990224


[1400]	train's auc: 0.997556	valid's auc: 0.990462


[1600]	train's auc: 0.9982	valid's auc: 0.990607


[1800]	train's auc: 0.998693	valid's auc: 0.990757


[2000]	train's auc: 0.999055	valid's auc: 0.990895


[2200]	train's auc: 0.999329	valid's auc: 0.990996


[2400]	train's auc: 0.999536	valid's auc: 0.991058


[2600]	train's auc: 0.999684	valid's auc: 0.991139


[2800]	train's auc: 0.99979	valid's auc: 0.99121


[3000]	train's auc: 0.999865	valid's auc: 0.99126


[3200]	train's auc: 0.999914	valid's auc: 0.991286


[3400]	train's auc: 0.999947	valid's auc: 0.991329


[3600]	train's auc: 0.999968	valid's auc: 0.991367


[3800]	train's auc: 0.999981	valid's auc: 0.991418


[4000]	train's auc: 0.999989	valid's auc: 0.991461
Did not meet early stopping. Best iteration is:
[4000]	train's auc: 0.999989	valid's auc: 0.991461


[2025-09-12 01:26:37] [LGB multi] seed=2025 fold=6 AUC: 0.991461 | best_iter=4000 | elapsed=325.1s


[2025-09-12 01:26:39] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.979082	valid's auc: 0.975534


[400]	train's auc: 0.988905	valid's auc: 0.984938


[600]	train's auc: 0.992427	valid's auc: 0.987714


[800]	train's auc: 0.994388	valid's auc: 0.988766


[1000]	train's auc: 0.995747	valid's auc: 0.989322


[1200]	train's auc: 0.996796	valid's auc: 0.98969


[1400]	train's auc: 0.997612	valid's auc: 0.99001


[1600]	train's auc: 0.998234	valid's auc: 0.990172


[1800]	train's auc: 0.998707	valid's auc: 0.990346


[2000]	train's auc: 0.999071	valid's auc: 0.990478


[2200]	train's auc: 0.99935	valid's auc: 0.990574


[2400]	train's auc: 0.999553	valid's auc: 0.990645


[2600]	train's auc: 0.999696	valid's auc: 0.990779


[2800]	train's auc: 0.999802	valid's auc: 0.990817


Early stopping, best iteration is:
[2684]	train's auc: 0.999746	valid's auc: 0.99082


[2025-09-12 01:30:20] [LGB multi] seed=2025 fold=7 AUC: 0.990820 | best_iter=2684 | elapsed=220.7s


[2025-09-12 01:30:22] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978804	valid's auc: 0.976181


[400]	train's auc: 0.98875	valid's auc: 0.985585


[600]	train's auc: 0.992364	valid's auc: 0.988337


[800]	train's auc: 0.994306	valid's auc: 0.989325


[1000]	train's auc: 0.995654	valid's auc: 0.989817


[1200]	train's auc: 0.996695	valid's auc: 0.990158


[1400]	train's auc: 0.997518	valid's auc: 0.990495


[1600]	train's auc: 0.99815	valid's auc: 0.990735


[1800]	train's auc: 0.998651	valid's auc: 0.990915


[2000]	train's auc: 0.99902	valid's auc: 0.991022


[2200]	train's auc: 0.999303	valid's auc: 0.991084


[2400]	train's auc: 0.999515	valid's auc: 0.991165


[2600]	train's auc: 0.999667	valid's auc: 0.99122


[2800]	train's auc: 0.999777	valid's auc: 0.991296


[3000]	train's auc: 0.999856	valid's auc: 0.991319


[3200]	train's auc: 0.999908	valid's auc: 0.991388


[3400]	train's auc: 0.999944	valid's auc: 0.99143


[3600]	train's auc: 0.999966	valid's auc: 0.991454


[3800]	train's auc: 0.999981	valid's auc: 0.991497


[4000]	train's auc: 0.999989	valid's auc: 0.991517
Did not meet early stopping. Best iteration is:
[3970]	train's auc: 0.999989	valid's auc: 0.991521


[2025-09-12 01:35:37] [LGB multi] seed=2025 fold=8 AUC: 0.991521 | best_iter=3970 | elapsed=314.4s


[2025-09-12 01:35:39] [LGB multi] seed=2025 ff=0.78 bf=0.78 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.978858	valid's auc: 0.976129


[400]	train's auc: 0.988604	valid's auc: 0.985372


[600]	train's auc: 0.992288	valid's auc: 0.988255


[800]	train's auc: 0.994203	valid's auc: 0.989122


[1000]	train's auc: 0.99564	valid's auc: 0.989778


[1200]	train's auc: 0.996687	valid's auc: 0.9901


[1400]	train's auc: 0.997508	valid's auc: 0.990341


[1600]	train's auc: 0.998146	valid's auc: 0.990539


[1800]	train's auc: 0.998638	valid's auc: 0.990704


[2000]	train's auc: 0.999016	valid's auc: 0.990805


[2200]	train's auc: 0.999307	valid's auc: 0.990893


[2400]	train's auc: 0.999522	valid's auc: 0.990986


[2600]	train's auc: 0.999672	valid's auc: 0.991048


[2800]	train's auc: 0.999782	valid's auc: 0.991108


[3000]	train's auc: 0.999858	valid's auc: 0.991159


[3200]	train's auc: 0.999913	valid's auc: 0.991201


[3400]	train's auc: 0.999948	valid's auc: 0.991228


Early stopping, best iteration is:
[3317]	train's auc: 0.999935	valid's auc: 0.991232


[2025-09-12 01:40:08] [LGB multi] seed=2025 fold=9 AUC: 0.991232 | best_iter=3317 | elapsed=268.2s


[2025-09-12 01:40:10] [LGB multi] seed=2025 OOF AUC: 0.991116 | total_elapsed=3014.1s


[2025-09-12 01:40:11] [LGB multi] Saved OOF/test preds for s2025


In [17]:
# Archive additional submissions and blend metadata
log('Creating additional submission variants and archiving blend metadata...')
import json

def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / (len(r) - 1)

# Load OOF for recomputing weights/metrics
gt = train[['id','target']].copy()
oof_lgb_paths = [
    ('oof_lgb_fast_s42.csv', 'oof_lgb_fast_s42'),
    ('oof_lgb_fast_s1337.csv', 'oof_lgb_fast_s1337'),
    ('oof_lgb_fast_s2025.csv', 'oof_lgb_fast_s2025'),
]
oof_list = []
for p, col in oof_lgb_paths:
    if os.path.exists(p):
        df = pd.read_csv(p)
        pred_col = col if col in df.columns else [c for c in df.columns if c != 'id'][0]
        oof_list.append(df[['id', pred_col]].rename(columns={pred_col: os.path.splitext(os.path.basename(p))[0]}))

if len(oof_list) == 0:
    log('Missing LGB OOF files; cannot create variants.')
else:
    oof_merged = oof_list[0]
    for df in oof_list[1:]:
        oof_merged = oof_merged.merge(df, on='id', how='inner')
    lgb_cols = [c for c in oof_merged.columns if c != 'id']
    oof_merged['lgb_ens'] = oof_merged[lgb_cols].mean(axis=1).astype('float32')
    oof = gt.merge(oof_merged[['id','lgb_ens']], on='id', how='left')
    # XGB OOF
    have_xgb = os.path.exists('oof_xgb_seed42.csv')
    if have_xgb:
        oof_xgb = pd.read_csv('oof_xgb_seed42.csv')
        xcol = 'oof_xgb' if 'oof_xgb' in oof_xgb.columns else [c for c in oof_xgb.columns if c != 'id'][0]
        oof = oof.merge(oof_xgb[['id', xcol]].rename(columns={xcol: 'xgb'}), on='id', how='left')
    else:
        oof['xgb'] = np.nan
    # Rank cols
    oof['lgb_r'] = rank_norm(oof['lgb_ens'])
    if have_xgb:
        oof['xgb_r'] = rank_norm(oof['xgb'])

    # Grid search per guidance: [0.60..0.85] step 0.05 then refine ±0.04 step 0.02
    best = {'auc': 0.0, 'mode': 'prob', 'w_lgb': 1.0}
    if have_xgb:
        coarse = np.arange(0.60, 0.85 + 1e-9, 0.05)
        cand = set(np.clip(np.concatenate([coarse,
                                           np.unique(np.round((coarse[:,None] + np.array([-0.04,-0.02,0.02,0.04]))
                                                              .ravel(), 2))]), 0.0, 1.0))
        weights = sorted(list(cand))
    else:
        weights = [1.0]
    for w in weights:
        if have_xgb:
            prob = w * oof['lgb_ens'] + (1 - w) * oof['xgb']
            rank = w * oof['lgb_r'] + (1 - w) * oof['xgb_r']
        else:
            prob = oof['lgb_ens']
            rank = oof['lgb_r']
        auc_p = roc_auc_score(oof['target'], prob)
        if auc_p > best['auc']:
            best.update({'auc': float(auc_p), 'mode': 'prob', 'w_lgb': float(w)})
        auc_r = roc_auc_score(oof['target'], rank)
        if auc_r > best['auc']:
            best.update({'auc': float(auc_r), 'mode': 'rank', 'w_lgb': float(w)})

    # Build test preds:
    # LGB 3-seed average
    pred_paths = [
        ('pred_lgb_fast_s42.csv', 'prediction_lgb_fast_s42'),
        ('pred_lgb_fast_s1337.csv', 'prediction_lgb_fast_s1337'),
        ('pred_lgb_fast_s2025.csv', 'prediction_lgb_fast_s2025'),
    ]
    preds = []
    for p, col in pred_paths:
        if os.path.exists(p):
            df = pd.read_csv(p)
            pcol = col if col in df.columns else [c for c in df.columns if c != 'id'][0]
            preds.append(df[['id', pcol]].rename(columns={pcol: os.path.splitext(os.path.basename(p))[0]}))
    if len(preds) < 3:
        log('Missing some LGB test predictions; skipping extra submissions.')
    else:
        pm = preds[0]
        for df in preds[1:]:
            pm = pm.merge(df, on='id', how='inner')
        lgb_pred_cols = [c for c in pm.columns if c != 'id']
        pm['lgb_ens'] = pm[lgb_pred_cols].mean(axis=1).astype('float32')

        # Save pure 3-seed LGB average submission
        ss = pd.read_csv('sample_submission.csv')[['id']].merge(pm[['id','lgb_ens']], on='id', how='left')
        ss = ss.rename(columns={'lgb_ens': 'target'})
        ss.to_csv('submission_lgb3.csv', index=False)
        log('Saved submission_lgb3.csv (pure LGB 3-seed average).')

        # Fixed rank blend 0.72/0.28 with XGB
        if have_xgb and os.path.exists('pred_xgb_seed42.csv'):
            pred_xgb = pd.read_csv('pred_xgb_seed42.csv')
            xcol = 'prediction_xgb' if 'prediction_xgb' in pred_xgb.columns else [c for c in pred_xgb.columns if c != 'id'][0]
            sub_fx = pd.read_csv('sample_submission.csv')[['id']].merge(pm[['id','lgb_ens']], on='id', how='left')
            sub_fx = sub_fx.merge(pred_xgb[['id', xcol]].rename(columns={xcol: 'xgb'}), on='id', how='left')
            rl = pd.Series(sub_fx['lgb_ens']).rank(method='average').values
            rx = pd.Series(sub_fx['xgb']).rank(method='average').values
            rl = (rl - 1) / (len(rl) - 1); rx = (rx - 1) / (len(rx) - 1)
            sub_fx['target'] = (0.72 * rl + 0.28 * rx).astype('float32')
            sub_fx[['id','target']].to_csv('submission_rank_072_028.csv', index=False)
            log('Saved submission_rank_072_028.csv (fixed rank 0.72/0.28).')
        else:
            log('XGB preds missing; skipped fixed rank 0.72/0.28 file.')

        # Archive current best submission.csv as submission_best.csv if present
        if os.path.exists('submission.csv'):
            sub_best = pd.read_csv('submission.csv')
            sub_best.to_csv('submission_best.csv', index=False)
            log('Archived current submission.csv as submission_best.csv')

    # Save blend metadata
    meta = {
        'best_oof_auc': best['auc'],
        'best_mode': best['mode'],
        'best_w_lgb': best['w_lgb'],
        'notes': 'Weights searched on OOF with LGB 3-seed ensemble vs XGB; additional variants saved.'
    }
    with open('blend_meta.json', 'w') as f:
        json.dump(meta, f, indent=2)
    log(f"Saved blend_meta.json: {meta}")

[2025-09-12 01:41:58] Creating additional submission variants and archiving blend metadata...


[2025-09-12 01:42:16] Saved submission_lgb3.csv (pure LGB 3-seed average).


[2025-09-12 01:42:16] Saved submission_rank_072_028.csv (fixed rank 0.72/0.28).


[2025-09-12 01:42:16] Archived current submission.csv as submission_best.csv


[2025-09-12 01:42:16] Saved blend_meta.json: {'best_oof_auc': 0.991134746878686, 'best_mode': 'prob', 'best_w_lgb': 0.89, 'notes': 'Weights searched on OOF with LGB 3-seed ensemble vs XGB; additional variants saved.'}


In [19]:
# Strictly rebuild current blended submission to ensure format compliance
log('Strictly rebuilding submission.csv from current blended predictions...')
assert os.path.exists('submission.csv'), 'submission.csv not found to rebuild'
ss = pd.read_csv('sample_submission.csv')[['id']]
sub_cur = pd.read_csv('submission.csv')
# Detect prediction column
pred_col = None
if list(sub_cur.columns) == ['id', 'target']:
    pred_col = 'target'
else:
    # find non-id column
    other = [c for c in sub_cur.columns if c != 'id']
    pred_col = other[0] if other else None
assert pred_col is not None, 'No prediction column found in submission.csv'
# Merge to enforce ID order and exact length
rebuilt = ss.merge(sub_cur[['id', pred_col]].rename(columns={pred_col: 'target'}), on='id', how='left')
missing = int(rebuilt['target'].isna().sum())
if missing > 0:
    raise ValueError(f'Missing predictions for {missing} test rows after strict rebuild')
# Enforce numeric type and [0,1] bounds
rebuilt['target'] = pd.to_numeric(rebuilt['target'], errors='coerce')
if rebuilt['target'].isna().any():
    raise ValueError('NaNs present after numeric coercion in target')
rebuilt['target'] = rebuilt['target'].astype('float64').clip(0.0, 1.0)
rebuilt = rebuilt[['id', 'target']]
rebuilt.to_csv('submission.csv', index=False)
log('submission.csv strictly rebuilt and saved (float64, clipped [0,1], aligned to sample_submission).')

[2025-09-12 01:43:11] Strictly rebuilding submission.csv from current blended predictions...


[2025-09-12 01:43:11] submission.csv strictly rebuilt and saved (float64, clipped [0,1], aligned to sample_submission).


In [21]:
# Pivot pipeline: f_27 identity map and compact numeric-only features (no f_27-derived) for unseen subset
log('Pivot: building f_27 identity map for seen rows and compact numeric-only features for unseen model...')
t0 = time.time()

# 1) f_27 -> target map from train; identify seen/unseen in test
assert 'f_27' in train.columns and 'target' in train.columns, 'Missing f_27 or target in train'
f27_target_map = train.groupby('f_27')['target'].first().to_dict()
test_seen_mask = test['f_27'].isin(f27_target_map)
n_seen = int(test_seen_mask.sum())
n_unseen = int((~test_seen_mask).sum())
log(f'Test seen f_27 rows: {n_seen} | unseen: {n_unseen}')
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_target_map).astype('float32')

# 2) Define compact numeric-only feature set
base_num_cols = [c for c in train.columns if c.startswith('f_') and c != 'f_27' and (str(train[c].dtype).startswith('float') or str(train[c].dtype).startswith('int'))]
# Keep only original numeric features f_00..f_26 (not any previously engineered columns)
base_num_cols = [c for c in base_num_cols if len(c) == 4 and c[2:].isdigit()]
base_num_cols = sorted(base_num_cols)
log(f'Base numeric cols count: {len(base_num_cols)}')

# Build compact extra features: sorted row stats, equality count over selected numerics, parity bits
sel_eq_cols = [c for c in ['f_00','f_01','f_02','f_03','f_05','f_06','f_10','f_12','f_20','f_21','f_22','f_26'] if c in base_num_cols]
parity_cols = [c for c in ['f_00','f_02','f_10','f_20','f_21','f_26'] if c in base_num_cols]

def build_compact_block(df):
    vals = df[base_num_cols].astype('float32').values
    srt = np.sort(vals, axis=1)
    out = pd.DataFrame(index=df.index)
    out['comp_sorted_0'] = srt[:, 0].astype('float32')
    out['comp_sorted_1'] = srt[:, 1].astype('float32')
    out['comp_sorted_-1'] = srt[:, -1].astype('float32')
    out['comp_sorted_range'] = (srt[:, -1] - srt[:, 0]).astype('float32')
    # equality count over selected numerics
    if len(sel_eq_cols) >= 2:
        M = df[sel_eq_cols].astype('float32').values
        from itertools import combinations
        cnt = np.zeros(len(df), dtype=np.int16)
        for i, j in combinations(range(M.shape[1]), 2):
            cnt += (np.abs(M[:, i] - M[:, j]) < 1e-6)
        out['comp_num_equal_pairs'] = cnt
    else:
        out['comp_num_equal_pairs'] = 0
    # parity bits
    for c in parity_cols:
        out[f'comp_{c}_parity'] = (df[c].astype('int64') & 1).astype('int8')
    return out

comp_train = build_compact_block(train)
comp_test = build_compact_block(test)

# Track compact feature names explicitly
compact_feature_cols = list(comp_train.columns)
log(f'Compact feature count: {len(compact_feature_cols)}')

# 3) Persist helper artifacts in memory (globals) and basic masks for later cells
test_unseen_mask = ~test_seen_mask
test_unseen_f27 = set(test.loc[test_unseen_mask, 'f_27'].unique())
train_mask_unseen_vocab = train['f_27'].isin(test_unseen_f27).values  # for OOF weighting on unseen-like rows
log(f'Train rows in unseen test vocab: {int(train_mask_unseen_vocab.sum())} / {len(train)}')

# 4) Sanity: no NaNs, and ensure we do not include any f_27-derived feature
assert all(['f_27' not in c for c in compact_feature_cols]), 'Compact features contaminated with f_27-derived cols'
nan_tr = int(comp_train.isna().sum().sum()); nan_te = int(comp_test.isna().sum().sum())
assert nan_tr == 0 and nan_te == 0, f'NaNs in compact features: train {nan_tr}, test {nan_te}'

log(f'Pivot prep done in {time.time()-t0:.1f}s. Ready to train unseen-subset models.')

[2025-09-12 01:47:08] Pivot: building f_27 identity map for seen rows and compact numeric-only features for unseen model...


[2025-09-12 01:47:10] Test seen f_27 rows: 28435 | unseen: 71565


[2025-09-12 01:47:10] Base numeric cols count: 30


[2025-09-12 01:47:11] Compact feature count: 11


[2025-09-12 01:47:11] Train rows in unseen test vocab: 0 / 800000


[2025-09-12 01:47:11] Pivot prep done in 3.3s. Ready to train unseen-subset models.


In [22]:
# Train compact-feature LightGBM models (5 seeds) for unseen subset
log('Training compact-feature LightGBM models for unseen subset...')
import lightgbm as lgb

assert 'comp_sorted_0' in comp_train.columns, 'Compact features not built; run pivot prep cell first.'
Xc = comp_train[compact_feature_cols].copy()
yc = train['target'].values
Xc_test = comp_test[compact_feature_cols].copy()

# Reuse locked folds
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

def train_lgb_compact(seed=42, ff=0.75, bf=0.75, tag='c42'):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': 0.045,
        'num_leaves': 160,
        'max_depth': -1,
        'min_data_in_leaf': 240,
        'feature_fraction': ff,
        'bagging_fraction': bf,
        'bagging_freq': 1,
        'lambda_l1': 0.0,
        'lambda_l2': 4.0,
        'max_bin': 127,
        'verbose': -1,
        'n_jobs': -1,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'data_random_seed': seed
    }
    oof = np.zeros(len(Xc), dtype='float32')
    ptest = np.zeros(len(Xc_test), dtype='float32')
    t0 = time.time()
    for fold in range(n_splits):
        trn_idx = np.where(folds_arr != fold)[0]
        val_idx = np.where(folds_arr == fold)[0]
        log(f'[LGB compact] seed={seed} ff={ff} bf={bf} | Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
        dtr = lgb.Dataset(Xc.iloc[trn_idx], label=yc[trn_idx])
        dvl = lgb.Dataset(Xc.iloc[val_idx], label=yc[val_idx])
        model = lgb.train(
            params,
            dtr,
            num_boost_round=4000,
            valid_sets=[dtr, dvl],
            valid_names=['train','valid'],
            callbacks=[lgb.early_stopping(120), lgb.log_evaluation(200)]
        )
        oof[val_idx] = model.predict(Xc.iloc[val_idx], num_iteration=model.best_iteration)
        auc = roc_auc_score(yc[val_idx], oof[val_idx])
        log(f'[LGB compact] seed={seed} fold={fold} AUC: {auc:.6f} | best_iter={model.best_iteration}')
        ptest += model.predict(Xc_test, num_iteration=model.best_iteration) / n_splits
        del dtr, dvl, model; gc.collect()
    cv_auc = roc_auc_score(yc, oof);
    log(f'[LGB compact] seed={seed} OOF AUC: {cv_auc:.6f} | elapsed={time.time()-t0:.1f}s')
    pd.DataFrame({'id': train['id'], f'oof_lgb_comp_{tag}': oof}).to_csv(f'oof_lgb_comp_{tag}.csv', index=False)
    pd.DataFrame({'id': test['id'], f'prediction_lgb_comp_{tag}': ptest}).to_csv(f'pred_lgb_comp_{tag}.csv', index=False)
    log(f'[LGB compact] Saved OOF/test preds for {tag}')

# Run 5 seeds sequentially
seed_cfgs = [
    (42, 0.75, 0.75, 'c42'),
    (1337, 0.72, 0.72, 'c1337'),
    (2025, 0.78, 0.78, 'c2025'),
    (101, 0.70, 0.80, 'c101'),
    (999, 0.80, 0.70, 'c999'),
]
for s, ff, bf, tag in seed_cfgs:
    train_lgb_compact(s, ff, bf, tag)
log('All compact LGB seeds finished.')

[2025-09-12 01:48:00] Training compact-feature LightGBM models for unseen subset...


[2025-09-12 01:48:00] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609486	valid's auc: 0.577636


Early stopping, best iteration is:
[131]	train's auc: 0.601722	valid's auc: 0.578104
[2025-09-12 01:48:13] [LGB compact] seed=42 fold=0 AUC: 0.578104 | best_iter=131


[2025-09-12 01:48:13] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[79]	train's auc: 0.595171	valid's auc: 0.580088
[2025-09-12 01:48:24] [LGB compact] seed=42 fold=1 AUC: 0.580088 | best_iter=79


[2025-09-12 01:48:24] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609564	valid's auc: 0.581314


Early stopping, best iteration is:
[99]	train's auc: 0.598061	valid's auc: 0.582287
[2025-09-12 01:48:36] [LGB compact] seed=42 fold=2 AUC: 0.582287 | best_iter=99


[2025-09-12 01:48:36] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609251	valid's auc: 0.581727


Early stopping, best iteration is:
[173]	train's auc: 0.606583	valid's auc: 0.582113
[2025-09-12 01:48:52] [LGB compact] seed=42 fold=3 AUC: 0.582113 | best_iter=173


[2025-09-12 01:48:52] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609298	valid's auc: 0.579087


Early stopping, best iteration is:
[186]	train's auc: 0.607921	valid's auc: 0.579287
[2025-09-12 01:49:08] [LGB compact] seed=42 fold=4 AUC: 0.579287 | best_iter=186


[2025-09-12 01:49:08] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.60948	valid's auc: 0.580912


Early stopping, best iteration is:
[102]	train's auc: 0.598032	valid's auc: 0.581854
[2025-09-12 01:49:20] [LGB compact] seed=42 fold=5 AUC: 0.581854 | best_iter=102


[2025-09-12 01:49:20] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609217	valid's auc: 0.580377


Early stopping, best iteration is:
[109]	train's auc: 0.598965	valid's auc: 0.580787
[2025-09-12 01:49:32] [LGB compact] seed=42 fold=6 AUC: 0.580787 | best_iter=109


[2025-09-12 01:49:33] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609442	valid's auc: 0.579774


Early stopping, best iteration is:
[110]	train's auc: 0.599325	valid's auc: 0.580325
[2025-09-12 01:49:45] [LGB compact] seed=42 fold=7 AUC: 0.580325 | best_iter=110


[2025-09-12 01:49:45] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.60956	valid's auc: 0.578827


Early stopping, best iteration is:
[129]	train's auc: 0.601731	valid's auc: 0.579521
[2025-09-12 01:49:58] [LGB compact] seed=42 fold=8 AUC: 0.579521 | best_iter=129


[2025-09-12 01:49:58] [LGB compact] seed=42 ff=0.75 bf=0.75 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609141	valid's auc: 0.578915


Early stopping, best iteration is:
[108]	train's auc: 0.599177	valid's auc: 0.579307
[2025-09-12 01:50:11] [LGB compact] seed=42 fold=9 AUC: 0.579307 | best_iter=108


[2025-09-12 01:50:11] [LGB compact] seed=42 OOF AUC: 0.580331 | elapsed=131.3s


[2025-09-12 01:50:12] [LGB compact] Saved OOF/test preds for c42


[2025-09-12 01:50:12] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609531	valid's auc: 0.577488


Early stopping, best iteration is:
[108]	train's auc: 0.599024	valid's auc: 0.577968
[2025-09-12 01:50:24] [LGB compact] seed=1337 fold=0 AUC: 0.577968 | best_iter=108


[2025-09-12 01:50:25] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[48]	train's auc: 0.590484	valid's auc: 0.579822
[2025-09-12 01:50:34] [LGB compact] seed=1337 fold=1 AUC: 0.579822 | best_iter=48


[2025-09-12 01:50:34] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[57]	train's auc: 0.591646	valid's auc: 0.581417
[2025-09-12 01:50:43] [LGB compact] seed=1337 fold=2 AUC: 0.581417 | best_iter=57


[2025-09-12 01:50:43] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609093	valid's auc: 0.581667


Early stopping, best iteration is:
[115]	train's auc: 0.599812	valid's auc: 0.582333
[2025-09-12 01:50:57] [LGB compact] seed=1337 fold=3 AUC: 0.582333 | best_iter=115


[2025-09-12 01:50:57] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609495	valid's auc: 0.578302


Early stopping, best iteration is:
[123]	train's auc: 0.600822	valid's auc: 0.579041
[2025-09-12 01:51:10] [LGB compact] seed=1337 fold=4 AUC: 0.579041 | best_iter=123


[2025-09-12 01:51:10] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.60951	valid's auc: 0.580565


Early stopping, best iteration is:
[108]	train's auc: 0.598829	valid's auc: 0.581532
[2025-09-12 01:51:22] [LGB compact] seed=1337 fold=5 AUC: 0.581532 | best_iter=108


[2025-09-12 01:51:22] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609035	valid's auc: 0.580065


Early stopping, best iteration is:
[97]	train's auc: 0.597228	valid's auc: 0.580405
[2025-09-12 01:51:34] [LGB compact] seed=1337 fold=6 AUC: 0.580405 | best_iter=97


[2025-09-12 01:51:34] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609611	valid's auc: 0.579206


Early stopping, best iteration is:
[98]	train's auc: 0.597499	valid's auc: 0.579673
[2025-09-12 01:51:46] [LGB compact] seed=1337 fold=7 AUC: 0.579673 | best_iter=98


[2025-09-12 01:51:46] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609457	valid's auc: 0.57924


Early stopping, best iteration is:
[99]	train's auc: 0.597688	valid's auc: 0.579549
[2025-09-12 01:51:58] [LGB compact] seed=1337 fold=8 AUC: 0.579549 | best_iter=99


[2025-09-12 01:51:58] [LGB compact] seed=1337 ff=0.72 bf=0.72 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609188	valid's auc: 0.579591


Early stopping, best iteration is:
[179]	train's auc: 0.606919	valid's auc: 0.579758
[2025-09-12 01:52:14] [LGB compact] seed=1337 fold=9 AUC: 0.579758 | best_iter=179


[2025-09-12 01:52:15] [LGB compact] seed=1337 OOF AUC: 0.580040 | elapsed=122.6s


[2025-09-12 01:52:16] [LGB compact] Saved OOF/test preds for c1337


[2025-09-12 01:52:16] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.61044	valid's auc: 0.577239


Early stopping, best iteration is:
[137]	train's auc: 0.603156	valid's auc: 0.577941
[2025-09-12 01:52:30] [LGB compact] seed=2025 fold=0 AUC: 0.577941 | best_iter=137


[2025-09-12 01:52:30] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610443	valid's auc: 0.579654


Early stopping, best iteration is:
[93]	train's auc: 0.597599	valid's auc: 0.580516
[2025-09-12 01:52:41] [LGB compact] seed=2025 fold=1 AUC: 0.580516 | best_iter=93


[2025-09-12 01:52:41] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610194	valid's auc: 0.579736


Early stopping, best iteration is:
[85]	train's auc: 0.596016	valid's auc: 0.580942
[2025-09-12 01:52:52] [LGB compact] seed=2025 fold=2 AUC: 0.580942 | best_iter=85


[2025-09-12 01:52:52] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609778	valid's auc: 0.582079


Early stopping, best iteration is:
[182]	train's auc: 0.607864	valid's auc: 0.582398
[2025-09-12 01:53:08] [LGB compact] seed=2025 fold=3 AUC: 0.582398 | best_iter=182


[2025-09-12 01:53:09] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610278	valid's auc: 0.578143


Early stopping, best iteration is:
[100]	train's auc: 0.598596	valid's auc: 0.578853
[2025-09-12 01:53:20] [LGB compact] seed=2025 fold=4 AUC: 0.578853 | best_iter=100


[2025-09-12 01:53:20] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.61052	valid's auc: 0.580501


Early stopping, best iteration is:
[89]	train's auc: 0.596594	valid's auc: 0.581692
[2025-09-12 01:53:31] [LGB compact] seed=2025 fold=5 AUC: 0.581692 | best_iter=89


[2025-09-12 01:53:31] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.60997	valid's auc: 0.580484


Early stopping, best iteration is:
[211]	train's auc: 0.611143	valid's auc: 0.580615
[2025-09-12 01:53:49] [LGB compact] seed=2025 fold=6 AUC: 0.580615 | best_iter=211


[2025-09-12 01:53:49] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610114	valid's auc: 0.579787


Early stopping, best iteration is:
[126]	train's auc: 0.601515	valid's auc: 0.580163
[2025-09-12 01:54:02] [LGB compact] seed=2025 fold=7 AUC: 0.580163 | best_iter=126


[2025-09-12 01:54:03] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610386	valid's auc: 0.578716


Early stopping, best iteration is:
[133]	train's auc: 0.602686	valid's auc: 0.579415
[2025-09-12 01:54:16] [LGB compact] seed=2025 fold=8 AUC: 0.579415 | best_iter=133


[2025-09-12 01:54:16] [LGB compact] seed=2025 ff=0.78 bf=0.78 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610518	valid's auc: 0.579309


Early stopping, best iteration is:
[121]	train's auc: 0.601182	valid's auc: 0.579793
[2025-09-12 01:54:29] [LGB compact] seed=2025 fold=9 AUC: 0.579793 | best_iter=121


[2025-09-12 01:54:29] [LGB compact] seed=2025 OOF AUC: 0.580202 | elapsed=133.3s


[2025-09-12 01:54:30] [LGB compact] Saved OOF/test preds for c2025


[2025-09-12 01:54:30] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609588	valid's auc: 0.577428


Early stopping, best iteration is:
[98]	train's auc: 0.598151	valid's auc: 0.578245
[2025-09-12 01:54:45] [LGB compact] seed=101 fold=0 AUC: 0.578245 | best_iter=98


[2025-09-12 01:54:45] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[55]	train's auc: 0.591747	valid's auc: 0.581136
[2025-09-12 01:54:55] [LGB compact] seed=101 fold=1 AUC: 0.581136 | best_iter=55


[2025-09-12 01:54:55] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[58]	train's auc: 0.591889	valid's auc: 0.582109
[2025-09-12 01:55:05] [LGB compact] seed=101 fold=2 AUC: 0.582109 | best_iter=58


[2025-09-12 01:55:05] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609362	valid's auc: 0.581418


Early stopping, best iteration is:
[127]	train's auc: 0.601365	valid's auc: 0.581903
[2025-09-12 01:55:18] [LGB compact] seed=101 fold=3 AUC: 0.581903 | best_iter=127


[2025-09-12 01:55:18] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609329	valid's auc: 0.578644


Early stopping, best iteration is:
[139]	train's auc: 0.602587	valid's auc: 0.579162
[2025-09-12 01:55:32] [LGB compact] seed=101 fold=4 AUC: 0.579162 | best_iter=139


[2025-09-12 01:55:32] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[78]	train's auc: 0.594973	valid's auc: 0.581856
[2025-09-12 01:55:43] [LGB compact] seed=101 fold=5 AUC: 0.581856 | best_iter=78


[2025-09-12 01:55:43] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609444	valid's auc: 0.580256


Early stopping, best iteration is:
[116]	train's auc: 0.600255	valid's auc: 0.580501
[2025-09-12 01:55:55] [LGB compact] seed=101 fold=6 AUC: 0.580501 | best_iter=116


[2025-09-12 01:55:55] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609275	valid's auc: 0.580162


Early stopping, best iteration is:
[101]	train's auc: 0.598257	valid's auc: 0.580575
[2025-09-12 01:56:07] [LGB compact] seed=101 fold=7 AUC: 0.580575 | best_iter=101


[2025-09-12 01:56:08] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609405	valid's auc: 0.579277


Early stopping, best iteration is:
[104]	train's auc: 0.598685	valid's auc: 0.579895
[2025-09-12 01:56:20] [LGB compact] seed=101 fold=8 AUC: 0.579895 | best_iter=104


[2025-09-12 01:56:20] [LGB compact] seed=101 ff=0.7 bf=0.8 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.60935	valid's auc: 0.57881


Early stopping, best iteration is:
[124]	train's auc: 0.601065	valid's auc: 0.579251
[2025-09-12 01:56:33] [LGB compact] seed=101 fold=9 AUC: 0.579251 | best_iter=124


[2025-09-12 01:56:33] [LGB compact] seed=101 OOF AUC: 0.580364 | elapsed=122.8s


[2025-09-12 01:56:34] [LGB compact] Saved OOF/test preds for c101


[2025-09-12 01:56:34] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.61018	valid's auc: 0.577272


Early stopping, best iteration is:
[133]	train's auc: 0.602579	valid's auc: 0.577679
[2025-09-12 01:56:48] [LGB compact] seed=999 fold=0 AUC: 0.577679 | best_iter=133


[2025-09-12 01:56:48] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610242	valid's auc: 0.57919
Early stopping, best iteration is:
[82]	train's auc: 0.595785	valid's auc: 0.580925
[2025-09-12 01:56:58] [LGB compact] seed=999 fold=1 AUC: 0.580925 | best_iter=82


[2025-09-12 01:56:59] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[65]	train's auc: 0.592963	valid's auc: 0.581879
[2025-09-12 01:57:09] [LGB compact] seed=999 fold=2 AUC: 0.581879 | best_iter=65


[2025-09-12 01:57:09] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609596	valid's auc: 0.582257


Early stopping, best iteration is:
[101]	train's auc: 0.597932	valid's auc: 0.582404
[2025-09-12 01:57:21] [LGB compact] seed=999 fold=3 AUC: 0.582404 | best_iter=101


[2025-09-12 01:57:21] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609869	valid's auc: 0.578344


Early stopping, best iteration is:
[97]	train's auc: 0.597797	valid's auc: 0.579266
[2025-09-12 01:57:33] [LGB compact] seed=999 fold=4 AUC: 0.579266 | best_iter=97


[2025-09-12 01:57:33] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


Early stopping, best iteration is:
[77]	train's auc: 0.5947	valid's auc: 0.581514
[2025-09-12 01:57:44] [LGB compact] seed=999 fold=5 AUC: 0.581514 | best_iter=77


[2025-09-12 01:57:44] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609966	valid's auc: 0.579524


Early stopping, best iteration is:
[90]	train's auc: 0.596715	valid's auc: 0.580043
[2025-09-12 01:57:55] [LGB compact] seed=999 fold=6 AUC: 0.580043 | best_iter=90


[2025-09-12 01:57:55] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.610052	valid's auc: 0.579106


Early stopping, best iteration is:
[100]	train's auc: 0.598141	valid's auc: 0.57977
[2025-09-12 01:58:07] [LGB compact] seed=999 fold=7 AUC: 0.579770 | best_iter=100


[2025-09-12 01:58:07] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609697	valid's auc: 0.578595


Early stopping, best iteration is:
[102]	train's auc: 0.598263	valid's auc: 0.579386
[2025-09-12 01:58:19] [LGB compact] seed=999 fold=8 AUC: 0.579386 | best_iter=102


[2025-09-12 01:58:19] [LGB compact] seed=999 ff=0.8 bf=0.7 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 120 rounds


[200]	train's auc: 0.609776	valid's auc: 0.579659


Early stopping, best iteration is:
[164]	train's auc: 0.605749	valid's auc: 0.579906
[2025-09-12 01:58:34] [LGB compact] seed=999 fold=9 AUC: 0.579906 | best_iter=164


[2025-09-12 01:58:34] [LGB compact] seed=999 OOF AUC: 0.580228 | elapsed=120.3s


[2025-09-12 01:58:35] [LGB compact] Saved OOF/test preds for c999


[2025-09-12 01:58:35] All compact LGB seeds finished.


In [23]:
# Compact-feature XGBoost (CPU, hist) training for unseen subset diversity
log('Training compact-feature XGBoost (CPU, hist) for unseen subset...')
t0_all = time.time()
try:
    import xgboost as xgb
except ImportError:
    import sys, subprocess
    log('XGBoost not found. Installing...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'xgboost'])
    import xgboost as xgb

assert 'comp_sorted_0' in comp_train.columns, 'Compact features not built; run pivot prep first.'
Xc = comp_train[compact_feature_cols].copy()
yc = train['target'].values
Xc_test = comp_test[compact_feature_cols].copy()

# Folds (locked)
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

# Params (compact set, slightly deeper for diversity)
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'max_bin': 128,
    'eta': 0.045,
    'max_depth': 8,
    'min_child_weight': 80,
    'subsample': 0.78,
    'colsample_bytree': 0.9,
    'lambda': 3.0,
    'nthread': 36,
    'seed': 42
}
num_boost_round = 3000
early_stopping_rounds = 150

oof_xgb_c = np.zeros(len(Xc), dtype='float32')
pred_test_xgb_c = np.zeros(len(Xc_test), dtype='float32')

for fold in range(n_splits):
    fold_t0 = time.time()
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'[XGB compact] Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
    dtrain = xgb.DMatrix(Xc.iloc[trn_idx], label=yc[trn_idx])
    dvalid = xgb.DMatrix(Xc.iloc[val_idx], label=yc[val_idx])
    dtest = xgb.DMatrix(Xc_test)
    bst = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=200
    )
    best_iter = None
    try:
        attrs = bst.attributes()
        if 'best_iteration' in attrs:
            best_iter = int(attrs['best_iteration'])
    except Exception:
        best_iter = getattr(bst, 'best_iteration', None)
    if best_iter is not None and best_iter >= 0:
        oof_pred = bst.predict(dvalid, iteration_range=(0, best_iter + 1))
    else:
        oof_pred = bst.predict(dvalid)
    oof_xgb_c[val_idx] = oof_pred.astype('float32')
    fold_auc = roc_auc_score(yc[val_idx], oof_xgb_c[val_idx])
    log(f'[XGB compact] Fold {fold} AUC: {fold_auc:.6f} | best_iter={best_iter} | elapsed={time.time()-fold_t0:.1f}s')
    if best_iter is not None and best_iter >= 0:
        pred_te = bst.predict(dtest, iteration_range=(0, best_iter + 1))
    else:
        pred_te = bst.predict(dtest)
    pred_test_xgb_c += pred_te.astype('float32') / n_splits
    del dtrain, dvalid, dtest, bst
    gc.collect()

cv_auc_xgb_c = roc_auc_score(yc, oof_xgb_c)
log(f'[XGB compact] OOF AUC: {cv_auc_xgb_c:.6f} | total_elapsed={time.time()-t0_all:.1f}s')
pd.DataFrame({'id': train['id'], 'oof_xgb_comp': oof_xgb_c}).to_csv('oof_xgb_comp.csv', index=False)
pd.DataFrame({'id': test['id'], 'prediction_xgb_comp': pred_test_xgb_c}).to_csv('pred_xgb_comp.csv', index=False)
log('[XGB compact] Saved OOF and test predictions.')

[2025-09-12 01:58:44] Training compact-feature XGBoost (CPU, hist) for unseen subset...


[2025-09-12 01:58:45] [XGB compact] Fold 1/10 | trn=720000 val=80000


[0]	train-auc:0.57450	valid-auc:0.56864


[200]	train-auc:0.59887	valid-auc:0.57803


[400]	train-auc:0.60925	valid-auc:0.57773


[440]	train-auc:0.61109	valid-auc:0.57731


[2025-09-12 01:59:00] [XGB compact] Fold 0 AUC: 0.578115 | best_iter=291 | elapsed=15.2s


[2025-09-12 01:59:00] [XGB compact] Fold 2/10 | trn=720000 val=80000


[0]	train-auc:0.57421	valid-auc:0.57229


[200]	train-auc:0.59908	valid-auc:0.58011


[250]	train-auc:0.60207	valid-auc:0.57973


[2025-09-12 01:59:09] [XGB compact] Fold 1 AUC: 0.580441 | best_iter=100 | elapsed=9.3s


[2025-09-12 01:59:09] [XGB compact] Fold 3/10 | trn=720000 val=80000


[0]	train-auc:0.57401	valid-auc:0.57286


[200]	train-auc:0.59909	valid-auc:0.58029


[230]	train-auc:0.60101	valid-auc:0.58015


[2025-09-12 01:59:18] [XGB compact] Fold 2 AUC: 0.581230 | best_iter=80 | elapsed=8.1s


[2025-09-12 01:59:18] [XGB compact] Fold 4/10 | trn=720000 val=80000


[0]	train-auc:0.57435	valid-auc:0.57162


[200]	train-auc:0.59916	valid-auc:0.58260


[318]	train-auc:0.60571	valid-auc:0.58207


[2025-09-12 01:59:29] [XGB compact] Fold 3 AUC: 0.582696 | best_iter=169 | elapsed=11.2s


[2025-09-12 01:59:29] [XGB compact] Fold 5/10 | trn=720000 val=80000


[0]	train-auc:0.57447	valid-auc:0.56817


[200]	train-auc:0.59899	valid-auc:0.57822


[284]	train-auc:0.60411	valid-auc:0.57767


[2025-09-12 01:59:39] [XGB compact] Fold 4 AUC: 0.578766 | best_iter=134 | elapsed=9.9s


[2025-09-12 01:59:39] [XGB compact] Fold 6/10 | trn=720000 val=80000


[0]	train-auc:0.57426	valid-auc:0.57299


[200]	train-auc:0.59926	valid-auc:0.58177


[253]	train-auc:0.60252	valid-auc:0.58109


[2025-09-12 01:59:48] [XGB compact] Fold 5 AUC: 0.582276 | best_iter=104 | elapsed=9.0s


[2025-09-12 01:59:48] [XGB compact] Fold 7/10 | trn=720000 val=80000


[0]	train-auc:0.57444	valid-auc:0.57147


[200]	train-auc:0.59895	valid-auc:0.58031


[274]	train-auc:0.60332	valid-auc:0.57981


[2025-09-12 01:59:58] [XGB compact] Fold 6 AUC: 0.580638 | best_iter=125 | elapsed=9.6s


[2025-09-12 01:59:58] [XGB compact] Fold 8/10 | trn=720000 val=80000


[0]	train-auc:0.57410	valid-auc:0.57185


[200]	train-auc:0.59901	valid-auc:0.58013


[303]	train-auc:0.60445	valid-auc:0.57978


[2025-09-12 02:00:09] [XGB compact] Fold 7 AUC: 0.580358 | best_iter=154 | elapsed=11.0s


[2025-09-12 02:00:09] [XGB compact] Fold 9/10 | trn=720000 val=80000


[0]	train-auc:0.57481	valid-auc:0.56998


[200]	train-auc:0.59946	valid-auc:0.57896


[288]	train-auc:0.60457	valid-auc:0.57842


[2025-09-12 02:00:19] [XGB compact] Fold 8 AUC: 0.579211 | best_iter=138 | elapsed=10.0s


[2025-09-12 02:00:19] [XGB compact] Fold 10/10 | trn=720000 val=80000


[0]	train-auc:0.57467	valid-auc:0.56825


[200]	train-auc:0.59888	valid-auc:0.57909


[400]	train-auc:0.60881	valid-auc:0.57882


[444]	train-auc:0.61080	valid-auc:0.57854


[2025-09-12 02:00:34] [XGB compact] Fold 9 AUC: 0.579440 | best_iter=294 | elapsed=15.2s


[2025-09-12 02:00:35] [XGB compact] OOF AUC: 0.580264 | total_elapsed=110.3s


[2025-09-12 02:00:36] [XGB compact] Saved OOF and test predictions.


In [24]:
# Assemble final submission using f_27 identity for seen and compact-model ensemble for unseen
log('Assembling final submission: f_27 identity for seen; compact-model ensemble for unseen...')
import json

# 1) Recompute f_27 -> target map and seen/unseen masks
f27_target_map = train.groupby('f_27')['target'].first().to_dict()
test_seen_mask = test['f_27'].isin(f27_target_map).values
test_unseen_mask = ~test_seen_mask
log(f'Seen test rows: {int(test_seen_mask.sum())} | Unseen: {int(test_unseen_mask.sum())}')
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_target_map).astype('float32').values

# 2) Load compact LGB OOF/test preds and form ensemble
def load_comp_oof(path):
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    cols = [c for c in df.columns if c != 'id']
    if len(cols) != 1:
        return None
    return df.rename(columns={cols[0]: 'pred'})

def load_comp_pred(path):
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    cols = [c for c in df.columns if c != 'id']
    if len(cols) != 1:
        return None
    return df.rename(columns={cols[0]: 'pred'})

lgb_tags = ['c42','c1337','c2025','c101','c999']
oofs = []; preds = []
for tag in lgb_tags:
    po = load_comp_oof(f'oof_lgb_comp_{tag}.csv')
    pt = load_comp_pred(f'pred_lgb_comp_{tag}.csv')
    if (po is not None) and (pt is not None):
        oofs.append(po.rename(columns={'pred': f'oof_{tag}'}))
        preds.append(pt.rename(columns={'pred': f'pred_{tag}'}))

if len(oofs) == 0 or len(preds) == 0:
    raise RuntimeError('Missing compact LGB artifacts; train compact models first.')

oof_lgb = oofs[0]
for df in oofs[1:]:
    oof_lgb = oof_lgb.merge(df, on='id', how='inner')
pred_lgb = preds[0]
for df in preds[1:]:
    pred_lgb = pred_lgb.merge(df, on='id', how='inner')
lgb_oof_cols = [c for c in oof_lgb.columns if c != 'id']
lgb_pred_cols = [c for c in pred_lgb.columns if c != 'id']
oof_lgb['lgb_ens'] = oof_lgb[lgb_oof_cols].mean(axis=1).astype('float32')
pred_lgb['lgb_ens'] = pred_lgb[lgb_pred_cols].mean(axis=1).astype('float32')

# 3) Load compact XGB OOF/test if available
oof_xgb = load_comp_oof('oof_xgb_comp.csv')
pred_xgb = load_comp_pred('pred_xgb_comp.csv')
have_xgb = (oof_xgb is not None) and (pred_xgb is not None)

# 4) Tune LGB/XGB weight on OOF (global fallback if no unseen-like train rows)
gt = train[['id','target']].copy()
oof = gt.merge(oof_lgb[['id','lgb_ens']], on='id', how='left')
if have_xgb:
    oof = oof.merge(oof_xgb.rename(columns={'pred':'xgb'}), on='id', how='left')
else:
    oof['xgb'] = np.nan

def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / (len(r) - 1)

oof['lgb_r'] = rank_norm(oof['lgb_ens'])
if have_xgb:
    oof['xgb_r'] = rank_norm(oof['xgb'])

best = {'auc': 0.0, 'mode': 'prob', 'w_lgb': 1.0}
if have_xgb:
    coarse = np.arange(0.70, 0.85 + 1e-9, 0.05)
    refine = np.unique(np.round((coarse[:,None] + np.array([-0.04,-0.02,0.02,0.04])).ravel(), 2))
    weights = sorted(list(set(np.clip(np.concatenate([coarse, refine]), 0.0, 1.0))))
else:
    weights = [1.0]
for w in weights:
    if have_xgb:
        prob = w * oof['lgb_ens'] + (1 - w) * oof['xgb']
        rank = w * oof['lgb_r'] + (1 - w) * oof['xgb_r']
    else:
        prob = oof['lgb_ens']
        rank = oof['lgb_r']
    auc_p = roc_auc_score(oof['target'], prob)
    if auc_p > best['auc']:
        best.update({'auc': float(auc_p), 'mode': 'prob', 'w_lgb': float(w)})
    auc_r = roc_auc_score(oof['target'], rank)
    if auc_r > best['auc']:
        best.update({'auc': float(auc_r), 'mode': 'rank', 'w_lgb': float(w)})
log(f"[Unseen blend] Best OOF AUC={best['auc']:.6f} | mode={best['mode']} | w_lgb={best['w_lgb']}")

# 5) Build unseen test predictions
sub = pd.read_csv('sample_submission.csv')[['id']]
sub = sub.merge(pred_lgb[['id','lgb_ens']], on='id', how='left')
if have_xgb:
    sub = sub.merge(pred_xgb.rename(columns={'pred':'xgb'}), on='id', how='left')
if best['mode'] == 'prob' or not have_xgb:
    unseen_pred = sub['lgb_ens'].values if not have_xgb else (best['w_lgb'] * sub['lgb_ens'].values + (1 - best['w_lgb']) * sub['xgb'].values)
else:
    rl = pd.Series(sub['lgb_ens']).rank(method='average').values
    rx = pd.Series(sub['xgb']).rank(method='average').values
    rl = (rl - 1) / (len(rl) - 1); rx = (rx - 1) / (len(rx) - 1)
    unseen_pred = (best['w_lgb'] * rl + (1 - best['w_lgb']) * rx)
unseen_pred = unseen_pred.astype('float32')

# 6) Assemble final submission: seen via map, unseen via blended preds
final = pd.read_csv('sample_submission.csv')[['id']].copy()
final['target'] = np.zeros(len(final), dtype='float32')
final.loc[test_seen_mask, 'target'] = preds_seen
final.loc[test_unseen_mask, 'target'] = unseen_pred[test_unseen_mask]

# 7) Sanity checks and save
if final['target'].isna().any():
    raise ValueError('NaNs found in final target after assembly')
final['target'] = final['target'].astype('float64').clip(0.0, 1.0)
final.to_csv('submission.csv', index=False)
log('Saved submission.csv (seen from f_27 map, unseen from blended compact ensemble).')

# Extra variants: pure LGB unseen and fixed 0.75/0.25 prob blend for unseen
final_lgb = final.copy()
final_lgb.loc[test_unseen_mask, 'target'] = sub.loc[test_unseen_mask, 'lgb_ens'].astype('float32')
final_lgb['target'] = final_lgb['target'].astype('float64').clip(0.0,1.0)
final_lgb.to_csv('submission_unseen_lgb.csv', index=False)
log('Saved submission_unseen_lgb.csv (unseen from LGB-avg only).')
if have_xgb:
    final_fx = final.copy()
    fx = (0.75 * sub['lgb_ens'].values + 0.25 * (sub['xgb'].values if have_xgb else 0.0)).astype('float32')
    final_fx.loc[test_unseen_mask, 'target'] = fx[test_unseen_mask]
    final_fx['target'] = final_fx['target'].astype('float64').clip(0.0,1.0)
    final_fx.to_csv('submission_unseen_prob_075_025.csv', index=False)
    log('Saved submission_unseen_prob_075_025.csv (unseen prob 0.75/0.25).')

# Save meta
with open('blend_unseen_meta.json','w') as f:
    json.dump(best, f, indent=2)
log(f"Saved blend_unseen_meta.json: {best}")

[2025-09-12 02:00:51] Assembling final submission: f_27 identity for seen; compact-model ensemble for unseen...


[2025-09-12 02:00:53] Seen test rows: 28435 | Unseen: 71565


[2025-09-12 02:01:07] [Unseen blend] Best OOF AUC=0.581052 | mode=prob | w_lgb=0.66


[2025-09-12 02:01:07] Saved submission.csv (seen from f_27 map, unseen from blended compact ensemble).


[2025-09-12 02:01:07] Saved submission_unseen_lgb.csv (unseen from LGB-avg only).


[2025-09-12 02:01:07] Saved submission_unseen_prob_075_025.csv (unseen prob 0.75/0.25).


[2025-09-12 02:01:07] Saved blend_unseen_meta.json: {'auc': 0.581052351856858, 'mode': 'prob', 'w_lgb': 0.66}


In [25]:
# Hybrid assembly: f_27 identity for seen; best LGB(3-seed)+XGB blend for unseen (full 165-feature models)
log('Building hybrid submission: identity map for seen, strong full-feature blend for unseen...')

# 1) f_27 -> target identity map and masks
f27_target_map = train.groupby('f_27')['target'].first().to_dict()
t_seen_mask = test['f_27'].isin(f27_target_map).values
t_unseen_mask = ~t_seen_mask
log(f'Hybrid masks -> seen: {int(t_seen_mask.sum())} | unseen: {int(t_unseen_mask.sum())}')
preds_seen = test.loc[t_seen_mask, 'f_27'].map(f27_target_map).astype('float32').values

# 2) Load OOF/test for 3-seed LGB (full features) and XGB, create tuned blend
def _load_oof(path):
    if not os.path.exists(path): return None
    df = pd.read_csv(path); cols = [c for c in df.columns if c != 'id']
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})
def _load_pred(path):
    if not os.path.exists(path): return None
    df = pd.read_csv(path); cols = [c for c in df.columns if c != 'id']
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})

# LGB seeds OOF
oofs = []; preds = [];
for p_o, p_t in [
    ('oof_lgb_fast_s42.csv','pred_lgb_fast_s42.csv'),
    ('oof_lgb_fast_s1337.csv','pred_lgb_fast_s1337.csv'),
    ('oof_lgb_fast_s2025.csv','pred_lgb_fast_s2025.csv'),
]:
    oo = _load_oof(p_o); pt = _load_pred(p_t)
    if (oo is not None) and (pt is not None):
        oofs.append(oo); preds.append(pt)
assert len(oofs) >= 1 and len(preds) >= 1, 'Missing full LGB seed artifacts'
oof_lgb = oofs[0]
for df in oofs[1:]: oof_lgb = oof_lgb.merge(df, on='id', how='inner', suffixes=('','_x'))
pred_lgb = preds[0]
for df in preds[1:]: pred_lgb = pred_lgb.merge(df, on='id', how='inner', suffixes=('','_x'))
lgb_oof_cols = [c for c in oof_lgb.columns if c != 'id']
lgb_pred_cols = [c for c in pred_lgb.columns if c != 'id']
oof_lgb['lgb_ens'] = oof_lgb[lgb_oof_cols].mean(axis=1).astype('float32')
pred_lgb['lgb_ens'] = pred_lgb[lgb_pred_cols].mean(axis=1).astype('float32')

# XGB OOF/pred
oof_xgb = _load_oof('oof_xgb_seed42.csv')
pred_xgb = _load_pred('pred_xgb_seed42.csv')
have_xgb = (oof_xgb is not None) and (pred_xgb is not None)

# 3) Tune blend weight on OOF (rank vs prob), using full-train OOF since unseen-like train rows are none
gt = train[['id','target']].copy()
oof = gt.merge(oof_lgb[['id','lgb_ens']], on='id', how='left')
if have_xgb: oof = oof.merge(oof_xgb.rename(columns={'pred':'xgb'}), on='id', how='left')
else: oof['xgb'] = np.nan
def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / (len(r) - 1)
oof['lgb_r'] = rank_norm(oof['lgb_ens'])
if have_xgb: oof['xgb_r'] = rank_norm(oof['xgb'])
best = {'auc':0.0, 'mode':'prob', 'w':1.0}
weights = np.arange(0.60, 0.86, 0.05) if have_xgb else [1.0]
for w in weights:
    if have_xgb:
        prob = w*oof['lgb_ens'] + (1-w)*oof['xgb']
        rank = w*oof['lgb_r'] + (1-w)*oof['xgb_r']
    else:
        prob = oof['lgb_ens']; rank = oof['lgb_r']
    ap = roc_auc_score(oof['target'], prob)
    if ap > best['auc']: best.update({'auc':float(ap), 'mode':'prob', 'w':float(w)})
    ar = roc_auc_score(oof['target'], rank)
    if ar > best['auc']: best.update({'auc':float(ar), 'mode':'rank', 'w':float(w)})
log(f"[Hybrid blend] Best OOF AUC={best['auc']:.6f} | mode={best['mode']} | w_lgb={best['w']}")

# 4) Build unseen predictions from best blend
sub_te = pd.read_csv('sample_submission.csv')[['id']].merge(pred_lgb[['id','lgb_ens']], on='id', how='left')
if have_xgb: sub_te = sub_te.merge(pred_xgb.rename(columns={'pred':'xgb'}), on='id', how='left')
if best['mode']=='prob' or not have_xgb:
    unseen_pred_full = sub_te['lgb_ens'].values if not have_xgb else (best['w']*sub_te['lgb_ens'].values + (1-best['w'])*sub_te['xgb'].values)
else:
    rl = rank_norm(sub_te['lgb_ens']); rx = rank_norm(sub_te['xgb'])
    unseen_pred_full = (best['w']*rl + (1-best['w'])*rx)
unseen_pred_full = unseen_pred_full.astype('float32')

# 5) Assemble final: seen from identity map; unseen from full-feature blend
final = pd.read_csv('sample_submission.csv')[['id']].copy()
final['target'] = np.zeros(len(final), dtype='float32')
final.loc[t_seen_mask, 'target'] = preds_seen
final.loc[t_unseen_mask, 'target'] = unseen_pred_full[t_unseen_mask]
final['target'] = final['target'].astype('float64').clip(0.0,1.0)
final.to_csv('submission.csv', index=False)
log('Hybrid submission.csv saved (seen: identity map; unseen: full-feature LGB+XGB blend).')

[2025-09-12 02:02:40] Building hybrid submission: identity map for seen, strong full-feature blend for unseen...


[2025-09-12 02:02:42] Hybrid masks -> seen: 28435 | unseen: 71565


[2025-09-12 02:02:47] [Hybrid blend] Best OOF AUC=0.991016 | mode=prob | w_lgb=0.8500000000000002


[2025-09-12 02:02:47] Hybrid submission.csv saved (seen: identity map; unseen: full-feature LGB+XGB blend).


In [27]:
# Leakage-safe Target Encodings (TE) for f_27 components + target-free frequencies (for unseen model)
log('Building OOF target encodings for f_27 positional chars and bigrams, plus frequency encodings...')
t0 = time.time()

assert 'f_27' in train.columns and 'target' in train.columns, 'Missing f_27/target'
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values.astype('int16')
n_splits = int(np.unique(folds_arr).size)
prior = float(train['target'].mean())

# Precompute tokens
s_tr = train['f_27'].astype(str).values
s_te = test['f_27'].astype(str).values

# Positional char integer tokens (already in columns f_27_pos_i but recompute arrays to be safe)
pos_char_tr = np.stack([train[f'f_27_pos_{i}'].astype('int16').values for i in range(10)], axis=1)
pos_char_te = np.stack([test[f'f_27_pos_{i}'].astype('int16').values for i in range(10)], axis=1)

# Positional bigram tokens (string form for groupby; train/test)
def make_bigrams(arr):
    # arr: array of strings length 10
    n = arr.shape[0]
    out = [None]*9
    for i in range(9):
        out[i] = np.fromiter((row[i:i+2] for row in arr), count=n, dtype=object)
    return out
bg_tr = make_bigrams(s_tr)
bg_te = make_bigrams(s_te)

# Optional: f27_nunique per row
def f27_nunique(arr):
    return np.fromiter((len(set(list(x))) for x in arr), count=len(arr), dtype=np.int16)
f27_nuniq_tr = f27_nunique(s_tr)
f27_nuniq_te = f27_nunique(s_te)

# Helper to compute OOF TE given key arrays (keys) and smoothing m
def oof_te_by_key(keys, y, m, prior):
    # keys: array-like of hashable tokens for all rows in train
    keys = pd.Series(keys)
    oof_vals = np.zeros(len(keys), dtype=np.float32)
    oof_logcnt = np.zeros(len(keys), dtype=np.float32)
    for fold in range(n_splits):
        trn_idx = np.where(folds_arr != fold)[0]
        val_idx = np.where(folds_arr == fold)[0]
        k_tr = keys.iloc[trn_idx]
        y_tr = y[trn_idx]
        grp = pd.DataFrame({'k': k_tr.values, 'y': y_tr}).groupby('k')
        cnt = grp['y'].size()
        sumy = grp['y'].sum()
        te_map = ((sumy + m * prior) / (cnt + m)).astype('float32')
        map_cnt = cnt.astype('int32')
        # map for val
        kval = keys.iloc[val_idx]
        te = kval.map(te_map).astype('float32')
        lc = kval.map(map_cnt).fillna(0).astype('int32')
        te = te.fillna(prior).values
        oof_vals[val_idx] = te
        oof_logcnt[val_idx] = np.log1p(lc.values).astype('float32')
    # Full fit for test transform
    grp_full = pd.DataFrame({'k': keys.values, 'y': y}).groupby('k')
    cnt_full = grp_full['y'].size()
    sumy_full = grp_full['y'].sum()
    te_map_full = ((sumy_full + m * prior) / (cnt_full + m)).astype('float32')
    cnt_full = cnt_full.astype('int32')
    return oof_vals, oof_logcnt, te_map_full, cnt_full

# Build TE for positional chars (10) with m=30
y_tr = train['target'].values.astype('float32')
te_pos_char_tr = np.zeros((len(train), 10), dtype=np.float32)
te_pos_char_logcnt_tr = np.zeros((len(train), 10), dtype=np.float32)
te_pos_char_te = np.zeros((len(test), 10), dtype=np.float32)
te_pos_char_logcnt_te = np.zeros((len(test), 10), dtype=np.float32)
for i in range(10):
    keys_tr = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[:, i]])
    oof_mean, oof_logcnt, te_map_full, cnt_full = oof_te_by_key(keys_tr, y_tr, m=30.0, prior=prior)
    te_pos_char_tr[:, i] = oof_mean
    te_pos_char_logcnt_tr[:, i] = oof_logcnt
    keys_te = pd.Series([f'{i}|{int(t)}' for t in pos_char_te[:, i]])
    te_vals = keys_te.map(te_map_full).astype('float32').fillna(prior).values
    lc = keys_te.map(cnt_full).fillna(0).astype('int32').values
    te_pos_char_te[:, i] = te_vals
    te_pos_char_logcnt_te[:, i] = np.log1p(lc).astype('float32')

# Build TE for positional bigrams (9) with m=100
te_pos_bg_tr = np.zeros((len(train), 9), dtype=np.float32)
te_pos_bg_logcnt_tr = np.zeros((len(train), 9), dtype=np.float32)
te_pos_bg_te = np.zeros((len(test), 9), dtype=np.float32)
te_pos_bg_logcnt_te = np.zeros((len(test), 9), dtype=np.float32)
for i in range(9):
    keys_tr = pd.Series([f'{i}|{bg}' for bg in bg_tr[i]])
    oof_mean, oof_logcnt, te_map_full, cnt_full = oof_te_by_key(keys_tr, y_tr, m=100.0, prior=prior)
    te_pos_bg_tr[:, i] = oof_mean
    te_pos_bg_logcnt_tr[:, i] = oof_logcnt
    keys_te = pd.Series([f'{i}|{bg}' for bg in bg_te[i]])
    te_vals = keys_te.map(te_map_full).astype('float32').fillna(prior).values
    lc = keys_te.map(cnt_full).fillna(0).astype('int32').values
    te_pos_bg_te[:, i] = te_vals
    te_pos_bg_logcnt_te[:, i] = np.log1p(lc).astype('float32')

# Optional TE for f27_nunique with m=50
keys_nu_tr = pd.Series(f27_nuniq_tr.astype('int16'))
oof_nu_mean, oof_nu_logcnt, nu_map_full, nu_cnt_full = oof_te_by_key(keys_nu_tr, y_tr, m=50.0, prior=prior)
nu_te_tr = oof_nu_mean
nu_logcnt_tr = oof_nu_logcnt
keys_nu_te = pd.Series(f27_nuniq_te.astype('int16'))
nu_te_te = keys_nu_te.map(nu_map_full).astype('float32').fillna(prior).values
nu_logcnt_te = np.log1p(keys_nu_te.map(nu_cnt_full).fillna(0).astype('int32').values).astype('float32')

# Target-free frequencies pooled on train+test
N_all = float(len(train) + len(test))
# pos-char freqs
freq_pos_char_tr = np.zeros((len(train), 10), dtype=np.float32)
freq_pos_char_te = np.zeros((len(test), 10), dtype=np.float32)
for i in range(10):
    keys_all = pd.Series(np.concatenate([np.array([f'{i}|{int(t)}' for t in pos_char_tr[:, i]]), np.array([f'{i}|{int(t)}' for t in pos_char_te[:, i]])]))
    vc = keys_all.value_counts()
    ktr = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[:, i]])
    kte = pd.Series([f'{i}|{int(t)}' for t in pos_char_te[:, i]])
    freq_pos_char_tr[:, i] = (ktr.map(vc).fillna(0).values.astype('float32') / N_all)
    freq_pos_char_te[:, i] = (kte.map(vc).fillna(0).values.astype('float32') / N_all)

# pos-bigram freqs
freq_pos_bg_tr = np.zeros((len(train), 9), dtype=np.float32)
freq_pos_bg_te = np.zeros((len(test), 9), dtype=np.float32)
for i in range(9):
    keys_all = pd.Series(np.concatenate([np.array([f'{i}|{bg}' for bg in bg_tr[i]]), np.array([f'{i}|{bg}' for bg in bg_te[i]])]))
    vc = keys_all.value_counts()
    ktr = pd.Series([f'{i}|{bg}' for bg in bg_tr[i]])
    kte = pd.Series([f'{i}|{bg}' for bg in bg_te[i]])
    freq_pos_bg_tr[:, i] = (ktr.map(vc).fillna(0).values.astype('float32') / N_all)
    freq_pos_bg_te[:, i] = (kte.map(vc).fillna(0).values.astype('float32') / N_all)

# full-string freq
all_full = pd.Series(np.concatenate([s_tr, s_te]))
vc_full = all_full.value_counts()
freq_full_tr = pd.Series(s_tr).map(vc_full).fillna(0).values.astype('float32') / N_all
freq_full_te = pd.Series(s_te).map(vc_full).fillna(0).values.astype('float32') / N_all

# Assemble TE/freq dataframes
te_cols_tr = {}
te_cols_te = {}
for i in range(10):
    te_cols_tr[f'TE_pos_char_{i}_mean'] = te_pos_char_tr[:, i]
    te_cols_te[f'TE_pos_char_{i}_mean'] = te_pos_char_te[:, i]
for i in range(9):
    te_cols_tr[f'TE_pos_bigram_{i}_mean'] = te_pos_bg_tr[:, i]
    te_cols_te[f'TE_pos_bigram_{i}_mean'] = te_pos_bg_te[:, i]
for i in range(10):
    te_cols_tr[f'TE_pos_char_{i}_logcnt'] = te_pos_char_logcnt_tr[:, i]
    te_cols_te[f'TE_pos_char_{i}_logcnt'] = te_pos_char_logcnt_te[:, i]
for i in range(9):
    te_cols_tr[f'TE_pos_bigram_{i}_logcnt'] = te_pos_bg_logcnt_tr[:, i]
    te_cols_te[f'TE_pos_bigram_{i}_logcnt'] = te_pos_bg_logcnt_te[:, i]
te_cols_tr['TE_f27_nunique_mean'] = nu_te_tr
te_cols_te['TE_f27_nunique_mean'] = nu_te_te
te_cols_tr['TE_f27_nunique_logcnt'] = nu_logcnt_tr
te_cols_te['TE_f27_nunique_logcnt'] = nu_logcnt_te
for i in range(10):
    te_cols_tr[f'FREQ_pos_char_{i}'] = freq_pos_char_tr[:, i]
    te_cols_te[f'FREQ_pos_char_{i}'] = freq_pos_char_te[:, i]
for i in range(9):
    te_cols_tr[f'FREQ_pos_bigram_{i}'] = freq_pos_bg_tr[:, i]
    te_cols_te[f'FREQ_pos_bigram_{i}'] = freq_pos_bg_te[:, i]
te_cols_tr['FREQ_full_string'] = freq_full_tr
te_cols_te['FREQ_full_string'] = freq_full_te

te_train = pd.DataFrame(te_cols_tr, index=train.index)
te_test = pd.DataFrame(te_cols_te, index=test.index)

# Build augmented compact feature matrices for unseen model
X_unseen_tr = pd.concat([comp_train.reset_index(drop=True), te_train.reset_index(drop=True)], axis=1)
X_unseen_te = pd.concat([comp_test.reset_index(drop=True), te_test.reset_index(drop=True)], axis=1)
unseen_feature_cols = X_unseen_tr.columns.tolist()

# Sanity checks
assert X_unseen_tr.shape[1] == len(unseen_feature_cols)
assert not np.isnan(X_unseen_tr.values).any(), 'NaNs in X_unseen_tr'
assert not np.isnan(X_unseen_te.values).any(), 'NaNs in X_unseen_te'
log(f'Built TE/freq block. Unseen feature count: {len(unseen_feature_cols)} | time {time.time()-t0:.1f}s')

[2025-09-12 02:07:21] Building OOF target encodings for f_27 positional chars and bigrams, plus frequency encodings...


[2025-09-12 02:08:07] Built TE/freq block. Unseen feature count: 71 | time 46.4s


In [29]:
# Train TE-enhanced unseen models: 6x LightGBM + 1x XGBoost on X_unseen_tr/X_unseen_te
log('Training TE-enhanced unseen models (6 LGB seeds + 1 XGB) ...')
import lightgbm as lgb

assert 'X_unseen_tr' in globals() and 'X_unseen_te' in globals(), 'Run TE feature cell first (X_unseen_tr/X_unseen_te missing)'
X_tr = X_unseen_tr.copy()
X_te = X_unseen_te.copy()
y_tr = train['target'].values.astype('float32')

# Folds (locked)
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

def train_lgb_unseen(seed=42, lr=0.04, num_leaves=224, min_data_in_leaf=240, ff=0.78, bf=0.78, l2=4.0, tag='u42'):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': lr,
        'num_leaves': num_leaves,
        'max_depth': -1,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': ff,
        'bagging_fraction': bf,
        'bagging_freq': 1,
        'lambda_l1': 0.0,
        'lambda_l2': l2,
        'max_bin': 255,
        'verbose': -1,
        'n_jobs': -1,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'data_random_seed': seed
    }
    oof = np.zeros(len(X_tr), dtype='float32')
    ptest = np.zeros(len(X_te), dtype='float32')
    t0 = time.time()
    for fold in range(n_splits):
        trn_idx = np.where(folds_arr != fold)[0]
        val_idx = np.where(folds_arr == fold)[0]
        log(f'[LGB unseen] seed={seed} ff={ff} bf={bf} | Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
        dtr = lgb.Dataset(X_tr.iloc[trn_idx], label=y_tr[trn_idx])
        dvl = lgb.Dataset(X_tr.iloc[val_idx], label=y_tr[val_idx])
        model = lgb.train(params, dtr, num_boost_round=5000, valid_sets=[dtr, dvl], valid_names=['train','valid'], callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)])
        oof[val_idx] = model.predict(X_tr.iloc[val_idx], num_iteration=model.best_iteration)
        auc = roc_auc_score(y_tr[val_idx], oof[val_idx])
        log(f'[LGB unseen] seed={seed} fold={fold} AUC: {auc:.6f} | best_iter={model.best_iteration}')
        ptest += model.predict(X_te, num_iteration=model.best_iteration) / n_splits
        del dtr, dvl, model; gc.collect()
    cv_auc = roc_auc_score(y_tr, oof)
    log(f'[LGB unseen] seed={seed} OOF AUC: {cv_auc:.6f} | elapsed={time.time()-t0:.1f}s')
    pd.DataFrame({'id': train['id'], f'oof_lgb_unseen_{tag}': oof}).to_csv(f'oof_lgb_unseen_{tag}.csv', index=False)
    pd.DataFrame({'id': test['id'], f'prediction_lgb_unseen_{tag}': ptest}).to_csv(f'pred_lgb_unseen_{tag}.csv', index=False)
    log(f'[LGB unseen] Saved OOF/test preds for {tag}')

# Run 6 LGB seeds with diverse params
lgb_cfgs = [
    # seed, lr, leaves, min_leaf, ff, bf, l2, tag
    (42,   0.040, 224, 240, 0.78, 0.78, 4.0, 'u42'),
    (1337, 0.045, 256, 240, 0.72, 0.72, 5.0, 'u1337'),
    (2025, 0.038, 192, 260, 0.83, 0.75, 3.0, 'u2025'),
    (101,  0.042, 256, 300, 0.70, 0.83, 6.0, 'u101'),
    (999,  0.035, 192, 220, 0.80, 0.70, 4.0, 'u999'),
    (7,    0.040, 240, 260, 0.75, 0.80, 5.0, 'u7'),
]
for s, lr, nl, mdl, ff, bf, l2, tag in lgb_cfgs:
    train_lgb_unseen(seed=s, lr=lr, num_leaves=nl, min_data_in_leaf=mdl, ff=ff, bf=bf, l2=l2, tag=tag)
log('All TE-enhanced LGB unseen seeds finished.')

# XGBoost on TE-enhanced unseen features
log('Training XGBoost unseen (TE-enhanced features)...')
try:
    import xgboost as xgb
except ImportError:
    import sys, subprocess
    log('XGBoost not found. Installing...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'xgboost'])
    import xgboost as xgb

dtest = xgb.DMatrix(X_te)
oof_xgb_u = np.zeros(len(X_tr), dtype='float32')
pred_xgb_u = np.zeros(len(X_te), dtype='float32')
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'max_bin': 256,
    'eta': 0.045,
    'max_depth': 8,
    'min_child_weight': 80,
    'subsample': 0.80,
    'colsample_bytree': 0.80,
    'lambda': 3.0,
    'nthread': 36,
    'seed': 42
}
num_boost_round = 3500
early_stopping_rounds = 150
for fold in range(n_splits):
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'[XGB unseen] Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
    dtr = xgb.DMatrix(X_tr.iloc[trn_idx], label=y_tr[trn_idx])
    dvl = xgb.DMatrix(X_tr.iloc[val_idx], label=y_tr[val_idx])
    bst = xgb.train(params=xgb_params, dtrain=dtr, num_boost_round=num_boost_round, evals=[(dtr,'train'),(dvl,'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=200)
    best_iter = None
    try:
        attrs = bst.attributes()
        if 'best_iteration' in attrs:
            best_iter = int(attrs['best_iteration'])
    except Exception:
        best_iter = getattr(bst, 'best_iteration', None)
    if best_iter is not None and best_iter >= 0:
        oof_pred = bst.predict(dvl, iteration_range=(0, best_iter + 1))
        pred_te = bst.predict(dtest, iteration_range=(0, best_iter + 1))
    else:
        oof_pred = bst.predict(dvl)
        pred_te = bst.predict(dtest)
    oof_xgb_u[val_idx] = oof_pred.astype('float32')
    pred_xgb_u += (pred_te.astype('float32') / n_splits)
    del dtr, dvl, bst; gc.collect()
auc_xgb_u = roc_auc_score(y_tr, oof_xgb_u)
log(f'[XGB unseen] OOF AUC: {auc_xgb_u:.6f}')
pd.DataFrame({'id': train['id'], 'oof_xgb_unseen': oof_xgb_u}).to_csv('oof_xgb_unseen.csv', index=False)
pd.DataFrame({'id': test['id'], 'prediction_xgb_unseen': pred_xgb_u}).to_csv('pred_xgb_unseen.csv', index=False)
log('[XGB unseen] Saved OOF and test predictions.')

[2025-09-12 02:10:28] Training TE-enhanced unseen models (6 LGB seeds + 1 XGB) ...


[2025-09-12 02:10:28] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833937	valid's auc: 0.817831


[400]	train's auc: 0.848099	valid's auc: 0.819141


[600]	train's auc: 0.860071	valid's auc: 0.819201


[800]	train's auc: 0.871088	valid's auc: 0.819073


Early stopping, best iteration is:
[636]	train's auc: 0.862084	valid's auc: 0.819255


[2025-09-12 02:11:37] [LGB unseen] seed=42 fold=0 AUC: 0.819255 | best_iter=636


[2025-09-12 02:11:38] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834039	valid's auc: 0.81781


[400]	train's auc: 0.848064	valid's auc: 0.818609


[600]	train's auc: 0.860134	valid's auc: 0.818601


Early stopping, best iteration is:
[522]	train's auc: 0.855583	valid's auc: 0.818675


[2025-09-12 02:12:37] [LGB unseen] seed=42 fold=1 AUC: 0.818675 | best_iter=522


[2025-09-12 02:12:38] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833582	valid's auc: 0.820903


[400]	train's auc: 0.847628	valid's auc: 0.822091


[600]	train's auc: 0.859454	valid's auc: 0.821927


Early stopping, best iteration is:
[423]	train's auc: 0.849048	valid's auc: 0.822132


[2025-09-12 02:13:30] [LGB unseen] seed=42 fold=2 AUC: 0.822132 | best_iter=423


[2025-09-12 02:13:31] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833972	valid's auc: 0.818086


[400]	train's auc: 0.848121	valid's auc: 0.81876


Early stopping, best iteration is:
[374]	train's auc: 0.846487	valid's auc: 0.818854


[2025-09-12 02:14:20] [LGB unseen] seed=42 fold=3 AUC: 0.818854 | best_iter=374


[2025-09-12 02:14:21] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833776	valid's auc: 0.820789


[400]	train's auc: 0.847934	valid's auc: 0.821613


[600]	train's auc: 0.859773	valid's auc: 0.821345


Early stopping, best iteration is:
[427]	train's auc: 0.849608	valid's auc: 0.821617


[2025-09-12 02:15:18] [LGB unseen] seed=42 fold=4 AUC: 0.821617 | best_iter=427


[2025-09-12 02:15:18] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834074	valid's auc: 0.816329


[400]	train's auc: 0.848019	valid's auc: 0.817393


[600]	train's auc: 0.860128	valid's auc: 0.817359


Early stopping, best iteration is:
[470]	train's auc: 0.852436	valid's auc: 0.817476


[2025-09-12 02:16:15] [LGB unseen] seed=42 fold=5 AUC: 0.817476 | best_iter=470


[2025-09-12 02:16:15] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833941	valid's auc: 0.818487


[400]	train's auc: 0.848062	valid's auc: 0.819378


[600]	train's auc: 0.860086	valid's auc: 0.819257


Early stopping, best iteration is:
[416]	train's auc: 0.84906	valid's auc: 0.819422


[2025-09-12 02:17:07] [LGB unseen] seed=42 fold=6 AUC: 0.819422 | best_iter=416


[2025-09-12 02:17:07] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833747	valid's auc: 0.817906


[400]	train's auc: 0.847895	valid's auc: 0.818858


[600]	train's auc: 0.860029	valid's auc: 0.818823


Early stopping, best iteration is:
[473]	train's auc: 0.852464	valid's auc: 0.819001


[2025-09-12 02:18:04] [LGB unseen] seed=42 fold=7 AUC: 0.819001 | best_iter=473


[2025-09-12 02:18:05] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833877	valid's auc: 0.818354


[400]	train's auc: 0.847995	valid's auc: 0.819339


[600]	train's auc: 0.860057	valid's auc: 0.819231


Early stopping, best iteration is:
[505]	train's auc: 0.854487	valid's auc: 0.819412


[2025-09-12 02:19:04] [LGB unseen] seed=42 fold=8 AUC: 0.819412 | best_iter=505


[2025-09-12 02:19:04] [LGB unseen] seed=42 ff=0.78 bf=0.78 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.83369	valid's auc: 0.819926


[400]	train's auc: 0.847801	valid's auc: 0.821017


Early stopping, best iteration is:
[354]	train's auc: 0.844927	valid's auc: 0.821066


[2025-09-12 02:19:51] [LGB unseen] seed=42 fold=9 AUC: 0.821066 | best_iter=354


[2025-09-12 02:19:52] [LGB unseen] seed=42 OOF AUC: 0.819671 | elapsed=564.0s


[2025-09-12 02:19:53] [LGB unseen] Saved OOF/test preds for u42


[2025-09-12 02:19:53] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837943	valid's auc: 0.818113


[400]	train's auc: 0.854583	valid's auc: 0.818745


[600]	train's auc: 0.868883	valid's auc: 0.818734


Early stopping, best iteration is:
[444]	train's auc: 0.85783	valid's auc: 0.818785


[2025-09-12 02:20:51] [LGB unseen] seed=1337 fold=0 AUC: 0.818785 | best_iter=444


[2025-09-12 02:20:51] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.838103	valid's auc: 0.818307


[400]	train's auc: 0.854554	valid's auc: 0.818796


Early stopping, best iteration is:
[396]	train's auc: 0.854261	valid's auc: 0.818808


[2025-09-12 02:21:45] [LGB unseen] seed=1337 fold=1 AUC: 0.818808 | best_iter=396


[2025-09-12 02:21:46] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837586	valid's auc: 0.821122


[400]	train's auc: 0.854324	valid's auc: 0.821551


Early stopping, best iteration is:
[290]	train's auc: 0.84563	valid's auc: 0.821763


[2025-09-12 02:22:30] [LGB unseen] seed=1337 fold=2 AUC: 0.821763 | best_iter=290


[2025-09-12 02:22:31] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837956	valid's auc: 0.818205


[400]	train's auc: 0.854508	valid's auc: 0.818671


Early stopping, best iteration is:
[366]	train's auc: 0.85196	valid's auc: 0.818713


[2025-09-12 02:23:21] [LGB unseen] seed=1337 fold=3 AUC: 0.818713 | best_iter=366


[2025-09-12 02:23:21] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837681	valid's auc: 0.820793


[400]	train's auc: 0.854343	valid's auc: 0.820997


Early stopping, best iteration is:
[250]	train's auc: 0.842225	valid's auc: 0.821088


[2025-09-12 02:24:03] [LGB unseen] seed=1337 fold=4 AUC: 0.821088 | best_iter=250


[2025-09-12 02:24:03] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.838128	valid's auc: 0.816824


[400]	train's auc: 0.854706	valid's auc: 0.817204


Early stopping, best iteration is:
[317]	train's auc: 0.84825	valid's auc: 0.817253


[2025-09-12 02:24:53] [LGB unseen] seed=1337 fold=5 AUC: 0.817253 | best_iter=317


[2025-09-12 02:24:54] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837967	valid's auc: 0.818953


[400]	train's auc: 0.854592	valid's auc: 0.819357


Early stopping, best iteration is:
[373]	train's auc: 0.852496	valid's auc: 0.819496


[2025-09-12 02:25:44] [LGB unseen] seed=1337 fold=6 AUC: 0.819496 | best_iter=373


[2025-09-12 02:25:45] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837905	valid's auc: 0.818009


[400]	train's auc: 0.854588	valid's auc: 0.818342


Early stopping, best iteration is:
[288]	train's auc: 0.84564	valid's auc: 0.818481


[2025-09-12 02:26:29] [LGB unseen] seed=1337 fold=7 AUC: 0.818481 | best_iter=288


[2025-09-12 02:26:29] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837912	valid's auc: 0.818556


[400]	train's auc: 0.854544	valid's auc: 0.819212


Early stopping, best iteration is:
[392]	train's auc: 0.853945	valid's auc: 0.819263


[2025-09-12 02:27:23] [LGB unseen] seed=1337 fold=8 AUC: 0.819263 | best_iter=392


[2025-09-12 02:27:24] [LGB unseen] seed=1337 ff=0.72 bf=0.72 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837794	valid's auc: 0.820197


[400]	train's auc: 0.854457	valid's auc: 0.820303


Early stopping, best iteration is:
[346]	train's auc: 0.850263	valid's auc: 0.820547


[2025-09-12 02:28:13] [LGB unseen] seed=1337 fold=9 AUC: 0.820547 | best_iter=346


[2025-09-12 02:28:14] [LGB unseen] seed=1337 OOF AUC: 0.819391 | elapsed=500.9s


[2025-09-12 02:28:15] [LGB unseen] Saved OOF/test preds for u1337


[2025-09-12 02:28:15] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830933	valid's auc: 0.817653


[400]	train's auc: 0.843401	valid's auc: 0.819153


[600]	train's auc: 0.853753	valid's auc: 0.819267


Early stopping, best iteration is:
[525]	train's auc: 0.850023	valid's auc: 0.819338


[2025-09-12 02:29:14] [LGB unseen] seed=2025 fold=0 AUC: 0.819338 | best_iter=525


[2025-09-12 02:29:14] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.831034	valid's auc: 0.817579


[400]	train's auc: 0.843387	valid's auc: 0.818753


[600]	train's auc: 0.853764	valid's auc: 0.818754


Early stopping, best iteration is:
[552]	train's auc: 0.85136	valid's auc: 0.818787


[2025-09-12 02:30:14] [LGB unseen] seed=2025 fold=1 AUC: 0.818787 | best_iter=552


[2025-09-12 02:30:14] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830691	valid's auc: 0.820852


[400]	train's auc: 0.843148	valid's auc: 0.822277


[600]	train's auc: 0.853614	valid's auc: 0.822235


Early stopping, best iteration is:
[436]	train's auc: 0.84512	valid's auc: 0.822294


[2025-09-12 02:31:07] [LGB unseen] seed=2025 fold=2 AUC: 0.822294 | best_iter=436


[2025-09-12 02:31:08] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.831055	valid's auc: 0.817578


[400]	train's auc: 0.843432	valid's auc: 0.818778


[600]	train's auc: 0.853818	valid's auc: 0.818922


Early stopping, best iteration is:
[554]	train's auc: 0.851548	valid's auc: 0.818929


[2025-09-12 02:32:07] [LGB unseen] seed=2025 fold=3 AUC: 0.818929 | best_iter=554


[2025-09-12 02:32:07] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830774	valid's auc: 0.820466


[400]	train's auc: 0.843144	valid's auc: 0.821358


[600]	train's auc: 0.853633	valid's auc: 0.821375


Early stopping, best iteration is:
[551]	train's auc: 0.851147	valid's auc: 0.821422


[2025-09-12 02:33:06] [LGB unseen] seed=2025 fold=4 AUC: 0.821422 | best_iter=551


[2025-09-12 02:33:06] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.831159	valid's auc: 0.816016


[400]	train's auc: 0.843549	valid's auc: 0.817237


[600]	train's auc: 0.853917	valid's auc: 0.817407


[800]	train's auc: 0.863453	valid's auc: 0.817168


Early stopping, best iteration is:
[639]	train's auc: 0.855815	valid's auc: 0.817429


[2025-09-12 02:34:11] [LGB unseen] seed=2025 fold=5 AUC: 0.817429 | best_iter=639


[2025-09-12 02:34:12] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830988	valid's auc: 0.818487


[400]	train's auc: 0.843466	valid's auc: 0.819605


[600]	train's auc: 0.853868	valid's auc: 0.819526


Early stopping, best iteration is:
[438]	train's auc: 0.845604	valid's auc: 0.81968


[2025-09-12 02:35:10] [LGB unseen] seed=2025 fold=6 AUC: 0.819680 | best_iter=438


[2025-09-12 02:35:10] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830928	valid's auc: 0.817483


[400]	train's auc: 0.843359	valid's auc: 0.818657


[600]	train's auc: 0.853726	valid's auc: 0.818609


Early stopping, best iteration is:
[473]	train's auc: 0.847232	valid's auc: 0.81878


[2025-09-12 02:36:04] [LGB unseen] seed=2025 fold=7 AUC: 0.818780 | best_iter=473


[2025-09-12 02:36:05] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830992	valid's auc: 0.818017


[400]	train's auc: 0.843495	valid's auc: 0.81935


[600]	train's auc: 0.853935	valid's auc: 0.819305


Early stopping, best iteration is:
[471]	train's auc: 0.847353	valid's auc: 0.819447


[2025-09-12 02:36:59] [LGB unseen] seed=2025 fold=8 AUC: 0.819447 | best_iter=471


[2025-09-12 02:36:59] [LGB unseen] seed=2025 ff=0.83 bf=0.75 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.830757	valid's auc: 0.819186


[400]	train's auc: 0.843224	valid's auc: 0.820602


[600]	train's auc: 0.85366	valid's auc: 0.820743


Early stopping, best iteration is:
[508]	train's auc: 0.848913	valid's auc: 0.820853


[2025-09-12 02:37:57] [LGB unseen] seed=2025 fold=9 AUC: 0.820853 | best_iter=508


[2025-09-12 02:37:58] [LGB unseen] seed=2025 OOF AUC: 0.819674 | elapsed=582.5s


[2025-09-12 02:37:59] [LGB unseen] Saved OOF/test preds for u2025


[2025-09-12 02:37:59] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836456	valid's auc: 0.818319


[400]	train's auc: 0.851818	valid's auc: 0.819125


Early stopping, best iteration is:
[354]	train's auc: 0.848504	valid's auc: 0.8192


[2025-09-12 02:38:51] [LGB unseen] seed=101 fold=0 AUC: 0.819200 | best_iter=354


[2025-09-12 02:38:51] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836591	valid's auc: 0.818246


[400]	train's auc: 0.851886	valid's auc: 0.818982


Early stopping, best iteration is:
[335]	train's auc: 0.847228	valid's auc: 0.81902


[2025-09-12 02:39:41] [LGB unseen] seed=101 fold=1 AUC: 0.819020 | best_iter=335


[2025-09-12 02:39:41] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836362	valid's auc: 0.821425


[400]	train's auc: 0.851841	valid's auc: 0.822109


Early stopping, best iteration is:
[391]	train's auc: 0.851205	valid's auc: 0.822117


[2025-09-12 02:40:35] [LGB unseen] seed=101 fold=2 AUC: 0.822117 | best_iter=391


[2025-09-12 02:40:36] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836774	valid's auc: 0.818391


[400]	train's auc: 0.852122	valid's auc: 0.819082


Early stopping, best iteration is:
[364]	train's auc: 0.849535	valid's auc: 0.819138


[2025-09-12 02:41:21] [LGB unseen] seed=101 fold=3 AUC: 0.819138 | best_iter=364


[2025-09-12 02:41:22] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836398	valid's auc: 0.820969


[400]	train's auc: 0.851953	valid's auc: 0.821365


Early stopping, best iteration is:
[352]	train's auc: 0.848482	valid's auc: 0.821515


[2025-09-12 02:42:12] [LGB unseen] seed=101 fold=4 AUC: 0.821515 | best_iter=352


[2025-09-12 02:42:13] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836881	valid's auc: 0.81669


[400]	train's auc: 0.852264	valid's auc: 0.817305


Early stopping, best iteration is:
[366]	train's auc: 0.849902	valid's auc: 0.817366


[2025-09-12 02:43:05] [LGB unseen] seed=101 fold=5 AUC: 0.817366 | best_iter=366


[2025-09-12 02:43:05] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836512	valid's auc: 0.818864


[400]	train's auc: 0.851987	valid's auc: 0.819395


[600]	train's auc: 0.865292	valid's auc: 0.819216


Early stopping, best iteration is:
[420]	train's auc: 0.853383	valid's auc: 0.819481


[2025-09-12 02:44:01] [LGB unseen] seed=101 fold=6 AUC: 0.819481 | best_iter=420


[2025-09-12 02:44:01] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836735	valid's auc: 0.818417


[400]	train's auc: 0.852124	valid's auc: 0.81885


Early stopping, best iteration is:
[337]	train's auc: 0.847672	valid's auc: 0.818987


[2025-09-12 02:44:54] [LGB unseen] seed=101 fold=7 AUC: 0.818987 | best_iter=337


[2025-09-12 02:44:55] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836445	valid's auc: 0.818492


[400]	train's auc: 0.851961	valid's auc: 0.819271


[600]	train's auc: 0.865232	valid's auc: 0.819023


Early stopping, best iteration is:
[406]	train's auc: 0.852392	valid's auc: 0.819295


[2025-09-12 02:45:49] [LGB unseen] seed=101 fold=8 AUC: 0.819295 | best_iter=406


[2025-09-12 02:45:49] [LGB unseen] seed=101 ff=0.7 bf=0.83 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836339	valid's auc: 0.820276


[400]	train's auc: 0.851817	valid's auc: 0.82109


[600]	train's auc: 0.865072	valid's auc: 0.820833


Early stopping, best iteration is:
[409]	train's auc: 0.852451	valid's auc: 0.8211


[2025-09-12 02:46:44] [LGB unseen] seed=101 fold=9 AUC: 0.821100 | best_iter=409


[2025-09-12 02:46:45] [LGB unseen] seed=101 OOF AUC: 0.819707 | elapsed=526.2s


[2025-09-12 02:46:46] [LGB unseen] Saved OOF/test preds for u101


[2025-09-12 02:46:46] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829499	valid's auc: 0.81656


[400]	train's auc: 0.841445	valid's auc: 0.818589


[600]	train's auc: 0.851196	valid's auc: 0.818902


Early stopping, best iteration is:
[594]	train's auc: 0.850914	valid's auc: 0.81892


[2025-09-12 02:47:48] [LGB unseen] seed=999 fold=0 AUC: 0.818920 | best_iter=594


[2025-09-12 02:47:49] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829679	valid's auc: 0.817092


[400]	train's auc: 0.841717	valid's auc: 0.818655


[600]	train's auc: 0.851558	valid's auc: 0.818801


[800]	train's auc: 0.860456	valid's auc: 0.818726


Early stopping, best iteration is:
[718]	train's auc: 0.856873	valid's auc: 0.818865


[2025-09-12 02:48:59] [LGB unseen] seed=999 fold=1 AUC: 0.818865 | best_iter=718


[2025-09-12 02:49:00] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829293	valid's auc: 0.820074


[400]	train's auc: 0.841369	valid's auc: 0.822057


[600]	train's auc: 0.8511	valid's auc: 0.822164


Early stopping, best iteration is:
[530]	train's auc: 0.847831	valid's auc: 0.82221


[2025-09-12 02:49:58] [LGB unseen] seed=999 fold=2 AUC: 0.822210 | best_iter=530


[2025-09-12 02:49:59] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829708	valid's auc: 0.817399


[400]	train's auc: 0.84158	valid's auc: 0.818829


[600]	train's auc: 0.85142	valid's auc: 0.818934


Early stopping, best iteration is:
[497]	train's auc: 0.846453	valid's auc: 0.81896


[2025-09-12 02:50:54] [LGB unseen] seed=999 fold=3 AUC: 0.818960 | best_iter=497


[2025-09-12 02:50:55] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.8293	valid's auc: 0.81994


[400]	train's auc: 0.841397	valid's auc: 0.821313


[600]	train's auc: 0.851206	valid's auc: 0.821327


Early stopping, best iteration is:
[476]	train's auc: 0.845275	valid's auc: 0.821408


[2025-09-12 02:51:49] [LGB unseen] seed=999 fold=4 AUC: 0.821408 | best_iter=476


[2025-09-12 02:51:49] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829793	valid's auc: 0.815575


[400]	train's auc: 0.841667	valid's auc: 0.817185


[600]	train's auc: 0.851412	valid's auc: 0.817463


[800]	train's auc: 0.86035	valid's auc: 0.817428


Early stopping, best iteration is:
[610]	train's auc: 0.8519	valid's auc: 0.817494


[2025-09-12 02:52:52] [LGB unseen] seed=999 fold=5 AUC: 0.817494 | best_iter=610


[2025-09-12 02:52:53] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829544	valid's auc: 0.817608


[400]	train's auc: 0.841577	valid's auc: 0.819217


[600]	train's auc: 0.851361	valid's auc: 0.819266


[800]	train's auc: 0.860298	valid's auc: 0.819183


Early stopping, best iteration is:
[637]	train's auc: 0.853049	valid's auc: 0.819321


[2025-09-12 02:53:59] [LGB unseen] seed=999 fold=6 AUC: 0.819321 | best_iter=637


[2025-09-12 02:54:00] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.82953	valid's auc: 0.817134


[400]	train's auc: 0.84159	valid's auc: 0.818662


[600]	train's auc: 0.851343	valid's auc: 0.818495


Early stopping, best iteration is:
[409]	train's auc: 0.842079	valid's auc: 0.818686


[2025-09-12 02:54:53] [LGB unseen] seed=999 fold=7 AUC: 0.818686 | best_iter=409


[2025-09-12 02:54:53] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829643	valid's auc: 0.817632


[400]	train's auc: 0.841551	valid's auc: 0.818919


[600]	train's auc: 0.851368	valid's auc: 0.819057


Early stopping, best iteration is:
[520]	train's auc: 0.847535	valid's auc: 0.819172


[2025-09-12 02:55:51] [LGB unseen] seed=999 fold=8 AUC: 0.819172 | best_iter=520


[2025-09-12 02:55:52] [LGB unseen] seed=999 ff=0.8 bf=0.7 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.829445	valid's auc: 0.819265


[400]	train's auc: 0.841296	valid's auc: 0.820982


[600]	train's auc: 0.851095	valid's auc: 0.82096


Early stopping, best iteration is:
[423]	train's auc: 0.842489	valid's auc: 0.821059


[2025-09-12 02:56:42] [LGB unseen] seed=999 fold=9 AUC: 0.821059 | best_iter=423


[2025-09-12 02:56:42] [LGB unseen] seed=999 OOF AUC: 0.819582 | elapsed=596.3s


[2025-09-12 02:56:43] [LGB unseen] Saved OOF/test preds for u999


[2025-09-12 02:56:43] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834693	valid's auc: 0.817434


[400]	train's auc: 0.849272	valid's auc: 0.818672


[600]	train's auc: 0.86172	valid's auc: 0.818632


Early stopping, best iteration is:
[449]	train's auc: 0.852445	valid's auc: 0.818756


[2025-09-12 02:57:40] [LGB unseen] seed=7 fold=0 AUC: 0.818756 | best_iter=449


[2025-09-12 02:57:40] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834947	valid's auc: 0.817887


[400]	train's auc: 0.849442	valid's auc: 0.818787


[600]	train's auc: 0.861681	valid's auc: 0.818603


Early stopping, best iteration is:
[443]	train's auc: 0.852165	valid's auc: 0.818871


[2025-09-12 02:58:35] [LGB unseen] seed=7 fold=1 AUC: 0.818871 | best_iter=443


[2025-09-12 02:58:36] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834612	valid's auc: 0.821084


[400]	train's auc: 0.849287	valid's auc: 0.822104


[600]	train's auc: 0.861759	valid's auc: 0.822018


Early stopping, best iteration is:
[523]	train's auc: 0.857114	valid's auc: 0.822207


[2025-09-12 02:59:38] [LGB unseen] seed=7 fold=2 AUC: 0.822207 | best_iter=523


[2025-09-12 02:59:38] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.8349	valid's auc: 0.818259


[400]	train's auc: 0.849605	valid's auc: 0.819077


[600]	train's auc: 0.862083	valid's auc: 0.819009


Early stopping, best iteration is:
[447]	train's auc: 0.852592	valid's auc: 0.819156


[2025-09-12 03:00:34] [LGB unseen] seed=7 fold=3 AUC: 0.819156 | best_iter=447


[2025-09-12 03:00:34] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834557	valid's auc: 0.820652


[400]	train's auc: 0.849382	valid's auc: 0.821347


Early stopping, best iteration is:
[310]	train's auc: 0.843275	valid's auc: 0.821419


[2025-09-12 03:01:21] [LGB unseen] seed=7 fold=4 AUC: 0.821419 | best_iter=310


[2025-09-12 03:01:21] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.835093	valid's auc: 0.816544


[400]	train's auc: 0.849683	valid's auc: 0.817378


[600]	train's auc: 0.862073	valid's auc: 0.817209


Early stopping, best iteration is:
[409]	train's auc: 0.850256	valid's auc: 0.817409


[2025-09-12 03:02:14] [LGB unseen] seed=7 fold=5 AUC: 0.817409 | best_iter=409


[2025-09-12 03:02:15] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.83482	valid's auc: 0.818826


[400]	train's auc: 0.849282	valid's auc: 0.819621


[600]	train's auc: 0.861694	valid's auc: 0.819575


Early stopping, best iteration is:
[524]	train's auc: 0.857182	valid's auc: 0.819649


[2025-09-12 03:03:17] [LGB unseen] seed=7 fold=6 AUC: 0.819649 | best_iter=524


[2025-09-12 03:03:18] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834941	valid's auc: 0.818188


[400]	train's auc: 0.84939	valid's auc: 0.818935


[600]	train's auc: 0.861643	valid's auc: 0.818862


Early stopping, best iteration is:
[480]	train's auc: 0.854425	valid's auc: 0.819019


[2025-09-12 03:04:16] [LGB unseen] seed=7 fold=7 AUC: 0.819019 | best_iter=480


[2025-09-12 03:04:16] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834728	valid's auc: 0.818491


[400]	train's auc: 0.849466	valid's auc: 0.819316


[600]	train's auc: 0.861893	valid's auc: 0.819349


Early stopping, best iteration is:
[489]	train's auc: 0.855123	valid's auc: 0.819422


[2025-09-12 03:05:21] [LGB unseen] seed=7 fold=8 AUC: 0.819422 | best_iter=489


[2025-09-12 03:05:22] [LGB unseen] seed=7 ff=0.75 bf=0.8 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834622	valid's auc: 0.819571


[400]	train's auc: 0.849199	valid's auc: 0.820403


Early stopping, best iteration is:
[361]	train's auc: 0.84663	valid's auc: 0.820483


[2025-09-12 03:06:12] [LGB unseen] seed=7 fold=9 AUC: 0.820483 | best_iter=361


[2025-09-12 03:06:13] [LGB unseen] seed=7 OOF AUC: 0.819607 | elapsed=569.1s


[2025-09-12 03:06:14] [LGB unseen] Saved OOF/test preds for u7


[2025-09-12 03:06:14] All TE-enhanced LGB unseen seeds finished.


[2025-09-12 03:06:14] Training XGBoost unseen (TE-enhanced features)...


[2025-09-12 03:06:14] [XGB unseen] Fold 1/10 | trn=720000 val=80000


[0]	train-auc:0.74132	valid-auc:0.74062


[200]	train-auc:0.82427	valid-auc:0.81315


[400]	train-auc:0.83180	valid-auc:0.81654


[600]	train-auc:0.83688	valid-auc:0.81738


[800]	train-auc:0.84141	valid-auc:0.81761


[957]	train-auc:0.84449	valid-auc:0.81762


[2025-09-12 03:06:56] [XGB unseen] Fold 2/10 | trn=720000 val=80000


[0]	train-auc:0.74174	valid-auc:0.74034


[200]	train-auc:0.82428	valid-auc:0.81485


[400]	train-auc:0.83165	valid-auc:0.81757


[600]	train-auc:0.83690	valid-auc:0.81813


[800]	train-auc:0.84126	valid-auc:0.81825


[1000]	train-auc:0.84531	valid-auc:0.81826


[1020]	train-auc:0.84569	valid-auc:0.81825


[2025-09-12 03:07:42] [XGB unseen] Fold 3/10 | trn=720000 val=80000


[0]	train-auc:0.74145	valid-auc:0.73491


[200]	train-auc:0.82358	valid-auc:0.81126


[400]	train-auc:0.83116	valid-auc:0.81538


[600]	train-auc:0.83616	valid-auc:0.81623


[800]	train-auc:0.84056	valid-auc:0.81658


[1000]	train-auc:0.84469	valid-auc:0.81663


[1195]	train-auc:0.84844	valid-auc:0.81656


[2025-09-12 03:08:34] [XGB unseen] Fold 4/10 | trn=720000 val=80000


[0]	train-auc:0.74181	valid-auc:0.73954


[200]	train-auc:0.82412	valid-auc:0.81432


[400]	train-auc:0.83139	valid-auc:0.81699


[600]	train-auc:0.83643	valid-auc:0.81749


[800]	train-auc:0.84095	valid-auc:0.81757


[1000]	train-auc:0.84515	valid-auc:0.81751


[1092]	train-auc:0.84685	valid-auc:0.81743


[2025-09-12 03:09:22] [XGB unseen] Fold 5/10 | trn=720000 val=80000


[0]	train-auc:0.74161	valid-auc:0.73918


[200]	train-auc:0.82405	valid-auc:0.81726


[400]	train-auc:0.83162	valid-auc:0.81958


[600]	train-auc:0.83654	valid-auc:0.81989


[784]	train-auc:0.84042	valid-auc:0.81992


[2025-09-12 03:09:57] [XGB unseen] Fold 6/10 | trn=720000 val=80000


[0]	train-auc:0.74185	valid-auc:0.73742


[200]	train-auc:0.82415	valid-auc:0.81297


[400]	train-auc:0.83170	valid-auc:0.81583


[600]	train-auc:0.83699	valid-auc:0.81641


[800]	train-auc:0.84151	valid-auc:0.81653


[944]	train-auc:0.84437	valid-auc:0.81646


[2025-09-12 03:10:39] [XGB unseen] Fold 7/10 | trn=720000 val=80000


[0]	train-auc:0.74088	valid-auc:0.73815


[200]	train-auc:0.82401	valid-auc:0.81542


[400]	train-auc:0.83174	valid-auc:0.81837


[600]	train-auc:0.83689	valid-auc:0.81887


[800]	train-auc:0.84121	valid-auc:0.81888


[2025-09-12 03:11:14] [XGB unseen] Fold 8/10 | trn=720000 val=80000


[0]	train-auc:0.74167	valid-auc:0.72995


[200]	train-auc:0.82380	valid-auc:0.79051


[400]	train-auc:0.83175	valid-auc:0.79289


[505]	train-auc:0.83445	valid-auc:0.79268


[2025-09-12 03:11:38] [XGB unseen] Fold 9/10 | trn=720000 val=80000


[0]	train-auc:0.74164	valid-auc:0.72770


[200]	train-auc:0.82424	valid-auc:0.79866


[400]	train-auc:0.83174	valid-auc:0.80207


[600]	train-auc:0.83669	valid-auc:0.80284


[800]	train-auc:0.84115	valid-auc:0.80309


[1000]	train-auc:0.84532	valid-auc:0.80313


[1052]	train-auc:0.84635	valid-auc:0.80318


[2025-09-12 03:12:24] [XGB unseen] Fold 10/10 | trn=720000 val=80000


[0]	train-auc:0.74164	valid-auc:0.73621


[200]	train-auc:0.82364	valid-auc:0.80522


[400]	train-auc:0.83135	valid-auc:0.80862


[600]	train-auc:0.83651	valid-auc:0.80943


[800]	train-auc:0.84098	valid-auc:0.80967


[908]	train-auc:0.84321	valid-auc:0.80964


[2025-09-12 03:13:04] [XGB unseen] OOF AUC: 0.812507


[2025-09-12 03:13:05] [XGB unseen] Saved OOF and test predictions.


In [30]:
# Cell 26: Blend TE-enhanced unseen models and assemble final submission
log('Blending TE-enhanced unseen models and assembling final submission...')
import pandas as pd, numpy as np, os, json
from sklearn.metrics import roc_auc_score

# 1) Load 6x LGB unseen OOF/test preds (TE-enhanced)
def load_oof(path):
    df = pd.read_csv(path)
    cols = [c for c in df.columns if c != 'id']
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})
def load_pred(path):
    df = pd.read_csv(path)
    cols = [c for c in df.columns if c != 'id']
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})

lgb_tags = ['u42','u1337','u2025','u101','u999','u7']
oofs = []; preds = []
for tag in lgb_tags:
    po = load_oof(f'oof_lgb_unseen_{tag}.csv')
    pt = load_pred(f'pred_lgb_unseen_{tag}.csv')
    oofs.append(po.rename(columns={'pred': f'oof_{tag}'}))
    preds.append(pt.rename(columns={'pred': f'pred_{tag}'}))

# Merge and average
oof_lgb = oofs[0]
for df in oofs[1:]:
    oof_lgb = oof_lgb.merge(df, on='id', how='inner')
pred_lgb = preds[0]
for df in preds[1:]:
    pred_lgb = pred_lgb.merge(df, on='id', how='inner')
lgb_oof_cols = [c for c in oof_lgb.columns if c != 'id']
lgb_pred_cols = [c for c in pred_lgb.columns if c != 'id']
oof_lgb['lgb_ens'] = oof_lgb[lgb_oof_cols].mean(axis=1).astype('float32')
pred_lgb['lgb_ens'] = pred_lgb[lgb_pred_cols].mean(axis=1).astype('float32')

# 2) Load XGB unseen OOF/test preds (TE-enhanced)
oof_xgb = load_oof('oof_xgb_unseen.csv')
pred_xgb = load_pred('pred_xgb_unseen.csv')

# 3) Tune blend on OOF (global, since no unseen-like rows exist in train)
gt = train[['id','target']].copy()
oof = gt.merge(oof_lgb[['id','lgb_ens']], on='id', how='left').merge(oof_xgb.rename(columns={'pred':'xgb'}), on='id', how='left')

def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / max(len(r) - 1, 1)

oof['lgb_r'] = rank_norm(oof['lgb_ens'])
oof['xgb_r'] = rank_norm(oof['xgb'])

best = {'auc': 0.0, 'mode': 'prob', 'w_lgb': 1.0}
weights = np.round(np.linspace(0.60, 0.95, 8), 2)  # 0.60..0.95
for w in weights:
    prob = w * oof['lgb_ens'] + (1 - w) * oof['xgb']
    auc_p = roc_auc_score(oof['target'], prob)
    if auc_p > best['auc']:
        best.update({'auc': float(auc_p), 'mode': 'prob', 'w_lgb': float(w)})
    rank = w * oof['lgb_r'] + (1 - w) * oof['xgb_r']
    auc_r = roc_auc_score(oof['target'], rank)
    if auc_r > best['auc']:
        best.update({'auc': float(auc_r), 'mode': 'rank', 'w_lgb': float(w)})
log(f"[TE-unseen blend] Best OOF AUC={best['auc']:.6f} | mode={best['mode']} | w_lgb={best['w_lgb']}")

# 4) Build blended unseen test predictions
sub_te = pd.read_csv('sample_submission.csv')[['id']].merge(pred_lgb[['id','lgb_ens']], on='id', how='left').merge(pred_xgb.rename(columns={'pred':'xgb'}), on='id', how='left')
if best['mode'] == 'prob':
    unseen_pred = best['w_lgb'] * sub_te['lgb_ens'].values + (1 - best['w_lgb']) * sub_te['xgb'].values
else:
    rl = rank_norm(sub_te['lgb_ens']); rx = rank_norm(sub_te['xgb'])
    unseen_pred = best['w_lgb'] * rl + (1 - best['w_lgb']) * rx
unseen_pred = unseen_pred.astype('float32')

# 5) f_27 identity map for seen rows
f27_target_map = train.groupby('f_27')['target'].first().to_dict()
test_seen_mask = test['f_27'].isin(f27_target_map).values
test_unseen_mask = ~test_seen_mask
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_target_map).astype('float32').values

# 6) Final assembly
final = pd.read_csv('sample_submission.csv')[['id']].copy()
final['target'] = np.zeros(len(final), dtype='float32')
final.loc[test_seen_mask, 'target'] = preds_seen
final.loc[test_unseen_mask, 'target'] = unseen_pred[test_unseen_mask]
if final['target'].isna().any():
    raise ValueError('NaNs in final target after assembly')
final['target'] = final['target'].astype('float64').clip(0.0, 1.0)
final.to_csv('submission.csv', index=False)
final.to_csv('submission_te_unseen.csv', index=False)
log('Saved submission.csv using f_27 identity for seen and TE-enhanced blend for unseen.')

# Save meta
with open('blend_unseen_meta_te.json', 'w') as f:
    json.dump(best, f, indent=2)
log(f"Saved blend_unseen_meta_te.json: {best}")

[2025-09-12 03:13:44] Blending TE-enhanced unseen models and assembling final submission...


[2025-09-12 03:13:50] [TE-unseen blend] Best OOF AUC=0.820429 | mode=rank | w_lgb=0.95


[2025-09-12 03:13:53] Saved submission.csv using f_27 identity for seen and TE-enhanced blend for unseen.


[2025-09-12 03:13:53] Saved blend_unseen_meta_te.json: {'auc': 0.8204291485137813, 'mode': 'rank', 'w_lgb': 0.95}


In [31]:
# Cell 27: Guardrails and assertions for f_27 identity map and submission integrity
log('Running guardrails: f_27 purity checks and submission integrity...')
import pandas as pd, numpy as np, json, os

# 1) f_27 -> target purity checks
grp = train.groupby('f_27')['target']
nuniq = grp.nunique()
max_nuniq = int(nuniq.max())
conflicts = int((nuniq > 1).sum())
log(f'f_27 groups: total={len(nuniq)}, max target nunique={max_nuniq}, conflicts={conflicts}')
if max_nuniq > 1 or conflicts > 0:
    # Resolve by majority (mean >= 0.5 -> 1 else 0)
    means = grp.mean()
    f27_map = (means >= 0.5).astype('int8').to_dict()
    log(f'Resolved {conflicts} conflicting f_27 groups by majority vote.')
else:
    # Pure mapping: any aggregation works; use .first() for speed
    f27_map = grp.first().astype('int8').to_dict()

# 2) Seen/unseen masks and coverage
test_seen_mask = test['f_27'].isin(f27_map).values
seen_cnt = int(test_seen_mask.sum())
unseen_cnt = int((~test_seen_mask).sum())
log(f'Seen coverage check -> seen={seen_cnt}, unseen={unseen_cnt}')
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_map)
assert preds_seen.notna().all(), 'NaNs found in seen predictions from identity map'

# 3) Submission integrity
assert os.path.exists('submission.csv'), 'submission.csv missing'
sub = pd.read_csv('submission.csv')
assert list(sub.columns) == ['id','target'], f'Bad submission columns: {list(sub.columns)}'
assert len(sub) == len(test), f'Submission length {len(sub)} != test length {len(test)}'
assert not sub['target'].isna().any(), 'NaNs in submission target'
assert np.isfinite(sub['target']).all(), 'Non-finite values in submission target'
assert sub['target'].between(0.0, 1.0).all(), 'Submission target out of [0,1] bounds'

# 4) If desired, strictly align and resave (idempotent) to be safe
ss = pd.read_csv('sample_submission.csv')[['id']]
sub_aligned = ss.merge(sub, on='id', how='left')
missing = int(sub_aligned['target'].isna().sum())
assert missing == 0, f'Missing predictions for {missing} rows after alignment'
sub_aligned['target'] = sub_aligned['target'].astype('float64').clip(0.0, 1.0)
sub_aligned.to_csv('submission.csv', index=False)
log('Guardrails passed. submission.csv verified and re-saved strictly aligned.')

[2025-09-12 03:16:56] Running guardrails: f_27 purity checks and submission integrity...


[2025-09-12 03:16:57] f_27 groups: total=670919, max target nunique=2, conflicts=41748


[2025-09-12 03:16:57] Resolved 41748 conflicting f_27 groups by majority vote.


[2025-09-12 03:16:58] Seen coverage check -> seen=28435, unseen=71565


[2025-09-12 03:16:58] Guardrails passed. submission.csv verified and re-saved strictly aligned.


In [32]:
# Cell 28: Refined blend + optional meta-stacker for TE-unseen, assemble submission
log('Refining TE-unseen blend (fine grid + optional stacker) and assembling submission...')
import pandas as pd, numpy as np, json, os
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import StratifiedKFold

def load_oof(path):
    df = pd.read_csv(path); cols = [c for c in df.columns if c != 'id'];
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})
def load_pred(path):
    df = pd.read_csv(path); cols = [c for c in df.columns if c != 'id'];
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})

# Load unseen OOF/test blocks
lgb_tags = ['u42','u1337','u2025','u101','u999','u7']
oofs = []; preds = []
for tag in lgb_tags:
    oofs.append(load_oof(f'oof_lgb_unseen_{tag}.csv').rename(columns={'pred': f'oof_{tag}'}))
    preds.append(load_pred(f'pred_lgb_unseen_{tag}.csv').rename(columns={'pred': f'pred_{tag}'}))
oof_lgb = oofs[0]
for df in oofs[1:]: oof_lgb = oof_lgb.merge(df, on='id', how='inner')
pred_lgb = preds[0]
for df in preds[1:]: pred_lgb = pred_lgb.merge(df, on='id', how='inner')
lgb_oof_cols = [c for c in oof_lgb.columns if c != 'id']
lgb_pred_cols = [c for c in pred_lgb.columns if c != 'id']
oof_lgb['lgb_ens'] = oof_lgb[lgb_oof_cols].mean(axis=1).astype('float32')
pred_lgb['lgb_ens'] = pred_lgb[lgb_pred_cols].mean(axis=1).astype('float32')

oof_xgb = load_oof('oof_xgb_unseen.csv').rename(columns={'pred':'xgb'})
pred_xgb = load_pred('pred_xgb_unseen.csv').rename(columns={'pred':'xgb'})

gt = train[['id','target']].copy()
oof = gt.merge(oof_lgb[['id','lgb_ens']], on='id', how='left').merge(oof_xgb, on='id', how='left')

def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / max(len(r) - 1, 1)

oof['lgb_r'] = rank_norm(oof['lgb_ens']); oof['xgb_r'] = rank_norm(oof['xgb'])

# Fine grid search in [0.90..0.98] step 0.01 for prob and rank
best = {'auc': 0.0, 'mode': 'prob', 'w_lgb': 1.0}
weights = np.round(np.arange(0.90, 0.981, 0.01), 2)
for w in weights:
    prob = w * oof['lgb_ens'] + (1 - w) * oof['xgb']
    auc_p = roc_auc_score(oof['target'], prob)
    if auc_p > best['auc']:
        best.update({'auc': float(auc_p), 'mode': 'prob', 'w_lgb': float(w)})
    rank = w * oof['lgb_r'] + (1 - w) * oof['xgb_r']
    auc_r = roc_auc_score(oof['target'], rank)
    if auc_r > best['auc']:
        best.update({'auc': float(auc_r), 'mode': 'rank', 'w_lgb': float(w)})
log(f"[Refine] Best OOF AUC={best['auc']:.6f} | mode={best['mode']} | w_lgb={best['w_lgb']}")

# Optional meta-stacker (logistic on [lgb_ens, xgb]); 10-fold out-of-fold to avoid bias
X_meta = oof[['lgb_ens','xgb']].values.astype('float32')
y_meta = oof['target'].values.astype('int8')
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
oof_stack = np.zeros(len(oof), dtype='float32')
test_stack = np.zeros(len(pred_lgb), dtype='float32')
models = []
for fold, (trn_idx, val_idx) in enumerate(skf.split(X_meta, y_meta), 1):
    Xtr, ytr = X_meta[trn_idx], y_meta[trn_idx]
    Xvl = X_meta[val_idx]
    clf = LogisticRegression(max_iter=1000, solver='lbfgs')
    clf.fit(Xtr, ytr)
    oof_stack[val_idx] = clf.predict_proba(Xvl)[:,1].astype('float32')
    models.append(clf)
auc_stack = roc_auc_score(y_meta, oof_stack)
log(f'[Stacker] Logistic meta OOF AUC={auc_stack:.6f}')
if auc_stack > best['auc'] + 0.0005:
    # Build test stack prediction via average of fold models
    X_test_meta = pd.read_csv('sample_submission.csv')[['id']].merge(pred_lgb[['id','lgb_ens']], on='id', how='left').merge(pred_xgb, on='id', how='left')
    Xt = X_test_meta[['lgb_ens','xgb']].values.astype('float32')
    for m in models:
        test_stack += (m.predict_proba(Xt)[:,1].astype('float32') / len(models))
    chosen = {'mode': 'stack', 'auc': float(auc_stack)}
else:
    chosen = best.copy()

# Build unseen test predictions according to chosen strategy
sub_te = pd.read_csv('sample_submission.csv')[['id']].merge(pred_lgb[['id','lgb_ens']], on='id', how='left').merge(pred_xgb, on='id', how='left')
if chosen.get('mode') == 'stack':
    unseen_pred = test_stack.astype('float32')
else:
    if chosen['mode'] == 'prob':
        unseen_pred = chosen['w_lgb'] * sub_te['lgb_ens'].values + (1 - chosen['w_lgb']) * sub_te['xgb'].values
    else:
        rl = rank_norm(sub_te['lgb_ens']); rx = rank_norm(sub_te['xgb'])
        unseen_pred = chosen['w_lgb'] * rl + (1 - chosen['w_lgb']) * rx
    unseen_pred = unseen_pred.astype('float32')

# Seen mapping with majority resolution guardrail
grp = train.groupby('f_27')['target']
means = grp.mean()
f27_map = (means >= 0.5).astype('int8').to_dict()
test_seen_mask = test['f_27'].isin(f27_map).values
test_unseen_mask = ~test_seen_mask
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_map).astype('float32').values

# Final assembly
final = pd.read_csv('sample_submission.csv')[['id']].copy()
final['target'] = np.zeros(len(final), dtype='float32')
final.loc[test_seen_mask, 'target'] = preds_seen
final.loc[test_unseen_mask, 'target'] = unseen_pred[test_unseen_mask]
if final['target'].isna().any():
    raise ValueError('NaNs in final target after assembly')
final['target'] = final['target'].astype('float64').clip(0.0, 1.0)
final.to_csv('submission.csv', index=False)
final.to_csv('submission_te_unseen_refined.csv', index=False)
meta = {'chosen': chosen, 'grid_best': best}
with open('blend_unseen_meta_te_refined.json','w') as f: json.dump(meta, f, indent=2)
log(f"Saved submission.csv and metadata. Chosen={chosen}")

[2025-09-12 03:18:08] Refining TE-unseen blend (fine grid + optional stacker) and assembling submission...


[2025-09-12 03:18:15] [Refine] Best OOF AUC=0.820443 | mode=rank | w_lgb=0.98


[2025-09-12 03:18:26] [Stacker] Logistic meta OOF AUC=0.820417


[2025-09-12 03:18:28] Saved submission.csv and metadata. Chosen={'auc': 0.8204432246955218, 'mode': 'rank', 'w_lgb': 0.98}


In [33]:
# Cell 29: Pseudo-unseen validation (hold out 20% unique f_27 strings), TE-fit on subset, evaluate unseen AUC
log('Starting pseudo-unseen validation: masking 20% unique f_27, fitting TE on remaining, evaluating held-out...')
import numpy as np, pandas as pd, time, gc, os
from sklearn.metrics import roc_auc_score

rng = np.random.default_rng(42)
s_tr = train['f_27'].astype(str).values
y_tr = train['target'].values.astype('float32')

# 1) Build pseudo-unseen split by f_27 identity
uniq = pd.unique(s_tr)
n_mask = max(1, int(0.20 * len(uniq)))
mask_ids = set(rng.choice(uniq, size=n_mask, replace=False))
holdout_mask = np.array([x in mask_ids for x in s_tr], dtype=bool)
fit_mask = ~holdout_mask
log(f'Pseudo-unseen split: fit_n={int(fit_mask.sum())} | holdout_n={int(holdout_mask.sum())} | uniq_total={len(uniq)} | uniq_masked={len(mask_ids)}')

# 2) Prepare positional tokens
pos_char_tr = np.stack([train[f'f_27_pos_{i}'].astype('int16').values for i in range(10)], axis=1)

def make_bigrams(arr):
    n = arr.shape[0]
    out = [None]*9
    for i in range(9):
        out[i] = np.fromiter((row[i:i+2] for row in arr), count=n, dtype=object)
    return out
bg_tr = make_bigrams(s_tr)

def f27_nunique(arr):
    return np.fromiter((len(set(list(x))) for x in arr), count=len(arr), dtype=np.int16)
f27_nuniq_tr = f27_nunique(s_tr)

# 3) Fit TE maps on fit subset only, apply to fit/holdout
prior = float(train.loc[fit_mask, 'target'].mean())

def fit_te_map(keys_fit, y_fit, m, prior):
    df = pd.DataFrame({'k': keys_fit, 'y': y_fit})
    grp = df.groupby('k')
    cnt = grp['y'].size()
    sumy = grp['y'].sum()
    te = ((sumy + m * prior) / (cnt + m)).astype('float32')
    return te, cnt.astype('int32')

# Positional chars (m=30)
te_pos_char_fit = np.zeros((fit_mask.sum(), 10), dtype=np.float32)
te_pos_char_hold = np.zeros((holdout_mask.sum(), 10), dtype=np.float32)
te_pos_char_logcnt_fit = np.zeros_like(te_pos_char_fit)
te_pos_char_logcnt_hold = np.zeros_like(te_pos_char_hold)
for i in range(10):
    keys_fit = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[fit_mask, i]])
    te_map, cnt_map = fit_te_map(keys_fit.values, y_tr[fit_mask], m=30.0, prior=prior)
    # transform
    k_fit = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[fit_mask, i]])
    k_hld = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[holdout_mask, i]])
    te_pos_char_fit[:, i] = k_fit.map(te_map).astype('float32').fillna(prior).values
    te_pos_char_hold[:, i] = k_hld.map(te_map).astype('float32').fillna(prior).values
    te_pos_char_logcnt_fit[:, i] = np.log1p(k_fit.map(cnt_map).fillna(0).astype('int32').values).astype('float32')
    te_pos_char_logcnt_hold[:, i] = np.log1p(k_hld.map(cnt_map).fillna(0).astype('int32').values).astype('float32')

# Positional bigrams (m=100)
te_pos_bg_fit = np.zeros((fit_mask.sum(), 9), dtype=np.float32)
te_pos_bg_hold = np.zeros((holdout_mask.sum(), 9), dtype=np.float32)
te_pos_bg_logcnt_fit = np.zeros_like(te_pos_bg_fit)
te_pos_bg_logcnt_hold = np.zeros_like(te_pos_bg_hold)
for i in range(9):
    keys_fit = pd.Series([f'{i}|{bg}' for bg in bg_tr[i][fit_mask]])
    te_map, cnt_map = fit_te_map(keys_fit.values, y_tr[fit_mask], m=100.0, prior=prior)
    k_fit = pd.Series([f'{i}|{bg}' for bg in bg_tr[i][fit_mask]])
    k_hld = pd.Series([f'{i}|{bg}' for bg in bg_tr[i][holdout_mask]])
    te_pos_bg_fit[:, i] = k_fit.map(te_map).astype('float32').fillna(prior).values
    te_pos_bg_hold[:, i] = k_hld.map(te_map).astype('float32').fillna(prior).values
    te_pos_bg_logcnt_fit[:, i] = np.log1p(k_fit.map(cnt_map).fillna(0).astype('int32').values).astype('float32')
    te_pos_bg_logcnt_hold[:, i] = np.log1p(k_hld.map(cnt_map).fillna(0).astype('int32').values).astype('float32')

# f27_nunique (m=50)
keys_nu_fit = pd.Series(f27_nuniq_tr[fit_mask].astype('int16'))
te_map_nu, cnt_map_nu = fit_te_map(keys_nu_fit.values, y_tr[fit_mask], m=50.0, prior=prior)
k_fit_nu = pd.Series(f27_nuniq_tr[fit_mask].astype('int16'))
k_hld_nu = pd.Series(f27_nuniq_tr[holdout_mask].astype('int16'))
nu_te_fit = k_fit_nu.map(te_map_nu).astype('float32').fillna(prior).values
nu_te_hold = k_hld_nu.map(te_map_nu).astype('float32').fillna(prior).values
nu_log_fit = np.log1p(k_fit_nu.map(cnt_map_nu).fillna(0).astype('int32').values).astype('float32')
nu_log_hold = np.log1p(k_hld_nu.map(cnt_map_nu).fillna(0).astype('int32').values).astype('float32')

# 4) Target-free frequencies (train+test pooled) - reuse existing approach
N_all = float(len(train) + len(test))
def freq_map(series_all):
    vc = pd.Series(series_all).value_counts()
    return vc
freq_pos_char_fit = np.zeros((fit_mask.sum(), 10), dtype=np.float32)
freq_pos_char_hold = np.zeros((holdout_mask.sum(), 10), dtype=np.float32)
for i in range(10):
    all_keys = np.array([f'{i}|{int(t)}' for t in np.concatenate([train[f'f_27_pos_{i}'].astype('int16').values, test[f'f_27_pos_{i}'].astype('int16').values])])
    vc = pd.Series(all_keys).value_counts()
    kf = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[fit_mask, i]])
    kh = pd.Series([f'{i}|{int(t)}' for t in pos_char_tr[holdout_mask, i]])
    freq_pos_char_fit[:, i] = (kf.map(vc).fillna(0).values.astype('float32') / N_all)
    freq_pos_char_hold[:, i] = (kh.map(vc).fillna(0).values.astype('float32') / N_all)

freq_pos_bg_fit = np.zeros((fit_mask.sum(), 9), dtype=np.float32)
freq_pos_bg_hold = np.zeros((holdout_mask.sum(), 9), dtype=np.float32)
for i in range(9):
    all_keys = np.array([f'{i}|{bg}' for bg in np.concatenate([make_bigrams(train['f_27'].astype(str).values)[i], make_bigrams(test['f_27'].astype(str).values)[i]])])
    vc = pd.Series(all_keys).value_counts()
    kf = pd.Series([f'{i}|{bg}' for bg in bg_tr[i][fit_mask]])
    kh = pd.Series([f'{i}|{bg}' for bg in bg_tr[i][holdout_mask]])
    freq_pos_bg_fit[:, i] = (kf.map(vc).fillna(0).values.astype('float32') / N_all)
    freq_pos_bg_hold[:, i] = (kh.map(vc).fillna(0).values.astype('float32') / N_all)

all_full = pd.Series(np.concatenate([train['f_27'].astype(str).values, test['f_27'].astype(str).values]))
vc_full = all_full.value_counts()
freq_full_fit = pd.Series(s_tr[fit_mask]).map(vc_full).fillna(0).values.astype('float32') / N_all
freq_full_hold = pd.Series(s_tr[holdout_mask]).map(vc_full).fillna(0).values.astype('float32') / N_all

# 5) Assemble feature matrices for fit/holdout using current TE block + compact block
comp_fit = pd.DataFrame(comp_train.loc[fit_mask].reset_index(drop=True))
comp_hold = pd.DataFrame(comp_train.loc[holdout_mask].reset_index(drop=True))
def assemble_block(te_pc, te_pc_lc, te_bg, te_bg_lc, nu_te, nu_log, f_pc, f_bg, f_full, idx_len):
    cols = {}
    for i in range(10): cols[f'TE_pos_char_{i}_mean'] = te_pc[:, i]
    for i in range(9): cols[f'TE_pos_bigram_{i}_mean'] = te_bg[:, i]
    for i in range(10): cols[f'TE_pos_char_{i}_logcnt'] = te_pc_lc[:, i]
    for i in range(9): cols[f'TE_pos_bigram_{i}_logcnt'] = te_bg_lc[:, i]
    cols['TE_f27_nunique_mean'] = nu_te
    cols['TE_f27_nunique_logcnt'] = nu_log
    for i in range(10): cols[f'FREQ_pos_char_{i}'] = f_pc[:, i]
    for i in range(9): cols[f'FREQ_pos_bigram_{i}'] = f_bg[:, i]
    cols['FREQ_full_string'] = f_full
    return pd.DataFrame(cols, index=np.arange(idx_len))

te_fit_df = assemble_block(te_pos_char_fit, te_pos_char_logcnt_fit, te_pos_bg_fit, te_pos_bg_logcnt_fit, nu_te_fit, nu_log_fit, freq_pos_char_fit, freq_pos_bg_fit, freq_full_fit, fit_mask.sum())
te_hold_df = assemble_block(te_pos_char_hold, te_pos_char_logcnt_hold, te_pos_bg_hold, te_pos_bg_logcnt_hold, nu_te_hold, nu_log_hold, freq_pos_char_hold, freq_pos_bg_hold, freq_full_hold, holdout_mask.sum())

X_fit = pd.concat([comp_fit.reset_index(drop=True), te_fit_df.reset_index(drop=True)], axis=1)
X_hold = pd.concat([comp_hold.reset_index(drop=True), te_hold_df.reset_index(drop=True)], axis=1)
y_fit = y_tr[fit_mask]
y_hold = y_tr[holdout_mask]
log(f'Feature blocks ready: X_fit={X_fit.shape}, X_hold={X_hold.shape}')

# 6) Train a quick LightGBM on X_fit and evaluate on X_hold (pseudo-unseen AUC)
import lightgbm as lgb, time
dtr = lgb.Dataset(X_fit, label=y_fit)
params = {
    'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
    'learning_rate': 0.045, 'num_leaves': 224, 'max_depth': -1, 'min_data_in_leaf': 240,
    'feature_fraction': 0.78, 'bagging_fraction': 0.78, 'bagging_freq': 1, 'lambda_l1': 0.0, 'lambda_l2': 4.0,
    'max_bin': 255, 'verbose': -1, 'n_jobs': -1, 'seed': 42
}
t0 = time.time()
model = lgb.train(params, dtr, num_boost_round=2000)
pred_hold = model.predict(X_hold)
auc_hold = roc_auc_score(y_hold, pred_hold)
log(f'[Pseudo-unseen] Holdout AUC={auc_hold:.6f} | train_time={time.time()-t0:.1f}s | fit_n={len(y_fit)} holdout_n={len(y_hold)}')

# Save for reference
pd.DataFrame({'id': train.loc[holdout_mask, 'id'].values, 'oof_pseudo_unseen': pred_hold}).to_csv('oof_pseudo_unseen_holdout.csv', index=False)
with open('pseudo_unseen_meta.json','w') as f:
    json.dump({'holdout_auc': float(auc_hold), 'fit_n': int(len(y_fit)), 'holdout_n': int(len(y_hold))}, f)
gc.collect();

[2025-09-12 03:20:34] Starting pseudo-unseen validation: masking 20% unique f_27, fitting TE on remaining, evaluating held-out...


[2025-09-12 03:20:35] Pseudo-unseen split: fit_n=639929 | holdout_n=160071 | uniq_total=670919 | uniq_masked=134183


[2025-09-12 03:21:18] Feature blocks ready: X_fit=(639929, 71), X_hold=(160071, 71)


[2025-09-12 03:22:28] [Pseudo-unseen] Holdout AUC=0.813502 | train_time=70.1s | fit_n=639929 holdout_n=160071


In [34]:
# Cell 30: Augment TE features for unseen model (trigrams, signatures, non-adj pairs) + target-free patterns; rebuild X_unseen_tr/te
log('Building augmented TE features for unseen model (trigrams, signatures, non-adj pairs) and target-free patterns...')
import numpy as np, pandas as pd, time, gc

t0 = time.time()
assert 'f_27' in train.columns and 'target' in train.columns, 'Missing f_27/target'
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values.astype('int16')
n_splits = int(np.unique(folds_arr).size)
prior = float(train['target'].mean())

# Strings
s_tr = train['f_27'].astype(str).values
s_te = test['f_27'].astype(str).values

# Positional char int tokens (10)
pos_char_tr = np.stack([train[f'f_27_pos_{i}'].astype('int16').values for i in range(10)], axis=1)
pos_char_te = np.stack([test[f'f_27_pos_{i}'].astype('int16').values for i in range(10)], axis=1)

# Bigrams arrays (string), for reuse if needed
def make_bigrams(arr):
    n = arr.shape[0]
    out = [None]*9
    for i in range(9):
        out[i] = np.fromiter((row[i:i+2] for row in arr), count=n, dtype=object)
    return out
bg_tr = make_bigrams(s_tr)
bg_te = make_bigrams(s_te)

# Trigrams arrays (string), i=0..7
def make_trigrams(arr):
    n = arr.shape[0]
    out = [None]*8
    for i in range(8):
        out[i] = np.fromiter((row[i:i+3] for row in arr), count=n, dtype=object)
    return out
tri_tr = make_trigrams(s_tr)
tri_te = make_trigrams(s_te)

# Helper signatures
def run_length_signature(s):
    # returns tuple of consecutive equal run lengths, e.g., 'AAABB' -> (3,2)
    res = []
    cur = 1
    for i in range(1, len(s)):
        if s[i] == s[i-1]: cur += 1
        else:
            res.append(cur); cur = 1
    res.append(cur)
    return tuple(res)

def count_hist_signature(s):
    # multiset of per-char counts sorted, e.g., 'AAABBC' -> (3,2,1)
    from collections import Counter
    cnt = Counter(s)
    return tuple(sorted(cnt.values(), reverse=True))

def majority_char_with_count(s):
    from collections import Counter
    cnt = Counter(s)
    ch, c = max(cnt.items(), key=lambda kv: kv[1])
    return f'{ch}|{c}'

# Non-adjacent symmetric pairs around center
pairs = [(0,9),(1,8),(2,7),(3,6),(4,5)]

# Helper: OOF TE by key with smoothing m; returns oof_mean, oof_logcnt, test_mean, test_logcnt
def oof_te_by_key(keys_tr, y, keys_te, m, prior):
    keys_tr = pd.Series(keys_tr)
    oof_vals = np.zeros(len(keys_tr), dtype=np.float32)
    oof_logcnt = np.zeros(len(keys_tr), dtype=np.float32)
    for fold in range(n_splits):
        trn_idx = np.where(folds_arr != fold)[0]
        val_idx = np.where(folds_arr == fold)[0]
        k_tr = keys_tr.iloc[trn_idx]
        y_tr = y[trn_idx]
        grp = pd.DataFrame({'k': k_tr.values, 'y': y_tr}).groupby('k')
        cnt = grp['y'].size()
        sumy = grp['y'].sum()
        te_map = ((sumy + m * prior) / (cnt + m)).astype('float32')
        lc_map = cnt.astype('int32')
        kval = keys_tr.iloc[val_idx]
        te = kval.map(te_map).astype('float32')
        lc = kval.map(lc_map).fillna(0).astype('int32')
        te = te.fillna(prior).values
        oof_vals[val_idx] = te
        oof_logcnt[val_idx] = np.log1p(lc.values).astype('float32')
    # Full fit for test
    grp_full = pd.DataFrame({'k': keys_tr.values, 'y': y}).groupby('k')
    cnt_full = grp_full['y'].size()
    sumy_full = grp_full['y'].sum()
    te_map_full = ((sumy_full + m * prior) / (cnt_full + m)).astype('float32')
    lc_full = cnt_full.astype('int32')
    keys_te = pd.Series(keys_te)
    te_te = keys_te.map(te_map_full).astype('float32').fillna(prior).values
    lc_te = np.log1p(keys_te.map(lc_full).fillna(0).astype('int32').values).astype('float32')
    return oof_vals, oof_logcnt, te_te, lc_te

y_tr = train['target'].values.astype('float32')

# 1) Recompute existing TEs with adjusted smoothing (pos-char m~25, bigram m~70)
te_pos_char_tr_mean = np.zeros((len(train), 10), dtype=np.float32)
te_pos_char_tr_log = np.zeros((len(train), 10), dtype=np.float32)
te_pos_char_te_mean = np.zeros((len(test), 10), dtype=np.float32)
te_pos_char_te_log = np.zeros((len(test), 10), dtype=np.float32)
for i in range(10):
    k_tr = np.array([f'{i}|{int(t)}' for t in pos_char_tr[:, i]], dtype=object)
    k_te = np.array([f'{i}|{int(t)}' for t in pos_char_te[:, i]], dtype=object)
    o_m, o_l, t_m, t_l = oof_te_by_key(k_tr, y_tr, k_te, m=25.0, prior=prior)
    te_pos_char_tr_mean[:, i] = o_m; te_pos_char_tr_log[:, i] = o_l
    te_pos_char_te_mean[:, i] = t_m; te_pos_char_te_log[:, i] = t_l

te_pos_bg_tr_mean = np.zeros((len(train), 9), dtype=np.float32)
te_pos_bg_tr_log = np.zeros((len(train), 9), dtype=np.float32)
te_pos_bg_te_mean = np.zeros((len(test), 9), dtype=np.float32)
te_pos_bg_te_log = np.zeros((len(test), 9), dtype=np.float32)
for i in range(9):
    k_tr = np.array([f'{i}|{bg}' for bg in bg_tr[i]], dtype=object)
    k_te = np.array([f'{i}|{bg}' for bg in bg_te[i]], dtype=object)
    o_m, o_l, t_m, t_l = oof_te_by_key(k_tr, y_tr, k_te, m=70.0, prior=prior)
    te_pos_bg_tr_mean[:, i] = o_m; te_pos_bg_tr_log[:, i] = o_l
    te_pos_bg_te_mean[:, i] = t_m; te_pos_bg_te_log[:, i] = t_l

# f27_nunique TE (m=50 unchanged)
def f27_nunique(arr):
    return np.fromiter((len(set(list(x))) for x in arr), count=len(arr), dtype=np.int16)
f27_nu_tr = f27_nunique(s_tr)
f27_nu_te = f27_nunique(s_te)
k_tr_nu = f27_nu_tr.astype('int16')
k_te_nu = f27_nu_te.astype('int16')
o_m, o_l, t_m, t_l = oof_te_by_key(k_tr_nu, y_tr, k_te_nu, m=50.0, prior=prior)
nu_te_tr_mean = o_m; nu_te_tr_log = o_l
nu_te_te_mean = t_m; nu_te_te_log = t_l

# 2) Positional trigrams TE (critical): i=0..7, m=180–200
te_tri_tr_mean = np.zeros((len(train), 8), dtype=np.float32)
te_tri_tr_log = np.zeros((len(train), 8), dtype=np.float32)
te_tri_te_mean = np.zeros((len(test), 8), dtype=np.float32)
te_tri_te_log = np.zeros((len(test), 8), dtype=np.float32)
for i in range(8):
    ktr = np.array([f'{i}|{tri}' for tri in tri_tr[i]], dtype=object)
    kte = np.array([f'{i}|{tri}' for tri in tri_te[i]], dtype=object)
    o_m, o_l, t_m, t_l = oof_te_by_key(ktr, y_tr, kte, m=180.0, prior=prior)
    te_tri_tr_mean[:, i] = o_m; te_tri_tr_log[:, i] = o_l
    te_tri_te_mean[:, i] = t_m; te_tri_te_log[:, i] = t_l

# 3) Count-hist signature TE (sorted multiset of char counts), m ~ 110
k_tr_hist = np.fromiter((str(count_hist_signature(s)) for s in s_tr), count=len(s_tr), dtype=object)
k_te_hist = np.fromiter((str(count_hist_signature(s)) for s in s_te), count=len(s_te), dtype=object)
hist_tr_mean, hist_tr_log, hist_te_mean, hist_te_log = oof_te_by_key(k_tr_hist, y_tr, k_te_hist, m=110.0, prior=prior)

# 4) Run-length signature TE, m ~ 70
k_tr_run = np.fromiter((str(run_length_signature(s)) for s in s_tr), count=len(s_tr), dtype=object)
k_te_run = np.fromiter((str(run_length_signature(s)) for s in s_te), count=len(s_te), dtype=object)
run_tr_mean, run_tr_log, run_te_mean, run_te_log = oof_te_by_key(k_tr_run, y_tr, k_te_run, m=70.0, prior=prior)

# 5) Selected non-adjacent symmetric pairs TE, m ~ 100
pair_keys_tr = {}
pair_keys_te = {}
for (i,j) in pairs:
    pair_keys_tr[(i,j)] = np.fromiter((f'{i}_{j}|{row[i]}{row[j]}' for row in s_tr), count=len(s_tr), dtype=object)
    pair_keys_te[(i,j)] = np.fromiter((f'{i}_{j}|{row[i]}{row[j]}' for row in s_te), count=len(s_te), dtype=object)
pair_tr_mean = {}; pair_tr_log = {}; pair_te_mean = {}; pair_te_log = {}
for (i,j), ktr in pair_keys_tr.items():
    kte = pair_keys_te[(i,j)]
    o_m, o_l, t_m, t_l = oof_te_by_key(ktr, y_tr, kte, m=100.0, prior=prior)
    pair_tr_mean[(i,j)] = o_m; pair_tr_log[(i,j)] = o_l
    pair_te_mean[(i,j)] = t_m; pair_te_log[(i,j)] = t_l

# 6) Optional: Majority-char with count TE, m ~ 60
k_tr_maj = np.fromiter((majority_char_with_count(s) for s in s_tr), count=len(s_tr), dtype=object)
k_te_maj = np.fromiter((majority_char_with_count(s) for s in s_te), count=len(s_te), dtype=object)
maj_tr_mean, maj_tr_log, maj_te_mean, maj_te_log = oof_te_by_key(k_tr_maj, y_tr, k_te_maj, m=60.0, prior=prior)

# Target-free frequencies (keep from previous block; recompute minimal essentials)
N_all = float(len(train) + len(test))
freq_full_tr = pd.Series(s_tr).map(pd.Series(np.concatenate([s_tr, s_te])).value_counts()).fillna(0).values.astype('float32') / N_all
freq_full_te = pd.Series(s_te).map(pd.Series(np.concatenate([s_tr, s_te])).value_counts()).fillna(0).values.astype('float32') / N_all

# Target-free pattern features
def position_of_first_repeat(s):
    for i in range(1, len(s)):
        if s[i] == s[i-1]: return i
    return 10
def alternating_score(s):
    diffs = sum(1 for i in range(1, len(s)) if s[i] != s[i-1])
    return diffs / 9.0
def same_diff_counts(s):
    same = sum(1 for i in range(1, len(s)) if s[i] == s[i-1])
    diff = 9 - same
    return same, diff

pos_first_rep_tr = np.fromiter((position_of_first_repeat(x) for x in s_tr), count=len(s_tr), dtype=np.int16)
pos_first_rep_te = np.fromiter((position_of_first_repeat(x) for x in s_te), count=len(s_te), dtype=np.int16)
alt_score_tr = np.fromiter((alternating_score(x) for x in s_tr), count=len(s_tr), dtype=np.float32)
alt_score_te = np.fromiter((alternating_score(x) for x in s_te), count=len(s_te), dtype=np.float32)
same_tr = np.zeros(len(s_tr), dtype=np.int8); diff_tr = np.zeros(len(s_tr), dtype=np.int8)
same_te = np.zeros(len(s_te), dtype=np.int8); diff_te = np.zeros(len(s_te), dtype=np.int8)
for idx, x in enumerate(s_tr):
    a,b = same_diff_counts(x); same_tr[idx] = a; diff_tr[idx] = b
for idx, x in enumerate(s_te):
    a,b = same_diff_counts(x); same_te[idx] = a; diff_te[idx] = b
num_runs_tr = (diff_tr + 1).astype('int8')
num_runs_te = (diff_te + 1).astype('int8')

# Include existing robust f_27 summaries if present
add_exist_cols = {}
for col in ['f_27_longest_run','f_27_transitions','f27_entropy','f27_first_last_same','f27_pal_matches','f27_majority_cnt','f27_majority_idx']:
    if col in train.columns and col in test.columns:
        add_exist_cols[col+'_tr'] = train[col].values
        add_exist_cols[col+'_te'] = test[col].values

# Assemble TE/freq/derived dataframes
te_cols_tr = {}
te_cols_te = {}
# Existing adjusted TEs
for i in range(10):
    te_cols_tr[f'TE_pos_char_{i}_mean'] = te_pos_char_tr_mean[:, i]
    te_cols_te[f'TE_pos_char_{i}_mean'] = te_pos_char_te_mean[:, i]
    te_cols_tr[f'TE_pos_char_{i}_logcnt'] = te_pos_char_tr_log[:, i]
    te_cols_te[f'TE_pos_char_{i}_logcnt'] = te_pos_char_te_log[:, i]
for i in range(9):
    te_cols_tr[f'TE_pos_bigram_{i}_mean'] = te_pos_bg_tr_mean[:, i]
    te_cols_te[f'TE_pos_bigram_{i}_mean'] = te_pos_bg_te_mean[:, i]
    te_cols_tr[f'TE_pos_bigram_{i}_logcnt'] = te_pos_bg_tr_log[:, i]
    te_cols_te[f'TE_pos_bigram_{i}_logcnt'] = te_pos_bg_te_log[:, i]
te_cols_tr['TE_f27_nunique_mean'] = nu_te_tr_mean
te_cols_te['TE_f27_nunique_mean'] = nu_te_te_mean
te_cols_tr['TE_f27_nunique_logcnt'] = nu_te_tr_log
te_cols_te['TE_f27_nunique_logcnt'] = nu_te_te_log
# New: trigrams
for i in range(8):
    te_cols_tr[f'TE_pos_trigram_{i}_mean'] = te_tri_tr_mean[:, i]
    te_cols_te[f'TE_pos_trigram_{i}_mean'] = te_tri_te_mean[:, i]
    te_cols_tr[f'TE_pos_trigram_{i}_logcnt'] = te_tri_tr_log[:, i]
    te_cols_te[f'TE_pos_trigram_{i}_logcnt'] = te_tri_te_log[:, i]
# New: count-hist and run-length signatures
te_cols_tr['TE_counthist_mean'] = hist_tr_mean
te_cols_tr['TE_counthist_logcnt'] = hist_tr_log
te_cols_te['TE_counthist_mean'] = hist_te_mean
te_cols_te['TE_counthist_logcnt'] = hist_te_log
te_cols_tr['TE_runlen_mean'] = run_tr_mean
te_cols_tr['TE_runlen_logcnt'] = run_tr_log
te_cols_te['TE_runlen_mean'] = run_te_mean
te_cols_te['TE_runlen_logcnt'] = run_te_log
# New: non-adjacent pairs
for (i,j) in pairs:
    key = f'TE_pair_{i}_{j}'
    te_cols_tr[f'{key}_mean'] = pair_tr_mean[(i,j)]
    te_cols_tr[f'{key}_logcnt'] = pair_tr_log[(i,j)]
    te_cols_te[f'{key}_mean'] = pair_te_mean[(i,j)]
    te_cols_te[f'{key}_logcnt'] = pair_te_log[(i,j)]
# Optional: majority-char TE
te_cols_tr['TE_majority_charcnt_mean'] = maj_tr_mean
te_cols_tr['TE_majority_charcnt_logcnt'] = maj_tr_log
te_cols_te['TE_majority_charcnt_mean'] = maj_te_mean
te_cols_te['TE_majority_charcnt_logcnt'] = maj_te_log
# Target-free core freq
te_cols_tr['FREQ_full_string'] = freq_full_tr
te_cols_te['FREQ_full_string'] = freq_full_te
# Target-free derived
te_cols_tr['pos_first_repeat'] = pos_first_rep_tr.astype('int16')
te_cols_te['pos_first_repeat'] = pos_first_rep_te.astype('int16')
te_cols_tr['alt_score'] = alt_score_tr.astype('float32')
te_cols_te['alt_score'] = alt_score_te.astype('float32')
te_cols_tr['same_adj'] = same_tr.astype('int8')
te_cols_te['same_adj'] = same_te.astype('int8')
te_cols_tr['diff_adj'] = diff_tr.astype('int8')
te_cols_te['diff_adj'] = diff_te.astype('int8')
te_cols_tr['num_runs'] = num_runs_tr.astype('int8')
te_cols_te['num_runs'] = num_runs_te.astype('int8')
# Attach existing robust summaries if present
for col in ['f_27_longest_run','f_27_transitions','f27_entropy','f27_first_last_same','f27_pal_matches','f27_majority_cnt','f27_majority_idx']:
    if col in train.columns and col in test.columns:
        te_cols_tr[col] = train[col].values
        te_cols_te[col] = test[col].values

te_train_aug = pd.DataFrame(te_cols_tr, index=train.index)
te_test_aug = pd.DataFrame(te_cols_te, index=test.index)

# Concatenate with compact features block for unseen
assert 'comp_train' in globals() and 'comp_test' in globals(), 'Run pivot prep to build compact features first'
X_unseen_tr = pd.concat([comp_train.reset_index(drop=True), te_train_aug.reset_index(drop=True)], axis=1)
X_unseen_te = pd.concat([comp_test.reset_index(drop=True), te_test_aug.reset_index(drop=True)], axis=1)
unseen_feature_cols = X_unseen_tr.columns.tolist()

# Sanity checks
assert X_unseen_tr.shape[0] == len(train) and X_unseen_te.shape[0] == len(test)
assert not np.isnan(X_unseen_tr.values).any(), 'NaNs in X_unseen_tr'
assert not np.isnan(X_unseen_te.values).any(), 'NaNs in X_unseen_te'
log(f'Augmented unseen features built: {X_unseen_tr.shape[1]} cols | time {time.time()-t0:.1f}s')
gc.collect();

[2025-09-12 03:27:58] Building augmented TE features for unseen model (trigrams, signatures, non-adj pairs) and target-free patterns...


[2025-09-12 03:29:02] Augmented unseen features built: 96 cols | time 64.6s


In [35]:
# Cell 31: Retrain unseen models on augmented features (4 LGB seeds + 1 XGB) with tightened params
log('Training augmented TE unseen models: 4x LGB + 1x XGB ...')
import numpy as np, pandas as pd, time, gc
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

assert 'X_unseen_tr' in globals() and 'X_unseen_te' in globals(), 'Run Cell 30 to build augmented features first'
X_tr = X_unseen_tr.copy()
X_te = X_unseen_te.copy()
y_tr = train['target'].values.astype('float32')

# Folds (locked, same as before)
folds_df = pd.read_csv('folds_10fold_seed42.csv')
folds_arr = folds_df['fold'].values
n_splits = len(np.unique(folds_arr))

def train_lgb_unseen_aug(seed=42, lr=0.040, num_leaves=256, min_data_in_leaf=300, ff=0.74, bf=0.80, l2=6.0, tag='au_s42'):
    params = {
        'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
        'learning_rate': lr, 'num_leaves': num_leaves, 'max_depth': -1,
        'min_data_in_leaf': min_data_in_leaf, 'feature_fraction': ff, 'bagging_fraction': bf, 'bagging_freq': 1,
        'lambda_l1': 0.0, 'lambda_l2': l2, 'max_bin': 255, 'verbose': -1, 'n_jobs': -1,
        'seed': seed, 'feature_fraction_seed': seed, 'bagging_seed': seed, 'data_random_seed': seed
    }
    oof = np.zeros(len(X_tr), dtype='float32')
    ptest = np.zeros(len(X_te), dtype='float32')
    t0 = time.time()
    for fold in range(n_splits):
        trn_idx = np.where(folds_arr != fold)[0]
        val_idx = np.where(folds_arr == fold)[0]
        log(f'[LGB unseen AUG] seed={seed} ff={ff} bf={bf} | Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
        dtr = lgb.Dataset(X_tr.iloc[trn_idx], label=y_tr[trn_idx])
        dvl = lgb.Dataset(X_tr.iloc[val_idx], label=y_tr[val_idx])
        model = lgb.train(params, dtr, num_boost_round=6000, valid_sets=[dtr, dvl], valid_names=['train','valid'], callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)])
        oof[val_idx] = model.predict(X_tr.iloc[val_idx], num_iteration=model.best_iteration)
        auc = roc_auc_score(y_tr[val_idx], oof[val_idx])
        log(f'[LGB unseen AUG] seed={seed} fold={fold} AUC: {auc:.6f} | best_iter={model.best_iteration}')
        ptest += model.predict(X_te, num_iteration=model.best_iteration) / n_splits
        del dtr, dvl, model; gc.collect()
    cv_auc = roc_auc_score(y_tr, oof)
    log(f'[LGB unseen AUG] seed={seed} OOF AUC: {cv_auc:.6f} | elapsed={time.time()-t0:.1f}s')
    pd.DataFrame({'id': train['id'], f'oof_lgb_unseen_{tag}': oof}).to_csv(f'oof_lgb_unseen_{tag}.csv', index=False)
    pd.DataFrame({'id': test['id'], f'prediction_lgb_unseen_{tag}': ptest}).to_csv(f'pred_lgb_unseen_{tag}.csv', index=False)
    log(f'[LGB unseen AUG] Saved OOF/test preds for {tag}')

# 4 LGB seeds with slight diversity per expert guidance
lgb_cfgs = [
    (42,   0.040, 256, 300, 0.74, 0.82, 6.0, 'au_s42'),
    (1337, 0.042, 288, 320, 0.70, 0.85, 7.0, 'au_s1337'),
    (2025, 0.038, 224, 300, 0.78, 0.78, 5.0, 'au_s2025'),
    (7,    0.040, 320, 340, 0.72, 0.80, 8.0, 'au_s7'),
]
for s, lr, nl, mdl, ff, bf, l2, tag in lgb_cfgs:
    train_lgb_unseen_aug(seed=s, lr=lr, num_leaves=nl, min_data_in_leaf=mdl, ff=ff, bf=bf, l2=l2, tag=tag)
log('All augmented LGB unseen seeds finished.')

# XGBoost on augmented features
log('Training XGBoost unseen (augmented features) ...')
try:
    import xgboost as xgb
except ImportError:
    import sys, subprocess
    log('XGBoost not found. Installing...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'xgboost'])
    import xgboost as xgb

dtest = xgb.DMatrix(X_te)
oof_xgb_u = np.zeros(len(X_tr), dtype='float32')
pred_xgb_u = np.zeros(len(X_te), dtype='float32')
xgb_params = {
    'objective': 'binary:logistic', 'eval_metric': 'auc', 'tree_method': 'hist', 'max_bin': 256,
    'eta': 0.040, 'max_depth': 9, 'min_child_weight': 120, 'subsample': 0.80, 'colsample_bytree': 0.75, 'lambda': 4.0,
    'nthread': 36, 'seed': 42
}
num_boost_round = 3800
early_stopping_rounds = 180
for fold in range(n_splits):
    trn_idx = np.where(folds_arr != fold)[0]
    val_idx = np.where(folds_arr == fold)[0]
    log(f'[XGB unseen AUG] Fold {fold+1}/{n_splits} | trn={len(trn_idx)} val={len(val_idx)}')
    dtr = xgb.DMatrix(X_tr.iloc[trn_idx], label=y_tr[trn_idx])
    dvl = xgb.DMatrix(X_tr.iloc[val_idx], label=y_tr[val_idx])
    bst = xgb.train(params=xgb_params, dtrain=dtr, num_boost_round=num_boost_round, evals=[(dtr,'train'),(dvl,'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=200)
    best_iter = None
    try:
        attrs = bst.attributes()
        if 'best_iteration' in attrs:
            best_iter = int(attrs['best_iteration'])
    except Exception:
        best_iter = getattr(bst, 'best_iteration', None)
    if best_iter is not None and best_iter >= 0:
        oof_pred = bst.predict(dvl, iteration_range=(0, best_iter + 1))
        pred_te = bst.predict(dtest, iteration_range=(0, best_iter + 1))
    else:
        oof_pred = bst.predict(dvl)
        pred_te = bst.predict(dtest)
    oof_xgb_u[val_idx] = oof_pred.astype('float32')
    pred_xgb_u += (pred_te.astype('float32') / n_splits)
    del dtr, dvl, bst; gc.collect()
auc_xgb_u = roc_auc_score(y_tr, oof_xgb_u)
log(f'[XGB unseen AUG] OOF AUC: {auc_xgb_u:.6f}')
pd.DataFrame({'id': train['id'], 'oof_xgb_unseen_aug': oof_xgb_u}).to_csv('oof_xgb_unseen_aug.csv', index=False)
pd.DataFrame({'id': test['id'], 'prediction_xgb_unseen_aug': pred_xgb_u}).to_csv('pred_xgb_unseen_aug.csv', index=False)
log('[XGB unseen AUG] Saved OOF and test predictions.')

[2025-09-12 03:30:31] Training augmented TE unseen models: 4x LGB + 1x XGB ...


[2025-09-12 03:30:31] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837319	valid's auc: 0.81735


[400]	train's auc: 0.854422	valid's auc: 0.818343


Early stopping, best iteration is:
[357]	train's auc: 0.851167	valid's auc: 0.818411


[2025-09-12 03:31:21] [LGB unseen AUG] seed=42 fold=0 AUC: 0.818411 | best_iter=357


[2025-09-12 03:31:21] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837241	valid's auc: 0.817103


[400]	train's auc: 0.854416	valid's auc: 0.818302


[600]	train's auc: 0.869064	valid's auc: 0.818146


Early stopping, best iteration is:
[443]	train's auc: 0.857711	valid's auc: 0.818319


[2025-09-12 03:32:19] [LGB unseen AUG] seed=42 fold=1 AUC: 0.818319 | best_iter=443


[2025-09-12 03:32:19] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.836805	valid's auc: 0.820437


[400]	train's auc: 0.853859	valid's auc: 0.821622


[600]	train's auc: 0.868458	valid's auc: 0.821402


Early stopping, best iteration is:
[434]	train's auc: 0.856476	valid's auc: 0.821644


[2025-09-12 03:33:15] [LGB unseen AUG] seed=42 fold=2 AUC: 0.821644 | best_iter=434


[2025-09-12 03:33:15] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837246	valid's auc: 0.817179


[400]	train's auc: 0.854464	valid's auc: 0.818352


Early stopping, best iteration is:
[375]	train's auc: 0.852559	valid's auc: 0.81842


[2025-09-12 03:34:06] [LGB unseen AUG] seed=42 fold=3 AUC: 0.818420 | best_iter=375


[2025-09-12 03:34:07] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837087	valid's auc: 0.820318


[400]	train's auc: 0.854287	valid's auc: 0.821057


Early stopping, best iteration is:
[351]	train's auc: 0.850367	valid's auc: 0.821093


[2025-09-12 03:35:00] [LGB unseen AUG] seed=42 fold=4 AUC: 0.821093 | best_iter=351


[2025-09-12 03:35:01] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837337	valid's auc: 0.81569


[400]	train's auc: 0.854447	valid's auc: 0.816745


[600]	train's auc: 0.868865	valid's auc: 0.816643


Early stopping, best iteration is:
[498]	train's auc: 0.861734	valid's auc: 0.816788


[2025-09-12 03:36:02] [LGB unseen AUG] seed=42 fold=5 AUC: 0.816788 | best_iter=498


[2025-09-12 03:36:03] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837166	valid's auc: 0.817983


[400]	train's auc: 0.854445	valid's auc: 0.818901


[600]	train's auc: 0.869055	valid's auc: 0.818831


Early stopping, best iteration is:
[501]	train's auc: 0.86201	valid's auc: 0.819034


[2025-09-12 03:37:03] [LGB unseen AUG] seed=42 fold=6 AUC: 0.819034 | best_iter=501


[2025-09-12 03:37:04] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.83717	valid's auc: 0.817476


[400]	train's auc: 0.854315	valid's auc: 0.818273


Early stopping, best iteration is:
[395]	train's auc: 0.853935	valid's auc: 0.818315


[2025-09-12 03:37:55] [LGB unseen AUG] seed=42 fold=7 AUC: 0.818315 | best_iter=395


[2025-09-12 03:37:56] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837261	valid's auc: 0.818156


[400]	train's auc: 0.854456	valid's auc: 0.819097


Early stopping, best iteration is:
[350]	train's auc: 0.850611	valid's auc: 0.819145


[2025-09-12 03:38:44] [LGB unseen AUG] seed=42 fold=8 AUC: 0.819145 | best_iter=350


[2025-09-12 03:38:45] [LGB unseen AUG] seed=42 ff=0.74 bf=0.82 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.837072	valid's auc: 0.819659


[400]	train's auc: 0.854175	valid's auc: 0.820534


[600]	train's auc: 0.868746	valid's auc: 0.820551


Early stopping, best iteration is:
[518]	train's auc: 0.862965	valid's auc: 0.820684


[2025-09-12 03:39:47] [LGB unseen AUG] seed=42 fold=9 AUC: 0.820684 | best_iter=518


[2025-09-12 03:39:48] [LGB unseen AUG] seed=42 OOF AUC: 0.819172 | elapsed=557.2s


[2025-09-12 03:39:49] [LGB unseen AUG] Saved OOF/test preds for au_s42


[2025-09-12 03:39:49] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.840384	valid's auc: 0.817758


[400]	train's auc: 0.859396	valid's auc: 0.818602


Early stopping, best iteration is:
[365]	train's auc: 0.856376	valid's auc: 0.818684


[2025-09-12 03:40:41] [LGB unseen AUG] seed=1337 fold=0 AUC: 0.818684 | best_iter=365


[2025-09-12 03:40:42] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.84044	valid's auc: 0.817565


[400]	train's auc: 0.859402	valid's auc: 0.818332


[600]	train's auc: 0.87564	valid's auc: 0.818228


Early stopping, best iteration is:
[430]	train's auc: 0.861943	valid's auc: 0.818408


[2025-09-12 03:41:39] [LGB unseen AUG] seed=1337 fold=1 AUC: 0.818408 | best_iter=430


[2025-09-12 03:41:40] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.84013	valid's auc: 0.820396


[400]	train's auc: 0.859151	valid's auc: 0.821418


Early stopping, best iteration is:
[398]	train's auc: 0.858962	valid's auc: 0.821431


[2025-09-12 03:42:33] [LGB unseen AUG] seed=1337 fold=2 AUC: 0.821431 | best_iter=398


[2025-09-12 03:42:33] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.840448	valid's auc: 0.817508


[400]	train's auc: 0.859338	valid's auc: 0.818155


Early stopping, best iteration is:
[302]	train's auc: 0.850793	valid's auc: 0.818247


[2025-09-12 03:43:19] [LGB unseen AUG] seed=1337 fold=3 AUC: 0.818247 | best_iter=302


[2025-09-12 03:43:20] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.840215	valid's auc: 0.820282


[400]	train's auc: 0.859234	valid's auc: 0.820576


Early stopping, best iteration is:
[347]	train's auc: 0.854624	valid's auc: 0.82068


[2025-09-12 03:44:26] [LGB unseen AUG] seed=1337 fold=4 AUC: 0.820680 | best_iter=347


[2025-09-12 03:44:27] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.84067	valid's auc: 0.816101


[400]	train's auc: 0.859839	valid's auc: 0.817045


Early stopping, best iteration is:
[397]	train's auc: 0.859547	valid's auc: 0.817057


[2025-09-12 03:45:42] [LGB unseen AUG] seed=1337 fold=5 AUC: 0.817057 | best_iter=397


[2025-09-12 03:45:42] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.840341	valid's auc: 0.818119


[400]	train's auc: 0.85941	valid's auc: 0.818768


[600]	train's auc: 0.87542	valid's auc: 0.818685


Early stopping, best iteration is:
[445]	train's auc: 0.863231	valid's auc: 0.818844


[2025-09-12 03:47:00] [LGB unseen AUG] seed=1337 fold=6 AUC: 0.818844 | best_iter=445


[2025-09-12 03:47:01] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.8405	valid's auc: 0.817939


[400]	train's auc: 0.859458	valid's auc: 0.818666


[600]	train's auc: 0.875379	valid's auc: 0.818166
Early stopping, best iteration is:
[402]	train's auc: 0.859619	valid's auc: 0.81867


[2025-09-12 03:48:12] [LGB unseen AUG] seed=1337 fold=7 AUC: 0.818670 | best_iter=402


[2025-09-12 03:48:12] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.840404	valid's auc: 0.81827


[400]	train's auc: 0.859466	valid's auc: 0.818753


[600]	train's auc: 0.875638	valid's auc: 0.818597


Early stopping, best iteration is:
[456]	train's auc: 0.864183	valid's auc: 0.818908


[2025-09-12 03:49:31] [LGB unseen AUG] seed=1337 fold=8 AUC: 0.818908 | best_iter=456


[2025-09-12 03:49:31] [LGB unseen AUG] seed=1337 ff=0.7 bf=0.85 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.84014	valid's auc: 0.820116


[400]	train's auc: 0.859113	valid's auc: 0.820836


Early stopping, best iteration is:
[393]	train's auc: 0.858505	valid's auc: 0.820877


[2025-09-12 03:50:41] [LGB unseen AUG] seed=1337 fold=9 AUC: 0.820877 | best_iter=393


[2025-09-12 03:50:42] [LGB unseen AUG] seed=1337 OOF AUC: 0.819164 | elapsed=652.6s


[2025-09-12 03:50:43] [LGB unseen AUG] Saved OOF/test preds for au_s1337


[2025-09-12 03:50:43] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834007	valid's auc: 0.817175


[400]	train's auc: 0.849264	valid's auc: 0.818421


[600]	train's auc: 0.862212	valid's auc: 0.818513


[800]	train's auc: 0.8739	valid's auc: 0.818273


Early stopping, best iteration is:
[661]	train's auc: 0.865891	valid's auc: 0.818611


[2025-09-12 03:51:54] [LGB unseen AUG] seed=2025 fold=0 AUC: 0.818611 | best_iter=661


[2025-09-12 03:51:55] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834051	valid's auc: 0.816791


[400]	train's auc: 0.849345	valid's auc: 0.818092


[600]	train's auc: 0.86223	valid's auc: 0.818055


Early stopping, best iteration is:
[439]	train's auc: 0.851978	valid's auc: 0.81811


[2025-09-12 03:52:49] [LGB unseen AUG] seed=2025 fold=1 AUC: 0.818110 | best_iter=439


[2025-09-12 03:52:50] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.8337	valid's auc: 0.820004


[400]	train's auc: 0.848955	valid's auc: 0.821448


[600]	train's auc: 0.861848	valid's auc: 0.821427


Early stopping, best iteration is:
[531]	train's auc: 0.857546	valid's auc: 0.821491


[2025-09-12 03:53:51] [LGB unseen AUG] seed=2025 fold=2 AUC: 0.821491 | best_iter=531


[2025-09-12 03:53:52] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834121	valid's auc: 0.817073


[400]	train's auc: 0.849386	valid's auc: 0.818347


[600]	train's auc: 0.862217	valid's auc: 0.818369


Early stopping, best iteration is:
[550]	train's auc: 0.8591	valid's auc: 0.818408


[2025-09-12 03:54:58] [LGB unseen AUG] seed=2025 fold=3 AUC: 0.818408 | best_iter=550


[2025-09-12 03:54:59] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833841	valid's auc: 0.819964


[400]	train's auc: 0.849183	valid's auc: 0.820795


Early stopping, best iteration is:
[383]	train's auc: 0.848033	valid's auc: 0.820832


[2025-09-12 03:55:49] [LGB unseen AUG] seed=2025 fold=4 AUC: 0.820832 | best_iter=383


[2025-09-12 03:55:50] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834289	valid's auc: 0.815564


[400]	train's auc: 0.849425	valid's auc: 0.816786


[600]	train's auc: 0.862224	valid's auc: 0.816853


Early stopping, best iteration is:
[519]	train's auc: 0.857283	valid's auc: 0.816915


[2025-09-12 03:56:50] [LGB unseen AUG] seed=2025 fold=5 AUC: 0.816915 | best_iter=519


[2025-09-12 03:56:51] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834088	valid's auc: 0.817846


[400]	train's auc: 0.849377	valid's auc: 0.818866


[600]	train's auc: 0.862245	valid's auc: 0.818836


Early stopping, best iteration is:
[535]	train's auc: 0.858221	valid's auc: 0.81894


[2025-09-12 03:57:52] [LGB unseen AUG] seed=2025 fold=6 AUC: 0.818940 | best_iter=535


[2025-09-12 03:57:52] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834066	valid's auc: 0.817175


[400]	train's auc: 0.849438	valid's auc: 0.818437


[600]	train's auc: 0.862407	valid's auc: 0.818348


Early stopping, best iteration is:
[470]	train's auc: 0.854182	valid's auc: 0.818481


[2025-09-12 03:58:49] [LGB unseen AUG] seed=2025 fold=7 AUC: 0.818481 | best_iter=470


[2025-09-12 03:58:50] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.834074	valid's auc: 0.817918


[400]	train's auc: 0.849352	valid's auc: 0.819033


[600]	train's auc: 0.862239	valid's auc: 0.818912


Early stopping, best iteration is:
[516]	train's auc: 0.857006	valid's auc: 0.819047


[2025-09-12 03:59:50] [LGB unseen AUG] seed=2025 fold=8 AUC: 0.819047 | best_iter=516


[2025-09-12 03:59:51] [LGB unseen AUG] seed=2025 ff=0.78 bf=0.78 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.833758	valid's auc: 0.819372


[400]	train's auc: 0.849047	valid's auc: 0.820628


[600]	train's auc: 0.861948	valid's auc: 0.820621


Early stopping, best iteration is:
[545]	train's auc: 0.858485	valid's auc: 0.820743


[2025-09-12 04:00:53] [LGB unseen AUG] seed=2025 fold=9 AUC: 0.820743 | best_iter=545


[2025-09-12 04:00:54] [LGB unseen AUG] seed=2025 OOF AUC: 0.819140 | elapsed=611.3s


[2025-09-12 04:00:55] [LGB unseen AUG] Saved OOF/test preds for au_s2025


[2025-09-12 04:00:55] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 1/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841277	valid's auc: 0.817313


[400]	train's auc: 0.861257	valid's auc: 0.818226


[600]	train's auc: 0.878019	valid's auc: 0.818034


Early stopping, best iteration is:
[425]	train's auc: 0.863508	valid's auc: 0.818271


[2025-09-12 04:01:54] [LGB unseen AUG] seed=7 fold=0 AUC: 0.818271 | best_iter=425


[2025-09-12 04:01:55] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 2/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.84137	valid's auc: 0.81726


[400]	train's auc: 0.861247	valid's auc: 0.818035


[600]	train's auc: 0.878119	valid's auc: 0.817958


Early stopping, best iteration is:
[443]	train's auc: 0.865076	valid's auc: 0.8181


[2025-09-12 04:02:55] [LGB unseen AUG] seed=7 fold=1 AUC: 0.818100 | best_iter=443


[2025-09-12 04:02:56] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 3/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841038	valid's auc: 0.820539


[400]	train's auc: 0.86106	valid's auc: 0.821344


Early stopping, best iteration is:
[344]	train's auc: 0.855918	valid's auc: 0.821398


[2025-09-12 04:03:48] [LGB unseen AUG] seed=7 fold=2 AUC: 0.821398 | best_iter=344


[2025-09-12 04:03:48] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 4/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841308	valid's auc: 0.817353


[400]	train's auc: 0.861472	valid's auc: 0.81809


Early stopping, best iteration is:
[363]	train's auc: 0.85811	valid's auc: 0.818153


[2025-09-12 04:04:46] [LGB unseen AUG] seed=7 fold=3 AUC: 0.818153 | best_iter=363


[2025-09-12 04:04:46] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 5/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841227	valid's auc: 0.820338


[400]	train's auc: 0.86132	valid's auc: 0.820793


Early stopping, best iteration is:
[336]	train's auc: 0.855386	valid's auc: 0.820967


[2025-09-12 04:05:38] [LGB unseen AUG] seed=7 fold=4 AUC: 0.820967 | best_iter=336


[2025-09-12 04:05:38] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 6/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841457	valid's auc: 0.815945


[400]	train's auc: 0.861492	valid's auc: 0.816816


Early stopping, best iteration is:
[385]	train's auc: 0.860122	valid's auc: 0.816863


[2025-09-12 04:06:34] [LGB unseen AUG] seed=7 fold=5 AUC: 0.816863 | best_iter=385


[2025-09-12 04:06:34] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 7/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841351	valid's auc: 0.818228


[400]	train's auc: 0.861272	valid's auc: 0.818814


Early stopping, best iteration is:
[375]	train's auc: 0.859025	valid's auc: 0.818834


[2025-09-12 04:07:29] [LGB unseen AUG] seed=7 fold=6 AUC: 0.818834 | best_iter=375


[2025-09-12 04:07:29] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 8/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841271	valid's auc: 0.817291


[400]	train's auc: 0.861293	valid's auc: 0.818019


Early stopping, best iteration is:
[386]	train's auc: 0.860067	valid's auc: 0.818071


[2025-09-12 04:08:25] [LGB unseen AUG] seed=7 fold=7 AUC: 0.818071 | best_iter=386


[2025-09-12 04:08:25] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 9/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841246	valid's auc: 0.818275


[400]	train's auc: 0.861342	valid's auc: 0.818872


Early stopping, best iteration is:
[384]	train's auc: 0.859929	valid's auc: 0.818916


[2025-09-12 04:09:20] [LGB unseen AUG] seed=7 fold=8 AUC: 0.818916 | best_iter=384


[2025-09-12 04:09:21] [LGB unseen AUG] seed=7 ff=0.72 bf=0.8 | Fold 10/10 | trn=720000 val=80000


Training until validation scores don't improve for 200 rounds


[200]	train's auc: 0.841068	valid's auc: 0.819739


[400]	train's auc: 0.86101	valid's auc: 0.820388


Early stopping, best iteration is:
[300]	train's auc: 0.851692	valid's auc: 0.820501


[2025-09-12 04:10:08] [LGB unseen AUG] seed=7 fold=9 AUC: 0.820501 | best_iter=300


[2025-09-12 04:10:09] [LGB unseen AUG] seed=7 OOF AUC: 0.818992 | elapsed=553.7s


[2025-09-12 04:10:10] [LGB unseen AUG] Saved OOF/test preds for au_s7


[2025-09-12 04:10:10] All augmented LGB unseen seeds finished.


[2025-09-12 04:10:10] Training XGBoost unseen (augmented features) ...


[2025-09-12 04:10:10] [XGB unseen AUG] Fold 1/10 | trn=720000 val=80000


[0]	train-auc:0.76450	valid-auc:0.76206


[200]	train-auc:0.82576	valid-auc:0.81391


[400]	train-auc:0.83435	valid-auc:0.81738


[600]	train-auc:0.83980	valid-auc:0.81807


[800]	train-auc:0.84492	valid-auc:0.81830


[1000]	train-auc:0.84921	valid-auc:0.81832


[1151]	train-auc:0.85267	valid-auc:0.81823


[2025-09-12 04:11:11] [XGB unseen AUG] Fold 2/10 | trn=720000 val=80000


[0]	train-auc:0.76759	valid-auc:0.75955


[200]	train-auc:0.82564	valid-auc:0.81363


[400]	train-auc:0.83421	valid-auc:0.81676


[600]	train-auc:0.84005	valid-auc:0.81752


[800]	train-auc:0.84493	valid-auc:0.81774


[1000]	train-auc:0.84980	valid-auc:0.81776


[1008]	train-auc:0.84994	valid-auc:0.81776


[2025-09-12 04:12:05] [XGB unseen AUG] Fold 3/10 | trn=720000 val=80000


[0]	train-auc:0.76830	valid-auc:0.76514


[200]	train-auc:0.82540	valid-auc:0.81581


[400]	train-auc:0.83392	valid-auc:0.81947


[600]	train-auc:0.83964	valid-auc:0.82029


[800]	train-auc:0.84479	valid-auc:0.82053


[1000]	train-auc:0.84915	valid-auc:0.82064


[1183]	train-auc:0.85314	valid-auc:0.82052


[2025-09-12 04:13:07] [XGB unseen AUG] Fold 4/10 | trn=720000 val=80000


[0]	train-auc:0.76559	valid-auc:0.75985


[200]	train-auc:0.82574	valid-auc:0.81442


[400]	train-auc:0.83393	valid-auc:0.81732


[600]	train-auc:0.83972	valid-auc:0.81787


[800]	train-auc:0.84466	valid-auc:0.81805


[1000]	train-auc:0.84951	valid-auc:0.81815


[1200]	train-auc:0.85389	valid-auc:0.81808


[1213]	train-auc:0.85418	valid-auc:0.81805


[2025-09-12 04:14:10] [XGB unseen AUG] Fold 5/10 | trn=720000 val=80000


[0]	train-auc:0.76457	valid-auc:0.76114


[200]	train-auc:0.82506	valid-auc:0.81708


[400]	train-auc:0.83359	valid-auc:0.81996


[600]	train-auc:0.83918	valid-auc:0.82036


[800]	train-auc:0.84431	valid-auc:0.82044


[964]	train-auc:0.84817	valid-auc:0.82034


[2025-09-12 04:15:04] [XGB unseen AUG] Fold 6/10 | trn=720000 val=80000


[0]	train-auc:0.76552	valid-auc:0.76027


[200]	train-auc:0.82552	valid-auc:0.81163


[400]	train-auc:0.83398	valid-auc:0.81462


[600]	train-auc:0.83990	valid-auc:0.81530


[800]	train-auc:0.84515	valid-auc:0.81552


[1000]	train-auc:0.84973	valid-auc:0.81546


[1021]	train-auc:0.85021	valid-auc:0.81548


[2025-09-12 04:15:59] [XGB unseen AUG] Fold 7/10 | trn=720000 val=80000


[0]	train-auc:0.76578	valid-auc:0.76285


[200]	train-auc:0.82539	valid-auc:0.81471


[400]	train-auc:0.83414	valid-auc:0.81776


[600]	train-auc:0.83975	valid-auc:0.81828


[800]	train-auc:0.84470	valid-auc:0.81845


[1000]	train-auc:0.84960	valid-auc:0.81844


[1054]	train-auc:0.85082	valid-auc:0.81841


[2025-09-12 04:16:53] [XGB unseen AUG] Fold 8/10 | trn=720000 val=80000


[0]	train-auc:0.76564	valid-auc:0.74451


[200]	train-auc:0.82571	valid-auc:0.79740


[400]	train-auc:0.83406	valid-auc:0.80007


[600]	train-auc:0.83974	valid-auc:0.80012


[800]	train-auc:0.84503	valid-auc:0.80052


[1000]	train-auc:0.84961	valid-auc:0.80051


[1100]	train-auc:0.85184	valid-auc:0.80066


[2025-09-12 04:17:51] [XGB unseen AUG] Fold 9/10 | trn=720000 val=80000


[0]	train-auc:0.76563	valid-auc:0.74074


[200]	train-auc:0.82534	valid-auc:0.80293


[400]	train-auc:0.83387	valid-auc:0.80540


[600]	train-auc:0.83973	valid-auc:0.80633


[800]	train-auc:0.84494	valid-auc:0.80664


[1000]	train-auc:0.84941	valid-auc:0.80686


[1200]	train-auc:0.85383	valid-auc:0.80693


[1374]	train-auc:0.85752	valid-auc:0.80686


[2025-09-12 04:19:03] [XGB unseen AUG] Fold 10/10 | trn=720000 val=80000


[0]	train-auc:0.76128	valid-auc:0.75556


[200]	train-auc:0.82533	valid-auc:0.81154


[400]	train-auc:0.83375	valid-auc:0.81434


[600]	train-auc:0.83927	valid-auc:0.81517


[800]	train-auc:0.84435	valid-auc:0.81565


[1000]	train-auc:0.84910	valid-auc:0.81562


[1090]	train-auc:0.85113	valid-auc:0.81556


[2025-09-12 04:20:00] [XGB unseen AUG] OOF AUC: 0.814888


[2025-09-12 04:20:01] [XGB unseen AUG] Saved OOF and test predictions.


In [36]:
# Cell 32: Blend augmented unseen models (4 LGB au_* + XGB aug) and build alternative submission
log('Blending augmented unseen models (au_*) and creating alternative submission...')
import pandas as pd, numpy as np, json, os
from sklearn.metrics import roc_auc_score

def load_oof(path):
    df = pd.read_csv(path); cols = [c for c in df.columns if c != 'id']
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})
def load_pred(path):
    df = pd.read_csv(path); cols = [c for c in df.columns if c != 'id']
    return df[['id', cols[0]]].rename(columns={cols[0]: 'pred'})

# Load augmented LGB OOF/test
au_tags = ['au_s42','au_s1337','au_s2025','au_s7']
oofs = []; preds = []
for tag in au_tags:
    oofs.append(load_oof(f'oof_lgb_unseen_{tag}.csv').rename(columns={'pred': f'oof_{tag}'}))
    preds.append(load_pred(f'pred_lgb_unseen_{tag}.csv').rename(columns={'pred': f'pred_{tag}'}))
oof_lgb = oofs[0]
for df in oofs[1:]: oof_lgb = oof_lgb.merge(df, on='id', how='inner')
pred_lgb = preds[0]
for df in preds[1:]: pred_lgb = pred_lgb.merge(df, on='id', how='inner')
lgb_oof_cols = [c for c in oof_lgb.columns if c != 'id']
lgb_pred_cols = [c for c in pred_lgb.columns if c != 'id']
oof_lgb['lgb_ens'] = oof_lgb[lgb_oof_cols].mean(axis=1).astype('float32')
pred_lgb['lgb_ens'] = pred_lgb[lgb_pred_cols].mean(axis=1).astype('float32')

# Load augmented XGB OOF/test
oof_xgb = load_oof('oof_xgb_unseen_aug.csv').rename(columns={'pred':'xgb'})
pred_xgb = load_pred('pred_xgb_unseen_aug.csv').rename(columns={'pred':'xgb'})

# OOF blend search (prob vs rank) with weights in [0.85..0.96]
gt = train[['id','target']].copy()
oof = gt.merge(oof_lgb[['id','lgb_ens']], on='id', how='left').merge(oof_xgb, on='id', how='left')
def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / max(len(r) - 1, 1)
oof['lgb_r'] = rank_norm(oof['lgb_ens']); oof['xgb_r'] = rank_norm(oof['xgb'])
best = {'auc': 0.0, 'mode': 'prob', 'w_lgb': 1.0}
for w in np.round(np.arange(0.85, 0.961, 0.01), 2):
    prob = w * oof['lgb_ens'] + (1 - w) * oof['xgb']
    ap = roc_auc_score(oof['target'], prob)
    if ap > best['auc']: best.update({'auc': float(ap), 'mode': 'prob', 'w_lgb': float(w)})
    rank = w * oof['lgb_r'] + (1 - w) * oof['xgb_r']
    ar = roc_auc_score(oof['target'], rank)
    if ar > best['auc']: best.update({'auc': float(ar), 'mode': 'rank', 'w_lgb': float(w)})
log(f"[Aug unseen blend] Best OOF AUC={best['auc']:.6f} | mode={best['mode']} | w_lgb={best['w_lgb']}")

# Build unseen test predictions according to best
sub_te = pd.read_csv('sample_submission.csv')[['id']].merge(pred_lgb[['id','lgb_ens']], on='id', how='left').merge(pred_xgb, on='id', how='left')
if best['mode'] == 'prob':
    unseen_pred = best['w_lgb'] * sub_te['lgb_ens'].values + (1 - best['w_lgb']) * sub_te['xgb'].values
else:
    rl = rank_norm(sub_te['lgb_ens']); rx = rank_norm(sub_te['xgb'])
    unseen_pred = best['w_lgb'] * rl + (1 - best['w_lgb']) * rx
unseen_pred = unseen_pred.astype('float32')

# Seen mapping via majority resolution (guardrail)
grp = train.groupby('f_27')['target']; means = grp.mean(); f27_map = (means >= 0.5).astype('int8').to_dict()
test_seen_mask = test['f_27'].isin(f27_map).values
test_unseen_mask = ~test_seen_mask
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_map).astype('float32').values

# Assemble alternative submission (do not overwrite submission.csv)
final_alt = pd.read_csv('sample_submission.csv')[['id']].copy()
final_alt['target'] = np.zeros(len(final_alt), dtype='float32')
final_alt.loc[test_seen_mask, 'target'] = preds_seen
final_alt.loc[test_unseen_mask, 'target'] = unseen_pred[test_unseen_mask]
final_alt['target'] = final_alt['target'].astype('float64').clip(0.0, 1.0)
final_alt.to_csv('submission_unseen_aug.csv', index=False)
with open('blend_unseen_meta_aug.json','w') as f: json.dump(best, f, indent=2)
log('Saved submission_unseen_aug.csv with augmented unseen blend and saved metadata.')

[2025-09-12 04:20:58] Blending augmented unseen models (au_*) and creating alternative submission...


[2025-09-12 04:21:06] [Aug unseen blend] Best OOF AUC=0.819963 | mode=rank | w_lgb=0.96


[2025-09-12 04:21:08] Saved submission_unseen_aug.csv with augmented unseen blend and saved metadata.


In [37]:
# Cell 33: Optional rarity rank post-process on current TE-unseen blend
log('Applying rarity rank post-process on current TE-unseen blend (alpha in {0.95,0.97,0.99})...')
import pandas as pd, numpy as np, json

# Load current TE-unseen blended test preds (from Cell 28's refined best) or rebuild quickly
pred_lgb = pd.read_csv('pred_lgb_unseen_u42.csv');
for tag in ['u1337','u2025','u101','u999','u7']:
    df = pd.read_csv(f'pred_lgb_unseen_{tag}.csv')
    col = [c for c in df.columns if c != 'id'][0]
    pred_lgb = pred_lgb.merge(df[['id', col]].rename(columns={col: f'pred_{tag}'}), on='id', how='inner')
lgb_cols = [c for c in pred_lgb.columns if c != 'id']
pred_lgb['lgb_ens'] = pred_lgb[lgb_cols].mean(axis=1).astype('float32')
pred_xgb = pd.read_csv('pred_xgb_unseen.csv')
xcol = [c for c in pred_xgb.columns if c != 'id'][0]
pred_xgb = pred_xgb.rename(columns={xcol: 'xgb'})
sub_blend = pd.read_csv('sample_submission.csv')[['id']].merge(pred_lgb[['id','lgb_ens']], on='id', how='left').merge(pred_xgb, on='id', how='left')

# Use refined best weight/mode from meta if available; else fallback to rank w=0.96
w_meta = {'mode': 'rank', 'w_lgb': 0.96}
try:
    with open('blend_unseen_meta_te_refined.json', 'r') as f:
        meta = json.load(f)
        if 'chosen' in meta:
            w_meta = meta['chosen']
        elif 'mode' in meta and 'w_lgb' in meta:
            w_meta = {'mode': meta['mode'], 'w_lgb': meta['w_lgb']}
except Exception:
    pass

def rank_norm(x):
    r = pd.Series(x).rank(method='average').values
    return (r - 1) / max(len(r) - 1, 1)

if w_meta.get('mode','rank') == 'prob':
    base_pred = (w_meta['w_lgb'] * sub_blend['lgb_ens'].values + (1 - w_meta['w_lgb']) * sub_blend['xgb'].values).astype('float32')
else:
    rl = rank_norm(sub_blend['lgb_ens']); rx = rank_norm(sub_blend['xgb'])
    base_pred = (w_meta['w_lgb'] * rl + (1 - w_meta['w_lgb']) * rx).astype('float32')

# Build 1 - FREQ_full_string rarity score (train+test pooled)
s_tr = train['f_27'].astype(str).values
s_te = test['f_27'].astype(str).values
all_full = pd.Series(np.concatenate([s_tr, s_te]))
vc_full = all_full.value_counts()
N_all = float(len(train) + len(test))
freq_full_te = pd.Series(s_te).map(vc_full).fillna(0).values.astype('float32') / N_all
rarity = 1.0 - freq_full_te
rarity_r = rank_norm(rarity)

# Seen/unseen masks via majority f_27 identity map
means = train.groupby('f_27')['target'].mean()
f27_map = (means >= 0.5).astype('int8').to_dict()
test_seen_mask = test['f_27'].isin(f27_map).values
test_unseen_mask = ~test_seen_mask
preds_seen = test.loc[test_seen_mask, 'f_27'].map(f27_map).astype('float32').values

alphas = [0.95, 0.97, 0.99]
for a in alphas:
    post_unseen = (a * rank_norm(base_pred) + (1 - a) * rarity_r).astype('float32')
    final = pd.read_csv('sample_submission.csv')[['id']].copy()
    final['target'] = np.zeros(len(final), dtype='float32')
    final.loc[test_seen_mask, 'target'] = preds_seen
    final.loc[test_unseen_mask, 'target'] = post_unseen[test_unseen_mask]
    final['target'] = final['target'].astype('float64').clip(0.0, 1.0)
    out_path = f'submission_unseen_rarity_a{int(round(a*100))}.csv'
    final.to_csv(out_path, index=False)
    log(f'Saved {out_path} (rarity rank post-process, alpha={a}).')

[2025-09-12 04:22:25] Applying rarity rank post-process on current TE-unseen blend (alpha in {0.95,0.97,0.99})...


[2025-09-12 04:22:29] Saved submission_unseen_rarity_a95.csv (rarity rank post-process, alpha=0.95).


[2025-09-12 04:22:29] Saved submission_unseen_rarity_a97.csv (rarity rank post-process, alpha=0.97).


[2025-09-12 04:22:29] Saved submission_unseen_rarity_a99.csv (rarity rank post-process, alpha=0.99).
