In [54]:
# PP blend r=4.0 and r=4.5 (0.5/0.5) with distance-aware caps (3/2/1) and thresholds optimized AFTER hysteresis per fold; fold-median thresholds; identical test chain; then G overwrite
import time, numpy as np, pandas as pd, sys, subprocess
from sklearn.model_selection import GroupKFold
from sklearn.metrics import matthews_corrcoef

try:
    import xgboost as xgb
except Exception as e:
    print('Installing xgboost...', e)
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'xgboost==2.1.4'], check=True)
    import xgboost as xgb
print('xgboost version (pp-blend-r40-r45-cap321-thr-after-hyst):', getattr(xgb, '__version__', 'unknown'))

def apply_hyst_per_pair(df_bin: pd.DataFrame) -> np.ndarray:
    df_h = df_bin.sort_values(['game_play','p1','p2','step']).copy()
    grp = df_h.groupby(['game_play','p1','p2'], sort=False)['pred_bin']
    df_h['pred_hyst'] = grp.transform(lambda s: (s.rolling(3, center=True, min_periods=1).sum() >= 2).astype(int))
    return df_h['pred_hyst'].to_numpy()

def train_bag_radius(train_sup: pd.DataFrame, test_feats: pd.DataFrame, feat_cols, groups, y_all, seeds=(42,1337,2025)):
    gkf = GroupKFold(n_splits=5)
    ord_idx_tr = train_sup[['game_play','p1','p2','step']].sort_values(['game_play','p1','p2','step']).index.to_numpy()
    ord_idx_te = test_feats[['game_play','p1','p2','step']].sort_values(['game_play','p1','p2','step']).index.to_numpy()
    oof_s_list, test_s_list = [], []
    for s in seeds:
        print(f'  seed {s} ...', flush=True)
        X_all = train_sup[feat_cols].astype(float).values
        oof = np.full(len(train_sup), np.nan, float)
        models = []
        for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_all, y_all, groups=groups)):
            t1 = time.time()
            X_tr, y_tr = X_all[tr_idx], y_all[tr_idx]
            X_va, y_va = X_all[va_idx], y_all[va_idx]
            neg = (y_tr == 0).sum(); posc = (y_tr == 1).sum()
            spw = max(1.0, neg / max(1, posc))
            dtrain = xgb.DMatrix(X_tr, label=y_tr); dvalid = xgb.DMatrix(X_va, label=y_va)
            params = {'tree_method':'hist','device':'cuda','max_depth':7,'eta':0.05,'subsample':0.9,'colsample_bytree':0.8,
                      'min_child_weight':10,'lambda':1.5,'alpha':0.1,'gamma':0.1,'objective':'binary:logistic','eval_metric':'logloss',
                      'scale_pos_weight': float(spw), 'seed': int(s + fold)}
            booster = xgb.train(params, dtrain, num_boost_round=3800, evals=[(dtrain,'train'),(dvalid,'valid')], early_stopping_rounds=200, verbose_eval=False)
            best_it = int(getattr(booster, 'best_iteration', None) or booster.num_boosted_rounds() - 1)
            oof[va_idx] = booster.predict(dvalid, iteration_range=(0, best_it + 1))
            models.append((booster, best_it))
            print(f'    fold {fold} done in {time.time()-t1:.1f}s; best_it={best_it}', flush=True)
        # Smooth OOF in canonical order
        df = train_sup[['game_play','p1','p2','step']].iloc[ord_idx_tr].copy()
        df['oof'] = oof[ord_idx_tr]
        df = df.sort_values(['game_play','p1','p2','step'])
        grp = df.groupby(['game_play','p1','p2'], sort=False)
        df['oof_smooth'] = grp['oof'].transform(lambda s_: s_.rolling(3, center=True, min_periods=1).max())
        oof_s_list.append(df['oof_smooth'].to_numpy())
        # Test preds and smoothing
        Xt = test_feats[feat_cols].astype(float).values
        dtest = xgb.DMatrix(Xt)
        pt = np.zeros(len(test_feats), float)
        for i, (booster, best_it) in enumerate(models):
            t1 = time.time(); pt += booster.predict(dtest, iteration_range=(0, best_it + 1));
            print(f'    test model {i} {time.time()-t1:.1f}s', flush=True)
        pt /= max(1, len(models))
        dt = test_feats[['game_play','p1','p2','step']].copy()
        dt['prob'] = 0.0
        dt.loc[ord_idx_te, 'prob'] = pt[ord_idx_te]
        dt = dt.sort_values(['game_play','p1','p2','step'])
        grp_t = dt.groupby(['game_play','p1','p2'], sort=False)
        dt['prob_smooth'] = grp_t['prob'].transform(lambda s_: s_.rolling(3, center=True, min_periods=1).max())
        test_s_list.append(dt['prob_smooth'].to_numpy())
    oof_avg = np.mean(np.vstack(oof_s_list), axis=0)
    test_avg = np.mean(np.vstack(test_s_list), axis=0)
    keys_tr_sorted = train_sup[['game_play','p1','p2','step']].iloc[ord_idx_tr].copy().reset_index(drop=True)
    keys_te_sorted = test_feats[['game_play','p1','p2','step']].iloc[ord_idx_te].copy().reset_index(drop=True)
    return oof_avg, test_avg, keys_tr_sorted, keys_te_sorted

def apply_distance_cap_smoothed(keys_df: pd.DataFrame, prob_smoothed: np.ndarray, dist_arr: np.ndarray, caps=(3,2,1), bins=(1.6, 2.4)) -> np.ndarray:
    df = keys_df.copy().reset_index(drop=True)
    df['prob'] = prob_smoothed
    df['dist'] = dist_arr
    df['row_id'] = np.arange(len(df))
    long1 = df[['game_play','step','p1','prob','dist','row_id']].rename(columns={'p1':'player'})
    long2 = df[['game_play','step','p2','prob','dist','row_id']].rename(columns={'p2':'player'})
    dfl = pd.concat([long1, long2], ignore_index=True)
    b0, b1 = bins
    bin_idx = np.where(dfl['dist'].to_numpy() <= b0, 0, np.where(dfl['dist'].to_numpy() <= b1, 1, 2))
    dfl['bin'] = bin_idx
    dfl = dfl.sort_values(['game_play','step','player','bin','prob'], ascending=[True, True, True, True, False])
    dfl['rank'] = dfl.groupby(['game_play','step','player','bin'], sort=False)['prob'].rank(method='first', ascending=False)
    cap_map = {0: caps[0], 1: caps[1], 2: caps[2]}
    dfl['cap'] = dfl['bin'].map(cap_map).astype(float)
    keep_ids = set(dfl.loc[dfl['rank'] <= dfl['cap'], 'row_id'].tolist())
    keep_mask = keys_df.index.to_series().reset_index(drop=True).isin(keep_ids).to_numpy()
    prob_capped = prob_smoothed.copy()
    prob_capped[~keep_mask] = 0.0
    return prob_capped

t0 = time.time()
print('Loading r=4.0 and r=4.5 supervised dyn train and test features...')
tr40 = pd.read_parquet('train_supervised_w5_helm_dyn_r40.parquet')
te40 = pd.read_parquet('test_pairs_w5_helm_dyn_r40.parquet')
tr45 = pd.read_parquet('train_supervised_w5_helm_dyn_r45.parquet')
te45 = pd.read_parquet('test_pairs_w5_helm_dyn_r45.parquet')
folds_df = pd.read_csv('folds_game_play.csv')
tr40 = tr40.merge(folds_df, on='game_play', how='left')
tr45 = tr45.merge(folds_df, on='game_play', how='left')
assert tr40['fold'].notna().all() and tr45['fold'].notna().all()
for df in (tr40, te40, tr45, te45):
    if 'px_dist_norm_min' in df.columns: df['px_dist_norm_min'] = df['px_dist_norm_min'].fillna(1.0)
    if 'views_both_present' in df.columns: df['views_both_present'] = df['views_both_present'].fillna(0).astype(float)

drop_cols = {'contact','game_play','step','p1','p2','team1','team2','pos1','pos2','fold'}
feat_cols40 = [c for c in tr40.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(tr40[c])]
feat_cols45 = [c for c in tr45.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(tr45[c])]
print('Using features r40:', len(feat_cols40), 'r45:', len(feat_cols45))

# Train-bag for each radius
groups40 = tr40['game_play'].values; y40 = tr40['contact'].astype(int).values
groups45 = tr45['game_play'].values; y45 = tr45['contact'].astype(int).values

print('Training r=4.0 ...')
oof40, teprob40, keys40_tr, keys40_te = train_bag_radius(tr40, te40, feat_cols40, groups40, y40)
print('Training r=4.5 ...')
oof45, teprob45, keys45_tr, keys45_te = train_bag_radius(tr45, te45, feat_cols45, groups45, y45)

# Align and blend OOF (0.5/0.5) on intersection of keys
kcols = ['game_play','p1','p2','step']
df_o40 = keys40_tr.copy(); df_o40['prob'] = oof40
df_o45 = keys45_tr.copy(); df_o45['prob'] = oof45
df_m = df_o40.merge(df_o45, on=kcols, how='inner', suffixes=('_40','_45'))
oof_blend = 0.5 * df_m['prob_40'].to_numpy() + 0.5 * df_m['prob_45'].to_numpy()
keys_blend_tr = df_m[kcols].reset_index(drop=True)

# Gather y/same/fold/dist from r=4.5 (has wider coverage). Filter to blended keys.
tr45_sorted = tr45.sort_values(kcols).reset_index(drop=True)
df_meta = tr45_sorted[kcols + ['contact','same_team','fold','distance']].copy()
df_meta = df_meta.merge(keys_blend_tr, on=kcols, how='right')
y_sorted = df_meta['contact'].astype(int).to_numpy()
same_sorted = df_meta['same_team'].fillna(0).astype(int).to_numpy() if 'same_team' in df_meta.columns else np.zeros(len(df_meta), np.int8)
fold_sorted = df_meta['fold'].astype(int).to_numpy()
dist_sorted = df_meta['distance'].astype(float).to_numpy()

# Distance-aware cap on blended OOF
oof_cap = apply_distance_cap_smoothed(keys_blend_tr, oof_blend, dist_sorted, caps=(3,2,1), bins=(1.6, 2.4))
print('Applied distance-aware cap (3/2/1) to blended OOF. Kept nonzero:', int((oof_cap>0).sum()), 'of', len(oof_cap))

# Optimize thresholds AFTER hysteresis per fold; grid 0.70-0.85
thr_grid = np.round(np.linspace(0.70, 0.85, 16), 3)
thr_best = []
for k in sorted(np.unique(fold_sorted)):
    m = (fold_sorted == k)
    df_k = keys_blend_tr.loc[m, kcols].copy()
    df_k['prob'] = oof_cap[m]
    df_k['same'] = same_sorted[m]
    y_k = y_sorted[m]
    best_m, best_to, best_ts = -1.0, 0.78, 0.78
    same_arr = df_k['same'].to_numpy()
    prob_arr = df_k['prob'].to_numpy()
    for to in thr_grid:
        for ts in thr_grid:
            thr_arr = np.where(same_arr == 1, ts, to)
            pred_bin = (prob_arr >= thr_arr).astype(int)
            df_tmp = df_k[kcols].copy()
            df_tmp['pred_bin'] = pred_bin
            pred_h = apply_hyst_per_pair(df_tmp)
            mcc = matthews_corrcoef(y_k, pred_h)
            if mcc > best_m:
                best_m, best_to, best_ts = float(mcc), float(to), float(ts)
    thr_best.append((best_to, best_ts))
    print(f' Fold {k} best after-hyst MCC={best_m:.5f} thr_opp={best_to:.3f} thr_same={best_ts:.3f}')

thr_best = np.array(thr_best, float)
thr_opp_med = float(np.median(thr_best[:, 0]))
thr_same_med = float(np.median(thr_best[:, 1]))
print(f'Fold-median thresholds after hysteresis (blend r40/r45 cap3/2/1): thr_opp={thr_opp_med:.4f}, thr_same={thr_same_med:.4f}')

# Test: align and blend 0.5/0.5, then smooth already done in train_bag, apply distance-aware caps, thresholds, hysteresis
df_t40 = keys40_te.copy(); df_t40['prob'] = teprob40
df_t45 = keys45_te.copy(); df_t45['prob'] = teprob45
df_tm = df_t40.merge(df_t45, on=kcols, how='inner', suffixes=('_40','_45'))
pt_blend = 0.5 * df_tm['prob_40'].to_numpy() + 0.5 * df_tm['prob_45'].to_numpy()
keys_blend_te = df_tm[kcols].reset_index(drop=True)

# Distance from r=4.5 test for caps
te45_sorted = te45.sort_values(kcols).reset_index(drop=True)
df_dist_t = te45_sorted[kcols + ['distance']].copy().merge(keys_blend_te, on=kcols, how='right')
dist_t_sorted = df_dist_t['distance'].astype(float).to_numpy()
pt_cap = apply_distance_cap_smoothed(keys_blend_te, pt_blend, dist_t_sorted, caps=(3,2,1), bins=(1.6, 2.4))
print('Applied distance-aware caps (3/2/1) on test blend.')

# same_team from r=4.5 test
same_flag_test = te45.sort_values(kcols).reset_index(drop=True)[kcols + ['same_team']].copy().merge(keys_blend_te, on=kcols, how='right')
same_arr_t = same_flag_test['same_team'].fillna(0).astype(int).to_numpy() if 'same_team' in same_flag_test.columns else np.zeros(len(keys_blend_te), int)
thr_arr_t = np.where(same_arr_t == 1, thr_same_med, thr_opp_med)
pred_bin_t = (pt_cap >= thr_arr_t).astype(int)
df_tmp_t = keys_blend_te.copy()
df_tmp_t['pred_bin'] = pred_bin_t
pred_h_t = apply_hyst_per_pair(df_tmp_t)

# Build submission with PP only (skip prior G overwrite)
cid_sorted = (keys_blend_te['game_play'].astype(str) + '_' + keys_blend_te['step'].astype(str) + '_' + keys_blend_te['p1'].astype(str) + '_' + keys_blend_te['p2'].astype(str))
pred_df_pp = pd.DataFrame({'contact_id': cid_sorted.values, 'contact_pp': pred_h_t.astype(int)})
ss = pd.read_csv('sample_submission.csv')
sub = ss.merge(pred_df_pp, on='contact_id', how='left')
sub['contact'] = sub['contact_pp'].fillna(0).astype(int)
sub = sub.drop(columns=['contact_pp'])
pp_ones = int(sub['contact'].sum())
print('PP (blend r40/r45 + cap3/2/1 thr-after-hyst) ones:', pp_ones)

sub.to_csv('submission.csv', index=False)
print('Saved submission.csv. Took {:.1f}s'.format(time.time()-t0))

xgboost version (pp-blend-r40-r45-cap321-thr-after-hyst): 2.1.4
Loading r=4.0 and r=4.5 supervised dyn train and test features...


Using features r40: 50 r45: 50
Training r=4.0 ...
  seed 42 ...


    fold 0 done in 36.7s; best_it=3253


    fold 1 done in 40.3s; best_it=3632


    fold 2 done in 37.7s; best_it=3326


    fold 3 done in 38.9s; best_it=3446


    fold 4 done in 37.6s; best_it=3468


    test model 0 0.2s


    test model 1 0.2s


    test model 2 0.2s


    test model 3 0.2s


    test model 4 0.2s


  seed 1337 ...


    fold 0 done in 38.2s; best_it=3385


    fold 1 done in 40.4s; best_it=3608


    fold 2 done in 36.3s; best_it=3140


    fold 3 done in 38.7s; best_it=3378


    fold 4 done in 39.7s; best_it=3609


    test model 0 0.2s


    test model 1 0.2s


    test model 2 0.2s


    test model 3 0.2s


    test model 4 0.2s


  seed 2025 ...


    fold 0 done in 39.2s; best_it=3453


    fold 1 done in 38.7s; best_it=3408


    fold 2 done in 37.8s; best_it=3284


    fold 3 done in 41.0s; best_it=3573


    fold 4 done in 37.5s; best_it=3388


    test model 0 0.2s


    test model 1 0.2s


    test model 2 0.2s


    test model 3 0.2s


    test model 4 0.2s


Training r=4.5 ...
  seed 42 ...


    fold 0 done in 46.6s; best_it=3688


    fold 1 done in 45.1s; best_it=3754


    fold 2 done in 43.3s; best_it=3466


    fold 3 done in 40.9s; best_it=3177


    fold 4 done in 44.9s; best_it=3799


    test model 0 0.2s


    test model 1 0.2s


    test model 2 0.2s


    test model 3 0.2s


    test model 4 0.2s


  seed 1337 ...


    fold 0 done in 45.1s; best_it=3799


    fold 1 done in 45.5s; best_it=3799


    fold 2 done in 45.2s; best_it=3467


    fold 3 done in 39.3s; best_it=2982


    fold 4 done in 46.3s; best_it=3777


    test model 0 0.2s


    test model 1 0.2s


    test model 2 0.2s


    test model 3 0.2s


    test model 4 0.2s


  seed 2025 ...


    fold 0 done in 44.7s; best_it=3798


    fold 1 done in 45.4s; best_it=3796


    fold 2 done in 45.9s; best_it=3519


    fold 3 done in 44.1s; best_it=3358


    fold 4 done in 45.4s; best_it=3716


    test model 0 0.2s


    test model 1 0.2s


    test model 2 0.2s


    test model 3 0.2s


    test model 4 0.2s


Applied distance-aware cap (3/2/1) to blended OOF. Kept nonzero: 440775 of 634192


 Fold 0 best after-hyst MCC=0.71737 thr_opp=0.800 thr_same=0.820


 Fold 1 best after-hyst MCC=0.74015 thr_opp=0.840 thr_same=0.840


 Fold 2 best after-hyst MCC=0.73547 thr_opp=0.840 thr_same=0.700


 Fold 3 best after-hyst MCC=0.73442 thr_opp=0.740 thr_same=0.740


 Fold 4 best after-hyst MCC=0.73501 thr_opp=0.810 thr_same=0.850
Fold-median thresholds after hysteresis (blend r40/r45 cap3/2/1): thr_opp=0.8100, thr_same=0.8200


Applied distance-aware caps (3/2/1) on test blend.


PP (blend r40/r45 + cap3/2/1 thr-after-hyst) ones: 6428


Saved submission.csv. Took 2226.8s


In [68]:
# Single r=4.5 PP pipeline (bug-fixed caps) + Integrated G-head (patched); thresholds after hysteresis; OR combine; write submission.csv
import os, time, sys, json, gc, math, itertools, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import matthews_corrcoef
import xgboost as xgb
warnings.filterwarnings('ignore')

t0 = time.time()
print('xgboost version (single-r45-pp+g, cap321-fix, thr-after-hyst):', xgb.__version__, flush=True)

# -------------------- IO --------------------
train_sup_fp = 'train_supervised_w5_helm_dyn_r45.parquet'
test_sup_fp  = 'test_pairs_w5_helm_dyn_r45.parquet'
folds_fp     = 'folds_game_play.csv'

assert Path(train_sup_fp).exists() and Path(test_sup_fp).exists(), 'Supervised r45 parquet files missing.'
folds_df = pd.read_csv(folds_fp)

# -------------------- Load supervised (r=4.5) --------------------
print('Loading r=4.5 supervised train/test...', flush=True)
train_df = pd.read_parquet(train_sup_fp)
test_df  = pd.read_parquet(test_sup_fp)

# Expected columns keys: ['game_play','step','p1','p2','same_team','distance', ..., 'contact' in train]
key_cols = ['game_play','step','p1','p2']
for c in key_cols + ['same_team','distance']:
    assert c in train_df.columns, f'missing {c} in train_df'
    assert c in test_df.columns, f'missing {c} in test_df'
assert 'contact' in train_df.columns

# Merge folds on game_play
train_df = train_df.merge(folds_df[['game_play','fold']], on='game_play', how='left')
assert train_df['fold'].notnull().all(), 'fold assignment missing for some rows'

# Features: drop keys/target/leak columns
drop_cols = set(key_cols + ['contact','fold'])
feat_cols = [c for c in train_df.columns if c not in drop_cols and train_df[c].dtype != 'O']
meta_cols = key_cols + ['same_team','distance']
meta_cols_merge = key_cols  # use only unique key for merges
print(f'Using {len(feat_cols)} features', flush=True)

# -------------------- Helpers --------------------
def add_group_sort_index(df):
    # canonical sort: do NOT include same_team in ordering
    return df.sort_values(['game_play','p1','p2','step'], kind='mergesort').reset_index(drop=True)

def roll_max_centered_by_group(df, prob_col, group_cols, win=3):
    # transform preserves alignment; no index juggling
    return df.groupby(group_cols, sort=False)[prob_col].transform(
        lambda x: x.rolling(window=win, center=True, min_periods=1).max()
    ).values

def hysteresis_2of3_sorted(pred_bin: np.ndarray, df_sorted: pd.DataFrame, group_cols: list) -> np.ndarray:
    assert isinstance(df_sorted.index, pd.RangeIndex)
    arr = pred_bin.astype(np.uint8)
    gvals = df_sorted[group_cols].to_numpy()

    new_group = np.zeros(len(df_sorted), dtype=bool)
    new_group[1:] = (gvals[1:] != gvals[:-1]).any(axis=1)
    end_group = np.zeros(len(df_sorted), dtype=bool)
    end_group[:-1] = new_group[1:]
    end_group[-1] = True

    prev = np.r_[arr[0], arr[:-1]]
    prev[new_group] = arr[new_group]
    nxt = np.r_[arr[1:], arr[-1]]
    nxt[end_group] = arr[end_group]

    out = (arr + prev + nxt >= 2).astype(np.uint8)
    return out

def mcc_fast(y_true, y_pred):
    return matthews_corrcoef(y_true, y_pred)

def apply_dual_threshold(proba, same_team, thr_opp, thr_same):
    thr = np.where(same_team.astype(np.int8)==1, thr_same, thr_opp)
    return (proba >= thr).astype(np.uint8)

def distance_bin_from_val(d):
    if d <= 1.6: return 0
    if d <= 2.4: return 1
    return 2

def apply_distance_caps_bugfixed(df_keys_meta, prob_smoothed):
    n = len(df_keys_meta)
    assert n == len(prob_smoothed)
    df = df_keys_meta.copy().reset_index(drop=True)
    df['prob_s'] = prob_smoothed.astype(np.float64)
    df['bin'] = df['distance'].apply(distance_bin_from_val).astype(np.int8)
    df['idx'] = np.arange(len(df), dtype=np.int64)
    a = df[['game_play','step','p1','bin','prob_s','idx']].rename(columns={'p1':'player'})
    b = df[['game_play','step','p2','bin','prob_s','idx']].rename(columns={'p2':'player'})
    long = pd.concat([a,b], axis=0, ignore_index=True)
    cap_map = {0:3, 1:2, 2:1}
    long['cap'] = long['bin'].map(cap_map).astype(np.int8)
    long['rank'] = long.groupby(['game_play','step','player','bin'], sort=False)['prob_s'].rank(method='first', ascending=False)
    keep_long = long[long['rank'] <= long['cap']]
    keep_ids = set(keep_long['idx'].values.tolist())
    keep_mask = np.isin(np.arange(len(df)), list(keep_ids))
    return keep_mask

def train_xgb_pp(X_train, y_train, X_valid, y_valid, params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=20000, evals=[(dvalid,'valid')],
                        early_stopping_rounds=400, verbose_eval=False)
    return booster

def train_xgb_g(X_train, y_train, X_valid, y_valid, params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=6000, evals=[(dvalid,'valid')],
                        early_stopping_rounds=200, verbose_eval=False)
    return booster

def predict_xgb(booster, X):
    return booster.predict(xgb.DMatrix(X), iteration_range=(0, booster.best_iteration+1))

# -------------------- PP model: 3-seed bagging, OOF/test --------------------
seeds = [42, 1337, 2025]
groups = train_df['game_play'].values
gkf = GroupKFold(n_splits=5)

oof_pred_accum = np.zeros((len(train_df),), dtype=np.float64)
test_pred_accum = np.zeros((len(test_df),), dtype=np.float64)

# compute scale_pos_weight (cap for stability)
pos = float(train_df['contact'].sum())
neg = float(len(train_df) - pos)
spw = max(1.0, neg / max(1.0, pos))
spw = min(spw, 5.0)
print(f'scale_pos_weight={spw:.2f} (neg={neg:.0f}, pos={pos:.0f})', flush=True)

params_base = {
    'objective':'binary:logistic',
    'eval_metric':'logloss',
    'tree_method':'gpu_hist',
    'max_depth':7,
    'eta':0.05,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'min_child_weight':10,
    'lambda':1.0,
    'scale_pos_weight': spw,
}

print('Training PP r=4.5 single model (3 seeds)...', flush=True)
for sd in seeds:
    params = params_base.copy()
    params['seed'] = sd
    fold_idx = 0
    for tr_idx, va_idx in gkf.split(train_df, train_df['contact'].values, groups):
        t1 = time.time()
        X_tr = train_df.iloc[tr_idx][feat_cols].values
        y_tr = train_df.iloc[tr_idx]['contact'].values.astype(np.float32)
        X_va = train_df.iloc[va_idx][feat_cols].values
        y_va = train_df.iloc[va_idx]['contact'].values.astype(np.float32)
        bst = train_xgb_pp(X_tr, y_tr, X_va, y_va, params)
        oof_pred_accum[va_idx] += predict_xgb(bst, X_va)
        test_pred_accum += predict_xgb(bst, test_df[feat_cols].values)
        dt = time.time()-t1
        print(f'  seed {sd} fold {fold_idx} done in {dt:.1f}s; best_it={bst.best_iteration}', flush=True)
        del bst; gc.collect()
        fold_idx += 1

oof_pred = oof_pred_accum / len(seeds)
test_pred = test_pred_accum / (len(seeds)*5)
print('PP bagging complete.', flush=True)

# -------------------- Roll-max smoothing (centered w=3) --------------------
train_meta = train_df[key_cols + ['same_team','distance']].copy().reset_index(drop=True)
test_meta  = test_df[key_cols + ['same_team','distance']].copy().reset_index(drop=True)

train_meta['proba'] = oof_pred
test_meta['proba']  = test_pred

train_meta = add_group_sort_index(train_meta)
test_meta  = add_group_sort_index(test_meta)

train_meta['proba_s'] = roll_max_centered_by_group(train_meta, 'proba', ['game_play','p1','p2'], win=3)
test_meta['proba_s']  = roll_max_centered_by_group(test_meta,  'proba', ['game_play','p1','p2'], win=3)

# -------------------- Distance-aware caps (3/2/1) with bug-fixed mask --------------------
keep_mask_train = apply_distance_caps_bugfixed(train_meta[key_cols + ['same_team','distance']], train_meta['proba_s'].values)
keep_mask_test  = apply_distance_caps_bugfixed(test_meta[key_cols + ['same_team','distance']],  test_meta['proba_s'].values)
print(f'Applied distance-aware caps. Train kept_nonzero: {keep_mask_train.sum()} of {len(keep_mask_train)}', flush=True)
print(f'Applied distance-aware caps. Test  kept_nonzero: {keep_mask_test.sum()} of {len(keep_mask_test)}', flush=True)

train_meta['proba_sc'] = np.where(keep_mask_train, train_meta['proba_s'].values, 0.0)
test_meta['proba_sc']  = np.where(keep_mask_test,  test_meta['proba_s'].values,  0.0)

# Bring back to original train/test row order via deterministic key merge (keys only, not same_team)
orig_train = train_df[key_cols + ['same_team','distance','contact','fold']].reset_index(drop=True).copy()
tmp = train_meta.reset_index(drop=True).copy()
m_train = orig_train.merge(tmp[meta_cols_merge + ['proba_sc']], on=meta_cols_merge, how='left', validate='one_to_one')
assert m_train['proba_sc'].notnull().all(), 'alignment failure in train_meta'

orig_test = test_df[key_cols + ['same_team','distance']].reset_index(drop=True).copy()
tmp2 = test_meta.reset_index(drop=True).copy()
m_test = orig_test.merge(tmp2[meta_cols_merge + ['proba_sc']], on=meta_cols_merge, how='left', validate='one_to_one')
assert m_test['proba_sc'].notnull().all(), 'alignment failure in test_meta'

# Quantiles sanity per fold (after smoothing+caps)
for f in sorted(train_df['fold'].unique()):
    m = (train_df['fold'].values == f)
    q = np.quantile(m_train.loc[m, 'proba_sc'].values, [0, 0.5, 0.9, 0.99])
    print(f'PP fold {f} proba_sc quantiles:', q, flush=True)

# -------------------- Threshold optimization AFTER hysteresis (per-fold), then fold-median --------------------
thr_grid = np.round(np.arange(0.60, 0.921, 0.02), 3)
sort_cols_pp = ['game_play','p1','p2','step']
best_by_fold = []
for f in sorted(train_df['fold'].unique()):
    mask_f = (train_df['fold'].values == f)
    df_f = m_train.loc[mask_f, ['game_play','p1','p2','step','same_team','distance','proba_sc','contact']].copy()
    df_f = df_f.sort_values(sort_cols_pp).reset_index(drop=True)
    assert isinstance(df_f.index, pd.RangeIndex)
    y = df_f['contact'].astype(int).values
    best_mcc = -1.0; best_thr = (0.80, 0.80)
    for thr_opp in thr_grid:
        for thr_same in thr_grid:
            pred_bin = apply_dual_threshold(df_f['proba_sc'].values, df_f['same_team'].values, thr_opp, thr_same)
            pred_hyst = hysteresis_2of3_sorted(pred_bin, df_f, ['game_play','p1','p2'])
            mcc = mcc_fast(y, pred_hyst)
            if mcc > best_mcc:
                best_mcc = mcc; best_thr = (thr_opp, thr_same)
    print(f' Fold {f} best after-hyst MCC={best_mcc:.5f} thr_opp={best_thr[0]:.3f} thr_same={best_thr[1]:.3f}', flush=True)
    best_by_fold.append((f, best_mcc, best_thr[0], best_thr[1]))

thr_opp_med = float(np.median([t[2] for t in best_by_fold]))
thr_same_med = float(np.median([t[3] for t in best_by_fold]))
print(f'Fold-median thresholds after hysteresis (PP r45): thr_opp={thr_opp_med:.4f}, thr_same={thr_same_med:.4f}', flush=True)

# Apply to test: sort -> threshold -> hysteresis -> restore original order
m_test_sorted = m_test.copy()
m_test_sorted['_orig_idx'] = np.arange(len(m_test_sorted))
m_test_sorted = m_test_sorted.sort_values(sort_cols_pp).reset_index(drop=True)
assert isinstance(m_test_sorted.index, pd.RangeIndex)
# Apply a small test-only threshold offset
pp_thr_offset = -0.02
thr_opp_med_t = max(0.0, thr_opp_med - pp_thr_offset)
thr_same_med_t = max(0.0, thr_same_med - pp_thr_offset)
pp_test_bin_sorted = apply_dual_threshold(m_test_sorted['proba_sc'].values, m_test_sorted['same_team'].values, thr_opp_med_t, thr_same_med_t)
pp_test_bin_sorted = hysteresis_2of3_sorted(pp_test_bin_sorted, m_test_sorted, ['game_play','p1','p2'])
pp_test_bin = np.zeros((len(m_test),), dtype=np.uint8)
pp_test_bin[m_test_sorted['_orig_idx'].values] = pp_test_bin_sorted

# Build canonical PP contact_id and in-sample PP count
ss = pd.read_csv('sample_submission.csv')
ss_set = set(ss['contact_id'].astype(str))
p1i = m_test['p1'].astype(int).to_numpy()
p2i = m_test['p2'].astype(int).to_numpy()
stepi = m_test['step'].astype(int).to_numpy()
pmin = np.minimum(p1i, p2i)
pmax = np.maximum(p1i, p2i)
cid_pp = (m_test['game_play'].astype(str) + '_' + stepi.astype(str) + '_' + pmin.astype(str) + '_' + pmax.astype(str)).astype(str)
pp_in_mask = cid_pp.isin(ss_set).to_numpy()
pp_pos = int(pp_test_bin.sum())
pp_pos_in = int((pp_test_bin.astype(bool) & pp_in_mask).sum())
print(f'PP positives after full chain: {pp_pos} (in-sample: {pp_pos_in})', flush=True)

# -------------------- G-head (integrated, PATCHED) --------------------
print('Training G-head (per-player) ...', flush=True)
train_trk = pd.read_csv('train_player_tracking.csv')
test_trk  = pd.read_csv('test_player_tracking.csv')
labels = pd.read_csv('train_labels.csv')

# Normalize position column names if present
rename_map = {}
if 'x_position' in train_trk.columns and 'y_position' in train_trk.columns:
    rename_map.update({'x_position':'x','y_position':'y'})
train_trk = train_trk.rename(columns=rename_map)
test_trk  = test_trk.rename(columns=rename_map)

# Minimal per-player features from tracking (select available)
base_cols = ['game_play','step','nfl_player_id']
cand_cols = ['x','y','speed','acceleration','o','dir']
avail = [c for c in cand_cols if c in train_trk.columns]
use_cols = base_cols + avail
train_trk = train_trk[use_cols].copy()
test_trk  = test_trk[[c for c in use_cols if c in test_trk.columns]].copy()

# unify dtype for ids
train_trk['nfl_player_id'] = train_trk['nfl_player_id'].astype(str)
test_trk['nfl_player_id'] = test_trk['nfl_player_id'].astype(str)

def add_player_feats(df):
    df = df.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
    for col in [c for c in ['x','y','speed','acceleration'] if c in df.columns]:
        df[f'd_{col}'] = df.groupby(['game_play','nfl_player_id'])[col].diff().fillna(0.0)
    if 'acceleration' in df.columns:
        df['jerk'] = df.groupby(['game_play','nfl_player_id'])['acceleration'].diff().fillna(0.0)
    if 'speed' in df.columns:
        df['speed_drop_3'] = df['speed'] - df.groupby(['game_play','nfl_player_id'])['speed'].shift(3).fillna(df['speed'])
        # rolling extrema (w=3) fast lifts
        df['speed_min_3'] = df.groupby(['game_play','nfl_player_id'])['speed'].rolling(3, min_periods=1).min().reset_index(level=[0,1], drop=True)
        df['speed_max_3'] = df.groupby(['game_play','nfl_player_id'])['speed'].rolling(3, min_periods=1).max().reset_index(level=[0,1], drop=True)
    if 'acceleration' in df.columns:
        df['accel_r3_min'] = df.groupby(['game_play','nfl_player_id'])['acceleration'].rolling(3, min_periods=1).min().reset_index(level=[0,1], drop=True)
    if 'o' in df.columns:
        df['d_o'] = df.groupby(['game_play','nfl_player_id'])['o'].diff().fillna(0.0)
        df['d_o'] = (df['d_o'] + 180) % 360 - 180
    if 'dir' in df.columns:
        df['d_dir'] = df.groupby(['game_play','nfl_player_id'])['dir'].diff().fillna(0.0)
        df['d_dir'] = (df['d_dir'] + 180) % 360 - 180
    df['time_since_snap'] = df.groupby('game_play')['step'].transform(lambda s: s - s.min())
    return df

train_trk = add_player_feats(train_trk)
test_trk  = add_player_feats(test_trk)

# 1) Build opponent proximity from r=4.5 PP pair tables (feature only)
def build_opp_prox_feats(df_pairs, dist_thr=3.5):
    use = df_pairs.loc[df_pairs['same_team']==0, ['game_play','step','p1','p2','distance','approaching_flag','rel_speed']].copy()
    a = use[['game_play','step','p1','distance','approaching_flag','rel_speed']].rename(columns={'p1':'nfl_player_id'})
    b = use[['game_play','step','p2','distance','approaching_flag','rel_speed']].rename(columns={'p2':'nfl_player_id'})
    long = pd.concat([a,b], ignore_index=True)
    agg = long.groupby(['game_play','step','nfl_player_id'], as_index=False).agg(
        min_opp_dist=('distance','min'),
        has_approaching=('approaching_flag','max'),
        max_rel_speed=('rel_speed','max')
    )
    agg['close_opp'] = (agg['min_opp_dist'] <= dist_thr).astype(np.int8)
    agg['approaching_fast'] = ((agg['has_approaching']==1) | (agg['max_rel_speed'] >= 1.5)).astype(np.int8)
    return agg

prox_train = build_opp_prox_feats(train_df, dist_thr=3.5)
prox_test  = build_opp_prox_feats(test_df,  dist_thr=3.5)
prox_train['nfl_player_id'] = prox_train['nfl_player_id'].astype(str)
prox_test['nfl_player_id']  = prox_test['nfl_player_id'].astype(str)

train_trk = train_trk.merge(prox_train, on=['game_play','step','nfl_player_id'], how='left')
test_trk  = test_trk.merge(prox_test,  on=['game_play','step','nfl_player_id'], how='left')
for c in ['min_opp_dist','close_opp','approaching_fast','has_approaching','max_rel_speed']:
    fill = 99.0 if c=='min_opp_dist' else 0
    train_trk[c] = train_trk[c].fillna(fill)
    test_trk[c]  = test_trk[c].fillna(fill)

# Quick extra features
train_trk['time_since_snap_sq'] = train_trk['time_since_snap']**2
test_trk['time_since_snap_sq']  = test_trk['time_since_snap']**2
train_trk = train_trk.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
test_trk  = test_trk.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
train_trk['min_opp_dist_lag'] = train_trk.groupby(['game_play','nfl_player_id'])['min_opp_dist'].shift(1)
test_trk['min_opp_dist_lag']  = test_trk.groupby(['game_play','nfl_player_id'])['min_opp_dist'].shift(1)
train_trk['min_opp_dist_lag'] = train_trk['min_opp_dist_lag'].fillna(train_trk['min_opp_dist'])
test_trk['min_opp_dist_lag']  = test_trk['min_opp_dist_lag'].fillna(test_trk['min_opp_dist'])

# Additional high-signal features (per expert advice)
train_trk['d_min_opp_dist'] = np.clip(train_trk['min_opp_dist'] - train_trk['min_opp_dist_lag'], -2.0, 2.0)
test_trk['d_min_opp_dist']  = np.clip(test_trk['min_opp_dist'] - test_trk['min_opp_dist_lag'], -2.0, 2.0)
if 'speed_min_3' in train_trk.columns and 'speed_max_3' in train_trk.columns:
    train_trk['speed_range_3'] = train_trk['speed_max_3'] - train_trk['speed_min_3']
    test_trk['speed_range_3']  = test_trk['speed_max_3'] - test_trk['speed_min_3']
train_trk['decel_and_close'] = (((train_trk.get('acceleration', pd.Series(0)).astype(float) <= -0.4) & (train_trk['min_opp_dist'].astype(float) <= 3.5)).astype(np.int8))
test_trk['decel_and_close']  = (((test_trk.get('acceleration', pd.Series(0)).astype(float) <= -0.4) & (test_trk['min_opp_dist'].astype(float) <= 3.5)).astype(np.int8))

# Build G labels from contact_id pattern: GP1_GP2_STEP_PLAYER_G (NO ±1 expansion)
labels_g = labels[labels['contact_id'].str.endswith('_G')].copy()
parts = labels_g['contact_id'].str.split('_')
labels_g['game_play'] = parts.str[0] + '_' + parts.str[1]
labels_g['step'] = parts.str[2].astype(int)
labels_g['player'] = parts.str[3]
labels_g = labels_g[['game_play','step','player','contact']].copy()

train_trk['player'] = train_trk['nfl_player_id'].astype(str)
train_g = train_trk.merge(labels_g, on=['game_play','step','player'], how='left')
train_g['contact'] = train_g['contact'].fillna(0).astype(int)

# G folds by game_play
gp_g = train_g['game_play'].values
gkf_g = GroupKFold(n_splits=5)
g_feat_drop = set(['game_play','step','nfl_player_id','player','contact'])
g_feat_cols = [c for c in train_g.columns if c not in g_feat_drop and train_g[c].dtype != 'O']
for _c in ['time_since_snap_sq','min_opp_dist_lag']:
    if _c not in g_feat_cols and _c in train_g.columns:
        g_feat_cols.append(_c)
print(f'G-head features: {len(g_feat_cols)}', flush=True)

g_oof_accum = np.zeros((len(train_g),), dtype=np.float64)
g_test_accum = np.zeros((len(test_trk),), dtype=np.float64)

for sd in seeds:
    fi = 0
    for tr_idx, va_idx in gkf_g.split(train_g, train_g['contact'].values, gp_g):
        X_tr = train_g.iloc[tr_idx][g_feat_cols].values
        y_tr = train_g.iloc[tr_idx]['contact'].values.astype(np.float32)
        X_va = train_g.iloc[va_idx][g_feat_cols].values
        y_va = train_g.iloc[va_idx]['contact'].values.astype(np.float32)
        # per-fold SPW with clip and stronger regularization
        posc = float((y_tr==1).sum()); negc = float(len(y_tr)-posc)
        spw_g = min(max(1.0, negc / max(1.0, posc)), 5.0)
        params_g = {
            'objective':'binary:logistic',
            'eval_metric':'logloss',
            'tree_method':'gpu_hist',
            'max_depth':6,
            'eta':0.04,
            'subsample':0.8,
            'colsample_bytree':0.75,
            'min_child_weight':20,
            'lambda':2.5,
            'alpha':0.3,
            'max_delta_step':1,
            'scale_pos_weight': spw_g,
            'seed': int(sd)
        }
        bst = train_xgb_g(X_tr, y_tr, X_va, y_va, params_g)
        g_oof_accum[va_idx] += predict_xgb(bst, X_va)
        g_test_accum += predict_xgb(bst, test_trk[g_feat_cols].values)
        print(f'  G seed {sd} fold {fi} best_it={bst.best_iteration}', flush=True)
        del bst; gc.collect(); fi += 1

g_oof = g_oof_accum / len(seeds)
g_test = g_test_accum / (len(seeds)*5)

# Post-proc for G (patched order):
# 1) Build meta with proba_raw
train_g_meta = train_trk[['game_play','step','nfl_player_id','speed','acceleration','speed_drop_3','time_since_snap','close_opp']].copy().reset_index(drop=True)
test_g_meta  = test_trk[['game_play','step','nfl_player_id','speed','acceleration','speed_drop_3','time_since_snap','close_opp']].copy().reset_index(drop=True)
train_g_meta['proba_raw'] = g_oof
test_g_meta['proba_raw']  = g_test

# 2) Kinematic-first gate on raw probabilities (relaxed thresholds)
def kinematic_gate(df):
    speed = df['speed'] if 'speed' in df.columns else pd.Series(1.0, index=df.index)
    accel = df['acceleration'] if 'acceleration' in df.columns else pd.Series(0.0, index=df.index)
    sdrop = df['speed_drop_3'] if 'speed_drop_3' in df.columns else pd.Series(0.0, index=df.index)
    tss = df['time_since_snap'] if 'time_since_snap' in df.columns else pd.Series(0, index=df.index)
    gate = ((speed <= 1.6) | (accel <= -0.3) | (sdrop <= -0.3)) & (tss >= 2)
    return gate.values

train_gate = kinematic_gate(train_g_meta)
test_gate  = kinematic_gate(test_g_meta)
print('G gate train True:', int(train_gate.sum()), 'of', len(train_gate), flush=True)
print('G gate test  True:', int(test_gate.sum()),  'of', len(test_gate),  flush=True)
train_g_meta['proba_raw'] = np.where(train_gate, train_g_meta['proba_raw'].values, 0.0)
test_g_meta['proba_raw']  = np.where(test_gate,  test_g_meta['proba_raw'].values,  0.0)
print('Nonzero proba_raw after gate (train/test):', int((train_g_meta['proba_raw']>0).sum()), int((test_g_meta['proba_raw']>0).sum()), flush=True)
if (train_g_meta['proba_raw']>0).any():
    print('Train proba_raw gated quantiles:', np.quantile(train_g_meta.loc[train_g_meta['proba_raw']>0,'proba_raw'].values, [0.9,0.95,0.99,0.999]), flush=True)
if (test_g_meta['proba_raw']>0).any():
    print('Test proba_raw gated quantiles:', np.quantile(test_g_meta.loc[test_g_meta['proba_raw']>0,'proba_raw'].values, [0.9,0.95,0.99,0.999]), flush=True)

# 3) Cap top-k per (game_play, step) on RAW proba (alignment-safe, no reordering)
def cap_topk_rank_inplace(df: pd.DataFrame, proba_col: str, k: int = 1, min_prob: float = 0.0) -> np.ndarray:
    ranks = df.groupby(['game_play','step'], sort=False)[proba_col].rank(method='first', ascending=False)
    keep = (ranks <= k) & (df[proba_col] >= min_prob)
    out = np.where(keep.values, df[proba_col].values, 0.0).astype(np.float64)
    # Assert cap holds
    cnt = df.assign(_nz=(out>0).astype(int)).groupby(['game_play','step'], sort=False)['_nz'].sum().max()
    assert int(cnt) <= k, f'Cap violation: found {int(cnt)} > k={k} per step'
    return out

k_cap = 2
min_prob_cap = 0.0
train_g_meta['proba_gc'] = cap_topk_rank_inplace(train_g_meta, 'proba_raw', k=k_cap, min_prob=min_prob_cap)
test_g_meta['proba_gc']  = cap_topk_rank_inplace(test_g_meta,  'proba_raw', k=k_cap, min_prob=min_prob_cap)
print('Test G candidates after cap (proba_gc>0):', int((test_g_meta['proba_gc']>0).sum()), flush=True)

# 4) Light smoothing AFTER cap (w=3)
def roll_max_centered_by_group_player(df, prob_col, win=3):
    return df.groupby(['game_play','nfl_player_id'], sort=False)[prob_col].transform(
        lambda x: x.rolling(win, center=True, min_periods=1).max()
    ).values

train_g_meta = train_g_meta.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
test_g_meta  = test_g_meta.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
train_g_meta['proba_gcs'] = roll_max_centered_by_group_player(train_g_meta, 'proba_gc', win=3)
test_g_meta['proba_gcs']  = roll_max_centered_by_group_player(test_g_meta,  'proba_gc', win=3)
if (train_g_meta['proba_gcs']>0).any():
    print('Train proba_gcs gated quantiles:', np.quantile(train_g_meta.loc[train_g_meta['proba_gcs']>0,'proba_gcs'].values, [0.9,0.95,0.99]), flush=True)
if (test_g_meta['proba_gcs']>0).any():
    print('Test proba_gcs gated quantiles:', np.quantile(test_g_meta.loc[test_g_meta['proba_gcs']>0,'proba_gcs'].values, [0.9,0.95,0.99]), flush=True)

# attach labels to train_g_meta for thresholding
train_g_meta = train_g_meta.merge(train_g[['game_play','step','nfl_player_id','contact']], on=['game_play','step','nfl_player_id'], how='left', validate='one_to_one')
train_g_meta['contact'] = train_g_meta['contact'].fillna(0).astype(int)

# Threshold search grid and RATE-based guardrail per fold
thr_grid_g = np.round(np.arange(0.42, 0.905, 0.005), 3)
best_thr_g = []
for f in sorted(np.unique(train_df['fold'].values)):
    gps_fold = set(train_df.loc[train_df['fold']==f, 'game_play'].unique().tolist())
    idx = train_g_meta['game_play'].isin(gps_fold).values
    df_f = train_g_meta.loc[idx].copy()
    df_f = df_f.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
    assert isinstance(df_f.index, pd.RangeIndex)
    y = df_f['contact'].values.astype(int)
    fold_size = len(df_f)
    cap_rate = 0.0125
    cap_pos = int(np.floor(cap_rate * fold_size))
    nz = int((df_f['proba_gcs']>0).sum())
    print(f' G Fold {f}: fold_size={fold_size} cap_pos={cap_pos} nz={nz}', flush=True)
    best_mcc=-1.0; best_thr=None; best_pos=None
    for thr in thr_grid_g:
        pred = (df_f['proba_gcs'].values >= thr).astype(np.uint8)
        pred_h = hysteresis_2of3_sorted(pred, df_f, ['game_play','nfl_player_id'])
        num_pos = int(pred_h.sum())
        if num_pos > cap_pos:
            continue
        mcc = mcc_fast(y, pred_h)
        if mcc > best_mcc:
            best_mcc = mcc; best_thr = float(thr); best_pos = num_pos
    if best_thr is None:
        # fallback: choose highest thr with <= cap_pos; if none, use 0.98
        fallback_thr = None; fallback_pos = None
        for thr in thr_grid_g[::-1]:
            pred = (df_f['proba_gcs'].values >= thr).astype(np.uint8)
            pred_h = hysteresis_2of3_sorted(pred, df_f, ['game_play','nfl_player_id'])
            num_pos = int(pred_h.sum())
            if num_pos <= cap_pos:
                fallback_thr = float(thr); fallback_pos = int(num_pos); break
        if fallback_thr is None:
            fallback_thr = 0.98; fallback_pos = -1
        best_thr = fallback_thr; best_pos = fallback_pos
    cap_flag = 'CAP95' if (best_pos is not None and cap_pos > 0 and best_pos >= 0.95*cap_pos) else ''
    print(f' G Fold {f} best after-hyst thr={best_thr:.3f} pos={best_pos} (cap={cap_pos}) {cap_flag}', flush=True)
    best_thr_g.append(best_thr)

thr_g_med = float(np.median(best_thr_g))
print(f'G fold-median threshold after hysteresis: thr_g={thr_g_med:.4f}', flush=True)

# Apply to test G: optional test-side threshold scan to hit target G=1.5k-1.8k with combined<=8.8k (in-sample counts)
test_g_meta_sorted = test_g_meta.copy()
test_g_meta_sorted['_orig_idx'] = np.arange(len(test_g_meta_sorted))
test_g_meta_sorted = test_g_meta_sorted.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
assert isinstance(test_g_meta_sorted.index, pd.RangeIndex)
thr_test = float(thr_g_med)
target_lo, target_hi = 1500, 1800
# Build canonical G contact_id for original-order meta and in-sample mask
stepi_go = test_g_meta['step'].astype(int).to_numpy()
pidi_go = test_g_meta['nfl_player_id'].astype(int).to_numpy()
cid_g_orig = (test_g_meta['game_play'].astype(str) + '_' + stepi_go.astype(str) + '_' + pidi_go.astype(str) + '_G').astype(str)
g_in_mask_orig = cid_g_orig.isin(ss_set).to_numpy()
for thr in thr_grid_g[::-1]:  # high -> low
    pred_sorted = (test_g_meta_sorted['proba_gcs'].values >= thr).astype(np.uint8)
    pred_sorted = hysteresis_2of3_sorted(pred_sorted, test_g_meta_sorted, ['game_play','nfl_player_id'])
    tmp = np.zeros((len(test_g_meta_sorted),), dtype=np.uint8)
    tmp[test_g_meta_sorted['_orig_idx'].values] = pred_sorted
    g_cnt = int(tmp.sum())
    g_cnt_in = int((tmp.astype(bool) & g_in_mask_orig).sum())
    combined_cnt_in = int(pp_pos_in + g_cnt_in)
    if (g_cnt_in >= target_lo) and (g_cnt_in <= target_hi) and (combined_cnt_in <= 8800):
        thr_test = float(thr)
        print(f'Override test thr_g to {thr_test:.3f} for G_in={g_cnt_in} and combined_in={combined_cnt_in}', flush=True)
        break

# Final apply to test with chosen thr_test
g_test_bin_sorted = (test_g_meta_sorted['proba_gcs'].values >= thr_test).astype(np.uint8)
g_test_bin_sorted = hysteresis_2of3_sorted(g_test_bin_sorted, test_g_meta_sorted, ['game_play','nfl_player_id'])
g_test_bin = np.zeros((len(test_g_meta),), dtype=np.uint8)
g_test_bin[test_g_meta_sorted['_orig_idx'].values] = g_test_bin_sorted
g_pos = int(g_test_bin.sum())
g_pos_in = int((g_test_bin.astype(bool) & g_in_mask_orig).sum())
print(f'G positives after full chain (gated+cap): {g_pos} (in-sample: {g_pos_in})', flush=True)

# -------------------- Build submission: OR combine PP and G with canonical IDs --------------------
# PP rows (canonical p_min/p_max)
sub_pp = pd.DataFrame({'contact_id': cid_pp.astype(str), 'contact': pp_test_bin.astype(int)})
# G rows (canonical)
stepi_g = test_g_meta['step'].astype(int).to_numpy()
pidi_g = test_g_meta['nfl_player_id'].astype(int).to_numpy()
cid_g = (test_g_meta['game_play'].astype(str) + '_' + stepi_g.astype(str) + '_' + pidi_g.astype(str) + '_G').astype(str)
sub_g = pd.DataFrame({'contact_id': cid_g.astype(str), 'contact': g_test_bin.astype(int)})
sub_g = sub_g[sub_g['contact'] > 0]

# Combine via OR, align to sample
sub = pd.concat([sub_pp, sub_g], ignore_index=True).groupby('contact_id', as_index=False)['contact'].max()
pp_in = int(sub.loc[sub['contact_id'].isin(ss_set) & (~sub['contact_id'].str.endswith('_G')), 'contact'].sum())
g_in = int(sub.loc[sub['contact_id'].isin(ss_set) & (sub['contact_id'].str.endswith('_G')), 'contact'].sum())
print(f'Final counts BEFORE merge (in-sample): PP={pp_in}, G={g_in}, combined={pp_in+g_in}', flush=True)
sub = ss[['contact_id']].merge(sub, on='contact_id', how='left')
sub['contact'] = sub['contact'].fillna(0).astype(int)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv', flush=True)
print(f'Total time: {time.time()-t0:.1f}s', flush=True)

xgboost version (single-r45-pp+g, cap321-fix, thr-after-hyst): 2.1.4


Loading r=4.5 supervised train/test...


Using 50 features


scale_pos_weight=5.00 (neg=696815, pos=48809)


Training PP r=4.5 single model (3 seeds)...


  seed 42 fold 0 done in 36.3s; best_it=2533


  seed 42 fold 1 done in 32.2s; best_it=2289


  seed 42 fold 2 done in 29.1s; best_it=2026


  seed 42 fold 3 done in 28.0s; best_it=1951


  seed 42 fold 4 done in 33.1s; best_it=2406


  seed 1337 fold 0 done in 36.0s; best_it=2510


  seed 1337 fold 1 done in 30.4s; best_it=2119


  seed 1337 fold 2 done in 28.9s; best_it=1987


  seed 1337 fold 3 done in 24.1s; best_it=1591


  seed 1337 fold 4 done in 35.5s; best_it=2568


  seed 2025 fold 0 done in 35.4s; best_it=2420


  seed 2025 fold 1 done in 36.4s; best_it=2600


  seed 2025 fold 2 done in 29.1s; best_it=2017


  seed 2025 fold 3 done in 26.2s; best_it=1725


  seed 2025 fold 4 done in 32.2s; best_it=2283


PP bagging complete.


Applied distance-aware caps. Train kept_nonzero: 461184 of 745624


Applied distance-aware caps. Test  kept_nonzero: 196178 of 319769


PP fold 0 proba_sc quantiles: [0.00000000e+00 1.57871132e-05 3.99024809e-01 9.90915716e-01]


PP fold 1 proba_sc quantiles: [0.00000000e+00 2.37228955e-05 4.50268100e-01 9.92166954e-01]


PP fold 2 proba_sc quantiles: [0.00000000e+00 1.83392870e-05 4.15545891e-01 9.92218574e-01]


PP fold 3 proba_sc quantiles: [0.00000000e+00 1.80452072e-05 4.30632964e-01 9.91201311e-01]


PP fold 4 proba_sc quantiles: [0.00000000e+00 2.60238642e-05 4.86056199e-01 9.92451434e-01]


 Fold 0 best after-hyst MCC=0.71530 thr_opp=0.720 thr_same=0.780


 Fold 1 best after-hyst MCC=0.74015 thr_opp=0.760 thr_same=0.700


 Fold 2 best after-hyst MCC=0.73827 thr_opp=0.780 thr_same=0.620


 Fold 3 best after-hyst MCC=0.73629 thr_opp=0.660 thr_same=0.600


 Fold 4 best after-hyst MCC=0.73990 thr_opp=0.740 thr_same=0.800


Fold-median thresholds after hysteresis (PP r45): thr_opp=0.7400, thr_same=0.7000


PP positives after full chain: 6371 (in-sample: 6345)


Training G-head (per-player) ...


G-head features: 24


  G seed 42 fold 0 best_it=974


  G seed 42 fold 1 best_it=1686


  G seed 42 fold 2 best_it=798


  G seed 42 fold 3 best_it=1319


  G seed 42 fold 4 best_it=1258


  G seed 1337 fold 0 best_it=1150


  G seed 1337 fold 1 best_it=2075


  G seed 1337 fold 2 best_it=769


  G seed 1337 fold 3 best_it=1488


  G seed 1337 fold 4 best_it=995


  G seed 2025 fold 0 best_it=1012


  G seed 2025 fold 1 best_it=1933


  G seed 2025 fold 2 best_it=656


  G seed 2025 fold 3 best_it=1430


  G seed 2025 fold 4 best_it=1169


G gate train True: 972034 of 1225299


G gate test  True: 104903 of 127754


Nonzero proba_raw after gate (train/test): 972034 104903


Train proba_raw gated quantiles: [0.01858482 0.11650438 0.61017988 0.90421667]


Test proba_raw gated quantiles: [0.1420511  0.31259542 0.694529   0.92223667]


Test G candidates after cap (proba_gc>0): 11471


Train proba_gcs gated quantiles: [0.43306754 0.67890182 0.90120914]


Test proba_gcs gated quantiles: [0.55948352 0.7268164  0.91980044]


 G Fold 0: fold_size=240966 cap_pos=3012 nz=31437


 G Fold 0 best after-hyst thr=0.465 pos=2986 (cap=3012) CAP95


 G Fold 1: fold_size=246994 cap_pos=3087 nz=31778


 G Fold 1 best after-hyst thr=0.420 pos=3048 (cap=3087) CAP95


 G Fold 2: fold_size=233992 cap_pos=2924 nz=30571


 G Fold 2 best after-hyst thr=0.515 pos=2846 (cap=2924) CAP95


 G Fold 3: fold_size=237314 cap_pos=2966 nz=30801


 G Fold 3 best after-hyst thr=0.540 pos=2858 (cap=2966) CAP95


 G Fold 4: fold_size=266033 cap_pos=3325 nz=34428


 G Fold 4 best after-hyst thr=0.515 pos=2520 (cap=3325) 


G fold-median threshold after hysteresis: thr_g=0.5150


Override test thr_g to 0.475 for G_in=1507 and combined_in=7852


G positives after full chain (gated+cap): 2323 (in-sample: 1507)


Final counts BEFORE merge (in-sample): PP=6345, G=1507, combined=7852


Saved submission.csv


Total time: 677.9s


In [3]:
# Debug: Inspect columns of r=4.5 supervised parquet files to fix key column names
import pandas as pd
from pathlib import Path
train_sup_fp = 'train_supervised_w5_helm_dyn_r45.parquet'
test_sup_fp  = 'test_pairs_w5_helm_dyn_r45.parquet'
print('Exist:', Path(train_sup_fp).exists(), Path(test_sup_fp).exists())
train_df_head = pd.read_parquet(train_sup_fp)
test_df_head = pd.read_parquet(test_sup_fp)
print('train columns:', list(train_df_head.columns))
print('test  columns:', list(test_df_head.columns))
print('head train:')
print(train_df_head.head(3))
print('head test:')
print(test_df_head.head(3))

Exist: True True
train columns: ['game_play', 'step', 'p1', 'p2', 'contact', 'distance', 'rel_dx', 'rel_dy', 'speed1', 'speed2', 'accel1', 'accel2', 'closing', 'abs_closing', 'abs_d_heading', 'same_team', 'team1', 'team2', 'pos1', 'pos2', 'dist_min_p5', 'dist_mean_p5', 'dist_max_p5', 'dist_std_p5', 'abs_close_min_p5', 'abs_close_mean_p5', 'abs_close_max_p5', 'abs_close_std_p5', 'cnt_dist_lt15_p5', 'cnt_dist_lt20_p5', 'cnt_dist_lt25_p5', 'dist_delta_p5', 'px_dist_norm_min', 'views_both_present', 'approaching_flag', 'ttc_raw', 'ttc_clip', 'ttc_log', 'inv_ttc', 'd_dist_1', 'd_dist_2', 'd_dist_5', 'd_close_1', 'd_absclose_1', 'd_speed1_1', 'd_speed2_1', 'd_accel1_1', 'd_accel2_1', 'rm3_d_dist_1', 'rm3_d_close_1', 'rel_speed', 'abs_rel_speed', 'rel_accel', 'abs_rel_accel', 'jerk1', 'jerk2', 'd_px_norm_1', 'cnt_px_lt006_p3', 'cnt_px_lt008_p3']
test  columns: ['game_play', 'step', 'p1', 'p2', 'distance', 'rel_dx', 'rel_dy', 'speed1', 'speed2', 'accel1', 'accel2', 'closing', 'abs_closing', 'ab

In [7]:
# Integrate fresh G-head (fast) and OR-combine with existing PP (from submission.csv non-G rows); write improved submission.csv
import time, gc, numpy as np, pandas as pd, xgboost as xgb
from sklearn.model_selection import GroupKFold

t0 = time.time()
print('G-head quick integration start...', flush=True)

# 1) Load tracking and labels
train_trk = pd.read_csv('train_player_tracking.csv')
test_trk  = pd.read_csv('test_player_tracking.csv')
labels = pd.read_csv('train_labels.csv')

rename_map = {}
if 'x_position' in train_trk.columns and 'y_position' in train_trk.columns:
    rename_map.update({'x_position':'x','y_position':'y'})
train_trk = train_trk.rename(columns=rename_map)
test_trk  = test_trk.rename(columns=rename_map)

base_cols = ['game_play','step','nfl_player_id']
cand_cols = ['x','y','speed','acceleration']
avail = [c for c in cand_cols if c in train_trk.columns]
use_cols = base_cols + avail
train_trk = train_trk[use_cols].copy()
test_trk  = test_trk[[c for c in use_cols if c in test_trk.columns]].copy()

def add_player_feats(df):
    df = df.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
    for col in [c for c in ['x','y','speed','acceleration'] if c in df.columns]:
        df[f'd_{col}'] = df.groupby(['game_play','nfl_player_id'])[col].diff().fillna(0.0)
    if 'acceleration' in df.columns:
        df['jerk'] = df.groupby(['game_play','nfl_player_id'])['acceleration'].diff().fillna(0.0)
    for col in [c for c in ['speed','acceleration'] if c in df.columns]:
        df[f'{col}_r3_max'] = df.groupby(['game_play','nfl_player_id'])[col].rolling(3, min_periods=1).max().reset_index(level=[0,1], drop=True)
        df[f'{col}_r5_min'] = df.groupby(['game_play','nfl_player_id'])[col].rolling(5, min_periods=1).min().reset_index(level=[0,1], drop=True)
    if 'speed' in df.columns:
        df['speed_drop_3'] = df['speed'] - df.groupby(['game_play','nfl_player_id'])['speed'].shift(3).fillna(df['speed'])
    return df

train_trk = add_player_feats(train_trk)
test_trk  = add_player_feats(test_trk)

# Build G labels
labels_g = labels[labels['contact_id'].str.endswith('_G')].copy()
parts = labels_g['contact_id'].str.split('_')
labels_g['game_play'] = parts.str[0] + '_' + parts.str[1]
labels_g['step'] = parts.str[2].astype(int)
labels_g['player'] = parts.str[3]
labels_g = labels_g[['game_play','step','player','contact']].copy()

train_trk['player'] = train_trk['nfl_player_id'].astype(str)
g_lab = labels_g.copy()
g_lab_p1 = g_lab.copy(); g_lab_p1['step'] = g_lab_p1['step'] + 1
g_lab_m1 = g_lab.copy(); g_lab_m1['step'] = g_lab_m1['step'] - 1
g_lab_all = pd.concat([g_lab, g_lab_p1, g_lab_m1], ignore_index=True).drop_duplicates(['game_play','step','player'])
g_lab_all['contact'] = 1

train_g = train_trk.merge(g_lab_all, on=['game_play','step','player'], how='left')
train_g['contact'] = train_g['contact'].fillna(0).astype(int)

# 2) Train a fast G-head (1 seed, 5 folds) on GPU
gkf = GroupKFold(n_splits=5)
groups_g = train_g['game_play'].values
g_feat_drop = {'game_play','step','nfl_player_id','player','contact'}
g_feat_cols = [c for c in train_g.columns if c not in g_feat_drop and train_g[c].dtype != 'O']
print('G-head features:', len(g_feat_cols), flush=True)

def train_xgb_fast(X_tr, y_tr, X_va, y_va, seed=42):
    dtr = xgb.DMatrix(X_tr, label=y_tr); dva = xgb.DMatrix(X_va, label=y_va)
    neg = float((y_tr==0).sum()); pos = float((y_tr==1).sum()); spw = max(1.0, neg/max(1.0,pos))
    params = {
        'objective':'binary:logistic', 'eval_metric':'logloss', 'tree_method':'gpu_hist',
        'max_depth':7, 'eta':0.06, 'subsample':0.9, 'colsample_bytree':0.8, 'min_child_weight':10, 'lambda':1.2,
        'scale_pos_weight': spw, 'seed': int(seed)
    }
    bst = xgb.train(params, dtr, num_boost_round=4000, evals=[(dva,'valid')], early_stopping_rounds=200, verbose_eval=False)
    return bst

oof = np.zeros(len(train_g), float); pt = np.zeros(len(test_trk), float)
seed = 42; fi=0
Xt = xgb.DMatrix(test_trk[g_feat_cols].values) if len(test_trk) else None
for tr_idx, va_idx in gkf.split(train_g, train_g['contact'].values, groups_g):
    X_tr = train_g.iloc[tr_idx][g_feat_cols].values; y_tr = train_g.iloc[tr_idx]['contact'].values.astype(np.float32)
    X_va = train_g.iloc[va_idx][g_feat_cols].values; y_va = train_g.iloc[va_idx]['contact'].values.astype(np.float32)
    t1=time.time(); bst = train_xgb_fast(X_tr, y_tr, X_va, y_va, seed=seed)
    oof[va_idx] = bst.predict(xgb.DMatrix(X_va), iteration_range=(0, bst.best_iteration+1))
    if Xt is not None: pt += bst.predict(Xt, iteration_range=(0, bst.best_iteration+1))
    print(f'  G fast fold {fi} best_it={bst.best_iteration} in {time.time()-t1:.1f}s', flush=True)
    del bst; gc.collect(); fi+=1
pt = pt / 5.0

# 3) Post-proc: rolling max w=5, per-fold threshold after hysteresis, median to test
def roll_max_centered_by_group_player(df, prob_col, win=5):
    return df.groupby(['game_play','nfl_player_id'], sort=False)[prob_col].apply(lambda x: x.rolling(win, center=True, min_periods=1).max()).reset_index(level=[0,1], drop=True).values

def hysteresis_2of3_sorted(pred_bin: np.ndarray, df_sorted: pd.DataFrame, group_cols: list) -> np.ndarray:
    assert isinstance(df_sorted.index, pd.RangeIndex)
    arr = pred_bin.astype(np.uint8); gvals = df_sorted[group_cols].to_numpy()
    new_group = np.zeros(len(df_sorted), dtype=bool); new_group[1:] = (gvals[1:] != gvals[:-1]).any(axis=1)
    end_group = np.zeros(len(df_sorted), dtype=bool); end_group[:-1] = new_group[1:]; end_group[-1] = True
    prev = np.r_[arr[0], arr[:-1]]; prev[new_group] = arr[new_group]
    nxt = np.r_[arr[1:], arr[-1]]; nxt[end_group] = arr[end_group]
    return (arr + prev + nxt >= 2).astype(np.uint8)

train_g_meta = train_trk[['game_play','step','nfl_player_id']].copy().reset_index(drop=True)
test_g_meta  = test_trk[['game_play','step','nfl_player_id']].copy().reset_index(drop=True)
train_g_meta['proba'] = oof; test_g_meta['proba'] = pt
train_g_meta = train_g_meta.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
test_g_meta  = test_g_meta.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
train_g_meta['proba_s'] = roll_max_centered_by_group_player(train_g_meta, 'proba', win=5)
test_g_meta['proba_s']  = roll_max_centered_by_group_player(test_g_meta,  'proba', win=5)

# Attach labels for thresholding by folds from PP folds
folds_df = pd.read_csv('folds_game_play.csv')
gp2fold = dict(zip(folds_df['game_play'], folds_df['game_play'].astype(str).map(lambda x: folds_df.loc[folds_df['game_play']==x,'fold'].values[0]) if False else folds_df['fold']))
train_g_meta['fold'] = train_g_meta['game_play'].map(gp2fold).astype(int)
train_g_meta = train_g_meta.merge(train_g[['game_play','step','nfl_player_id','contact']], on=['game_play','step','nfl_player_id'], how='left', validate='one_to_one')
train_g_meta['contact'] = train_g_meta['contact'].fillna(0).astype(int)

thr_grid = np.round(np.arange(0.60, 0.921, 0.02), 3)
best_thr_g = []
for f in sorted(train_g_meta['fold'].unique()):
    df_f = train_g_meta.loc[train_g_meta['fold']==f].sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
    y = df_f['contact'].values.astype(int)
    best_m=-1.0; best_t=0.78
    for t in thr_grid:
        pred = (df_f['proba_s'].values >= t).astype(np.uint8)
        pred_h = hysteresis_2of3_sorted(pred, df_f, ['game_play','nfl_player_id'])
        # Simple MCC
        tp = ((y==1)&(pred_h==1)).sum(); tn=((y==0)&(pred_h==0)).sum(); fp=((y==0)&(pred_h==1)).sum(); fn=((y==1)&(pred_h==0)).sum()
        denom = max(1.0, float(((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))**0.5))
        mcc = (tp*tn - fp*fn)/denom
        if mcc > best_m: best_m=mcc; best_t=float(t)
    print(f' G-fast Fold {f} best MCC={best_m:.5f} thr={best_t:.3f}', flush=True)
    best_thr_g.append(best_t)
thr_g_med = float(np.median(best_thr_g))
print(f'G-fast fold-median thr={thr_g_med:.4f}', flush=True)

# Apply to test G and build G rows
tst = test_g_meta.copy(); tst['_idx']=np.arange(len(tst))
tst = tst.sort_values(['game_play','nfl_player_id','step']).reset_index(drop=True)
g_bin_sorted = (tst['proba_s'].values >= thr_g_med).astype(np.uint8)
g_bin_sorted = hysteresis_2of3_sorted(g_bin_sorted, tst, ['game_play','nfl_player_id'])
g_bin = np.zeros(len(test_g_meta), np.uint8); g_bin[tst['_idx'].values]=g_bin_sorted
g_rows = test_g_meta[['game_play','step','nfl_player_id']].copy()
g_rows['contact'] = g_bin.astype(int)
g_rows = g_rows[g_rows['contact']>0].copy()
g_rows['contact_id'] = g_rows['game_play'] + '_' + g_rows['step'].astype(str) + '_' + g_rows['nfl_player_id'].astype(str) + '_G'
sub_g = g_rows[['contact_id','contact']].copy()
g_pos = int(sub_g['contact'].sum())
print('G-fast positives:', g_pos, flush=True)

# 4) OR-combine with existing PP predictions (from current submission.csv non-G rows)
ss = pd.read_csv('sample_submission.csv')
cur = pd.read_csv('submission.csv')
pp_only = cur[~cur['contact_id'].str.endswith('_G')].copy()
pp_only = pp_only.rename(columns={'contact':'pp_contact'})
sub = ss[['contact_id']].merge(pp_only, on='contact_id', how='left')
sub['pp_contact'] = sub['pp_contact'].fillna(0).astype(int)
if len(sub_g):
    sub = sub.merge(sub_g.rename(columns={'contact':'g_contact'}), on='contact_id', how='left')
    sub['g_contact'] = sub['g_contact'].fillna(0).astype(int)
    sub['contact'] = np.maximum(sub['pp_contact'], sub['g_contact']).astype(int)
else:
    sub['contact'] = sub['pp_contact']
sub = sub[['contact_id','contact']]
sub.to_csv('submission.csv', index=False)
print('Saved improved submission.csv (PP OR G-fast). Total time {:.1f}s'.format(time.time()-t0), flush=True)

G-head quick integration start...


G-head features: 14


  G fast fold 0 best_it=535 in 4.3s


  G fast fold 1 best_it=682 in 5.1s


  G fast fold 2 best_it=619 in 4.7s


  G fast fold 3 best_it=509 in 4.1s


  G fast fold 4 best_it=379 in 3.4s


 G-fast Fold 0 best MCC=7637962530.00000 thr=0.660


 G-fast Fold 1 best MCC=9144683812.00000 thr=0.680


 G-fast Fold 2 best MCC=8056316466.00000 thr=0.660


 G-fast Fold 3 best MCC=7984390436.00000 thr=0.640


 G-fast Fold 4 best MCC=9738054776.00000 thr=0.640


G-fast fold-median thr=0.6600


G-fast positives: 38345


Saved improved submission.csv (PP OR G-fast). Total time 33.8s


# Plan: Patch G-head per expert advice (do NOT execute yet)

We received expert guidance that explains the G-head blow-up and how to fix it. Next steps (implementation to be done in Cell 1 in upcoming edits; do not run until PP Cell 0 finishes):

- Replace cap_topk_per_step with a groupby-rank mask that never reorders rows:
  - Rank within (game_play, step) descending by proba; keep_mask = (rank <= k) & (proba >= min_prob); set others to 0.0.
  - Enforce assertion: per-(game_play, step), number of proba>0 must be <= k.

- Post-proc order: Gate -> CAP (k=1–2 on raw proba) -> Threshold -> Hysteresis (2-of-3). Reduce smoothing:
  - If smoothing is used, apply roll-max AFTER cap with a short window (w=3).

- Calibrate/regularize the G model:
  - Clip scale_pos_weight to <= 5 per fold.
  - Stronger regularization: lambda ~2–3, alpha ~0.2–0.5, min_child_weight ~20, max_depth ~6, subsample ~0.8, colsample_bytree ~0.7–0.8, eta ~0.03–0.05; optional max_delta_step=1.
  - Keep eval_metric='logloss'.

- Gate kinematic-first (proximity as feature, not hard gate):
  - Start with conservative gate_extra = (speed <= 0.8) OR (acceleration <= -0.8) OR (speed_drop_3 <= -0.8) AND time_since_snap >= 2. Optionally AND proximity close_opp for precision.

- Labels:
  - Remove ±1 expansion initially; use exact frame. Reintroduce limited expansion only with kinematic confirmation if needed.

- Threshold search and guardrail:
  - Keep per-fold grid AFTER hysteresis; accept only thresholds with <=500 positives (or ~0.3% of fold size).
  - Fallback: choose highest threshold that satisfies the cap; if none, use a very high constant (e.g., 0.98), not 0.94.
  - Optionally initialize from high-quantile (99–99.9th) and sweep a narrow band.

- Diagnostics to add before re-running:
  - Print stagewise counts (raw -> gate -> cap -> smooth -> hysteresis -> threshold) train/test.
  - Quantiles/histograms for proba_s and proba_gc per fold.
  - Assert per-(game_play, step) cap holds (<=k).
  - Verify contact_id format for G: <game_play>_<step>_<nfl_player_id>_G.

Targets:
- Test G candidates (proba_gc>0): ~4–6k.
- Final G positives after hysteresis: ~1.8k–2.2k.

Execution plan:
1) When Cell 0 completes, modify Cell 1 to: implement safe cap, change order (cap before smoothing), tighten gate (kinematic-first), clip spw and add regularization, remove ±1 labels.
2) Add diagnostics prints and assertions to stop runs if caps fail.
3) Execute Cell 1; verify counts and distributions; iterate thresholds.
4) If G still high: raise min_prob (e.g., 0.997), tighten gate; if low: relax min_prob (e.g., 0.99).

We will not execute Cell 1 until these patches are coded. Cell 0 continues to run for a clean PP-only baseline.

In [65]:
# Snapshot current Cell 1 submission: save combined and extract G rows for hedge blend
import pandas as pd
from pathlib import Path

src = Path('submission.csv')
assert src.exists(), 'submission.csv not found; run Cell 1 first.'
sub = pd.read_csv(src)

# Save full combined from Cell 1
sub.to_csv('submission_cell1_pp_r45_plus_g.csv', index=False)
print('Saved snapshot:', 'submission_cell1_pp_r45_plus_g.csv', 'rows=', len(sub))

# Extract G rows only
is_g = sub['contact_id'].str.endswith('_G')
g_rows = sub.loc[is_g, ['contact_id','contact']].copy()
g_rows.to_csv('g_rows_from_cell1.csv', index=False)
print('Saved G rows:', 'g_rows_from_cell1.csv', 'G positives=', int(g_rows['contact'].sum()), 'rows=', len(g_rows))

Saved snapshot: submission_cell1_pp_r45_plus_g.csv rows= 463243
Saved G rows: g_rows_from_cell1.csv G positives= 1507 rows= 40282


In [66]:
# Combine PP-only (from Cell 0 PP-blend file) with saved G rows (from Cell 1 snapshot) and write hedge submission (do NOT overwrite submission.csv)
import pandas as pd
from pathlib import Path

pp_file = Path('submission_ppblend_plus_g.csv')  # use PP-blend file; will strip G rows
g_file = Path('g_rows_from_cell1.csv')
ss_file = Path('sample_submission.csv')
assert pp_file.exists(), 'PP-blend file not found: submission_ppblend_plus_g.csv'
assert g_file.exists(), 'g_rows_from_cell1.csv not found. Re-run Cell 5 snapshot after Cell 1.'

pp_sub_full = pd.read_csv(pp_file)
ss = pd.read_csv(ss_file)

# Strip G rows to get PP-only from the blend
pp_sub = pp_sub_full[~pp_sub_full['contact_id'].str.endswith('_G')].copy()
pp_sub = pp_sub.rename(columns={'contact':'pp_contact'})

g_rows = pd.read_csv(g_file)
g_rows = g_rows.rename(columns={'contact':'g_contact'})

# OR-combine on sample_submission universe
sub = ss[['contact_id']].merge(pp_sub, on='contact_id', how='left')
sub['pp_contact'] = sub['pp_contact'].fillna(0).astype(int)
sub = sub.merge(g_rows, on='contact_id', how='left')
sub['g_contact'] = sub['g_contact'].fillna(0).astype(int)
sub['contact'] = (sub['pp_contact'] | sub['g_contact']).astype(int)
sub = sub[['contact_id','contact']]

combined_pos = int(sub['contact'].sum())
pp_pos = int(pp_sub['pp_contact'].sum())
g_pos = int(g_rows['g_contact'].sum())
print('Hedge components -> PP:', pp_pos, 'G:', g_pos, 'combined:', combined_pos)

# Save hedge output only (do NOT overwrite submission.csv here)
sub.to_csv('submission_hedge_ppblend_plus_freshG.csv', index=False)
print('Saved submission_hedge_ppblend_plus_freshG.csv')

Hedge components -> PP: 6428 G: 1507 combined: 7935


Saved submission_hedge_ppblend_plus_freshG.csv


In [56]:
# Inspect snapshot submission from Cell 1 and, if valid, set it as current submission.csv
import pandas as pd, shutil, os
from pathlib import Path

snap_fp = Path('submission_cell1_pp_r45_plus_g.csv')
assert snap_fp.exists(), 'snapshot file missing: submission_cell1_pp_r45_plus_g.csv'
sub = pd.read_csv(snap_fp)
is_g = sub['contact_id'].str.endswith('_G')
pp_pos = int(sub.loc[~is_g, 'contact'].sum())
g_pos = int(sub.loc[is_g, 'contact'].sum())
combined = int(sub['contact'].sum())
print('Snapshot counts -> PP:', pp_pos, 'G:', g_pos, 'combined:', combined)

# If looks like the strong run (target ~PP 6601, G 1854, combined ~8455), overwrite submission.csv with snapshot
if (pp_pos >= 6400 and pp_pos <= 6800) and (g_pos >= 1700 and g_pos <= 2000) and (combined <= 9000):
    sub.to_csv('submission.csv', index=False)
    print('Set submission.csv from snapshot (Cell 1 PP+G).')
else:
    print('Snapshot not in expected range. Keeping current submission.csv.')

Snapshot counts -> PP: 6571 G: 1250 combined: 7821
Snapshot not in expected range. Keeping current submission.csv.


In [61]:
# Normalize contact_id in snapshot to canonical schema and rebuild submission.csv
import pandas as pd, numpy as np
from pathlib import Path

snap_fp = Path('submission_cell1_pp_r45_plus_g.csv')
ss_fp = Path('sample_submission.csv')
assert snap_fp.exists() and ss_fp.exists(), 'Required files missing.'
snap = pd.read_csv(snap_fp)
ss = pd.read_csv(ss_fp)

def normalize_cid(cid: str) -> str:
    parts = cid.split('_')
    if cid.endswith('_G'):
        gp = parts[0] + '_' + parts[1]
        step = int(parts[2])
        pid = int(parts[3])
        return f'{gp}_{step}_{pid}_G'
    else:
        gp = parts[0] + '_' + parts[1]
        step = int(parts[2])
        p1, p2 = int(parts[3]), int(parts[4])
        if p1 <= p2:
            pmin, pmax = p1, p2
        else:
            pmin, pmax = p2, p1
        return f'{gp}_{step}_{pmin}_{pmax}'

# Normalize IDs and collapse duplicates
snap['contact_id'] = snap['contact_id'].astype(str).apply(normalize_cid)
snap = snap.groupby('contact_id', as_index=False)['contact'].max()

# Align to sample and fill missing as 0
sub = ss[['contact_id']].merge(snap, on='contact_id', how='left')
sub['contact'] = sub['contact'].fillna(0).astype(int)

# Report in-sample counts
is_g = sub['contact_id'].str.endswith('_G')
pp_pos = int(sub.loc[~is_g, 'contact'].sum())
g_pos = int(sub.loc[is_g, 'contact'].sum())
combined = int(sub['contact'].sum())
print('Normalized snapshot counts -> PP:', pp_pos, 'G:', g_pos, 'combined:', combined)

sub.to_csv('submission.csv', index=False)
print('Saved normalized submission.csv')

Normalized snapshot counts -> PP: 6571 G: 1171 combined: 7742


Saved normalized submission.csv


In [72]:
# Set submission.csv to hedge file
import pandas as pd, os
hedge_fp = 'submission_hedge_ppblend_plus_freshG.csv'
sub = pd.read_csv(hedge_fp)
is_g = sub['contact_id'].str.endswith('_G')
pp_pos = int(sub.loc[~is_g, 'contact'].sum()); g_pos = int(sub.loc[is_g, 'contact'].sum()); combined = int(sub['contact'].sum())
print('Hedge counts -> PP:', pp_pos, 'G:', g_pos, 'combined:', combined)
sub.to_csv('submission.csv', index=False)
print('submission.csv overwritten from hedge file')

Hedge counts -> PP: 6428 G: 1507 combined: 7935


submission.csv overwritten from hedge file


In [69]:
# Build a precision-tilted hedge: cap G to k=1 per (game_play, step) on hedge file and write submission.csv
import pandas as pd

hedge_fp = 'submission_hedge_ppblend_plus_freshG.csv'
sub = pd.read_csv(hedge_fp)

# Split PP and G
is_g = sub['contact_id'].str.endswith('_G')
pp = sub.loc[~is_g].copy()
g = sub.loc[is_g].copy()

# Parse game_play and step from G contact_id: <game_play>_<step>_<player>_G
parts = g['contact_id'].str.split('_')
g['game_play'] = parts.str[0] + '_' + parts.str[1]
g['step'] = parts.str[2].astype(int)

# For each (game_play, step), keep at most 1 G positive (k=1), preferring any positive; if multiple, keep first arbitrarily
g_pos = g[g['contact'] == 1].copy()
g_pos = g_pos.sort_values(['game_play','step']).drop_duplicates(['game_play','step'], keep='first')

# Recombine PP with capped G
sub_k1 = pd.concat([pp[['contact_id','contact']], g_pos[['contact_id','contact']]], ignore_index=True)
sub_k1 = sub_k1.groupby('contact_id', as_index=False)['contact'].max()

# Report counts
is_g_k1 = sub_k1['contact_id'].str.endswith('_G')
pp_pos = int(sub_k1.loc[~is_g_k1, 'contact'].sum())
g_pos_cnt = int(sub_k1.loc[is_g_k1, 'contact'].sum())
combined = int(sub_k1['contact'].sum())
print('Precision-hedge counts -> PP:', pp_pos, 'G(k=1):', g_pos_cnt, 'combined:', combined)

# Save as submission.csv
sub_k1.to_csv('submission.csv', index=False)
print('Saved submission.csv (hedge with G k=1 cap)')

Precision-hedge counts -> PP: 6428 G(k=1): 781 combined: 7209


Saved submission.csv (hedge with G k=1 cap)


In [71]:
# Build PP-only submission from blend file (strip G rows) and align to sample_submission
import pandas as pd
ppblend_fp = 'submission_ppblend_plus_g.csv'
ss_fp = 'sample_submission.csv'
ppsub_full = pd.read_csv(ppblend_fp)
ss = pd.read_csv(ss_fp)
# Keep PP rows only and align
pp_only = ppsub_full[~ppsub_full['contact_id'].str.endswith('_G')].copy()
pp_only = pp_only.rename(columns={'contact':'pp_contact'})
sub = ss[['contact_id']].merge(pp_only, on='contact_id', how='left')
sub['contact'] = sub['pp_contact'].fillna(0).astype(int)
sub = sub[['contact_id','contact']]
pp_pos = int(sub['contact'].sum())
print('PP-only submission positives:', pp_pos)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (PP-only from blend)')

PP-only submission positives: 6428


Saved submission.csv (PP-only from blend)
