In [1]:
# Quantile LGBM v3: 5-seed bag for q50 delta (OOF + Test); reuse v2 bands for sigma later
import numpy as np, pandas as pd, time, gc
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

np.random.seed(42)
pd.set_option('display.max_columns', 200)

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('sample_submission.csv')

# Helpers
def prepare_baseline_table(df):
    base = (df.sort_values(['Patient','Weeks']).groupby('Patient', as_index=False).first())
    base = base[['Patient','Weeks','FVC','Percent','Age','Sex','SmokingStatus']].rename(
        columns={'Weeks':'Base_Week','FVC':'Base_FVC','Percent':'Percent_at_base'})
    return base

def one_hot_fit(df, cols):
    return {c: sorted(df[c].dropna().astype(str).unique().tolist()) for c in cols}

def one_hot_transform(df, cats):
    out = df.copy()
    for c, values in cats.items():
        col = df[c].astype(str)
        for v in values:
            out[f'{c}__{v}'] = (col == v).astype(np.int8)
    return out

def ecdf_rank_fit(x):
    xs = np.sort(np.asarray(x, dtype=float))
    return xs

def ecdf_rank_transform(x, xs):
    x = np.asarray(x, dtype=float)
    idx = np.searchsorted(xs, x, side='right')
    return idx / max(len(xs), 1)

def build_slope_features(base_df, ecdf_basefvc=None, ecdf_percent=None, cats=None, fit=False):
    b = base_df.copy()
    b['log_Base_FVC'] = np.log1p(np.maximum(b['Base_FVC'].astype(float), 1.0))
    b['BaseFVC_over_Age'] = b['Base_FVC'].astype(float) / np.maximum(b['Age'].astype(float), 1.0)
    b['PercentBase_over_Age'] = b['Percent_at_base'].astype(float) / np.maximum(b['Age'].astype(float), 1.0)
    if fit:
        ecdf_basefvc = ecdf_rank_fit(b['Base_FVC'].values)
        ecdf_percent = ecdf_rank_fit(b['Percent_at_base'].values)
    b['BaseFVC_ecdf'] = ecdf_rank_transform(b['Base_FVC'].values, ecdf_basefvc)
    b['Percent_ecdf'] = ecdf_rank_transform(b['Percent_at_base'].values, ecdf_percent)
    if fit:
        cats = one_hot_fit(b, ['Sex','SmokingStatus'])
    b = one_hot_transform(b, cats)
    num_cols = ['Age','Base_FVC','log_Base_FVC','Percent_at_base','BaseFVC_over_Age','PercentBase_over_Age','BaseFVC_ecdf','Percent_ecdf']
    cat_cols = [c for c in b.columns if c.startswith('Sex__') or c.startswith('SmokingStatus__')]
    feat_cols = num_cols + cat_cols
    return b, feat_cols, ecdf_basefvc, ecdf_percent, cats

def compute_patient_slopes(df, patient_col='Patient', week_col='Weeks', target_col='FVC'):
    slopes = {}
    for pid, g in df.groupby(patient_col):
        if g.shape[0] >= 2:
            x = g[week_col].values.astype(float); y = g[target_col].values.astype(float)
            xm = x.mean(); ym = y.mean()
            denom = ((x - xm)**2).sum()
            slope = ((x - xm) * (y - ym)).sum() / denom if denom > 0 else 0.0
            slopes[pid] = slope
    return slopes

def build_q_features(grid_df, base_df, ecdf_bf=None, ecdf_pc=None, cats=None, fit=False):
    d = grid_df.merge(base_df[['Patient','Base_Week','Base_FVC','Percent_at_base','Age','Sex','SmokingStatus']], on='Patient', how='left')
    d['dist'] = (d['Weeks'] - d['Base_Week']).astype(float)
    d = d[d['dist'] >= 0].copy()
    d['abs_dist'] = d['dist'].abs()
    d['log1p_abs_dist'] = np.log1p(d['abs_dist'])
    d['dist_cap'] = d['dist'].clip(0, 30)
    d['dist_short'] = d['dist'].clip(0, 5)
    d['dist_mid'] = (d['dist'] - 5).clip(lower=0, upper=10)
    d['dist_long'] = (d['dist'] - 15).clip(lower=0)
    d['dist2'] = d['dist']**2
    d['dist3'] = d['dist']**3
    d['Base_FVC'] = d['Base_FVC'].astype(float)
    d['Percent_at_base'] = d['Percent_at_base'].astype(float).clip(30, 120)
    d['Age'] = d['Age'].astype(float)
    d['log_Base_FVC'] = np.log1p(np.maximum(d['Base_FVC'], 1.0))
    d['Age_x_Percent'] = d['Age'] * d['Percent_at_base']
    d['BaseFVC_x_dist'] = d['Base_FVC'] * d['dist']
    d['dist_x_Age'] = d['dist'] * d['Age']
    d['dist_x_Percent'] = d['dist'] * d['Percent_at_base']
    d['BaseFVC_x_dshort'] = d['Base_FVC'] * d['dist_short']
    d['BaseFVC_x_dmid'] = d['Base_FVC'] * d['dist_mid']
    d['BaseFVC_x_dlong'] = d['Base_FVC'] * d['dist_long']
    if fit:
        ecdf_bf = ecdf_rank_fit(d['Base_FVC'].values)
        ecdf_pc = ecdf_rank_fit(d['Percent_at_base'].values)
        cats = one_hot_fit(d, ['Sex','SmokingStatus'])
    d['BaseFVC_ecdf'] = ecdf_rank_transform(d['Base_FVC'].values, ecdf_bf)
    d['Percent_ecdf'] = ecdf_rank_transform(d['Percent_at_base'].values, ecdf_pc)
    d = one_hot_transform(d, cats)
    d['BFV_decile'] = np.floor(d['BaseFVC_ecdf'] * 10).clip(0, 9).astype(int)
    for k in range(10):
        d[f'BFV_decile__{k}'] = (d['BFV_decile'] == k).astype(np.int8)
    feat_cols = [
        'Age','Base_FVC','log_Base_FVC','Percent_at_base','BaseFVC_ecdf','Percent_ecdf',
        'dist','abs_dist','log1p_abs_dist','dist_cap','dist_short','dist_mid','dist_long','dist2','dist3',
        'Age_x_Percent','BaseFVC_x_dist','dist_x_Age','dist_x_Percent','BaseFVC_x_dshort','BaseFVC_x_dmid','BaseFVC_x_dlong','s_hat'
    ] + [c for c in d.columns if c.startswith('Sex__') or c.startswith('SmokingStatus__') or c.startswith('BFV_decile__')]
    for c in feat_cols:
        if c not in d.columns: d[c] = 0.0
    return d, feat_cols, ecdf_bf, ecdf_pc, cats

def fit_s_hat_fold(trn_df, base_trn):
    slopes_trn = compute_patient_slopes(trn_df)
    slope_labels_trn = pd.DataFrame({'Patient': list(slopes_trn.keys()), 's_label': list(slopes_trn.values())})
    base_trn_lab = base_trn.merge(slope_labels_trn, on='Patient', how='left')
    bf_trn, f_cols_s, ecdf_bf_s, ecdf_pc_s, cats_s = build_slope_features(base_trn_lab, fit=True)
    scaler_s = StandardScaler(with_mean=True, with_std=True).fit(bf_trn[f_cols_s].values.astype(float))
    Xs_tr = scaler_s.transform(bf_trn[f_cols_s].values.astype(float))
    y_s = bf_trn['s_label'].fillna(0.0).values.astype(float)
    ridge = Ridge(alpha=1.0, random_state=42).fit(Xs_tr, y_s)
    knn = KNeighborsRegressor(n_neighbors=9, weights='distance').fit(Xs_tr, y_s)
    q_lo, q_hi = np.percentile(y_s, [5,95])
    def get_s_hat_map(base_df_patients):
        bf_pred, _, _, _, _ = build_slope_features(base_df_patients, ecdf_bf_s, ecdf_pc_s, cats_s, fit=False)
        Xs = scaler_s.transform(bf_pred[f_cols_s].values.astype(float))
        s = 0.8*ridge.predict(Xs) + 0.2*knn.predict(Xs)
        s = np.clip(s, q_lo, q_hi)
        return dict(zip(bf_pred['Patient'].values, s))
    return get_s_hat_map

# Config per expert
seeds = [1337, 2027, 3037, 4242, 5151]
params = dict(objective='quantile', metric='quantile',
              n_estimators=2600, learning_rate=0.032,
              num_leaves=31, max_depth=6, min_data_in_leaf=24,
              subsample=0.75, colsample_bytree=0.75,
              reg_alpha=0.1, reg_lambda=0.2, n_jobs=-1, verbose=-1)

# OOF frame to collect q50 deltas
oof_df = train[['Patient','Weeks','FVC']].copy()
oof_df['q50_delta_oof'] = np.nan

# Static TEST grid and index map
grid_te = ss.copy()
parts = grid_te['Patient_Week'].str.rsplit('_', n=1, expand=True)
grid_te['Patient'] = parts[0]; grid_te['Weeks'] = parts[1].astype(int)
test_base = test[['Patient','Weeks','FVC','Percent','Age','Sex','SmokingStatus']].rename(
    columns={'Weeks':'Base_Week','FVC':'Base_FVC','Percent':'Percent_at_base'})
grid_te_idx = grid_te[['Patient','Weeks']].copy()
grid_te_idx['ss_idx'] = np.arange(grid_te_idx.shape[0], dtype=int)

# Accumulators for TEST q50 delta
test_pred_sum = np.zeros(ss.shape[0], dtype=float)

gkf = GroupKFold(n_splits=5)
groups = train['Patient'].values

for fold, (trn_idx, val_idx) in enumerate(gkf.split(train, groups=groups), 1):
    tf = time.time()
    trn_df = train.iloc[trn_idx].copy(); val_df = train.iloc[val_idx].copy()
    base_trn = prepare_baseline_table(trn_df)
    base_val = prepare_baseline_table(val_df)
    # s_hat maps
    get_s_hat_map = fit_s_hat_fold(trn_df, base_trn)
    s_map_trn = get_s_hat_map(base_trn)
    s_map_val = get_s_hat_map(base_val)
    base_test = grid_te[['Patient']].drop_duplicates().merge(test_base.drop_duplicates('Patient'), on='Patient', how='left')
    s_map_test = get_s_hat_map(base_test[['Patient','Base_Week','Base_FVC','Percent_at_base','Age','Sex','SmokingStatus']])

    # Build future-only train/val with s_hat
    trn = trn_df.merge(base_trn[['Patient','Base_Week','Base_FVC','Percent_at_base','Age','Sex','SmokingStatus']], on='Patient', how='left')
    val = val_df.merge(base_val[['Patient','Base_Week','Base_FVC','Percent_at_base','Age','Sex','SmokingStatus']], on='Patient', how='left')
    trn['dist'] = (trn['Weeks'] - trn['Base_Week']).astype(float); trn = trn[trn['dist'] >= 0].copy()
    val['dist'] = (val['Weeks'] - val['Base_Week']).astype(float); val = val[val['dist'] >= 0].copy()
    trn['s_hat'] = trn['Patient'].map(s_map_trn).astype(float).fillna(0.0)
    val['s_hat'] = val['Patient'].map(s_map_val).astype(float).fillna(0.0)

    # Features (fit on TRAIN fold)
    trn_feat, feat_cols, ecdf_bf, ecdf_pc, cats = build_q_features(trn[['Patient','Weeks']].copy(), base_trn, fit=True)
    trn_feat['s_hat'] = trn_feat['Patient'].map(s_map_trn).astype(float).fillna(0.0)
    val_feat, _, _, _, _ = build_q_features(val[['Patient','Weeks']].copy(), base_val, ecdf_bf, ecdf_pc, cats, fit=False)
    val_feat['s_hat'] = val_feat['Patient'].map(s_map_val).astype(float).fillna(0.0)

    # Align features with labels strictly
    trn_feat_aligned = trn_feat.merge(trn[['Patient','Weeks','FVC']], on=['Patient','Weeks'], how='inner')
    val_feat_aligned = val_feat.merge(val[['Patient','Weeks','FVC']], on=['Patient','Weeks'], how='inner')

    y_tr_delta = (trn_feat_aligned['FVC'].astype(float).values - trn_feat_aligned['Base_FVC'].astype(float).values)
    y_va_delta = (val_feat_aligned['FVC'].astype(float).values - val_feat_aligned['Base_FVC'].astype(float).values)
    X_tr = trn_feat_aligned[feat_cols].values.astype(float)
    X_va = val_feat_aligned[feat_cols].values.astype(float)

    if X_tr.shape[0] == 0 or X_va.shape[0] == 0:
        print(f'[v3 Fold {fold}] skipped (X_tr={X_tr.shape[0]}, X_va={X_va.shape[0]})', flush=True)
        del trn_df, val_df, trn, val, trn_feat, val_feat, trn_feat_aligned, val_feat_aligned
        gc.collect()
        continue

    # Build TEST features under TRAIN-fold transforms; align to ss via index map
    te_feat, _, _, _, _ = build_q_features(grid_te[['Patient','Weeks']].copy(), test_base, ecdf_bf, ecdf_pc, cats, fit=False)
    te_feat['s_hat'] = te_feat['Patient'].map(s_map_test).astype(float).fillna(0.0)
    X_te = te_feat[feat_cols].values.astype(float)
    te_keys = te_feat[['Patient','Weeks']].copy().merge(grid_te_idx, on=['Patient','Weeks'], how='left')
    te_idx = te_keys['ss_idx'].values.astype(int)

    # Seed bagging for q50 (alpha=0.5); jitter lr by ±0.002 per seed index
    val_pred_sum = np.zeros(X_va.shape[0], dtype=float)
    test_pred_sum_fold = np.zeros(ss.shape[0], dtype=float)

    for si, sd in enumerate(seeds):
        lr = params['learning_rate'] + (0.002 if (si % 2 == 0) else -0.002)
        mdl = LGBMRegressor(**{**params, 'alpha': 0.5, 'learning_rate': lr}, random_state=sd)
        mdl.fit(X_tr, y_tr_delta,
                eval_set=[(X_va, y_va_delta)],
                eval_metric='quantile',
                callbacks=[lgb.early_stopping(200, verbose=False)])
        val_pred_sum += mdl.predict(X_va, num_iteration=mdl.best_iteration_)
        pred_te = mdl.predict(X_te, num_iteration=mdl.best_iteration_)
        # Scatter into full-sized array using te_idx
        test_pred_sum_fold[te_idx] += pred_te
        del mdl
    # Average over seeds
    val_pred_avg = val_pred_sum / max(len(seeds), 1)
    test_pred_avg_fold = test_pred_sum_fold / max(len(seeds), 1)

    # Write OOF deltas back by keys
    keys = val_feat_aligned[['Patient','Weeks']].reset_index(drop=True)
    block = pd.DataFrame({'Patient': keys['Patient'].astype(str), 'Weeks': keys['Weeks'].astype(int), 'q50_delta_oof': val_pred_avg})
    oof_df = oof_df.merge(block, on=['Patient','Weeks'], how='left', suffixes=('','_new'))
    oof_df['q50_delta_oof'] = oof_df['q50_delta_oof'].fillna(oof_df['q50_delta_oof_new'])
    oof_df.drop(columns=['q50_delta_oof_new'], inplace=True)

    # Accumulate TEST deltas (fold-average)
    test_pred_sum += (test_pred_avg_fold / 5.0)  # average across 5 folds

    print(f'[v3 Fold {fold}] trn={X_tr.shape[0]} val={X_va.shape[0]} elapsed={time.time()-tf:.2f}s', flush=True)
    del trn_df, val_df, trn, val, trn_feat, val_feat, trn_feat_aligned, val_feat_aligned, X_tr, X_va, X_te, te_feat, te_idx, te_keys
    gc.collect()

# Save OOF v3 (q50 only) with Base_FVC for downstream reconstruction
train_base = prepare_baseline_table(train)
oof_save = oof_df.dropna(subset=['q50_delta_oof']).merge(train_base[['Patient','Base_Week','Base_FVC']], on='Patient', how='left')
oof_save.to_csv('oof_quantile_lgbm_v3.csv', index=False)

# Save TEST q50 deltas aligned to ss; reusing v2 q20/q80 later for sigma is allowed
pd.DataFrame({'Patient_Week': ss['Patient_Week'], 'q50_d': test_pred_sum.astype(float)}).to_csv('pred_quantile_deltas_v3.csv', index=False)

print(f'Saved oof_quantile_lgbm_v3.csv and pred_quantile_deltas_v3.csv. Total elapsed {time.time()-t0:.1f}s')

[v3 Fold 1] trn=1124 val=284 elapsed=1.42s


[v3 Fold 2] trn=1127 val=281 elapsed=0.86s


[v3 Fold 3] trn=1129 val=279 elapsed=0.92s


[v3 Fold 4] trn=1129 val=279 elapsed=1.21s


[v3 Fold 5] trn=1123 val=285 elapsed=1.02s


Saved oof_quantile_lgbm_v3.csv and pred_quantile_deltas_v3.csv. Total elapsed 5.7s
