In [1]:
# Lean production: build robust submissions S1, S2, S3 from cached OOF/test .npy
import numpy as np, pandas as pd, json, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

print('Loading train for y and test ids...')
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
test_ids = test[id_col].values

print('Loading cached OOF/test predictions...')
o_lr = np.load('oof_lr_pivot.npy')
o_d1 = np.load('oof_xgb_dense.npy')
o_d2 = np.load('oof_xgb_dense_v2.npy')
o_meta = np.load('oof_xgb_meta.npy')
t_lr = np.load('test_lr_pivot.npy')
t_d1 = np.load('test_xgb_dense.npy')
t_d2 = np.load('test_xgb_dense_v2.npy')
t_meta = np.load('test_xgb_meta.npy')

# Optional 5th model: alternate sparse LR view
try:
    o_lr_alt = np.load('oof_lr_alt.npy')
    t_lr_alt = np.load('test_lr_alt.npy')
except Exception:
    o_lr_alt = None; t_lr_alt = None

# Quick OOF diagnostics
def auc_prob(arr):
    return roc_auc_score(y, arr)
def auc_logit_from_probs(*probs, weights=None):
    zs = [to_logit(p) for p in probs]
    if weights is None:
        w = np.ones(len(zs), dtype=np.float64) / len(zs)
    else:
        w = np.array(weights, dtype=np.float64)
    z = np.zeros_like(zs[0], dtype=np.float64)
    for wi, zi in zip(w, zs):
        z += wi * zi
    return roc_auc_score(y, z)

print('Single-model OOF AUCs:')
print({'LR': auc_prob(o_lr), 'Dense1': auc_prob(o_d1), 'Dense2': auc_prob(o_d2), 'Meta': auc_prob(o_meta), 'LR_alt': (auc_prob(o_lr_alt) if o_lr_alt is not None else None)})

# S1: Global 4-way logit reference blend (fixed best weights from main notebook refine)
w_ref = (0.3381, 0.266786, 0.163514, 0.2316)  # (LR, Dense1, Dense2, Meta)
z_ref_oof = w_ref[0]*to_logit(o_lr) + w_ref[1]*to_logit(o_d1) + w_ref[2]*to_logit(o_d2) + w_ref[3]*to_logit(o_meta)
auc_s1 = roc_auc_score(y, z_ref_oof)
print(f'S1 OOF AUC(z): {auc_s1:.5f}')
z_ref_te = w_ref[0]*to_logit(t_lr) + w_ref[1]*to_logit(t_d1) + w_ref[2]*to_logit(t_d2) + w_ref[3]*to_logit(t_meta)
p_s1 = sigmoid(z_ref_te).astype(np.float32)
pd.DataFrame({id_col: test_ids, target_col: p_s1}).to_csv('submission_s1_ref4_logit.csv', index=False)

# S2: Equal-weight probability average over 4 models + shrinkage variant toward mean
p_eq = (t_lr + t_d1 + t_d2 + t_meta) / 4.0
pd.DataFrame({id_col: test_ids, target_col: p_eq.astype(np.float32)}).to_csv('submission_s2_equal_prob.csv', index=False)
# Shrinkage: 0.7*S1_probs + 0.3*mean(models)
p_eq_shrink = (0.7*p_s1 + 0.3*p_eq).astype(np.float32)
pd.DataFrame({id_col: test_ids, target_col: p_eq_shrink}).to_csv('submission_s2_shrink_prob.csv', index=False)

# S3: 5-way logit blend with tiny LR_alt weight (3-6%), others scaled from S1
best_auc_s3 = -1.0; best_cfg_s3 = None; p_s3_best = None
if (o_lr_alt is not None) and (t_lr_alt is not None):
    z_lr, z_d1, z_d2, z_m = to_logit(o_lr), to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
    tz_lr, tz_d1, tz_d2, tz_m = to_logit(t_lr), to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)
    z_alt, tz_alt = to_logit(o_lr_alt), to_logit(t_lr_alt)
    for w_alt in [0.03, 0.04, 0.05, 0.06]:
        rem = 1.0 - w_alt
        w1 = w_ref[0] * rem
        w2 = w_ref[1] * rem
        w3 = w_ref[2] * rem
        w4 = w_ref[3] * rem
        z_oof = w1*z_lr + w2*z_d1 + w3*z_d2 + w4*z_m + w_alt*z_alt
        auc_here = roc_auc_score(y, z_oof)
        if auc_here > best_auc_s3:
            best_auc_s3 = auc_here
            best_cfg_s3 = (w1, w2, w3, w4, w_alt)
            z_te = w1*tz_lr + w2*tz_d1 + w3*tz_d2 + w4*tz_m + w_alt*tz_alt
            p_s3_best = sigmoid(z_te).astype(np.float32)
    if best_cfg_s3 is not None:
        print(f'S3 5-way best (logit) OOF AUC(z): {best_auc_s3:.5f} | weights(LR,D1,D2,Meta,LR_alt)={best_cfg_s3}')
        pd.DataFrame({id_col: test_ids, target_col: p_s3_best}).to_csv('submission_s3_5way_logit.csv', index=False)
    else:
        print('S3 skipped: LR_alt arrays not found or no config evaluated.')
else:
    print('S3 skipped: Missing LR_alt cached predictions.')

# Default submission.csv: use S1 (ref 4-way logit) as primary per expert advice
pd.DataFrame({id_col: test_ids, target_col: p_s1}).to_csv('submission.csv', index=False)
print('Wrote default submission.csv as S1 (ref 4-way logit). Also wrote S2 equal/shrink and S3 5-way (if available).')

Loading train for y and test ids...
Loading cached OOF/test predictions...
Single-model OOF AUCs:
{'LR': 0.6786023038450223, 'Dense1': 0.6756124134764913, 'Dense2': 0.6749017972318943, 'Meta': 0.666192706969406, 'LR_alt': 0.674147858613878}
S1 OOF AUC(z): 0.69242
S3 5-way best (logit) OOF AUC(z): 0.69237 | weights(LR,D1,D2,Meta,LR_alt)=(0.327957, 0.25878242, 0.15860858, 0.224652, 0.03)
Wrote default submission.csv as S1 (ref 4-way logit). Also wrote S2 equal/shrink and S3 5-way (if available).


In [2]:
# Overwrite submission.csv with S2 equal-weight probability hedge
import pandas as pd
s2 = pd.read_csv('submission_s2_equal_prob.csv')
s2.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S2 equal-weight probability blend. Head:')
print(s2.head())

submission.csv overwritten with S2 equal-weight probability blend. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.340917
1   t3_roiuw                  0.225581
2   t3_mjnbq                  0.215592
3   t3_t8wd1                  0.208369
4  t3_1m4zxu                  0.214607


In [3]:
# Overwrite submission.csv with S3 5-way logit (tiny LR_alt weight) hedge
import pandas as pd
s3 = pd.read_csv('submission_s3_5way_logit.csv')
s3.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S3 5-way logit (tiny LR_alt) blend. Head:')
print(s3.head())

submission.csv overwritten with S3 5-way logit (tiny LR_alt) blend. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.330550
1   t3_roiuw                  0.215626
2   t3_mjnbq                  0.211781
3   t3_t8wd1                  0.210184
4  t3_1m4zxu                  0.217150


In [4]:
# Overwrite submission.csv with S2 shrinkage probability hedge
import pandas as pd
s2s = pd.read_csv('submission_s2_shrink_prob.csv')
s2s.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S2 shrinkage probability blend. Head:')
print(s2s.head())

submission.csv overwritten with S2 shrinkage probability blend. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.334539
1   t3_roiuw                  0.219630
2   t3_mjnbq                  0.214051
3   t3_t8wd1                  0.208835
4  t3_1m4zxu                  0.214938


In [6]:
# Build and write blend-of-blends (logit): mix Ref4, PerFold4(no-zstd), Ref3
import numpy as np, pandas as pd, time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Load base OOF/test probs
o_lr = np.load('oof_lr_pivot.npy'); t_lr = np.load('test_lr_pivot.npy')
o_d1 = np.load('oof_xgb_dense.npy'); t_d1 = np.load('test_xgb_dense.npy')
o_d2 = np.load('oof_xgb_dense_v2.npy'); t_d2 = np.load('test_xgb_dense_v2.npy')
o_meta = np.load('oof_xgb_meta.npy'); t_meta = np.load('test_xgb_meta.npy')

# Convert to logits
z1, z2, z3, z4 = to_logit(o_lr), to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz1, tz2, tz3, tz4 = to_logit(t_lr), to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Ref4: fixed weights
w_ref4 = (0.3381, 0.266786, 0.163514, 0.2316)  # (LR, D1, D2, Meta)
z_ref4 = w_ref4[0]*z1 + w_ref4[1]*z2 + w_ref4[2]*z3 + w_ref4[3]*z4
tz_ref4 = w_ref4[0]*tz1 + w_ref4[1]*tz2 + w_ref4[2]*tz3 + w_ref4[3]*tz4
print(f'Ref4 OOF AUC(z): {roc_auc_score(y, z_ref4):.5f}')

# Ref3: best 3-way logit (LR, D1, Meta) via coarse grid
best_auc3, best_w3 = -1.0, None
grid = np.arange(0.20, 0.50+1e-12, 0.01)  # search reasonable simplex
t0 = time.time(); tried = 0
for w_lr in grid:
    for w_d1 in grid:
        w_meta = 1.0 - w_lr - w_d1
        if w_meta < 0 or w_meta > 1: continue
        z = w_lr*z1 + w_d1*z2 + w_meta*z4
        auc = roc_auc_score(y, z); tried += 1
        if auc > best_auc3:
            best_auc3, best_w3 = auc, (float(w_lr), float(w_d1), float(w_meta))
print(f'Ref3 OOF AUC(z): {best_auc3:.5f} | best_w3={best_w3} | tried={tried}')
tz_ref3 = best_w3[0]*tz1 + best_w3[1]*tz2 + best_w3[2]*tz4
z_ref3 = best_w3[0]*z1 + best_w3[1]*z2 + best_w3[2]*z4

# PerFold4 (no z-std): for each CV fold, pick best weights on train_idx (coarse grid), apply to val_idx; test logits averaged over fold weights
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
z_pf4 = np.zeros_like(y, dtype=np.float64)
tz_pf4_parts = []
grid_w = np.arange(0.28, 0.40+1e-12, 0.004)  # narrow window around ref4
grid_wd = np.arange(0.38, 0.48+1e-12, 0.004) # total dense weight
alpha_grid = np.arange(0.20, 0.50+1e-12, 0.05) # split D1/D2
for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
    best_auc_f, best_w_f = -1.0, None
    y_tr = y[tr_idx]
    for w_lr in grid_w:
        for wd in grid_wd:
            w_meta = 1.0 - w_lr - wd
            if w_meta < 0 or w_meta > 1: continue
            for a in alpha_grid:
                w_d2 = wd * a; w_d1 = wd - w_d2
                z_tr = w_lr*z1[tr_idx] + w_d1*z2[tr_idx] + w_d2*z3[tr_idx] + w_meta*z4[tr_idx]
                auc = roc_auc_score(y_tr, z_tr)
                if auc > best_auc_f:
                    best_auc_f, best_w_f = auc, (float(w_lr), float(w_d1), float(w_d2), float(w_meta))
    w_lr, w_d1, w_d2, w_meta = best_w_f
    z_pf4[va_idx] = w_lr*z1[va_idx] + w_d1*z2[va_idx] + w_d2*z3[va_idx] + w_meta*z4[va_idx]
    tz_pf4_parts.append(w_lr*tz1 + w_d1*tz2 + w_d2*tz3 + w_meta*tz4)
    print(f'PerFold4 Fold {fold} best_w={best_w_f}')
auc_pf4 = roc_auc_score(y, z_pf4)
tz_pf4 = np.mean(tz_pf4_parts, axis=0)
print(f'PerFold4 OOF AUC(z): {auc_pf4:.5f}')

# Blend-of-blends: mix z_ref4, z_pf4, z_ref3 with a narrow grid around (~0.366, 0.432, 0.202)
wr_c, wp_c, w3_c = 0.366, 0.432, 0.202
step = 0.01
wr_grid = np.arange(max(0.2, wr_c-0.06), min(0.6, wr_c+0.06)+1e-12, step)
wp_grid = np.arange(max(0.2, wp_c-0.06), min(0.6, wp_c+0.06)+1e-12, step)
best_auc_mix, best_w_mix = -1.0, None
t1 = time.time(); tried = 0
for wr in wr_grid:
    for wp in wp_grid:
        w3 = 1.0 - wr - wp
        if w3 < 0 or w3 > 1: continue
        z_mix = wr*z_ref4 + wp*z_pf4 + w3*z_ref3
        auc = roc_auc_score(y, z_mix); tried += 1
        if auc > best_auc_mix:
            best_auc_mix, best_w_mix = auc, (float(wr), float(wp), float(w3))
print(f'Blend-of-blends tried {tried} combos | best_w={best_w_mix} OOF AUC(z): {best_auc_mix:.5f} | {time.time()-t1:.1f}s')

# Build test predictions for best weights
wr, wp, w3 = best_w_mix
tz_mix = wr*tz_ref4 + wp*tz_pf4 + w3*tz_ref3
pt = sigmoid(tz_mix).astype(np.float32)
sub = pd.DataFrame({id_col: ids, target_col: pt})
sub.to_csv('submission_blend_of_blends_logit.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (blend-of-blends logit). Head:')
print(sub.head())

Ref4 OOF AUC(z): 0.69242


Ref3 OOF AUC(z): 0.69201 | best_w3=(0.35000000000000014, 0.4200000000000002, 0.2299999999999997) | tried=960


PerFold4 Fold 1 best_w=(0.3440000000000001, 0.28600000000000003, 0.154, 0.21599999999999986)


PerFold4 Fold 2 best_w=(0.3400000000000001, 0.31920000000000004, 0.1368, 0.20399999999999985)


PerFold4 Fold 3 best_w=(0.3400000000000001, 0.23800000000000007, 0.23800000000000002, 0.18399999999999983)


PerFold4 Fold 4 best_w=(0.30000000000000004, 0.23000000000000007, 0.23, 0.23999999999999988)


PerFold4 Fold 5 best_w=(0.32400000000000007, 0.19000000000000003, 0.18999999999999997, 0.29599999999999993)
PerFold4 OOF AUC(z): 0.69131


Blend-of-blends tried 169 combos | best_w=(0.35600000000000004, 0.372, 0.2719999999999999) OOF AUC(z): 0.69198 | 0.3s
Saved submission.csv (blend-of-blends logit). Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.331289
1   t3_roiuw                  0.216208
2   t3_mjnbq                  0.213753
3   t3_t8wd1                  0.208131
4  t3_1m4zxu                  0.214751


In [7]:
# S4: Hedge variant - mix LR and LR_noSub in logit space for LR component, keep ref weights for others
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

# Load base preds
o_lr = np.load('oof_lr_pivot.npy'); t_lr = np.load('test_lr_pivot.npy')
o_lr_ns = np.load('oof_lr_nosub.npy'); t_lr_ns = np.load('test_lr_nosub.npy')
o_d1 = np.load('oof_xgb_dense.npy'); t_d1 = np.load('test_xgb_dense.npy')
o_d2 = np.load('oof_xgb_dense_v2.npy'); t_d2 = np.load('test_xgb_dense_v2.npy')
o_meta = np.load('oof_xgb_meta.npy'); t_meta = np.load('test_xgb_meta.npy')

# Convert to logits
z_lr, z_lr_ns = to_logit(o_lr), to_logit(o_lr_ns)
tz_lr, tz_lr_ns = to_logit(t_lr), to_logit(t_lr_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Reference weights for 4-way
w_ref = (0.3381, 0.266786, 0.163514, 0.2316)  # (LR, D1, D2, Meta)

# Small grid for mixing LR with LR_noSub to hedge subreddit drift
best_auc, best_g = -1.0, None
for g in [0.2, 0.3, 0.4]:
    z_lr_mix = (1.0 - g)*z_lr + g*z_lr_ns
    z_oof = w_ref[0]*z_lr_mix + w_ref[1]*z_d1 + w_ref[2]*z_d2 + w_ref[3]*z_meta
    auc = roc_auc_score(y, z_oof)
    print(f'g={g:.2f} | OOF AUC(z): {auc:.5f}')
    if auc > best_auc:
        best_auc, best_g = auc, g
print(f'Chosen g={best_g:.2f} with OOF AUC(z)={best_auc:.5f}')

# Build test with chosen g
tz_lr_mix = (1.0 - best_g)*tz_lr + best_g*tz_lr_ns
zt = w_ref[0]*tz_lr_mix + w_ref[1]*tz_d1 + w_ref[2]*tz_d2 + w_ref[3]*tz_meta
pt = sigmoid(zt).astype(np.float32)
sub = pd.DataFrame({id_col: test[id_col].values, target_col: pt})
sub.to_csv('submission_s4_lr_mix_logit.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S4 LR/LR_noSub mix hedge. Head:')
print(sub.head())

g=0.20 | OOF AUC(z): 0.69246
g=0.30 | OOF AUC(z): 0.69246
g=0.40 | OOF AUC(z): 0.69252
Chosen g=0.40 with OOF AUC(z)=0.69252
submission.csv overwritten with S4 LR/LR_noSub mix hedge. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.333798
1   t3_roiuw                  0.216595
2   t3_mjnbq                  0.208447
3   t3_t8wd1                  0.211159
4  t3_1m4zxu                  0.216228


In [8]:
# S5: 2-way global logit blend: LR + Dense v1, sweep LR weight in {0.55, 0.60, 0.65, 0.70}
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

o_lr = np.load('oof_lr_pivot.npy'); t_lr = np.load('test_lr_pivot.npy')
o_d1 = np.load('oof_xgb_dense.npy'); t_d1 = np.load('test_xgb_dense.npy')

z_lr, z_d1 = to_logit(o_lr), to_logit(o_d1)
tz_lr, tz_d1 = to_logit(t_lr), to_logit(t_d1)

best_auc, best_w = -1.0, None
for w_lr in [0.55, 0.60, 0.65, 0.70]:
    w_d1 = 1.0 - w_lr
    z = w_lr*z_lr + w_d1*z_d1
    auc = roc_auc_score(y, z)
    print(f'w_lr={w_lr:.2f}, w_d1={w_d1:.2f} | OOF AUC(z): {auc:.5f}')
    if auc > best_auc:
        best_auc, best_w = auc, (w_lr, w_d1)
print(f'Chosen weights (LR, Dense1)={best_w} with OOF AUC(z)={best_auc:.5f}')

zt = best_w[0]*tz_lr + best_w[1]*tz_d1
pt = sigmoid(zt).astype(np.float32)
sub = pd.DataFrame({id_col: test[id_col].values, target_col: pt})
sub.to_csv('submission_s5_2way_logit.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S5 2-way global logit (LR + Dense1). Head:')
print(sub.head())

w_lr=0.55, w_d1=0.45 | OOF AUC(z): 0.68983
w_lr=0.60, w_d1=0.40 | OOF AUC(z): 0.68932
w_lr=0.65, w_d1=0.35 | OOF AUC(z): 0.68852
w_lr=0.70, w_d1=0.30 | OOF AUC(z): 0.68747
Chosen weights (LR, Dense1)=(0.55, 0.44999999999999996) with OOF AUC(z)=0.68983
submission.csv overwritten with S5 2-way global logit (LR + Dense1). Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.313329
1   t3_roiuw                  0.197200
2   t3_mjnbq                  0.208460
3   t3_t8wd1                  0.204065
4  t3_1m4zxu                  0.209776


In [9]:
# S4-highg and S6: cached-only hedges
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Load base preds
o_lr = np.load('oof_lr_pivot.npy'); t_lr = np.load('test_lr_pivot.npy')
o_lr_ns = np.load('oof_lr_nosub.npy'); t_lr_ns = np.load('test_lr_nosub.npy')
o_d1 = np.load('oof_xgb_dense.npy'); t_d1 = np.load('test_xgb_dense.npy')
o_d2 = np.load('oof_xgb_dense_v2.npy'); t_d2 = np.load('test_xgb_dense_v2.npy')
o_meta = np.load('oof_xgb_meta.npy'); t_meta = np.load('test_xgb_meta.npy')

z_lr, z_lr_ns = to_logit(o_lr), to_logit(o_lr_ns)
tz_lr, tz_lr_ns = to_logit(t_lr), to_logit(t_lr_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Reference 4-way weights
w_ref = (0.3381, 0.266786, 0.163514, 0.2316)  # (LR, D1, D2, Meta)

# S4-highg: 4-way global logit with LR_mix using high g values (0.60, 0.70)
for g in [0.60, 0.70]:
    z_lr_mix = (1.0 - g)*z_lr + g*z_lr_ns
    z_oof = w_ref[0]*z_lr_mix + w_ref[1]*z_d1 + w_ref[2]*z_d2 + w_ref[3]*z_meta
    auc = roc_auc_score(y, z_oof)
    print(f'S4 g={g:.2f} | OOF AUC(z): {auc:.5f}')
    tz_lr_mix = (1.0 - g)*tz_lr + g*tz_lr_ns
    zt = w_ref[0]*tz_lr_mix + w_ref[1]*tz_d1 + w_ref[2]*tz_d2 + w_ref[3]*tz_meta
    pt = sigmoid(zt).astype(np.float32)
    sub = pd.DataFrame({id_col: test[id_col].values, target_col: pt})
    sub.to_csv(f'submission_s4_lr_mix_g{int(g*100)}.csv', index=False)

# S6: Equal-weight probability average across 5 models (add LR_alt), with mild clip [0.02, 0.98]
try:
    t_lr_alt = np.load('test_lr_alt.npy')
    o_lr_alt = np.load('oof_lr_alt.npy')
    # Log simple OOF for reference (prob space AUC)
    try:
        auc_alt = roc_auc_score(y, o_lr_alt)
        print(f'LR_alt OOF(prob) AUC: {auc_alt:.5f}')
    except Exception:
        pass
    p5_te = (t_lr + t_d1 + t_d2 + t_meta + t_lr_alt) / 5.0
    p5_te = np.clip(p5_te, 0.02, 0.98).astype(np.float32)
    pd.DataFrame({id_col: test[id_col].values, target_col: p5_te}).to_csv('submission_s6_equal5_clip.csv', index=False)
    print('Wrote S6: submission_s6_equal5_clip.csv')
except Exception as e:
    print('S6 skipped (missing LR_alt cache):', e)

# Default for this cell: set submission.csv to S4 g=0.70 per expert hedge
s4g70 = pd.read_csv('submission_s4_lr_mix_g70.csv')
s4g70.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S4 g=0.70 (LR_mix high-g) variant. Head:')
print(s4g70.head())

S4 g=0.60 | OOF AUC(z): 0.69233
S4 g=0.70 | OOF AUC(z): 0.69217
LR_alt OOF(prob) AUC: 0.67415
Wrote S6: submission_s6_equal5_clip.csv
submission.csv overwritten with S4 g=0.70 (LR_mix high-g) variant. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.335296
1   t3_roiuw                  0.216232
2   t3_mjnbq                  0.204796
3   t3_t8wd1                  0.212763
4  t3_1m4zxu                  0.217092


In [10]:
# Overwrite submission.csv with S6: equal-weight 5-model probability blend with clipping
import pandas as pd
s6 = pd.read_csv('submission_s6_equal5_clip.csv')
s6.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S6 equal5 clipped probability blend. Head:')
print(s6.head())

submission.csv overwritten with S6 equal5 clipped probability blend. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.330999
1   t3_roiuw                  0.214951
2   t3_mjnbq                  0.205373
3   t3_t8wd1                  0.216660
4  t3_1m4zxu                  0.230018


In [12]:
# Time-aware CV retrain for LR (with and without subreddit TF-IDF); cache OOF/test for robust blending
import numpy as np, pandas as pd, time, sys, gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def build_combined_text(df: pd.DataFrame) -> pd.Series:
    title = df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
    body = df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
    return (title + ' \n ' + body).astype(str)

def build_subreddit_text(df: pd.DataFrame) -> pd.Series:
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')

txt_tr = build_combined_text(train); txt_te = build_combined_text(test)
subs_tr = build_subreddit_text(train); subs_te = build_subreddit_text(test)

# Build time-ordered forward-chaining folds (5 blocks)
assert 'unix_timestamp_of_request' in train.columns, 'Missing timestamp for time-aware CV'
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 5
blocks = np.array_split(order, k)
folds = []
for i in range(1, k):
    va_idx = np.array(blocks[i])
    tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx))
print(f'Time-CV: {len(folds)} folds (forward chaining) built.')

word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=50000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=50000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=30000, sublinear_tf=True, smooth_idf=True, norm='l2')

def run_lr_time(with_subs: bool, tag: str):
    t0 = time.time()
    oof = np.zeros(n, dtype=np.float32)
    mask = np.zeros(n, dtype=bool)
    test_fold_preds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        fstart = time.time()
        tr_text = txt_tr.iloc[tr_idx]; va_text = txt_tr.iloc[va_idx]
        tfidf_w = TfidfVectorizer(**word_params)
        Xw_tr = tfidf_w.fit_transform(tr_text); Xw_va = tfidf_w.transform(va_text); Xw_te = tfidf_w.transform(txt_te)
        tfidf_c = TfidfVectorizer(**char_params)
        Xc_tr = tfidf_c.fit_transform(tr_text); Xc_va = tfidf_c.transform(va_text); Xc_te = tfidf_c.transform(txt_te)
        if with_subs:
            tfidf_s = TfidfVectorizer(**subs_params)
            Xs_tr = tfidf_s.fit_transform(subs_tr.iloc[tr_idx]); Xs_va = tfidf_s.transform(subs_tr.iloc[va_idx]); Xs_te = tfidf_s.transform(subs_te)
            X_tr = hstack([Xw_tr, Xc_tr, Xs_tr], format='csr')
            X_va = hstack([Xw_va, Xc_va, Xs_va], format='csr')
            X_te = hstack([Xw_te, Xc_te, Xs_te], format='csr')
        else:
            X_tr = hstack([Xw_tr, Xc_tr], format='csr')
            X_va = hstack([Xw_va, Xc_va], format='csr')
            X_te = hstack([Xw_te, Xc_te], format='csr')
        clf = LogisticRegression(solver='saga', penalty='l2', C=0.5, max_iter=4000, n_jobs=-1, class_weight=None, random_state=42, verbose=0)
        clf.fit(X_tr, y[tr_idx])
        va_pred = clf.predict_proba(X_va)[:,1].astype(np.float32)
        te_pred = clf.predict_proba(X_te)[:,1].astype(np.float32)
        oof[va_idx] = va_pred
        mask[va_idx] = True
        test_fold_preds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[{tag}] Fold {fi}/{len(folds)} AUC: {auc:.5f} | elapsed {time.time()-fstart:.1f}s')
        del tfidf_w, tfidf_c, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, X_tr, X_va, X_te
        if with_subs:
            del tfidf_s, Xs_tr, Xs_va, Xs_te
        gc.collect()
    validated = int(mask.sum())
    auc_oof = roc_auc_score(y[mask], oof[mask]); print(f'[{tag}] OOF AUC (on validated {validated}/{n}): {auc_oof:.5f} | total {time.time()-t0:.1f}s')
    test_pred = np.mean(test_fold_preds, axis=0).astype(np.float32)
    np.save(f'oof_lr_time_{tag}.npy', oof.astype(np.float32))
    np.save(f'test_lr_time_{tag}.npy', test_pred)
    return auc_oof

auc_with = run_lr_time(True, 'withsub')
auc_nosub = run_lr_time(False, 'nosub')
print({'time_lr_withsub': auc_with, 'time_lr_nosub': auc_nosub})

Time-CV: 4 folds (forward chaining) built.


[withsub] Fold 1/4 AUC: 0.67061 | elapsed 3.3s


[withsub] Fold 2/4 AUC: 0.59622 | elapsed 5.1s


[withsub] Fold 3/4 AUC: 0.60989 | elapsed 8.3s


[withsub] Fold 4/4 AUC: 0.58196 | elapsed 10.7s
[withsub] OOF AUC (on validated 2302/2878): 0.61641 | total 27.7s


[nosub] Fold 1/4 AUC: 0.67949 | elapsed 3.9s


[nosub] Fold 2/4 AUC: 0.56102 | elapsed 6.1s


[nosub] Fold 3/4 AUC: 0.58402 | elapsed 8.9s


[nosub] Fold 4/4 AUC: 0.61858 | elapsed 10.8s
[nosub] OOF AUC (on validated 2302/2878): 0.60963 | total 30.1s
{'time_lr_withsub': 0.6164125729343122, 'time_lr_nosub': 0.6096292964384343}


In [13]:
# Overwrite submission.csv with S4 g=0.60 (LR_mix) per expert hedge
import pandas as pd
s4g60 = pd.read_csv('submission_s4_lr_mix_g60.csv')
s4g60.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S4 g=0.60 (LR_mix) variant. Head:')
print(s4g60.head())

submission.csv overwritten with S4 g=0.60 (LR_mix) variant. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.334796
1   t3_roiuw                  0.216353
2   t3_mjnbq                  0.206008
3   t3_t8wd1                  0.212227
4  t3_1m4zxu                  0.216804


In [14]:
# S7: 4-way global logit with weights shrunk toward equal (15-20%)
import numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

# Load base preds
o_lr = np.load('oof_lr_pivot.npy'); t_lr = np.load('test_lr_pivot.npy')
o_d1 = np.load('oof_xgb_dense.npy'); t_d1 = np.load('test_xgb_dense.npy')
o_d2 = np.load('oof_xgb_dense_v2.npy'); t_d2 = np.load('test_xgb_dense_v2.npy')
o_meta = np.load('oof_xgb_meta.npy'); t_meta = np.load('test_xgb_meta.npy')

z1, z2, z3, z4 = to_logit(o_lr), to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz1, tz2, tz3, tz4 = to_logit(t_lr), to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Reference weights and equal weights
w_ref = np.array([0.3381, 0.266786, 0.163514, 0.2316], dtype=np.float64)
w_eq = np.array([0.25, 0.25, 0.25, 0.25], dtype=np.float64)

best_auc, best_alpha, best_w = -1.0, None, None
for alpha in [0.15, 0.20]:
    w = (1.0 - alpha) * w_ref + alpha * w_eq
    w = w / w.sum()
    z_oof = w[0]*z1 + w[1]*z2 + w[2]*z3 + w[3]*z4
    auc = roc_auc_score(y, z_oof)
    print(f'alpha={alpha:.2f} | shrunk weights={tuple(w)} | OOF AUC(z): {auc:.5f}')
    if auc > best_auc:
        best_auc, best_alpha, best_w = auc, alpha, w
print(f'Chosen alpha={best_alpha:.2f} | weights={tuple(best_w)} | OOF AUC(z): {best_auc:.5f}')

zt = best_w[0]*tz1 + best_w[1]*tz2 + best_w[2]*tz3 + best_w[3]*tz4
pt = sigmoid(zt).astype(np.float32)
sub = pd.DataFrame({id_col: test[id_col].values, target_col: pt})
sub.to_csv('submission_s7_shrunk_logit.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S7 shrunk-weight 4-way logit. Head:')
print(sub.head())

alpha=0.15 | shrunk weights=(0.324885, 0.2642681, 0.1764869, 0.23436) | OOF AUC(z): 0.69237
alpha=0.20 | shrunk weights=(0.32048, 0.2634288, 0.1808112, 0.23528) | OOF AUC(z): 0.69237
Chosen alpha=0.15 | weights=(0.324885, 0.2642681, 0.1764869, 0.23436) | OOF AUC(z): 0.69237
submission.csv overwritten with S7 shrunk-weight 4-way logit. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.333016
1   t3_roiuw                  0.218114
2   t3_mjnbq                  0.213677
3   t3_t8wd1                  0.208816
4  t3_1m4zxu                  0.213635


In [15]:
# S8: Time-CV tuned 4-way global logit with LR_mix (use LR time-CV OOF/test; others from cache); coarse shrink toward equal
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Build time-CV mask (same as Cell 9 forward-chaining on unix_timestamp)
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 5
blocks = np.array_split(order, k)
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); mask[va_idx] = True
print(f'Time-CV validated count: {mask.sum()}/{n}')

# Load OOF/test predictions
# LR (time-CV) with/without subreddit
o_lr_time_w = np.load('oof_lr_time_withsub.npy')
o_lr_time_ns = np.load('oof_lr_time_nosub.npy')
t_lr_time_w = np.load('test_lr_time_withsub.npy')
t_lr_time_ns = np.load('test_lr_time_nosub.npy')
# Dense/meta from cached (stratified OOF), use as-is for weight tuning on time mask
o_d1 = np.load('oof_xgb_dense.npy'); t_d1 = np.load('test_xgb_dense.npy')
o_d2 = np.load('oof_xgb_dense_v2.npy'); t_d2 = np.load('test_xgb_dense_v2.npy')
o_meta = np.load('oof_xgb_meta.npy'); t_meta = np.load('test_xgb_meta.npy')

# Convert to logits
z_lr_w, z_lr_ns = to_logit(o_lr_time_w), to_logit(o_lr_time_ns)
tz_lr_w, tz_lr_ns = to_logit(t_lr_time_w), to_logit(t_lr_time_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Reference 4-way weights
w_ref = np.array([0.3381, 0.266786, 0.163514, 0.2316], dtype=np.float64)  # (LR, D1, D2, Meta)
w_eq = np.array([0.25, 0.25, 0.25, 0.25], dtype=np.float64)

best_auc, best_cfg = -1.0, None
for g in [0.50, 0.60, 0.65, 0.70]:
    z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
    tz_lr_mix = (1.0 - g)*tz_lr_w + g*tz_lr_ns
    for alpha in [0.0, 0.15]:
        w = (1.0 - alpha)*w_ref + alpha*w_eq
        w = w / w.sum()
        # OOF blend (score only on time mask)
        z_oof = w[0]*z_lr_mix + w[1]*z_d1 + w[2]*z_d2 + w[3]*z_meta
        auc = roc_auc_score(y[mask], z_oof[mask])
        print(f'g={g:.2f}, alpha={alpha:.2f} | OOF(z,time-mask) AUC: {auc:.5f} | weights={tuple(w)}')
        if auc > best_auc:
            best_auc = auc; best_cfg = (g, alpha, w, tz_lr_mix)

g_best, alpha_best, w_best, tz_lr_mix_best = best_cfg
print(f'Chosen config: g={g_best:.2f}, alpha={alpha_best:.2f}, weights={tuple(w_best)} | time-mask OOF AUC(z)={best_auc:.5f}')

# Build test prediction with chosen config
zt = w_best[0]*tz_lr_mix_best + w_best[1]*tz_d1 + w_best[2]*tz_d2 + w_best[3]*tz_meta
pt = sigmoid(zt).astype(np.float32)
sub = pd.DataFrame({id_col: test[id_col].values, target_col: pt})
sub.to_csv('submission_s8_timecv_lr_mix.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('submission.csv overwritten with S8 time-CV tuned 4-way (LR_mix) logit. Head:')
print(sub.head())

Time-CV validated count: 2302/2878
g=0.50, alpha=0.00 | OOF(z,time-mask) AUC: 0.68662 | weights=(0.3381, 0.266786, 0.163514, 0.2316)
g=0.50, alpha=0.15 | OOF(z,time-mask) AUC: 0.68700 | weights=(0.324885, 0.2642681, 0.1764869, 0.23436)
g=0.60, alpha=0.00 | OOF(z,time-mask) AUC: 0.68656 | weights=(0.3381, 0.266786, 0.163514, 0.2316)
g=0.60, alpha=0.15 | OOF(z,time-mask) AUC: 0.68698 | weights=(0.324885, 0.2642681, 0.1764869, 0.23436)
g=0.65, alpha=0.00 | OOF(z,time-mask) AUC: 0.68654 | weights=(0.3381, 0.266786, 0.163514, 0.2316)
g=0.65, alpha=0.15 | OOF(z,time-mask) AUC: 0.68693 | weights=(0.324885, 0.2642681, 0.1764869, 0.23436)
g=0.70, alpha=0.00 | OOF(z,time-mask) AUC: 0.68639 | weights=(0.3381, 0.266786, 0.163514, 0.2316)
g=0.70, alpha=0.15 | OOF(z,time-mask) AUC: 0.68683 | weights=(0.324885, 0.2642681, 0.1764869, 0.23436)
Chosen config: g=0.50, alpha=0.15, weights=(0.324885, 0.2642681, 0.1764869, 0.23436) | time-mask OOF AUC(z)=0.68700
submission.csv overwritten with S8 time-CV tu

In [20]:
# S9: Time-aware CV Dense XGB (SVD on TF-IDF word/char/subs + enriched meta incl. hour_sin/cos), 3-seed bag; 6-block CV; per-fold scale_pos_weight
import numpy as np, pandas as pd, time, re, gc, sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df: pd.DataFrame) -> pd.Series:
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)

def get_body(df: pd.DataFrame) -> pd.Series:
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)

def combine_text(df: pd.DataFrame) -> pd.Series:
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)

def clean_text_series(s: pd.Series) -> pd.Series:
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s

def build_subreddit_text(df: pd.DataFrame) -> pd.Series:
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')

def build_meta(df: pd.DataFrame) -> pd.DataFrame:
    title = get_title(df)
    body = get_body(df)
    out = pd.DataFrame(index=df.index)
    # Text meta
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    # Time meta (from unix timestamp if available) + hour sin/cos
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    # Account/karma proxies (if present)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    # log1p heavy-tailed fields
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

# Build time-ordered forward-chaining folds (6 blocks -> 5 folds) and validated mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Time-CV Dense: {len(folds)} folds; validated {mask.sum()}/{n}')

# Precompute cleaned text and subs for train/test
raw_tr_text = combine_text(train); raw_te_text = combine_text(test)
clean_tr_text = clean_text_series(raw_tr_text); clean_te_text = clean_text_series(raw_te_text)
subs_tr = build_subreddit_text(train); subs_te = build_subreddit_text(test)
meta_te_base = build_meta(test).astype(np.float32).values

# Vectorizer params
word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=20000, sublinear_tf=True, smooth_idf=True, norm='l2')

# SVD dims: word=150, char=150, subs=50
svd_w_n, svd_c_n, svd_s_n = 150, 150, 50

# XGB params for drift robustness (eta tweaked); scale_pos_weight set per fold
base_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=3,
    eta=0.035,
    subsample=0.8,
    colsample_bytree=0.6,
    min_child_weight=8,
    reg_alpha=1.0,
    reg_lambda=4.0,
    gamma=0.1,
    device='cuda',
    tree_method='hist'
)
num_boost_round = 6000
early_stopping_rounds = 300
seeds = [42, 1337, 2025]

oof_sum = np.zeros(n, dtype=np.float64)
oof_cnt = np.zeros(n, dtype=np.float64)
test_seed_preds = []

for si, seed in enumerate(seeds, 1):
    print(f'=== Seed {seed} ({si}/{len(seeds)}) ===')
    params = dict(base_params); params['seed'] = seed
    oof_seed = np.zeros(n, dtype=np.float32)
    test_folds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        fold_t0 = time.time()
        # Build per-fold meta
        meta_tr = build_meta(train.iloc[tr_idx]).astype(np.float32).values
        meta_va = build_meta(train.iloc[va_idx]).astype(np.float32).values

        # TF-IDF fit on train, transform val/test
        t_tfidf = time.time()
        tfidf_w = TfidfVectorizer(**word_params)
        Xw_tr = tfidf_w.fit_transform(clean_tr_text.iloc[tr_idx]); Xw_va = tfidf_w.transform(clean_tr_text.iloc[va_idx]); Xw_te = tfidf_w.transform(clean_te_text)
        tfidf_c = TfidfVectorizer(**char_params)
        Xc_tr = tfidf_c.fit_transform(clean_tr_text.iloc[tr_idx]); Xc_va = tfidf_c.transform(clean_tr_text.iloc[va_idx]); Xc_te = tfidf_c.transform(clean_te_text)
        tfidf_s = TfidfVectorizer(**subs_params)
        Xs_tr = tfidf_s.fit_transform(subs_tr.iloc[tr_idx]); Xs_va = tfidf_s.transform(subs_tr.iloc[va_idx]); Xs_te = tfidf_s.transform(subs_te)
        print(f'[Seed {seed} Fold {fi}] TF-IDF done in {time.time()-t_tfidf:.1f}s | shapes W:{Xw_tr.shape} C:{Xc_tr.shape} S:{Xs_tr.shape}')

        # SVD per view with seed-aligned random_state
        t_svd = time.time()
        svd_w = TruncatedSVD(n_components=svd_w_n, random_state=seed)
        Zw_tr = svd_w.fit_transform(Xw_tr); Zw_va = svd_w.transform(Xw_va); Zw_te = svd_w.transform(Xw_te)
        svd_c = TruncatedSVD(n_components=svd_c_n, random_state=seed)
        Zc_tr = svd_c.fit_transform(Xc_tr); Zc_va = svd_c.transform(Xc_va); Zc_te = svd_c.transform(Xc_te)
        svd_s = TruncatedSVD(n_components=svd_s_n, random_state=seed)
        Zs_tr = svd_s.fit_transform(Xs_tr); Zs_va = svd_s.transform(Xs_va); Zs_te = svd_s.transform(Xs_te)
        ev_w = float(getattr(svd_w, 'explained_variance_ratio_', np.array([])).sum()) if hasattr(svd_w, 'explained_variance_ratio_') else np.nan
        ev_c = float(getattr(svd_c, 'explained_variance_ratio_', np.array([])).sum()) if hasattr(svd_c, 'explained_variance_ratio_') else np.nan
        ev_s = float(getattr(svd_s, 'explained_variance_ratio_', np.array([])).sum()) if hasattr(svd_s, 'explained_variance_ratio_') else np.nan
        print(f'[Seed {seed} Fold {fi}] SVD done in {time.time()-t_svd:.1f}s | EV sums W:{ev_w:.3f} C:{ev_c:.3f} S:{ev_s:.3f}')

        # Stack and scale
        t_stack = time.time()
        Xtr_dense = np.hstack([Zw_tr, Zc_tr, Zs_tr, meta_tr]).astype(np.float32)
        Xva_dense = np.hstack([Zw_va, Zc_va, Zs_va, meta_va]).astype(np.float32)
        Xte_dense = np.hstack([Zw_te, Zc_te, Zs_te, meta_te_base]).astype(np.float32)
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(Xtr_dense); Xva = scaler.transform(Xva_dense); Xte = scaler.transform(Xte_dense)
        print(f'[Seed {seed} Fold {fi}] Stack/scale done in {time.time()-t_stack:.1f}s | shapes tr:{Xtr.shape} va:{Xva.shape} te:{Xte.shape}')

        # Train XGBoost with per-fold class balance
        pos = float((y[tr_idx] == 1).sum()); neg = float((y[tr_idx] == 0).sum())
        spw = (neg / max(pos, 1.0)) if pos > 0 else 1.0
        params['scale_pos_weight'] = spw
        t_train = time.time()
        dtrain = xgb.DMatrix(Xtr, label=y[tr_idx])
        dvalid = xgb.DMatrix(Xva, label=y[va_idx])
        dtest  = xgb.DMatrix(Xte)
        booster = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, 'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        te_pred = booster.predict(dtest, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        oof_seed[va_idx] = va_pred
        test_folds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[Seed {seed} Fold {fi}] Train {time.time()-t_train:.1f}s | best_iter={booster.best_iteration} | spw={spw:.2f} | AUC: {auc:.5f} | total {time.time()-fold_t0:.1f}s')

        # Cleanup
        del (tfidf_w, tfidf_c, tfidf_s, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, Xs_tr, Xs_va, Xs_te,
             svd_w, svd_c, svd_s, Zw_tr, Zw_va, Zw_te, Zc_tr, Zc_va, Zc_te, Zs_tr, Zs_va, Zs_te,
             Xtr_dense, Xva_dense, Xte_dense, Xtr, Xva, Xte, dtrain, dvalid, dtest, booster)
        gc.collect()

    # Aggregate per-seed
    seed_auc = roc_auc_score(y[mask], oof_seed[mask])
    print(f'[Seed {seed}] OOF AUC (validated only): {seed_auc:.5f}')
    oof_sum[mask] += oof_seed[mask]
    oof_cnt[mask] += 1.0
    test_seed_preds.append(np.mean(test_folds, axis=0).astype(np.float64))
    del oof_seed, test_folds
    gc.collect()

# Final averaged OOF/test
oof_avg = np.zeros(n, dtype=np.float32)
oof_avg[mask] = (oof_sum[mask] / np.maximum(oof_cnt[mask], 1.0)).astype(np.float32)
test_avg = np.mean(test_seed_preds, axis=0).astype(np.float32)

auc_oof = roc_auc_score(y[mask], oof_avg[mask])
print(f'Dense Time-CV OOF AUC (validated only, 3-seed avg): {auc_oof:.5f}')
np.save('oof_xgb_dense_time.npy', oof_avg.astype(np.float32))
np.save('test_xgb_dense_time.npy', test_avg)
print('Saved oof_xgb_dense_time.npy and test_xgb_dense_time.npy')

Time-CV Dense: 5 folds; validated 2398/2878
=== Seed 42 (1/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 42 Fold 1] TF-IDF done in 1.7s | shapes W:(480, 3990) C:(480, 13463) S:(480, 511)


[Seed 42 Fold 1] SVD done in 1.7s | EV sums W:0.520 C:0.569 S:0.604
[Seed 42 Fold 1] Stack/scale done in 0.0s | shapes tr:(480, 364) va:(480, 364) te:(1162, 364)


[Seed 42 Fold 1] Train 0.9s | best_iter=41 | spw=1.94 | AUC: 0.69377 | total 4.3s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 42 Fold 2] TF-IDF done in 1.9s | shapes W:(960, 7415) C:(960, 19786) S:(960, 1035)


[Seed 42 Fold 2] SVD done in 3.5s | EV sums W:0.338 C:0.415 S:0.484
[Seed 42 Fold 2] Stack/scale done in 0.0s | shapes tr:(960, 364) va:(480, 364) te:(1162, 364)


[Seed 42 Fold 2] Train 0.8s | best_iter=4 | spw=2.33 | AUC: 0.68735 | total 6.2s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 42 Fold 3] TF-IDF done in 2.2s | shapes W:(1440, 10152) C:(1440, 24292) S:(1440, 1621)


[Seed 42 Fold 3] SVD done in 4.3s | EV sums W:0.273 C:0.357 S:0.412
[Seed 42 Fold 3] Stack/scale done in 0.0s | shapes tr:(1440, 364) va:(480, 364) te:(1162, 364)


[Seed 42 Fold 3] Train 0.9s | best_iter=23 | spw=2.49 | AUC: 0.63294 | total 7.5s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 42 Fold 4] TF-IDF done in 2.5s | shapes W:(1920, 12497) C:(1920, 27408) S:(1920, 2246)


[Seed 42 Fold 4] SVD done in 5.2s | EV sums W:0.238 C:0.329 S:0.357
[Seed 42 Fold 4] Stack/scale done in 0.0s | shapes tr:(1920, 364) va:(479, 364) te:(1162, 364)


[Seed 42 Fold 4] Train 1.3s | best_iter=203 | spw=2.79 | AUC: 0.64135 | total 9.1s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 42 Fold 5] TF-IDF done in 2.9s | shapes W:(2399, 14639) C:(2399, 30209) S:(2399, 3258)


[Seed 42 Fold 5] SVD done in 5.9s | EV sums W:0.216 C:0.311 S:0.302
[Seed 42 Fold 5] Stack/scale done in 0.0s | shapes tr:(2399, 364) va:(479, 364) te:(1162, 364)


[Seed 42 Fold 5] Train 1.6s | best_iter=354 | spw=2.83 | AUC: 0.61672 | total 10.5s
[Seed 42] OOF AUC (validated only): 0.63948


=== Seed 1337 (2/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 1337 Fold 1] TF-IDF done in 1.5s | shapes W:(480, 3990) C:(480, 13463) S:(480, 511)


[Seed 1337 Fold 1] SVD done in 1.8s | EV sums W:0.520 C:0.569 S:0.604
[Seed 1337 Fold 1] Stack/scale done in 0.0s | shapes tr:(480, 364) va:(480, 364) te:(1162, 364)


[Seed 1337 Fold 1] Train 0.9s | best_iter=64 | spw=1.94 | AUC: 0.66188 | total 4.2s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 1337 Fold 2] TF-IDF done in 2.0s | shapes W:(960, 7415) C:(960, 19786) S:(960, 1035)


[Seed 1337 Fold 2] SVD done in 3.5s | EV sums W:0.338 C:0.415 S:0.485
[Seed 1337 Fold 2] Stack/scale done in 0.0s | shapes tr:(960, 364) va:(480, 364) te:(1162, 364)


[Seed 1337 Fold 2] Train 0.8s | best_iter=2 | spw=2.33 | AUC: 0.67431 | total 6.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 1337 Fold 3] TF-IDF done in 2.3s | shapes W:(1440, 10152) C:(1440, 24292) S:(1440, 1621)


[Seed 1337 Fold 3] SVD done in 4.2s | EV sums W:0.273 C:0.357 S:0.412
[Seed 1337 Fold 3] Stack/scale done in 0.1s | shapes tr:(1440, 364) va:(480, 364) te:(1162, 364)


[Seed 1337 Fold 3] Train 0.8s | best_iter=1 | spw=2.49 | AUC: 0.65287 | total 7.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 1337 Fold 4] TF-IDF done in 2.6s | shapes W:(1920, 12497) C:(1920, 27408) S:(1920, 2246)


[Seed 1337 Fold 4] SVD done in 5.1s | EV sums W:0.238 C:0.329 S:0.357
[Seed 1337 Fold 4] Stack/scale done in 0.0s | shapes tr:(1920, 364) va:(479, 364) te:(1162, 364)


[Seed 1337 Fold 4] Train 1.0s | best_iter=85 | spw=2.79 | AUC: 0.61859 | total 8.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 1337 Fold 5] TF-IDF done in 2.9s | shapes W:(2399, 14639) C:(2399, 30209) S:(2399, 3258)


[Seed 1337 Fold 5] SVD done in 5.8s | EV sums W:0.216 C:0.311 S:0.302
[Seed 1337 Fold 5] Stack/scale done in 0.0s | shapes tr:(2399, 364) va:(479, 364) te:(1162, 364)


[Seed 1337 Fold 5] Train 2.3s | best_iter=669 | spw=2.83 | AUC: 0.62785 | total 11.1s
[Seed 1337] OOF AUC (validated only): 0.62327


=== Seed 2025 (3/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 2025 Fold 1] TF-IDF done in 1.6s | shapes W:(480, 3990) C:(480, 13463) S:(480, 511)


[Seed 2025 Fold 1] SVD done in 1.8s | EV sums W:0.520 C:0.569 S:0.603
[Seed 2025 Fold 1] Stack/scale done in 0.0s | shapes tr:(480, 364) va:(480, 364) te:(1162, 364)


[Seed 2025 Fold 1] Train 0.9s | best_iter=29 | spw=1.94 | AUC: 0.69895 | total 4.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 2025 Fold 2] TF-IDF done in 1.9s | shapes W:(960, 7415) C:(960, 19786) S:(960, 1035)


[Seed 2025 Fold 2] SVD done in 3.4s | EV sums W:0.338 C:0.415 S:0.484
[Seed 2025 Fold 2] Stack/scale done in 0.0s | shapes tr:(960, 364) va:(480, 364) te:(1162, 364)


[Seed 2025 Fold 2] Train 0.8s | best_iter=7 | spw=2.33 | AUC: 0.66537 | total 6.2s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 2025 Fold 3] TF-IDF done in 2.2s | shapes W:(1440, 10152) C:(1440, 24292) S:(1440, 1621)


[Seed 2025 Fold 3] SVD done in 4.4s | EV sums W:0.273 C:0.357 S:0.412
[Seed 2025 Fold 3] Stack/scale done in 0.0s | shapes tr:(1440, 364) va:(480, 364) te:(1162, 364)


[Seed 2025 Fold 3] Train 0.9s | best_iter=4 | spw=2.49 | AUC: 0.62353 | total 7.5s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 2025 Fold 4] TF-IDF done in 2.5s | shapes W:(1920, 12497) C:(1920, 27408) S:(1920, 2246)


[Seed 2025 Fold 4] SVD done in 5.1s | EV sums W:0.238 C:0.329 S:0.357
[Seed 2025 Fold 4] Stack/scale done in 0.0s | shapes tr:(1920, 364) va:(479, 364) te:(1162, 364)


[Seed 2025 Fold 4] Train 1.0s | best_iter=70 | spw=2.79 | AUC: 0.63544 | total 8.7s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Seed 2025 Fold 5] TF-IDF done in 2.9s | shapes W:(2399, 14639) C:(2399, 30209) S:(2399, 3258)


[Seed 2025 Fold 5] SVD done in 6.0s | EV sums W:0.216 C:0.311 S:0.302
[Seed 2025 Fold 5] Stack/scale done in 0.0s | shapes tr:(2399, 364) va:(479, 364) te:(1162, 364)


[Seed 2025 Fold 5] Train 1.3s | best_iter=210 | spw=2.83 | AUC: 0.62317 | total 10.3s
[Seed 2025] OOF AUC (validated only): 0.63970


Dense Time-CV OOF AUC (validated only, 3-seed avg): 0.64533
Saved oof_xgb_dense_time.npy and test_xgb_dense_time.npy


In [18]:
# S10: Pure time-CV logit blends: LR_time(with/no sub) + Dense_time; tune on time-mask OOF; write submissions
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Time-mask (same 5 blocks -> 4 folds forward chaining)
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 5
blocks = np.array_split(order, k)
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    mask[np.array(blocks[i])] = True
print(f'Time-CV validated count: {mask.sum()}/{n}')

# Load time-CV base predictions
o_lr_w = np.load('oof_lr_time_withsub.npy'); t_lr_w = np.load('test_lr_time_withsub.npy')
o_lr_ns = np.load('oof_lr_time_nosub.npy'); t_lr_ns = np.load('test_lr_time_nosub.npy')
o_den = np.load('oof_xgb_dense_time.npy'); t_den = np.load('test_xgb_dense_time.npy')

# Convert to logits
z_lr_w, z_lr_ns, z_den = to_logit(o_lr_w), to_logit(o_lr_ns), to_logit(o_den)
tz_lr_w, tz_lr_ns, tz_den = to_logit(t_lr_w), to_logit(t_lr_ns), to_logit(t_den)

# Grid: mix LR_withsub and LR_nosub in logit space with g; then 2-way mix LR_mix and Dense_time with weight w_lr
g_grid = [0.50, 0.60, 0.65, 0.70]
wlr_grid = np.arange(0.55, 0.71, 0.03)  # LR weight in final 2-way logit blend
best_auc, best_cfg = -1.0, None
for g in g_grid:
    z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
    for w_lr in wlr_grid:
        z_oof = w_lr*z_lr_mix + (1.0 - w_lr)*z_den
        auc = roc_auc_score(y[mask], z_oof[mask])
        if auc > best_auc:
            best_auc = auc; best_cfg = (g, w_lr)
print(f'Best time-CV logit blend: g={best_cfg[0]:.2f}, w_lr={best_cfg[1]:.2f} | OOF(z,time-mask) AUC={best_auc:.5f}')

# Build primary test submission with best cfg
g_best, wlr_best = best_cfg
tz_lr_mix_best = (1.0 - g_best)*tz_lr_w + g_best*tz_lr_ns
zt_best = wlr_best*tz_lr_mix_best + (1.0 - wlr_best)*tz_den
pt_best = sigmoid(zt_best).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt_best}).to_csv('submission_time_blend_best.csv', index=False)

# Shrunk-weight variant: shrink 15% toward equal two-way weights (0.5/0.5) to hedge
alpha = 0.15
wlr_shrunk = (1.0 - alpha)*wlr_best + alpha*0.5
zt_shrunk = wlr_shrunk*tz_lr_mix_best + (1.0 - wlr_shrunk)*tz_den
pt_shrunk = sigmoid(zt_shrunk).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt_shrunk}).to_csv('submission_time_blend_shrunk.csv', index=False)

# Equal-prob hedge among the three time-CV bases (prob space) + mild clipping
p_eq = np.clip((t_lr_w + t_lr_ns + t_den) / 3.0, 0.01, 0.99).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_eq}).to_csv('submission_time_equal_prob.csv', index=False)

# Set default submission.csv to primary time-CV blend
pd.DataFrame({id_col: ids, target_col: pt_best}).to_csv('submission.csv', index=False)
print('Wrote submission_time_blend_best.csv (primary), shrunk and equal-prob hedges; submission.csv set to primary.')

Time-CV validated count: 2302/2878
Best time-CV logit blend: g=0.50, w_lr=0.55 | OOF(z,time-mask) AUC=0.63329
Wrote submission_time_blend_best.csv (primary), shrunk and equal-prob hedges; submission.csv set to primary.


In [19]:
# S11: 6-block forward-chaining LR_time with enriched meta (incl. hour_sin/cos); withsub and nosub; cache OOF/test
import numpy as np, pandas as pd, gc, time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)

def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)

def build_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)

def build_subs(df):
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')

def build_meta(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    # hour sin/cos
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf, -np.inf], 0).fillna(0).astype(np.float32)
    return out

# 6-block forward-chaining folds (→ 5 folds); score only validated indices
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Time-CV LR (with meta): {len(folds)} folds; validated {mask.sum()}/{n}')

txt_tr = build_text(train); txt_te = build_text(test)
subs_tr = build_subs(train); subs_te = build_subs(test)
meta_te = build_meta(test).astype(np.float32).values

word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=20000, sublinear_tf=True, smooth_idf=True, norm='l2')

def run_lr_time_meta(with_subs: bool, tag: str):
    t0 = time.time()
    oof = np.zeros(n, dtype=np.float32)
    test_fold_preds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        f0 = time.time()
        # Text TF-IDF per fold
        tfidf_w = TfidfVectorizer(**word_params)
        Xw_tr = tfidf_w.fit_transform(txt_tr.iloc[tr_idx]); Xw_va = tfidf_w.transform(txt_tr.iloc[va_idx]); Xw_te = tfidf_w.transform(txt_te)
        tfidf_c = TfidfVectorizer(**char_params)
        Xc_tr = tfidf_c.fit_transform(txt_tr.iloc[tr_idx]); Xc_va = tfidf_c.transform(txt_tr.iloc[va_idx]); Xc_te = tfidf_c.transform(txt_te)
        if with_subs:
            tfidf_s = TfidfVectorizer(**subs_params)
            Xs_tr = tfidf_s.fit_transform(subs_tr.iloc[tr_idx]); Xs_va = tfidf_s.transform(subs_tr.iloc[va_idx]); Xs_te = tfidf_s.transform(subs_te)
        # Meta per fold + scale
        meta_tr = build_meta(train.iloc[tr_idx]).astype(np.float32).values
        meta_va = build_meta(train.iloc[va_idx]).astype(np.float32).values
        scaler = StandardScaler(with_mean=True, with_std=True)
        meta_tr_s = scaler.fit_transform(meta_tr).astype(np.float32)
        meta_va_s = scaler.transform(meta_va).astype(np.float32)
        meta_te_s = scaler.transform(meta_te).astype(np.float32)
        # Stack text + meta
        if with_subs:
            X_tr = hstack([Xw_tr, Xc_tr, Xs_tr, csr_matrix(meta_tr_s)], format='csr')
            X_va = hstack([Xw_va, Xc_va, Xs_va, csr_matrix(meta_va_s)], format='csr')
            X_te = hstack([Xw_te, Xc_te, Xs_te, csr_matrix(meta_te_s)], format='csr')
        else:
            X_tr = hstack([Xw_tr, Xc_tr, csr_matrix(meta_tr_s)], format='csr')
            X_va = hstack([Xw_va, Xc_va, csr_matrix(meta_va_s)], format='csr')
            X_te = hstack([Xw_te, Xc_te, csr_matrix(meta_te_s)], format='csr')
        # Train LR
        clf = LogisticRegression(solver='saga', penalty='l2', C=0.8, max_iter=4000, n_jobs=-1, random_state=42)
        clf.fit(X_tr, y[tr_idx])
        va_pred = clf.predict_proba(X_va)[:,1].astype(np.float32)
        te_pred = clf.predict_proba(X_te)[:,1].astype(np.float32)
        oof[va_idx] = va_pred
        test_fold_preds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[{tag}] Fold {fi}/{len(folds)} AUC: {auc:.5f} | elapsed {time.time()-f0:.1f}s')
        # cleanup
        del tfidf_w, tfidf_c, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, scaler, meta_tr, meta_va, meta_tr_s, meta_va_s, meta_te_s, X_tr, X_va, X_te, clf
        if with_subs:
            del tfidf_s, Xs_tr, Xs_va, Xs_te
        gc.collect()
    auc_oof = roc_auc_score(y[mask], oof[mask])
    print(f'[{tag}] OOF AUC (validated only): {auc_oof:.5f} | total {time.time()-t0:.1f}s')
    test_pred = np.mean(test_fold_preds, axis=0).astype(np.float32)
    np.save(f'oof_lr_time_{tag}_meta.npy', oof.astype(np.float32))
    np.save(f'test_lr_time_{tag}_meta.npy', test_pred)
    return auc_oof

auc_with_meta = run_lr_time_meta(True, 'withsub')
auc_nosub_meta = run_lr_time_meta(False, 'nosub')
print({'time_lr_withsub_meta': auc_with_meta, 'time_lr_nosub_meta': auc_nosub_meta})

Time-CV LR (with meta): 5 folds; validated 2398/2878


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[withsub] Fold 1/5 AUC: 0.72358 | elapsed 4.5s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[withsub] Fold 2/5 AUC: 0.65111 | elapsed 12.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[withsub] Fold 3/5 AUC: 0.63332 | elapsed 25.7s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[withsub] Fold 4/5 AUC: 0.60852 | elapsed 31.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[withsub] Fold 5/5 AUC: 0.61538 | elapsed 42.1s
[withsub] OOF AUC (validated only): 0.64640 | total 117.7s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[nosub] Fold 1/5 AUC: 0.73879 | elapsed 6.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[nosub] Fold 2/5 AUC: 0.66851 | elapsed 18.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[nosub] Fold 3/5 AUC: 0.63060 | elapsed 25.1s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[nosub] Fold 4/5 AUC: 0.62357 | elapsed 37.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[nosub] Fold 5/5 AUC: 0.64427 | elapsed 44.9s
[nosub] OOF AUC (validated only): 0.66058 | total 133.7s
{'time_lr_withsub_meta': 0.6464044860018527, 'time_lr_nosub_meta': 0.660576334259739}


In [21]:
# S12: Dense XGB v2 (no-subreddit view) under 6-block time-CV with enriched meta (incl. hour_sin/cos), per-fold scale_pos_weight; 3-seed bag
import numpy as np, pandas as pd, time, gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def combine_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def clean_text_series(s):
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s
def build_meta(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

# 6-block forward chaining (5 folds) and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Dense v2 Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Precompute text/meta
raw_tr_text = combine_text(train); raw_te_text = combine_text(test)
clean_tr_text = clean_text_series(raw_tr_text); clean_te_text = clean_text_series(raw_te_text)
meta_te_base = build_meta(test).astype(np.float32).values

# Vectorizers (no subreddit view here)
word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=80000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')

# SVD dims for v2: word=250, char=120
svd_w_n, svd_c_n = 250, 120

# XGB params for v2 (stronger reg, no subs view); per-fold scale_pos_weight
base_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=3,
    eta=0.03,
    min_child_weight=10,
    subsample=0.75,
    colsample_bytree=0.7,
    reg_alpha=1.5,
    reg_lambda=5.0,
    gamma=0.2,
    device='cuda',
    tree_method='hist'
)
num_boost_round = 6000
early_stopping_rounds = 300
seeds = [42, 1337, 2025]

oof_sum = np.zeros(n, dtype=np.float64)
oof_cnt = np.zeros(n, dtype=np.float64)
test_seed_preds = []

for si, seed in enumerate(seeds, 1):
    print(f'=== Dense v2 Seed {seed} ({si}/{len(seeds)}) ===')
    params = dict(base_params); params['seed'] = seed
    oof_seed = np.zeros(n, dtype=np.float32)
    test_folds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        t0 = time.time()
        meta_tr = build_meta(train.iloc[tr_idx]).astype(np.float32).values
        meta_va = build_meta(train.iloc[va_idx]).astype(np.float32).values

        # TF-IDF
        t_tfidf = time.time()
        tfidf_w = TfidfVectorizer(**word_params)
        Xw_tr = tfidf_w.fit_transform(clean_tr_text.iloc[tr_idx]); Xw_va = tfidf_w.transform(clean_tr_text.iloc[va_idx]); Xw_te = tfidf_w.transform(clean_te_text)
        tfidf_c = TfidfVectorizer(**char_params)
        Xc_tr = tfidf_c.fit_transform(clean_tr_text.iloc[tr_idx]); Xc_va = tfidf_c.transform(clean_tr_text.iloc[va_idx]); Xc_te = tfidf_c.transform(clean_te_text)
        print(f'[v2 Seed {seed} Fold {fi}] TF-IDF {time.time()-t_tfidf:.1f}s | W:{Xw_tr.shape} C:{Xc_tr.shape}')

        # SVD
        t_svd = time.time()
        svd_w = TruncatedSVD(n_components=svd_w_n, random_state=seed)
        Zw_tr = svd_w.fit_transform(Xw_tr); Zw_va = svd_w.transform(Xw_va); Zw_te = svd_w.transform(Xw_te)
        svd_c = TruncatedSVD(n_components=svd_c_n, random_state=seed)
        Zc_tr = svd_c.fit_transform(Xc_tr); Zc_va = svd_c.transform(Xc_va); Zc_te = svd_c.transform(Xc_te)
        ev_w = float(getattr(svd_w, 'explained_variance_ratio_', np.array([])).sum()) if hasattr(svd_w, 'explained_variance_ratio_') else np.nan
        ev_c = float(getattr(svd_c, 'explained_variance_ratio_', np.array([])).sum()) if hasattr(svd_c, 'explained_variance_ratio_') else np.nan
        print(f'[v2 Seed {seed} Fold {fi}] SVD {time.time()-t_svd:.1f}s | EV W:{ev_w:.3f} C:{ev_c:.3f}')

        # Stack + scale
        t_stack = time.time()
        Xtr_dense = np.hstack([Zw_tr, Zc_tr, meta_tr]).astype(np.float32)
        Xva_dense = np.hstack([Zw_va, Zc_va, meta_va]).astype(np.float32)
        Xte_dense = np.hstack([Zw_te, Zc_te, meta_te_base]).astype(np.float32)
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(Xtr_dense); Xva = scaler.transform(Xva_dense); Xte = scaler.transform(Xte_dense)
        print(f'[v2 Seed {seed} Fold {fi}] Stack/scale {time.time()-t_stack:.1f}s | tr:{Xtr.shape} va:{Xva.shape} te:{Xte.shape}')

        # Train XGB with per-fold class balance
        pos = float((y[tr_idx] == 1).sum()); neg = float((y[tr_idx] == 0).sum())
        params['scale_pos_weight'] = (neg / max(pos, 1.0)) if pos > 0 else 1.0
        t_train = time.time()
        dtrain = xgb.DMatrix(Xtr, label=y[tr_idx]); dvalid = xgb.DMatrix(Xva, label=y[va_idx]); dtest = xgb.DMatrix(Xte)
        booster = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, 'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        te_pred = booster.predict(dtest, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        oof_seed[va_idx] = va_pred; test_folds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[v2 Seed {seed} Fold {fi}] Train {time.time()-t_train:.1f}s | best_iter={booster.best_iteration} | AUC: {auc:.5f} | total {time.time()-t0:.1f}s')

        del (tfidf_w, tfidf_c, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, svd_w, svd_c, Zw_tr, Zw_va, Zw_te, Zc_tr, Zc_va, Zc_te,
             Xtr_dense, Xva_dense, Xte_dense, Xtr, Xva, Xte, dtrain, dvalid, dtest, booster)
        gc.collect()

    seed_auc = roc_auc_score(y[mask], oof_seed[mask])
    print(f'[Dense v2 Seed {seed}] OOF AUC (validated only): {seed_auc:.5f}')
    oof_sum[mask] += oof_seed[mask]; oof_cnt[mask] += 1.0
    test_seed_preds.append(np.mean(test_folds, axis=0).astype(np.float64))
    del oof_seed, test_folds; gc.collect()

oof_avg = np.zeros(n, dtype=np.float32)
oof_avg[mask] = (oof_sum[mask] / np.maximum(oof_cnt[mask], 1.0)).astype(np.float32)
test_avg = np.mean(test_seed_preds, axis=0).astype(np.float32)
auc_oof = roc_auc_score(y[mask], oof_avg[mask])
print(f'Dense v2 Time-CV OOF AUC (validated only, 3-seed avg): {auc_oof:.5f}')
np.save('oof_xgb_dense_time_v2.npy', oof_avg.astype(np.float32))
np.save('test_xgb_dense_time_v2.npy', test_avg)
print('Saved oof_xgb_dense_time_v2.npy and test_xgb_dense_time_v2.npy')

Dense v2 Time-CV: 5 folds; validated 2398/2878
=== Dense v2 Seed 42 (1/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 42 Fold 1] TF-IDF 1.5s | W:(480, 3990) C:(480, 13463)


[v2 Seed 42 Fold 1] SVD 1.6s | EV W:0.723 C:0.493
[v2 Seed 42 Fold 1] Stack/scale 0.0s | tr:(480, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 42 Fold 1] Train 0.8s | best_iter=4 | AUC: 0.69459 | total 3.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 42 Fold 2] TF-IDF 1.8s | W:(960, 7415) C:(960, 19786)


[v2 Seed 42 Fold 2] SVD 2.7s | EV W:0.482 C:0.359
[v2 Seed 42 Fold 2] Stack/scale 0.0s | tr:(960, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 42 Fold 2] Train 0.8s | best_iter=4 | AUC: 0.67715 | total 5.3s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 42 Fold 3] TF-IDF 2.1s | W:(1440, 10152) C:(1440, 24292)


[v2 Seed 42 Fold 3] SVD 4.0s | EV W:0.388 C:0.309
[v2 Seed 42 Fold 3] Stack/scale 0.0s | tr:(1440, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 42 Fold 3] Train 0.8s | best_iter=10 | AUC: 0.61439 | total 7.0s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 42 Fold 4] TF-IDF 2.3s | W:(1920, 12497) C:(1920, 27408)


[v2 Seed 42 Fold 4] SVD 5.3s | EV W:0.337 C:0.285
[v2 Seed 42 Fold 4] Stack/scale 0.0s | tr:(1920, 384) va:(479, 384) te:(1162, 384)


[v2 Seed 42 Fold 4] Train 0.9s | best_iter=43 | AUC: 0.61612 | total 8.6s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 42 Fold 5] TF-IDF 2.6s | W:(2399, 14639) C:(2399, 30209)


[v2 Seed 42 Fold 5] SVD 6.1s | EV W:0.305 C:0.270
[v2 Seed 42 Fold 5] Stack/scale 0.0s | tr:(2399, 384) va:(479, 384) te:(1162, 384)


[v2 Seed 42 Fold 5] Train 2.7s | best_iter=816 | AUC: 0.60797 | total 11.5s
[Dense v2 Seed 42] OOF AUC (validated only): 0.62501


=== Dense v2 Seed 1337 (2/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 1337 Fold 1] TF-IDF 1.5s | W:(480, 3990) C:(480, 13463)


[v2 Seed 1337 Fold 1] SVD 1.5s | EV W:0.723 C:0.493
[v2 Seed 1337 Fold 1] Stack/scale 0.0s | tr:(480, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 1337 Fold 1] Train 1.0s | best_iter=89 | AUC: 0.67355 | total 4.0s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 1337 Fold 2] TF-IDF 1.8s | W:(960, 7415) C:(960, 19786)


[v2 Seed 1337 Fold 2] SVD 2.7s | EV W:0.482 C:0.359
[v2 Seed 1337 Fold 2] Stack/scale 0.0s | tr:(960, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 1337 Fold 2] Train 0.9s | best_iter=43 | AUC: 0.67412 | total 5.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 1337 Fold 3] TF-IDF 2.2s | W:(1440, 10152) C:(1440, 24292)


[v2 Seed 1337 Fold 3] SVD 4.0s | EV W:0.388 C:0.309
[v2 Seed 1337 Fold 3] Stack/scale 0.0s | tr:(1440, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 1337 Fold 3] Train 0.8s | best_iter=17 | AUC: 0.61999 | total 7.1s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 1337 Fold 4] TF-IDF 2.4s | W:(1920, 12497) C:(1920, 27408)


[v2 Seed 1337 Fold 4] SVD 5.4s | EV W:0.337 C:0.285
[v2 Seed 1337 Fold 4] Stack/scale 0.0s | tr:(1920, 384) va:(479, 384) te:(1162, 384)


[v2 Seed 1337 Fold 4] Train 1.6s | best_iter=360 | AUC: 0.62720 | total 9.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 1337 Fold 5] TF-IDF 2.7s | W:(2399, 14639) C:(2399, 30209)


[v2 Seed 1337 Fold 5] SVD 6.2s | EV W:0.305 C:0.270
[v2 Seed 1337 Fold 5] Stack/scale 0.0s | tr:(2399, 384) va:(479, 384) te:(1162, 384)


[v2 Seed 1337 Fold 5] Train 0.9s | best_iter=5 | AUC: 0.61138 | total 9.9s
[Dense v2 Seed 1337] OOF AUC (validated only): 0.62268


=== Dense v2 Seed 2025 (3/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 2025 Fold 1] TF-IDF 1.5s | W:(480, 3990) C:(480, 13463)


[v2 Seed 2025 Fold 1] SVD 1.6s | EV W:0.723 C:0.493
[v2 Seed 2025 Fold 1] Stack/scale 0.0s | tr:(480, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 2025 Fold 1] Train 1.0s | best_iter=96 | AUC: 0.67896 | total 4.1s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 2025 Fold 2] TF-IDF 1.8s | W:(960, 7415) C:(960, 19786)


[v2 Seed 2025 Fold 2] SVD 2.9s | EV W:0.482 C:0.359
[v2 Seed 2025 Fold 2] Stack/scale 0.0s | tr:(960, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 2025 Fold 2] Train 0.8s | best_iter=16 | AUC: 0.67580 | total 5.6s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 2025 Fold 3] TF-IDF 2.3s | W:(1440, 10152) C:(1440, 24292)


[v2 Seed 2025 Fold 3] SVD 4.0s | EV W:0.388 C:0.309
[v2 Seed 2025 Fold 3] Stack/scale 0.0s | tr:(1440, 384) va:(480, 384) te:(1162, 384)


[v2 Seed 2025 Fold 3] Train 0.8s | best_iter=10 | AUC: 0.61992 | total 7.2s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 2025 Fold 4] TF-IDF 2.5s | W:(1920, 12497) C:(1920, 27408)


[v2 Seed 2025 Fold 4] SVD 5.5s | EV W:0.337 C:0.285
[v2 Seed 2025 Fold 4] Stack/scale 0.0s | tr:(1920, 384) va:(479, 384) te:(1162, 384)


[v2 Seed 2025 Fold 4] Train 0.9s | best_iter=24 | AUC: 0.62906 | total 8.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[v2 Seed 2025 Fold 5] TF-IDF 2.7s | W:(2399, 14639) C:(2399, 30209)


[v2 Seed 2025 Fold 5] SVD 6.2s | EV W:0.305 C:0.270
[v2 Seed 2025 Fold 5] Stack/scale 0.0s | tr:(2399, 384) va:(479, 384) te:(1162, 384)


[v2 Seed 2025 Fold 5] Train 1.7s | best_iter=406 | AUC: 0.61535 | total 10.7s
[Dense v2 Seed 2025] OOF AUC (validated only): 0.63284


Dense v2 Time-CV OOF AUC (validated only, 3-seed avg): 0.64086
Saved oof_xgb_dense_time_v2.npy and test_xgb_dense_time_v2.npy


In [22]:
# S13: Meta-only XGB (time-aware, 6-block CV, per-fold scale_pos_weight), 3-seed bag; cache OOF/test
import numpy as np, pandas as pd, time, gc
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)

def build_meta(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

# 6-block forward-chaining folds and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Meta-only Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Precompute meta for test
meta_te_full = build_meta(test).astype(np.float32).values

# XGB params for meta-only model
base_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=3,
    eta=0.05,
    min_child_weight=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=3.0,
    gamma=0.0,
    device='cuda',
    tree_method='hist'
)
num_boost_round = 4000
early_stopping_rounds = 200
seeds = [42, 1337, 2025]

oof_sum = np.zeros(n, dtype=np.float64)
oof_cnt = np.zeros(n, dtype=np.float64)
test_seed_preds = []

for si, seed in enumerate(seeds, 1):
    print(f'=== Meta Seed {seed} ({si}/{len(seeds)}) ===')
    params = dict(base_params); params['seed'] = seed
    oof_seed = np.zeros(n, dtype=np.float32)
    test_folds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        t0 = time.time()
        M_tr = build_meta(train.iloc[tr_idx]).astype(np.float32).values
        M_va = build_meta(train.iloc[va_idx]).astype(np.float32).values
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(M_tr).astype(np.float32)
        Xva = scaler.transform(M_va).astype(np.float32)
        Xte = scaler.transform(meta_te_full).astype(np.float32)
        pos = float((y[tr_idx] == 1).sum()); neg = float((y[tr_idx] == 0).sum())
        params['scale_pos_weight'] = (neg / max(pos, 1.0)) if pos > 0 else 1.0
        dtrain = xgb.DMatrix(Xtr, label=y[tr_idx])
        dvalid = xgb.DMatrix(Xva, label=y[va_idx])
        dtest  = xgb.DMatrix(Xte)
        booster = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, 'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        te_pred = booster.predict(dtest, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        oof_seed[va_idx] = va_pred; test_folds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[Meta Seed {seed} Fold {fi}] best_iter={booster.best_iteration} | spw={params["scale_pos_weight"]:.2f} | AUC: {auc:.5f} | {time.time()-t0:.1f}s')
        del M_tr, M_va, scaler, Xtr, Xva, Xte, dtrain, dvalid, dtest, booster
        gc.collect()
    seed_auc = roc_auc_score(y[mask], oof_seed[mask])
    print(f'[Meta Seed {seed}] OOF AUC (validated only): {seed_auc:.5f}')
    oof_sum[mask] += oof_seed[mask]; oof_cnt[mask] += 1.0
    test_seed_preds.append(np.mean(test_folds, axis=0).astype(np.float64))
    del oof_seed, test_folds; gc.collect()

oof_avg = np.zeros(n, dtype=np.float32)
oof_avg[mask] = (oof_sum[mask] / np.maximum(oof_cnt[mask], 1.0)).astype(np.float32)
test_avg = np.mean(test_seed_preds, axis=0).astype(np.float32)
auc_oof = roc_auc_score(y[mask], oof_avg[mask])
print(f'Meta-only Time-CV OOF AUC (validated only, 3-seed avg): {auc_oof:.5f}')
np.save('oof_xgb_meta_time.npy', oof_avg.astype(np.float32))
np.save('test_xgb_meta_time.npy', test_avg)
print('Saved oof_xgb_meta_time.npy and test_xgb_meta_time.npy')

Meta-only Time-CV: 5 folds; validated 2398/2878
=== Meta Seed 42 (1/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 42 Fold 1] best_iter=42 | spw=1.94 | AUC: 0.73802 | 0.5s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 42 Fold 2] best_iter=21 | spw=2.33 | AUC: 0.67327 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 42 Fold 3] best_iter=67 | spw=2.49 | AUC: 0.63566 | 0.5s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 42 Fold 4] best_iter=6 | spw=2.79 | AUC: 0.63838 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 42 Fold 5] best_iter=4 | spw=2.83 | AUC: 0.58887 | 0.4s
[Meta Seed 42] OOF AUC (validated only): 0.65498
=== Meta Seed 1337 (2/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 1337 Fold 1] best_iter=66 | spw=1.94 | AUC: 0.73519 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 1337 Fold 2] best_iter=65 | spw=2.33 | AUC: 0.66432 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 1337 Fold 3] best_iter=1 | spw=2.49 | AUC: 0.63257 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 1337 Fold 4] best_iter=69 | spw=2.79 | AUC: 0.61290 | 0.5s


[Meta Seed 1337 Fold 5] best_iter=487 | spw=2.83 | AUC: 0.59533 | 1.1s
[Meta Seed 1337] OOF AUC (validated only): 0.64127


=== Meta Seed 2025 (3/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 2025 Fold 1] best_iter=38 | spw=1.94 | AUC: 0.73981 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 2025 Fold 2] best_iter=58 | spw=2.33 | AUC: 0.67307 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 2025 Fold 3] best_iter=25 | spw=2.49 | AUC: 0.63164 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 2025 Fold 4] best_iter=24 | spw=2.79 | AUC: 0.62542 | 0.4s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta Seed 2025 Fold 5] best_iter=207 | spw=2.83 | AUC: 0.59727 | 0.7s
[Meta Seed 2025] OOF AUC (validated only): 0.64888
Meta-only Time-CV OOF AUC (validated only, 3-seed avg): 0.65507
Saved oof_xgb_meta_time.npy and test_xgb_meta_time.npy


In [28]:
# S14: Constrained time-consistent logit blend over 6-block bases (LR_time_meta mix + Dense v1/v2 + Meta_time)
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# 6-block forward-chaining mask (validated indices only)
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    mask[np.array(blocks[i])] = True
print(f'Time-CV (6 blocks) validated count: {mask.sum()}/{n}')

# Load time-CV base predictions (all 6-block forward-chaining)
o_lr_w = np.load('oof_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy')
t_lr_w = np.load('test_lr_time_withsub_meta.npy')
t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
t_d1 = np.load('test_xgb_dense_time.npy')
t_d2 = np.load('test_xgb_dense_time_v2.npy')
t_meta = np.load('test_xgb_meta_time.npy')

# Convert to logits
z_lr_w, z_lr_ns = to_logit(o_lr_w), to_logit(o_lr_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz_lr_w, tz_lr_ns = to_logit(t_lr_w), to_logit(t_lr_ns)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Grid per expert constraints (LR-heavy, modest meta, capped dense):
# - g in np.arange(0.50, 0.90 + 1e-12, 0.025)  # extended to 0.90 since best sat at 0.85
# - meta_w in [0.08, 0.10, 0.12, 0.15, 0.18, 0.20]  # allow a bit more meta
# - dense_total in np.arange(0.15, 0.35 + 1e-12, 0.05)  # keep tight
# - alpha in {0.2, 0.35, 0.5, 0.65, 0.8}  # split Dense v1/v2
# - w_lr = 1 - meta_w - dense_total
g_grid = np.arange(0.50, 0.90 + 1e-12, 0.025)
meta_grid = [0.08, 0.10, 0.12, 0.15, 0.18, 0.20]
dense_tot_grid = np.arange(0.15, 0.35 + 1e-12, 0.05)
alpha_grid = [0.2, 0.35, 0.5, 0.65, 0.8]

best_auc, best_cfg = -1.0, None
t0 = time.time(); tried = 0
for g in g_grid:
    z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
    tz_lr_mix = (1.0 - g)*tz_lr_w + g*tz_lr_ns
    for meta_w in meta_grid:
        for d_tot in dense_tot_grid:
            w_lr = 1.0 - meta_w - d_tot
            if w_lr <= 0 or w_lr >= 1:
                continue
            for a in alpha_grid:
                w_d2 = d_tot * a
                w_d1 = d_tot - w_d2
                if w_d1 < 0 or w_d2 < 0:
                    continue
                z_oof = w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + meta_w*z_meta
                auc = roc_auc_score(y[mask], z_oof[mask])
                tried += 1
                if auc > best_auc:
                    best_auc = auc
                    best_cfg = dict(g=float(g), w_lr=float(w_lr), w_d1=float(w_d1), w_d2=float(w_d2), w_meta=float(meta_w), tz_lr_mix=tz_lr_mix)
cfg_print = {k: v for k, v in best_cfg.items() if k != 'tz_lr_mix'} if best_cfg is not None else {}
print(f'Constrained blend grid tried {tried} configs | Best OOF(z,time-mask) AUC: {best_auc:.5f} | cfg={cfg_print}')

# Build primary test submission using best weights
g = best_cfg['g']; w_lr = best_cfg['w_lr']; w_d1 = best_cfg['w_d1']; w_d2 = best_cfg['w_d2']; w_meta = best_cfg['w_meta']
tz_lr_mix = best_cfg['tz_lr_mix']
zt_best = w_lr*tz_lr_mix + w_d1*tz_d1 + w_d2*tz_d2 + w_meta*tz_meta
pt_best = sigmoid(zt_best).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt_best}).to_csv('submission_time_blend_constrained_best.csv', index=False)

# Shrunk variant: 15% toward equal weights across used models (LRmix, D1, D2, Meta)
alpha_shrink = 0.15
w_vec = np.array([w_lr, w_d1, w_d2, w_meta], dtype=np.float64)
w_eq = np.ones_like(w_vec) / 4.0
w_shrunk = (1.0 - alpha_shrink)*w_vec + alpha_shrink*w_eq
w_shrunk = (w_shrunk / w_shrunk.sum()).astype(np.float64)
zt_shrunk = w_shrunk[0]*tz_lr_mix + w_shrunk[1]*tz_d1 + w_shrunk[2]*tz_d2 + w_shrunk[3]*tz_meta
pt_shrunk = sigmoid(zt_shrunk).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt_shrunk}).to_csv('submission_time_blend_constrained_shrunk.csv', index=False)

# Equal-prob hedge across all five bases (prob space) with mild clipping
p_eq5 = np.clip((np.clip(sigmoid(tz_lr_w), 1e-6, 1-1e-6) + np.clip(sigmoid(tz_lr_ns), 1e-6, 1-1e-6) + np.clip(t_d1, 1e-6, 1-1e-6) + np.clip(t_d2, 1e-6, 1-1e-6) + np.clip(t_meta, 1e-6, 1-1e-6)) / 5.0, 0.01, 0.99).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_eq5}).to_csv('submission_time_equal5_prob.csv', index=False)

# Set primary submission
pd.DataFrame({id_col: ids, target_col: pt_best}).to_csv('submission.csv', index=False)
print('Wrote constrained time-CV blends (best, shrunk) and equal5 hedge; submission.csv set to primary.')

Time-CV (6 blocks) validated count: 2398/2878


Constrained blend grid tried 2550 configs | Best OOF(z,time-mask) AUC: 0.67198 | cfg={'g': 0.9000000000000004, 'w_lr': 0.44999999999999996, 'w_d1': 0.2800000000000001, 'w_d2': 0.07000000000000002, 'w_meta': 0.2}
Wrote constrained time-CV blends (best, shrunk) and equal5 hedge; submission.csv set to primary.


In [24]:
# S15: Time-aware ridge stacker on logits of 5 bases (LR_withsub_meta, LR_nosub_meta, Dense_v1, Dense_v2, Meta_time)
import numpy as np, pandas as pd, time, gc
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

# 6-block forward-chaining folds and validated mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Stacker Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Load base predictions (time-consistent 6-block CV) and convert to logits
o_lr_w = np.load('oof_lr_time_withsub_meta.npy');   t_lr_w = np.load('test_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy');    t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy');           t_d1 = np.load('test_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy');        t_d2 = np.load('test_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy');          t_meta = np.load('test_xgb_meta_time.npy')

Z_oof = np.vstack([to_logit(o_lr_w), to_logit(o_lr_ns), to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)]).T.astype(np.float64)
Z_test = np.vstack([to_logit(t_lr_w), to_logit(t_lr_ns), to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)]).T.astype(np.float64)
print('Stacker feature shapes:', Z_oof.shape, Z_test.shape)

# Alpha grid
alpha_grid = [0.1, 0.3, 1.0, 3.0, 10.0, 30.0]
best_auc, best_alpha = -1.0, None
best_oof_preds = None; best_test_preds = None

for ai, alpha in enumerate(alpha_grid, 1):
    t0 = time.time()
    oof_preds = np.zeros(n, dtype=np.float64)
    test_fold_preds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        Xtr, ytr = Z_oof[tr_idx], y[tr_idx]
        Xva = Z_oof[va_idx]
        model = Ridge(alpha=alpha, fit_intercept=True, random_state=42)
        model.fit(Xtr, ytr)
        oof_preds[va_idx] = model.predict(Xva)
        test_fold_preds.append(model.predict(Z_test))
        if fi % 1 == 0:
            print(f'[alpha={alpha}] Fold {fi}/{len(folds)} done')
        del Xtr, ytr, Xva, model; gc.collect()
    auc = roc_auc_score(y[mask], oof_preds[mask])
    test_avg = np.mean(test_fold_preds, axis=0).astype(np.float64)
    print(f'Alpha {alpha} | OOF(z,time-mask) AUC: {auc:.5f} | time {time.time()-t0:.1f}s')
    if auc > best_auc:
        best_auc, best_alpha = auc, alpha
        best_oof_preds = oof_preds.copy()
        best_test_preds = test_avg.copy()

print(f'Best ridge alpha={best_alpha} | OOF(z,time-mask) AUC: {best_auc:.5f}')

# Compare to S14 constrained AUC to decide if we promote to primary
s14_oof_auc = 0.67173  # from Cell 18 log
improvement = best_auc - s14_oof_auc
print(f'Improvement over S14: {improvement:.5f}')

# Save stacker submission; promote to primary only if >= +0.002 OOF gain
pt_stack = 1.0 / (1.0 + np.exp(-best_test_preds))
pd.DataFrame({id_col: ids, target_col: pt_stack.astype(np.float32)}).to_csv('submission_time_stacker_ridge.csv', index=False)
if improvement >= 0.002:
    pd.DataFrame({id_col: ids, target_col: pt_stack.astype(np.float32)}).to_csv('submission.csv', index=False)
    print('Promoted stacker to primary submission.csv')
else:
    print('Kept S14 primary; stacker saved as submission_time_stacker_ridge.csv')

Stacker Time-CV: 5 folds; validated 2398/2878
Stacker feature shapes: (2878, 5) (1162, 5)
[alpha=0.1] Fold 1/5 done
[alpha=0.1] Fold 2/5 done


[alpha=0.1] Fold 3/5 done
[alpha=0.1] Fold 4/5 done
[alpha=0.1] Fold 5/5 done


Alpha 0.1 | OOF(z,time-mask) AUC: 0.56571 | time 0.7s
[alpha=0.3] Fold 1/5 done
[alpha=0.3] Fold 2/5 done


[alpha=0.3] Fold 3/5 done
[alpha=0.3] Fold 4/5 done
[alpha=0.3] Fold 5/5 done


Alpha 0.3 | OOF(z,time-mask) AUC: 0.56599 | time 0.7s
[alpha=1.0] Fold 1/5 done
[alpha=1.0] Fold 2/5 done


[alpha=1.0] Fold 3/5 done
[alpha=1.0] Fold 4/5 done
[alpha=1.0] Fold 5/5 done


Alpha 1.0 | OOF(z,time-mask) AUC: 0.56658 | time 0.7s
[alpha=3.0] Fold 1/5 done
[alpha=3.0] Fold 2/5 done


[alpha=3.0] Fold 3/5 done
[alpha=3.0] Fold 4/5 done
[alpha=3.0] Fold 5/5 done


Alpha 3.0 | OOF(z,time-mask) AUC: 0.56830 | time 0.7s
[alpha=10.0] Fold 1/5 done
[alpha=10.0] Fold 2/5 done


[alpha=10.0] Fold 3/5 done
[alpha=10.0] Fold 4/5 done
[alpha=10.0] Fold 5/5 done


Alpha 10.0 | OOF(z,time-mask) AUC: 0.57016 | time 0.7s
[alpha=30.0] Fold 1/5 done
[alpha=30.0] Fold 2/5 done


[alpha=30.0] Fold 3/5 done
[alpha=30.0] Fold 4/5 done
[alpha=30.0] Fold 5/5 done


Alpha 30.0 | OOF(z,time-mask) AUC: 0.57190 | time 0.7s
Best ridge alpha=30.0 | OOF(z,time-mask) AUC: 0.57190
Improvement over S14: -0.09983
Kept S14 primary; stacker saved as submission_time_stacker_ridge.csv


In [25]:
# S16: NB-SVM (log-count ratio) under 6-block forward-chaining CV with word+char TF-IDF; cache OOF/test
import numpy as np, pandas as pd, time, gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def combine_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def clean_text_series(s):
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s

txt_tr_raw = combine_text(train); txt_te_raw = combine_text(test)
txt_tr = clean_text_series(txt_tr_raw); txt_te = clean_text_series(txt_te_raw)

# 6-block forward-chaining folds
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'NB-SVM Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Vectorizers
tfidf_word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=80000, sublinear_tf=True, smooth_idf=True, norm='l2')
tfidf_char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
count_word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=80000, binary=False)
count_char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, binary=False)

def log_count_ratio(X_counts, y_bin, alpha=1.0):
    # X_counts: csr (n_samples x n_features)
    y_bin = y_bin.astype(bool)
    pos_sum = (X_counts[y_bin].sum(axis=0) + alpha).A1
    neg_sum = (X_counts[~y_bin].sum(axis=0) + alpha).A1
    r = np.log(pos_sum / neg_sum)
    return r.astype(np.float32)

oof = np.zeros(n, dtype=np.float32)
test_fold_preds = []

for fi, (tr_idx, va_idx) in enumerate(folds, 1):
    t0 = time.time()
    y_tr = y[tr_idx]
    # Fit TF-IDF on train fold
    tfidf_w = TfidfVectorizer(**tfidf_word_params)
    Xw_tr_tfidf = tfidf_w.fit_transform(txt_tr.iloc[tr_idx]); Xw_va_tfidf = tfidf_w.transform(txt_tr.iloc[va_idx]); Xw_te_tfidf = tfidf_w.transform(txt_te)
    tfidf_c = TfidfVectorizer(**tfidf_char_params)
    Xc_tr_tfidf = tfidf_c.fit_transform(txt_tr.iloc[tr_idx]); Xc_va_tfidf = tfidf_c.transform(txt_tr.iloc[va_idx]); Xc_te_tfidf = tfidf_c.transform(txt_te)
    # Fit CountVectorizer with the same vocabulary as TF-IDF for r
    cnt_w = CountVectorizer(**count_word_params, vocabulary=tfidf_w.vocabulary_)
    Xw_tr_cnt = cnt_w.fit_transform(txt_tr.iloc[tr_idx])  # fit with fixed vocab for term indexing
    cnt_c = CountVectorizer(**count_char_params, vocabulary=tfidf_c.vocabulary_)
    Xc_tr_cnt = cnt_c.fit_transform(txt_tr.iloc[tr_idx])
    # Compute log-count ratios per view
    r_w = log_count_ratio(Xw_tr_cnt, y_tr, alpha=1.0)
    r_c = log_count_ratio(Xc_tr_cnt, y_tr, alpha=1.0)
    # Apply NB transform: multiply TF-IDF columns by r
    Xtr_nb = hstack([Xw_tr_tfidf.multiply(r_w), Xc_tr_tfidf.multiply(r_c)], format='csr')
    Xva_nb = hstack([Xw_va_tfidf.multiply(r_w), Xc_va_tfidf.multiply(r_c)], format='csr')
    Xte_nb = hstack([Xw_te_tfidf.multiply(r_w), Xc_te_tfidf.multiply(r_c)], format='csr')
    # Train logistic regression on NB features
    clf = LogisticRegression(solver='saga', penalty='l2', C=2.0, max_iter=4000, n_jobs=-1, random_state=42)
    clf.fit(Xtr_nb, y_tr)
    va_pred = clf.predict_proba(Xva_nb)[:,1].astype(np.float32)
    te_pred = clf.predict_proba(Xte_nb)[:,1].astype(np.float32)
    oof[va_idx] = va_pred
    test_fold_preds.append(te_pred)
    auc = roc_auc_score(y[va_idx], va_pred)
    print(f'[NB-SVM] Fold {fi}/{len(folds)} AUC: {auc:.5f} | elapsed {time.time()-t0:.1f}s | shapes tr:{Xtr_nb.shape} va:{Xva_nb.shape}')
    del (tfidf_w, tfidf_c, cnt_w, cnt_c, Xw_tr_tfidf, Xw_va_tfidf, Xw_te_tfidf, Xc_tr_tfidf, Xc_va_tfidf, Xc_te_tfidf,
         Xw_tr_cnt, Xc_tr_cnt, r_w, r_c, Xtr_nb, Xva_nb, Xte_nb, clf)
    gc.collect()

auc_oof = roc_auc_score(y[mask], oof[mask])
print(f'NB-SVM Time-CV OOF AUC (validated only): {auc_oof:.5f}')
test_avg = np.mean(test_fold_preds, axis=0).astype(np.float32)
np.save('oof_nbsvm_time.npy', oof.astype(np.float32))
np.save('test_nbsvm_time.npy', test_avg)
print('Saved oof_nbsvm_time.npy and test_nbsvm_time.npy')

NB-SVM Time-CV: 5 folds; validated 2398/2878


[NB-SVM] Fold 1/5 AUC: 0.51799 | elapsed 2.0s | shapes tr:(480, 17453) va:(480, 17453)


[NB-SVM] Fold 2/5 AUC: 0.53152 | elapsed 4.5s | shapes tr:(960, 27201) va:(480, 27201)


[NB-SVM] Fold 3/5 AUC: 0.53900 | elapsed 8.2s | shapes tr:(1440, 34444) va:(480, 34444)


[NB-SVM] Fold 4/5 AUC: 0.57269 | elapsed 12.9s | shapes tr:(1920, 39905) va:(479, 39905)


[NB-SVM] Fold 5/5 AUC: 0.59149 | elapsed 15.4s | shapes tr:(2399, 44848) va:(479, 44848)
NB-SVM Time-CV OOF AUC (validated only): 0.55642
Saved oof_nbsvm_time.npy and test_nbsvm_time.npy


In [26]:
# S17: Time-safe target encoding (TE) for requester_subreddits_at_request with m-estimate; cache OOF/train and test TE features
import numpy as np, pandas as pd, time, gc
from collections import Counter, defaultdict
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def as_sub_list(x):
    if isinstance(x, (list, tuple)):
        return [str(s).lower() for s in x]
    return []

subs_tr_lists = train.get('requester_subreddits_at_request', pd.Series([[]]*len(train))).apply(as_sub_list)
subs_te_lists = test.get('requester_subreddits_at_request', pd.Series([[]]*len(test))).apply(as_sub_list)

def logit(p):
    p = float(np.clip(p, 0.01, 0.99))
    return np.log(p/(1.0-p))

# Build 6-block forward-chaining folds and validated mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'TE folds: {len(folds)}; validated {mask.sum()}/{n}')

m_list = [50, 200]
# Features per row: for each m -> mean_logodds, max_logodds, sum_logodds (3*len(m_list));
# plus coverage, subs_count_log1p, and logcnt aggregates: mean_logcnt, max_logcnt, sum_logcnt (3)
F = 3*len(m_list) + 2 + 3
oof_te = np.zeros((n, F), dtype=np.float32)

def build_counts(indices):
    cnt = Counter(); pos = Counter()
    for idx in indices:
        labs = int(y[idx])
        subs = set(subs_tr_lists.iloc[idx])
        if not subs: continue
        for s in subs:
            cnt[s] += 1
            if labs == 1:
                pos[s] += 1
    return cnt, pos

def row_agg(subs, cnt, pos, p_global, m_list):
    total = len(subs)
    seen = 0
    logcnt_vals = []
    # precompute smoothed log-odds per m for row
    lodds_by_m = [[] for _ in m_list]
    for s in subs:
        c = cnt.get(s, 0)
        if c > 0: seen += 1
        pc = pos.get(s, 0)
        lc = np.log1p(c)
        logcnt_vals.append(lc)
        for mi, m in enumerate(m_list):
            phat = (pc + m*p_global) / (c + m) if (c + m) > 0 else p_global
            lodds_by_m[mi].append(logit(phat))
    feats = []
    for lodds in lodds_by_m:
        if len(lodds) == 0:
            feats.extend([logit(p_global), logit(p_global), logit(p_global)])
        else:
            arr = np.array(lodds, dtype=np.float32)
            feats.extend([float(arr.mean()), float(arr.max()), float(arr.sum())])
    coverage = (seen / total) if total > 0 else 0.0
    subs_count = np.log1p(total)
    if len(logcnt_vals) == 0:
        feats.extend([0.0, 0.0, 0.0])
    else:
        arrc = np.array(logcnt_vals, dtype=np.float32)
        feats.extend([float(arrc.mean()), float(arrc.max()), float(arrc.sum())])
    feats.extend([float(coverage), float(subs_count)])  # add at end to keep order stable with comment above
    # Reorder to match declared order: for clarity we defined coverage and subs_count at end; reorder now:
    # Declared order: [for m: mean,max,sum]... , coverage, subs_count_log1p, mean_logcnt, max_logcnt, sum_logcnt
    # Current feats: [for m: mean,max,sum]*, mean_logcnt, max_logcnt, sum_logcnt, coverage, subs_count
    # Fix:
    m_feats = feats[:3*len(m_list)]
    mean_logcnt, max_logcnt, sum_logcnt, coverage, subs_count = feats[3*len(m_list):]
    return m_feats + [coverage, subs_count, mean_logcnt, max_logcnt, sum_logcnt]

t0 = time.time()
for fi, (tr_idx, va_idx) in enumerate(folds, 1):
    f0 = time.time()
    cnt, pos = build_counts(tr_idx)
    p_global = float((y[tr_idx] == 1).mean()) if len(tr_idx) > 0 else float((y == 1).mean())
    # Fill OOF TE features for this val block
    for idx in va_idx:
        subs = subs_tr_lists.iloc[idx]
        feats = row_agg(subs, cnt, pos, p_global, m_list)
        oof_te[idx, :] = np.array(feats, dtype=np.float32)
    print(f'[TE] Fold {fi}/{len(folds)} done | train_cnt={len(cnt)} | p_global={p_global:.4f} | elapsed {time.time()-f0:.1f}s')
print(f'TE OOF features built in {time.time()-t0:.1f}s')

# Build test TE features using full train stats (refit on full train)
cnt_full, pos_full = build_counts(np.arange(n))
p_global_full = float((y == 1).mean())
te_test = np.zeros((len(test), F), dtype=np.float32)
for i in range(len(test)):
    subs = subs_te_lists.iloc[i]
    feats = row_agg(subs, cnt_full, pos_full, p_global_full, m_list)
    te_test[i, :] = np.array(feats, dtype=np.float32)
print('Test TE features built.')

np.save('te_subs_oof.npy', oof_te.astype(np.float32))
np.save('te_subs_test.npy', te_test.astype(np.float32))
print('Saved te_subs_oof.npy and te_subs_test.npy with shape', oof_te.shape, te_test.shape)

# Quick diagnostic: correlate single strongest base logit with a TE signal if available (optional)
try:
    from sklearn.preprocessing import StandardScaler
    z_lr_ns = np.log(np.clip(np.load('oof_lr_time_nosub_meta.npy'), 1e-6, 1-1e-6) / (1-np.clip(np.load('oof_lr_time_nosub_meta.npy'), 1e-6, 1-1e-6)))
    scaler = StandardScaler()
    te_std = scaler.fit_transform(oof_te[mask])
    corr = np.corrcoef(z_lr_ns[mask], te_std[:,0])[0,1]
    print(f'Debug corr(logit LR_nosub_meta, TE_feat0 on validated): {corr:.4f}')
except Exception as e:
    print('TE debug skipped:', e)

TE folds: 5; validated 2398/2878
[TE] Fold 1/5 done | train_cnt=968 | p_global=0.3396 | elapsed 0.1s
[TE] Fold 2/5 done | train_cnt=1730 | p_global=0.3000 | elapsed 0.1s


[TE] Fold 3/5 done | train_cnt=2569 | p_global=0.2868 | elapsed 0.1s
[TE] Fold 4/5 done | train_cnt=3571 | p_global=0.2635 | elapsed 0.2s


[TE] Fold 5/5 done | train_cnt=4880 | p_global=0.2614 | elapsed 0.2s
TE OOF features built in 0.8s
Test TE features built.


Saved te_subs_oof.npy and te_subs_test.npy with shape (2878, 11) (1162, 11)
Debug corr(logit LR_nosub_meta, TE_feat0 on validated): 0.2433


In [27]:
# S18: LR_time with enriched meta + subreddit TE (m=50,200 aggregates), 6-block forward-chaining; cache OOF/test
import numpy as np, pandas as pd, gc, time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix
from collections import Counter

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def build_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def build_subs_str(df):
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')
def as_sub_list(x):
    if isinstance(x, (list, tuple)):
        return [str(s).lower() for s in x]
    return []
subs_tr_lists = train.get('requester_subreddits_at_request', pd.Series([[]]*len(train))).apply(as_sub_list)
subs_te_lists = test.get('requester_subreddits_at_request', pd.Series([[]]*len(test))).apply(as_sub_list)

def build_meta(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf, -np.inf], 0).fillna(0).astype(np.float32)
    return out

def logit_clip(p):
    p = float(np.clip(p, 0.01, 0.99))
    return np.log(p/(1.0-p))

def build_counts(indices):
    cnt = Counter(); pos = Counter()
    for idx in indices:
        labs = int(y[idx])
        subs = set(subs_tr_lists.iloc[idx])
        if not subs: continue
        for s in subs:
            cnt[s] += 1
            if labs == 1:
                pos[s] += 1
    return cnt, pos

def row_te_feats(subs, cnt, pos, p_global, m_list=(50,200)):
    total = len(subs); seen = 0
    logcnt_vals = []
    lodds_by_m = [[] for _ in m_list]
    for s in subs:
        c = cnt.get(s, 0); pc = pos.get(s, 0)
        if c > 0: seen += 1
        logcnt_vals.append(np.log1p(c))
        for mi, m in enumerate(m_list):
            ph = (pc + m*p_global) / (c + m) if (c + m) > 0 else p_global
            lodds_by_m[mi].append(logit_clip(ph))
    feats = []
    for lodds in lodds_by_m:
        if len(lodds) == 0:
            feats.extend([logit_clip(p_global), logit_clip(p_global), logit_clip(p_global)])
        else:
            arr = np.array(lodds, dtype=np.float32)
            feats.extend([float(arr.mean()), float(arr.max()), float(arr.sum())])
    coverage = (seen/total) if total > 0 else 0.0
    subs_count = np.log1p(total)
    if len(logcnt_vals) == 0:
        mean_lc = max_lc = sum_lc = 0.0
    else:
        arrc = np.array(logcnt_vals, dtype=np.float32)
        mean_lc, max_lc, sum_lc = float(arrc.mean()), float(arrc.max()), float(arrc.sum())
    # Order: for m: mean,max,sum,... then coverage, subs_count, mean_logcnt, max_logcnt, sum_logcnt
    feats.extend([coverage, subs_count, mean_lc, max_lc, sum_lc])
    return np.array(feats, dtype=np.float32)

# 6-block forward-chaining folds (→ 5 folds) and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'LR+TE Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

txt_tr = build_text(train); txt_te = build_text(test)
subs_tr_str = build_subs_str(train); subs_te_str = build_subs_str(test)
meta_te_base = build_meta(test).astype(np.float32).values

word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=20000, sublinear_tf=True, smooth_idf=True, norm='l2')

def run_lr_time_meta_te(with_subs: bool, tag: str):
    t0 = time.time()
    oof = np.zeros(n, dtype=np.float32)
    test_fold_preds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        f0 = time.time()
        # Text TF-IDF per fold
        tfidf_w = TfidfVectorizer(**word_params)
        Xw_tr = tfidf_w.fit_transform(txt_tr.iloc[tr_idx]); Xw_va = tfidf_w.transform(txt_tr.iloc[va_idx]); Xw_te = tfidf_w.transform(txt_te)
        tfidf_c = TfidfVectorizer(**char_params)
        Xc_tr = tfidf_c.fit_transform(txt_tr.iloc[tr_idx]); Xc_va = tfidf_c.transform(txt_tr.iloc[va_idx]); Xc_te = tfidf_c.transform(txt_te)
        if with_subs:
            tfidf_s = TfidfVectorizer(**subs_params)
            Xs_tr = tfidf_s.fit_transform(subs_tr_str.iloc[tr_idx]); Xs_va = tfidf_s.transform(subs_tr_str.iloc[va_idx]); Xs_te = tfidf_s.transform(subs_te_str)
        # Meta per fold
        meta_tr = build_meta(train.iloc[tr_idx]).astype(np.float32).values
        meta_va = build_meta(train.iloc[va_idx]).astype(np.float32).values
        # TE per fold: fit counts on train_idx only
        cnt, pos = build_counts(tr_idx)
        p_global = float((y[tr_idx] == 1).mean()) if len(tr_idx) > 0 else float((y == 1).mean())
        F = 3*2 + 2 + 3  # m=50,200 -> 6 + coverage/subs_count + 3 logcnt = 11
        te_tr = np.zeros((len(tr_idx), F), dtype=np.float32)
        te_va = np.zeros((len(va_idx), F), dtype=np.float32)
        for j, idx in enumerate(tr_idx):
            te_tr[j, :] = row_te_feats(subs_tr_lists.iloc[idx], cnt, pos, p_global)
        for j, idx in enumerate(va_idx):
            te_va[j, :] = row_te_feats(subs_tr_lists.iloc[idx], cnt, pos, p_global)
        # Test TE using full-train stats each fold (consistent with protocol refit on full train for final test);
        # here we approximate by using full-train stats once outside loops is costly to rebuild; compute here quickly.
        cnt_full, pos_full = build_counts(np.arange(n))
        p_global_full = float((y == 1).mean())
        te_te = np.zeros((len(test), F), dtype=np.float32)
        for j in range(len(test)):
            te_te[j, :] = row_te_feats(subs_te_lists.iloc[j], cnt_full, pos_full, p_global_full)
        # Scale meta+TE jointly per fold
        meta_tr_te = np.hstack([meta_tr, te_tr]).astype(np.float32)
        meta_va_te = np.hstack([meta_va, te_va]).astype(np.float32)
        meta_te_all = np.hstack([meta_te_base, te_te]).astype(np.float32)
        scaler = StandardScaler(with_mean=True, with_std=True)
        meta_tr_s = scaler.fit_transform(meta_tr_te).astype(np.float32)
        meta_va_s = scaler.transform(meta_va_te).astype(np.float32)
        meta_te_s = scaler.transform(meta_te_all).astype(np.float32)
        # Stack text + scaled meta+TE
        if with_subs:
            X_tr = hstack([Xw_tr, Xc_tr, Xs_tr, csr_matrix(meta_tr_s)], format='csr')
            X_va = hstack([Xw_va, Xc_va, Xs_va, csr_matrix(meta_va_s)], format='csr')
            X_te = hstack([Xw_te, Xc_te, Xs_te, csr_matrix(meta_te_s)], format='csr')
        else:
            X_tr = hstack([Xw_tr, Xc_tr, csr_matrix(meta_tr_s)], format='csr')
            X_va = hstack([Xw_va, Xc_va, csr_matrix(meta_va_s)], format='csr')
            X_te = hstack([Xw_te, Xc_te, csr_matrix(meta_te_s)], format='csr')
        # Train LR
        clf = LogisticRegression(solver='saga', penalty='l2', C=0.8, max_iter=4000, n_jobs=-1, random_state=42)
        clf.fit(X_tr, y[tr_idx])
        va_pred = clf.predict_proba(X_va)[:,1].astype(np.float32)
        te_pred = clf.predict_proba(X_te)[:,1].astype(np.float32)
        oof[va_idx] = va_pred; test_fold_preds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[LR+TE {tag}] Fold {fi}/{len(folds)} AUC: {auc:.5f} | {time.time()-f0:.1f}s | shapes tr:{X_tr.shape} va:{X_va.shape}')
        # cleanup
        del (tfidf_w, tfidf_c, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, scaler, meta_tr, meta_va, meta_tr_te, meta_va_te, meta_te_all, meta_tr_s, meta_va_s, meta_te_s, X_tr, X_va, X_te, clf, te_tr, te_va, te_te, cnt, pos, cnt_full, pos_full)
        if with_subs:
            del tfidf_s, Xs_tr, Xs_va, Xs_te
        gc.collect()
    auc_oof = roc_auc_score(y[mask], oof[mask])
    print(f'[LR+TE {tag}] OOF AUC (validated only): {auc_oof:.5f} | total {time.time()-t0:.1f}s')
    test_pred = np.mean(test_fold_preds, axis=0).astype(np.float32)
    np.save(f'oof_lr_time_{tag}_meta_te.npy', oof.astype(np.float32))
    np.save(f'test_lr_time_{tag}_meta_te.npy', test_pred)
    return auc_oof

auc_with_te = run_lr_time_meta_te(True, 'withsub')
auc_nosub_te = run_lr_time_meta_te(False, 'nosub')
print({'time_lr_withsub_meta_te': auc_with_te, 'time_lr_nosub_meta_te': auc_nosub_te})

LR+TE Time-CV: 5 folds; validated 2398/2878


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE withsub] Fold 1/5 AUC: 0.66812 | 3.7s | shapes tr:(480, 18274) va:(480, 18274)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE withsub] Fold 2/5 AUC: 0.62384 | 8.4s | shapes tr:(960, 28965) va:(480, 28965)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE withsub] Fold 3/5 AUC: 0.60918 | 14.6s | shapes tr:(1440, 37079) va:(480, 37079)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE withsub] Fold 4/5 AUC: 0.58777 | 17.5s | shapes tr:(1920, 43385) va:(479, 43385)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE withsub] Fold 5/5 AUC: 0.57336 | 18.4s | shapes tr:(2399, 49550) va:(479, 49550)
[LR+TE withsub] OOF AUC (validated only): 0.61204 | total 63.3s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE nosub] Fold 1/5 AUC: 0.67468 | 3.5s | shapes tr:(480, 17763) va:(480, 17763)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE nosub] Fold 2/5 AUC: 0.63617 | 5.5s | shapes tr:(960, 27930) va:(480, 27930)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE nosub] Fold 3/5 AUC: 0.61293 | 8.6s | shapes tr:(1440, 35458) va:(480, 35458)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE nosub] Fold 4/5 AUC: 0.59936 | 16.3s | shapes tr:(1920, 41139) va:(479, 41139)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR+TE nosub] Fold 5/5 AUC: 0.57731 | 21.4s | shapes tr:(2399, 46292) va:(479, 46292)
[LR+TE nosub] OOF AUC (validated only): 0.61915 | total 55.9s
{'time_lr_withsub_meta_te': 0.6120376803743307, 'time_lr_nosub_meta_te': 0.6191540267244493}


In [29]:
# S19: Time-aware LogisticRegression stacker on base logits (5 bases), with per-fold scaling; promote if >= +0.002 over S14
import numpy as np, pandas as pd, time, gc
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

# 6-block forward-chaining folds and validated mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Stacker(LR) Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Load base predictions (time-consistent 6-block CV) and convert to logits
o_lr_w = np.load('oof_lr_time_withsub_meta.npy');   t_lr_w = np.load('test_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy');    t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy');           t_d1 = np.load('test_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy');        t_d2 = np.load('test_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy');          t_meta = np.load('test_xgb_meta_time.npy')

Z_oof_raw = np.vstack([to_logit(o_lr_w), to_logit(o_lr_ns), to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)]).T.astype(np.float64)
Z_test_raw = np.vstack([to_logit(t_lr_w), to_logit(t_lr_ns), to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)]).T.astype(np.float64)
print('Stacker raw feature shapes:', Z_oof_raw.shape, Z_test_raw.shape)

# C grid for LogisticRegression
C_grid = [0.1, 0.3, 1.0, 3.0, 10.0]
best_auc, best_C = -1.0, None
best_oof_scores = None; best_test_scores = None

for Ci, C in enumerate(C_grid, 1):
    t0 = time.time()
    oof_scores = np.zeros(n, dtype=np.float64)
    test_fold_scores = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        Xtr_raw, Xva_raw = Z_oof_raw[tr_idx], Z_oof_raw[va_idx]
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(Xtr_raw); Xva = scaler.transform(Xva_raw); Xte = scaler.transform(Z_test_raw)
        clf = LogisticRegression(penalty='l2', solver='lbfgs', C=C, max_iter=2000, random_state=42)
        clf.fit(Xtr, y[tr_idx])
        oof_scores[va_idx] = clf.predict_proba(Xva)[:,1]
        test_fold_scores.append(clf.predict_proba(Xte)[:,1])
        print(f'[C={C}] Fold {fi}/{len(folds)} done')
        del Xtr_raw, Xva_raw, Xtr, Xva, Xte, scaler, clf; gc.collect()
    auc = roc_auc_score(y[mask], oof_scores[mask])
    test_avg = np.mean(test_fold_scores, axis=0).astype(np.float64)
    print(f'C={C} | OOF(prob,time-mask) AUC: {auc:.5f} | time {time.time()-t0:.1f}s')
    if auc > best_auc:
        best_auc, best_C = auc, C
        best_oof_scores = oof_scores.copy()
        best_test_scores = test_avg.copy()

print(f'Best Logistic stacker C={best_C} | OOF(prob,time-mask) AUC: {best_auc:.5f}')

# Promote only if >= +0.002 over S14 constrained blend
s14_oof_auc = 0.67198
improvement = best_auc - s14_oof_auc
print(f'Improvement over S14: {improvement:.5f}')

sub_stack = pd.DataFrame({id_col: ids, target_col: best_test_scores.astype(np.float32)})
sub_stack.to_csv('submission_time_stacker_logreg.csv', index=False)
if improvement >= 0.002:
    sub_stack.to_csv('submission.csv', index=False)
    print('Promoted Logistic stacker to primary submission.csv')
else:
    print('Kept S14 primary; saved submission_time_stacker_logreg.csv')

Stacker(LR) Time-CV: 5 folds; validated 2398/2878
Stacker raw feature shapes: (2878, 5) (1162, 5)
[C=0.1] Fold 1/5 done
[C=0.1] Fold 2/5 done


[C=0.1] Fold 3/5 done
[C=0.1] Fold 4/5 done
[C=0.1] Fold 5/5 done


C=0.1 | OOF(prob,time-mask) AUC: 0.55112 | time 0.7s
[C=0.3] Fold 1/5 done
[C=0.3] Fold 2/5 done


[C=0.3] Fold 3/5 done
[C=0.3] Fold 4/5 done
[C=0.3] Fold 5/5 done


C=0.3 | OOF(prob,time-mask) AUC: 0.56067 | time 0.7s
[C=1.0] Fold 1/5 done
[C=1.0] Fold 2/5 done


[C=1.0] Fold 3/5 done
[C=1.0] Fold 4/5 done
[C=1.0] Fold 5/5 done


C=1.0 | OOF(prob,time-mask) AUC: 0.56459 | time 0.7s
[C=3.0] Fold 1/5 done
[C=3.0] Fold 2/5 done


[C=3.0] Fold 3/5 done
[C=3.0] Fold 4/5 done
[C=3.0] Fold 5/5 done


C=3.0 | OOF(prob,time-mask) AUC: 0.56140 | time 0.7s
[C=10.0] Fold 1/5 done
[C=10.0] Fold 2/5 done


[C=10.0] Fold 3/5 done
[C=10.0] Fold 4/5 done
[C=10.0] Fold 5/5 done


C=10.0 | OOF(prob,time-mask) AUC: 0.55787 | time 0.7s
Best Logistic stacker C=1.0 | OOF(prob,time-mask) AUC: 0.56459
Improvement over S14: -0.10739
Kept S14 primary; saved submission_time_stacker_logreg.csv


In [31]:
# S20: Meta-only XGB v2 with enriched domain features (time-aware 6-block CV), 3-seed bag; cache OOF/test
import numpy as np, pandas as pd, time, gc, re
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)

def build_meta_v2(df):
    title = get_title(df); body = get_body(df)
    txt = (title + ' ' + body).str.lower()
    out = pd.DataFrame(index=df.index)
    # Base lengths
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    # URL / IMG flags
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    # Counts/ratios
    out['exclam_count'] = body.str.count('!').astype(np.float32)
    out['question_count'] = body.str.count('\?').astype(np.float32)
    out['ellipsis_count'] = body.str.count(r'\.{3,}').astype(np.float32)
    out['url_count'] = body.str.count(r'https?://').astype(np.float32)
    out['img_count'] = body.str.count(r'imgur|jpg|jpeg|png|gif').astype(np.float32)
    out['number_count'] = body.str.count(r'\d+').astype(np.float32)
    # ALLCAPS ratio and word_allcaps_count
    def caps_ratio_func(s):
        if not isinstance(s, str) or len(s)==0: return 0.0
        caps = sum(1 for ch in s if ch.isupper())
        return caps / max(len(s), 1)
    out['caps_ratio'] = body.apply(caps_ratio_func).astype(np.float32)
    out['word_allcaps_count'] = body.str.findall(r'\b[A-Z]{2,}\b').apply(lambda x: len(x) if isinstance(x, list) else 0).astype(np.float32)
    # Pronouns
    out['i_count'] = txt.str.count(r'\bi\b').astype(np.float32)
    out['we_count'] = txt.str.count(r'\bwe\b').astype(np.float32)
    # Gratitude / reciprocity
    out['grat_count'] = txt.str.count(r'thank you|thanks|grateful|appreciate').astype(np.float32)
    out['recip_count'] = txt.str.count(r'pay it forward|give back|return favor|promise').astype(np.float32)
    # Hardship / urgency
    out['hard_count'] = txt.str.count(r'broke|rent|bill|student|homeless|hungry|kids|family|unemployed|job').astype(np.float32)
    out['urg_count'] = txt.str.count(r'today|tonight|asap|emergency').astype(np.float32)
    # Evidence
    out['evid_count'] = txt.str.count(r'proof|pic|photo|verify|receipt').astype(np.float32)
    # Pizza brands
    out['brand_count'] = txt.str.count(r'domino|pizza hut|papa john|little caesars').astype(np.float32)
    # Length/structure
    token_count = txt.str.count(r'\w+').astype(np.float32)
    sent_count = body.str.count(r'[\.!?]').astype(np.float32)
    out['token_count'] = token_count
    out['sentence_count'] = sent_count
    # Temporal
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    day = dt.dt.day.fillna(1).astype(np.float32)
    dow = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['dayofweek'] = dow
    out['is_weekend'] = dow.isin([5, 6]).astype(np.float32)
    out['end_of_month'] = (day >= 27).astype(np.float32)
    out['is_month_start'] = (day <= 5).astype(np.float32)
    out['end_of_week'] = dow.isin([4, 5]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    # Account proxies
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    # log1p heavy tails
    for c in out.columns:
        if out[c].dtype != np.float32: continue
        out[c] = out[c].replace([np.inf, -np.inf], 0).fillna(0)
    heavy = ['title_len','body_len','title_body_ratio','exclam_count','question_count','ellipsis_count','url_count','img_count','number_count','word_allcaps_count','i_count','we_count','grat_count','recip_count','hard_count','urg_count','evid_count','brand_count','token_count','sentence_count','requester_upvotes_minus_downvotes_at_request','requester_upvotes_plus_downvotes_at_request','requester_number_of_comments_at_request','requester_number_of_posts_at_request']
    for c in heavy:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf, -np.inf], 0).fillna(0).astype(np.float32)
    return out

# 6-block forward-chaining folds and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Meta v2 Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

meta_te_full = build_meta_v2(test).astype(np.float32).values

base_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=3,
    eta=0.05,
    min_child_weight=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.8,
    reg_lambda=4.0,
    gamma=0.0,
    device='cuda',
    tree_method='hist'
)
num_boost_round = 5000
early_stopping_rounds = 300
seeds = [42, 1337, 2025]

oof_sum = np.zeros(n, dtype=np.float64)
oof_cnt = np.zeros(n, dtype=np.float64)
test_seed_preds = []

for si, seed in enumerate(seeds, 1):
    print(f'=== Meta v2 Seed {seed} ({si}/{len(seeds)}) ===')
    params = dict(base_params); params['seed'] = seed
    oof_seed = np.zeros(n, dtype=np.float32)
    test_folds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        t0 = time.time()
        M_tr = build_meta_v2(train.iloc[tr_idx]).astype(np.float32).values
        M_va = build_meta_v2(train.iloc[va_idx]).astype(np.float32).values
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(M_tr).astype(np.float32)
        Xva = scaler.transform(M_va).astype(np.float32)
        Xte = scaler.transform(meta_te_full).astype(np.float32)
        pos = float((y[tr_idx] == 1).sum()); neg = float((y[tr_idx] == 0).sum())
        params['scale_pos_weight'] = (neg / max(pos, 1.0)) if pos > 0 else 1.0
        dtrain = xgb.DMatrix(Xtr, label=y[tr_idx])
        dvalid = xgb.DMatrix(Xva, label=y[va_idx])
        dtest  = xgb.DMatrix(Xte)
        booster = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, 'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_pred = booster.predict(dvalid, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        te_pred = booster.predict(dtest, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        oof_seed[va_idx] = va_pred; test_folds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[Meta v2 Seed {seed} Fold {fi}] best_iter={booster.best_iteration} | spw={params["scale_pos_weight"]:.2f} | AUC: {auc:.5f} | {time.time()-t0:.1f}s')
        del M_tr, M_va, scaler, Xtr, Xva, Xte, dtrain, dvalid, dtest, booster
        gc.collect()
    seed_auc = roc_auc_score(y[mask], oof_seed[mask])
    print(f'[Meta v2 Seed {seed}] OOF AUC (validated only): {seed_auc:.5f}')
    oof_sum[mask] += oof_seed[mask]; oof_cnt[mask] += 1.0
    test_seed_preds.append(np.mean(test_folds, axis=0).astype(np.float64))
    del oof_seed, test_folds; gc.collect()

oof_avg = np.zeros(n, dtype=np.float32)
oof_avg[mask] = (oof_sum[mask] / np.maximum(oof_cnt[mask], 1.0)).astype(np.float32)
test_avg = np.mean(test_seed_preds, axis=0).astype(np.float32)
auc_oof = roc_auc_score(y[mask], oof_avg[mask])
print(f'Meta v2 Time-CV OOF AUC (validated only, 3-seed avg): {auc_oof:.5f}')
np.save('oof_xgb_meta_time_v2.npy', oof_avg.astype(np.float32))
np.save('test_xgb_meta_time_v2.npy', test_avg)
print('Saved oof_xgb_meta_time_v2.npy and test_xgb_meta_time_v2.npy')

Meta v2 Time-CV: 5 folds; validated 2398/2878
=== Meta v2 Seed 42 (1/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 42 Fold 1] best_iter=32 | spw=1.94 | AUC: 0.68917 | 0.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 42 Fold 2] best_iter=70 | spw=2.33 | AUC: 0.67833 | 0.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 42 Fold 3] best_iter=13 | spw=2.49 | AUC: 0.61383 | 0.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 42 Fold 4] best_iter=3 | spw=2.79 | AUC: 0.62771 | 0.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 42 Fold 5] best_iter=427 | spw=2.83 | AUC: 0.62892 | 1.6s
[Meta v2 Seed 42] OOF AUC (validated only): 0.64297


=== Meta v2 Seed 1337 (2/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 1337 Fold 1] best_iter=2 | spw=1.94 | AUC: 0.70038 | 0.7s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 1337 Fold 2] best_iter=42 | spw=2.33 | AUC: 0.68153 | 0.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 1337 Fold 3] best_iter=1 | spw=2.49 | AUC: 0.61514 | 0.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 1337 Fold 4] best_iter=6 | spw=2.79 | AUC: 0.63437 | 0.9s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 1337 Fold 5] best_iter=391 | spw=2.83 | AUC: 0.62453 | 1.7s
[Meta v2 Seed 1337] OOF AUC (validated only): 0.64275


=== Meta v2 Seed 2025 (3/3) ===


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 2025 Fold 1] best_iter=71 | spw=1.94 | AUC: 0.68500 | 0.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 2025 Fold 2] best_iter=30 | spw=2.33 | AUC: 0.66752 | 0.8s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 2025 Fold 3] best_iter=168 | spw=2.49 | AUC: 0.61635 | 1.1s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 2025 Fold 4] best_iter=55 | spw=2.79 | AUC: 0.62501 | 1.0s


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[Meta v2 Seed 2025 Fold 5] best_iter=753 | spw=2.83 | AUC: 0.61686 | 2.1s
[Meta v2 Seed 2025] OOF AUC (validated only): 0.64438


Meta v2 Time-CV OOF AUC (validated only, 3-seed avg): 0.64897
Saved oof_xgb_meta_time_v2.npy and test_xgb_meta_time_v2.npy


In [32]:
# S21: NB-SVM (counts-based, binary=True) under 6-block forward-chaining CV; grid C in [2,4,8]; cache OOF/test
import numpy as np, pandas as pd, time, gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def combine_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def clean_text_series(s):
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s

txt_tr_raw = combine_text(train); txt_te_raw = combine_text(test)
txt_tr = clean_text_series(txt_tr_raw); txt_te = clean_text_series(txt_te_raw)

# 6-block forward-chaining folds
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'NB-SVM (counts) Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Count vectorizers (binary=True); word 1-2 and char_wb 3-5
cnt_word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=100000, binary=True)
cnt_char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=80000, binary=True)

def log_count_ratio(X_counts, y_bin, alpha=1.0):
    yb = y_bin.astype(bool)
    pos_sum = (X_counts[yb].sum(axis=0) + alpha).A1
    neg_sum = (X_counts[~yb].sum(axis=0) + alpha).A1
    return np.log(pos_sum / neg_sum).astype(np.float32)

C_grid = [2.0, 4.0, 8.0]
best_auc, best_C = -1.0, None
best_oof = None; best_test = None

for Ci, C in enumerate(C_grid, 1):
    tC = time.time()
    oof = np.zeros(n, dtype=np.float32)
    test_fold_preds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        t0 = time.time()
        y_tr = y[tr_idx]
        # Fit count vectorizers on train fold
        cnt_w = CountVectorizer(**cnt_word_params)
        Xw_tr = cnt_w.fit_transform(txt_tr.iloc[tr_idx]); Xw_va = cnt_w.transform(txt_tr.iloc[va_idx]); Xw_te = cnt_w.transform(txt_te)
        cnt_c = CountVectorizer(**cnt_char_params)
        Xc_tr = cnt_c.fit_transform(txt_tr.iloc[tr_idx]); Xc_va = cnt_c.transform(txt_tr.iloc[va_idx]); Xc_te = cnt_c.transform(txt_te)
        # Compute log-count ratios
        r_w = log_count_ratio(Xw_tr, y_tr, alpha=1.0)
        r_c = log_count_ratio(Xc_tr, y_tr, alpha=1.0)
        # NB transform: multiply columns by r
        Xtr_nb = hstack([Xw_tr.multiply(r_w), Xc_tr.multiply(r_c)], format='csr')
        Xva_nb = hstack([Xw_va.multiply(r_w), Xc_va.multiply(r_c)], format='csr')
        Xte_nb = hstack([Xw_te.multiply(r_w), Xc_te.multiply(r_c)], format='csr')
        # Train LogisticRegression on NB features
        clf = LogisticRegression(solver='saga', penalty='l2', C=C, max_iter=4000, n_jobs=-1, random_state=42)
        clf.fit(Xtr_nb, y_tr)
        va_pred = clf.predict_proba(Xva_nb)[:,1].astype(np.float32)
        te_pred = clf.predict_proba(Xte_nb)[:,1].astype(np.float32)
        oof[va_idx] = va_pred; test_fold_preds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[NB-Counts C={C}] Fold {fi}/{len(folds)} AUC: {auc:.5f} | {time.time()-t0:.1f}s | shapes tr:{Xtr_nb.shape} va:{Xva_nb.shape}')
        del (cnt_w, cnt_c, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, r_w, r_c, Xtr_nb, Xva_nb, Xte_nb, clf); gc.collect()
    auc_oof = roc_auc_score(y[mask], oof[mask])
    test_avg = np.mean(test_fold_preds, axis=0).astype(np.float32)
    print(f'[NB-Counts] C={C} | OOF AUC (validated only): {auc_oof:.5f} | total {time.time()-tC:.1f}s')
    if auc_oof > best_auc:
        best_auc, best_C = auc_oof, C
        best_oof = oof.copy(); best_test = test_avg.copy()

print(f'NB-Counts best C={best_C} | OOF(time-mask) AUC: {best_auc:.5f}')
np.save('oof_nb_counts_time.npy', best_oof.astype(np.float32))
np.save('test_nb_counts_time.npy', best_test.astype(np.float32))
print('Saved oof_nb_counts_time.npy and test_nb_counts_time.npy')

NB-SVM (counts) Time-CV: 5 folds; validated 2398/2878


[NB-Counts C=2.0] Fold 1/5 AUC: 0.55299 | 5.8s | shapes tr:(480, 17453) va:(480, 17453)


[NB-Counts C=2.0] Fold 2/5 AUC: 0.52408 | 12.8s | shapes tr:(960, 27201) va:(480, 27201)


[NB-Counts C=2.0] Fold 3/5 AUC: 0.51316 | 25.9s | shapes tr:(1440, 34444) va:(480, 34444)


[NB-Counts C=2.0] Fold 4/5 AUC: 0.53523 | 37.9s | shapes tr:(1920, 39905) va:(479, 39905)


[NB-Counts C=2.0] Fold 5/5 AUC: 0.58091 | 43.7s | shapes tr:(2399, 44848) va:(479, 44848)
[NB-Counts] C=2.0 | OOF AUC (validated only): 0.54553 | total 126.7s


[NB-Counts C=4.0] Fold 1/5 AUC: 0.55274 | 6.6s | shapes tr:(480, 17453) va:(480, 17453)


[NB-Counts C=4.0] Fold 2/5 AUC: 0.52372 | 15.7s | shapes tr:(960, 27201) va:(480, 27201)


[NB-Counts C=4.0] Fold 3/5 AUC: 0.51249 | 30.5s | shapes tr:(1440, 34444) va:(480, 34444)


[NB-Counts C=4.0] Fold 4/5 AUC: 0.53560 | 43.5s | shapes tr:(1920, 39905) va:(479, 39905)


[NB-Counts C=4.0] Fold 5/5 AUC: 0.58109 | 50.2s | shapes tr:(2399, 44848) va:(479, 44848)
[NB-Counts] C=4.0 | OOF AUC (validated only): 0.54506 | total 147.2s


[NB-Counts C=8.0] Fold 1/5 AUC: 0.55259 | 7.9s | shapes tr:(480, 17453) va:(480, 17453)


[NB-Counts C=8.0] Fold 2/5 AUC: 0.52293 | 18.6s | shapes tr:(960, 27201) va:(480, 27201)


[NB-Counts C=8.0] Fold 3/5 AUC: 0.51241 | 35.2s | shapes tr:(1440, 34444) va:(480, 34444)


[NB-Counts C=8.0] Fold 4/5 AUC: 0.53553 | 48.2s | shapes tr:(1920, 39905) va:(479, 39905)


[NB-Counts C=8.0] Fold 5/5 AUC: 0.58138 | 55.9s | shapes tr:(2399, 44848) va:(479, 44848)
[NB-Counts] C=8.0 | OOF AUC (validated only): 0.54450 | total 166.4s
NB-Counts best C=2.0 | OOF(time-mask) AUC: 0.54553
Saved oof_nb_counts_time.npy and test_nb_counts_time.npy


In [33]:
# S14b: Wider-grid hedge blend (allow denser total and meta up to 0.22); do not overwrite primary unless better
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Time-mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    mask[np.array(blocks[i])] = True
print(f'Time-CV (6 blocks) validated count: {mask.sum()}/{n}')

# Load bases
o_lr_w = np.load('oof_lr_time_withsub_meta.npy'); t_lr_w = np.load('test_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy'); t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy'); t_d1 = np.load('test_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy'); t_d2 = np.load('test_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy'); t_meta = np.load('test_xgb_meta_time.npy')

z_lr_w, z_lr_ns = to_logit(o_lr_w), to_logit(o_lr_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
tz_lr_w, tz_lr_ns = to_logit(t_lr_w), to_logit(t_lr_ns)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)

# Wider grid (hedge):
g_grid = np.arange(0.50, 0.90 + 1e-12, 0.025)
meta_grid = [0.08, 0.10, 0.12, 0.15, 0.18, 0.20, 0.22]
dense_tot_grid = np.arange(0.15, 0.45 + 1e-12, 0.05)  # extended to 0.45
alpha_grid = [0.2, 0.35, 0.5, 0.65, 0.8]

best_auc, best_cfg = -1.0, None
tried = 0
for g in g_grid:
    z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
    tz_lr_mix = (1.0 - g)*tz_lr_w + g*tz_lr_ns
    for meta_w in meta_grid:
        for d_tot in dense_tot_grid:
            w_lr = 1.0 - meta_w - d_tot
            if w_lr <= 0 or w_lr >= 1:
                continue
            for a in alpha_grid:
                w_d2 = d_tot * a
                w_d1 = d_tot - w_d2
                if w_d1 < 0 or w_d2 < 0:
                    continue
                z_oof = w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + meta_w*z_meta
                auc = roc_auc_score(y[mask], z_oof[mask])
                tried += 1
                if auc > best_auc:
                    best_auc = auc
                    best_cfg = dict(g=float(g), w_lr=float(w_lr), w_d1=float(w_d1), w_d2=float(w_d2), w_meta=float(meta_w), tz_lr_mix=tz_lr_mix)
cfg_print = {k: v for k, v in best_cfg.items() if k != 'tz_lr_mix'} if best_cfg is not None else {}
print(f'S14b wider grid tried {tried} | Best OOF(z,time-mask) AUC: {best_auc:.5f} | cfg={cfg_print}')

# Build alt test submission
g = best_cfg['g']; w_lr = best_cfg['w_lr']; w_d1 = best_cfg['w_d1']; w_d2 = best_cfg['w_d2']; w_meta = best_cfg['w_meta']
tz_lr_mix = best_cfg['tz_lr_mix']
zt = w_lr*tz_lr_mix + w_d1*tz_d1 + w_d2*tz_d2 + w_meta*tz_meta
pt = sigmoid(zt).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt}).to_csv('submission_time_blend_constrained_wide.csv', index=False)
print('Wrote submission_time_blend_constrained_wide.csv (not promoted).')

Time-CV (6 blocks) validated count: 2398/2878


S14b wider grid tried 4165 | Best OOF(z,time-mask) AUC: 0.67378 | cfg={'g': 0.9000000000000004, 'w_lr': 0.32999999999999996, 'w_d1': 0.36000000000000004, 'w_d2': 0.09000000000000002, 'w_meta': 0.22}
Wrote submission_time_blend_constrained_wide.csv (not promoted).


In [34]:
# Promote S14b wider-grid blend to primary submission
import pandas as pd
src = 'submission_time_blend_constrained_wide.csv'
dst = 'submission.csv'
sub = pd.read_csv(src)
sub.to_csv(dst, index=False)
print(f'Promoted {src} to {dst}. Head:')
print(sub.head())

Promoted submission_time_blend_constrained_wide.csv to submission.csv. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.537033
1   t3_roiuw                  0.511641
2   t3_mjnbq                  0.395971
3   t3_t8wd1                  0.442418
4  t3_1m4zxu                  0.515684


In [35]:
# Refit-on-full for 5 core time-consistent bases and final S14b-weighted blend
import numpy as np, pandas as pd, time, gc, re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
import xgboost as xgb
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def combine_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def clean_text_series(s):
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s
def build_subs(df):
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')
def build_meta_v1(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Common text data
txt_tr_raw = combine_text(train); txt_te_raw = combine_text(test)
txt_tr = clean_text_series(txt_tr_raw); txt_te = clean_text_series(txt_te_raw)
subs_tr = build_subs(train); subs_te = build_subs(test)
meta_tr_v1 = build_meta_v1(train).astype(np.float32).values
meta_te_v1 = build_meta_v1(test).astype(np.float32).values

# Vectorizer params (match time-CV models)
word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=20000, sublinear_tf=True, smooth_idf=True, norm='l2')

print('Refit-on-full: LR_withsub_meta')
t0 = time.time()
tfidf_w = TfidfVectorizer(**word_params)
Xw_tr = tfidf_w.fit_transform(txt_tr); Xw_te = tfidf_w.transform(txt_te)
tfidf_c = TfidfVectorizer(**char_params)
Xc_tr = tfidf_c.fit_transform(txt_tr); Xc_te = tfidf_c.transform(txt_te)
tfidf_s = TfidfVectorizer(**subs_params)
Xs_tr = tfidf_s.fit_transform(subs_tr); Xs_te = tfidf_s.transform(subs_te)
scaler_meta = StandardScaler(with_mean=True, with_std=True)
Mtr_s = scaler_meta.fit_transform(meta_tr_v1).astype(np.float32)
Mte_s = scaler_meta.transform(meta_te_v1).astype(np.float32)
X_tr_with = hstack([Xw_tr, Xc_tr, Xs_tr, csr_matrix(Mtr_s)], format='csr')
X_te_with = hstack([Xw_te, Xc_te, Xs_te, csr_matrix(Mte_s)], format='csr')
lr_with = LogisticRegression(solver='saga', penalty='l2', C=0.8, max_iter=4000, n_jobs=-1, random_state=42)
lr_with.fit(X_tr_with, y)
p_te_lr_with = lr_with.predict_proba(X_te_with)[:,1].astype(np.float32)
print(f'LR_withsub_meta trained in {time.time()-t0:.1f}s | shapes tr:{X_tr_with.shape} te:{X_te_with.shape}')

print('Refit-on-full: LR_nosub_meta')
t1 = time.time()
scaler_meta2 = StandardScaler(with_mean=True, with_std=True)
Mtr2_s = scaler_meta2.fit_transform(meta_tr_v1).astype(np.float32)
Mte2_s = scaler_meta2.transform(meta_te_v1).astype(np.float32)
X_tr_nosub = hstack([Xw_tr, Xc_tr, csr_matrix(Mtr2_s)], format='csr')
X_te_nosub = hstack([Xw_te, Xc_te, csr_matrix(Mte2_s)], format='csr')
lr_ns = LogisticRegression(solver='saga', penalty='l2', C=0.8, max_iter=4000, n_jobs=-1, random_state=42)
lr_ns.fit(X_tr_nosub, y)
p_te_lr_ns = lr_ns.predict_proba(X_te_nosub)[:,1].astype(np.float32)
print(f'LR_nosub_meta trained in {time.time()-t1:.1f}s | shapes tr:{X_tr_nosub.shape} te:{X_te_nosub.shape}')

# Dense v1 refit (TF-IDF word/char/subs -> SVD 150/150/50 + meta_v1 -> XGB)
print('Refit-on-full: Dense v1 (SVD 150/150/50 + meta) XGB')
t2 = time.time()
svd_w_v1, svd_c_v1, svd_s_v1 = TruncatedSVD(n_components=150, random_state=42), TruncatedSVD(n_components=150, random_state=42), TruncatedSVD(n_components=50, random_state=42)
Zw_tr = svd_w_v1.fit_transform(Xw_tr); Zw_te = svd_w_v1.transform(Xw_te)
Zc_tr = svd_c_v1.fit_transform(Xc_tr); Zc_te = svd_c_v1.transform(Xc_te)
Zs_tr = svd_s_v1.fit_transform(Xs_tr); Zs_te = svd_s_v1.transform(Xs_te)
Xtr_dense_v1 = np.hstack([Zw_tr, Zc_tr, Zs_tr, Mtr_s]).astype(np.float32)
Xte_dense_v1 = np.hstack([Zw_te, Zc_te, Zs_te, Mte_s]).astype(np.float32)
sc_v1 = StandardScaler(with_mean=True, with_std=True)
Xtr_v1 = sc_v1.fit_transform(Xtr_dense_v1); Xte_v1 = sc_v1.transform(Xte_dense_v1)
pos = float((y==1).sum()); neg = float((y==0).sum()); spw = (neg/max(pos,1.0)) if pos>0 else 1.0
params_v1 = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.035, subsample=0.8, colsample_bytree=0.6, min_child_weight=8, reg_alpha=1.0, reg_lambda=4.0, gamma=0.1, device='cuda', tree_method='hist', seed=42, scale_pos_weight=spw)
# Use conservative rounds approximating median best_iteration from CV
nrounds_v1 = 200
dtr_v1 = xgb.DMatrix(Xtr_v1, label=y); dte_v1 = xgb.DMatrix(Xte_v1)
booster_v1 = xgb.train(params_v1, dtr_v1, num_boost_round=nrounds_v1, verbose_eval=False)
p_te_d1 = booster_v1.predict(dte_v1).astype(np.float32)
print(f'Dense v1 trained in {time.time()-t2:.1f}s | nrounds={nrounds_v1} | spw={spw:.2f} | shapes tr:{Xtr_v1.shape}')

# Dense v2 refit (no-subs; SVD 250/120 + meta_v1 -> XGB)
print('Refit-on-full: Dense v2 (SVD 250/120 + meta) XGB')
t3 = time.time()
svd_w_v2, svd_c_v2 = TruncatedSVD(n_components=250, random_state=42), TruncatedSVD(n_components=120, random_state=42)
Zw2_tr = svd_w_v2.fit_transform(Xw_tr); Zw2_te = svd_w_v2.transform(Xw_te)
Zc2_tr = svd_c_v2.fit_transform(Xc_tr); Zc2_te = svd_c_v2.transform(Xc_te)
Xtr_dense_v2 = np.hstack([Zw2_tr, Zc2_tr, Mtr_s]).astype(np.float32)
Xte_dense_v2 = np.hstack([Zw2_te, Zc2_te, Mte_s]).astype(np.float32)
sc_v2 = StandardScaler(with_mean=True, with_std=True)
Xtr_v2 = sc_v2.fit_transform(Xtr_dense_v2); Xte_v2 = sc_v2.transform(Xte_dense_v2)
pos = float((y==1).sum()); neg = float((y==0).sum()); spw2 = (neg/max(pos,1.0)) if pos>0 else 1.0
params_v2 = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.03, min_child_weight=10, subsample=0.75, colsample_bytree=0.7, reg_alpha=1.5, reg_lambda=5.0, gamma=0.2, device='cuda', tree_method='hist', seed=42, scale_pos_weight=spw2)
nrounds_v2 = 120
dtr_v2 = xgb.DMatrix(Xtr_v2, label=y); dte_v2 = xgb.DMatrix(Xte_v2)
booster_v2 = xgb.train(params_v2, dtr_v2, num_boost_round=nrounds_v2, verbose_eval=False)
p_te_d2 = booster_v2.predict(dte_v2).astype(np.float32)
print(f'Dense v2 trained in {time.time()-t3:.1f}s | nrounds={nrounds_v2} | spw={spw2:.2f} | shapes tr:{Xtr_v2.shape}')

# Meta-only XGB refit
print('Refit-on-full: Meta-only XGB')
t4 = time.time()
sc_meta = StandardScaler(with_mean=True, with_std=True)
Xtr_meta = sc_meta.fit_transform(meta_tr_v1).astype(np.float32)
Xte_meta = sc_meta.transform(meta_te_v1).astype(np.float32)
pos = float((y==1).sum()); neg = float((y==0).sum()); spw3 = (neg/max(pos,1.0)) if pos>0 else 1.0
params_meta = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.05, min_child_weight=8, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.5, reg_lambda=3.0, gamma=0.0, device='cuda', tree_method='hist', seed=42, scale_pos_weight=spw3)
nrounds_meta = 80
dtr_m = xgb.DMatrix(Xtr_meta, label=y); dte_m = xgb.DMatrix(Xte_meta)
booster_m = xgb.train(params_meta, dtr_m, num_boost_round=nrounds_meta, verbose_eval=False)
p_te_meta = booster_m.predict(dte_m).astype(np.float32)
print(f'Meta-only trained in {time.time()-t4:.1f}s | nrounds={nrounds_meta} | spw={spw3:.2f} | feats:{Xtr_meta.shape[1]}')

# Apply S14b best weights to refit-on-full predictions
print('Blending refit-on-full predictions with S14b weights...')
z_lr_w = to_logit(p_te_lr_with); z_lr_ns = to_logit(p_te_lr_ns)
z_d1 = to_logit(p_te_d1); z_d2 = to_logit(p_te_d2); z_meta = to_logit(p_te_meta)
g = 0.90
z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
w_lr, w_d1, w_d2, w_meta = 0.33, 0.36, 0.09, 0.22
z_blend = w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + w_meta*z_meta
p_final = sigmoid(z_blend).astype(np.float32)
sub = pd.DataFrame({id_col: ids, target_col: p_final})
sub.to_csv('submission.csv', index=False)
sub.to_csv('submission_time_blend_constrained_wide_refit.csv', index=False)
print('Saved submission.csv (refit-on-full S14b blend). Head:')
print(sub.head())

# Cache refit test preds for record
np.save('test_refit_lr_withsub_meta.npy', p_te_lr_with)
np.save('test_refit_lr_nosub_meta.npy', p_te_lr_ns)
np.save('test_refit_xgb_dense_v1.npy', p_te_d1)
np.save('test_refit_xgb_dense_v2.npy', p_te_d2)
np.save('test_refit_xgb_meta.npy', p_te_meta)
print('Refit-on-full artifacts saved.')

  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


Refit-on-full: LR_withsub_meta


LR_withsub_meta trained in 53.9s | shapes tr:(2878, 54043) te:(1162, 54043)
Refit-on-full: LR_nosub_meta


LR_nosub_meta trained in 49.7s | shapes tr:(2878, 49515) te:(1162, 49515)
Refit-on-full: Dense v1 (SVD 150/150/50 + meta) XGB


Dense v1 trained in 7.2s | nrounds=200 | spw=3.03 | shapes tr:(2878, 364)
Refit-on-full: Dense v2 (SVD 250/120 + meta) XGB


Dense v2 trained in 7.4s | nrounds=120 | spw=3.03 | shapes tr:(2878, 384)
Refit-on-full: Meta-only XGB
Meta-only trained in 0.1s | nrounds=80 | spw=3.03 | feats:14
Blending refit-on-full predictions with S14b weights...
Saved submission.csv (refit-on-full S14b blend). Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.506584
1   t3_roiuw                  0.436435
2   t3_mjnbq                  0.373261
3   t3_t8wd1                  0.372974
4  t3_1m4zxu                  0.387110
Refit-on-full artifacts saved.


In [36]:
# Hedge submissions from refit-on-full predictions: 15% shrunk logit blend and equal-prob 5-base
import numpy as np, pandas as pd

id_col = 'request_id'; target_col = 'requester_received_pizza'
test = pd.read_json('test.json')
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Load refit-on-full test predictions
p_lr_w = np.load('test_refit_lr_withsub_meta.npy')
p_lr_ns = np.load('test_refit_lr_nosub_meta.npy')
p_d1 = np.load('test_refit_xgb_dense_v1.npy')
p_d2 = np.load('test_refit_xgb_dense_v2.npy')
p_meta = np.load('test_refit_xgb_meta.npy')

# Primary weights (S14b) for reference
g = 0.90
w_vec = np.array([0.33, 0.36, 0.09, 0.22], dtype=np.float64)  # (LRmix, D1, D2, Meta)

# Hedge 1: 15% shrink toward equal on logit components
z_lr_w = to_logit(p_lr_w); z_lr_ns = to_logit(p_lr_ns)
z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
z_d1 = to_logit(p_d1); z_d2 = to_logit(p_d2); z_meta = to_logit(p_meta)
w_eq = np.ones_like(w_vec) / 4.0
alpha = 0.15
w_shrunk = (1.0 - alpha)*w_vec + alpha*w_eq
w_shrunk = (w_shrunk / w_shrunk.sum()).astype(np.float64)
z_shrunk = w_shrunk[0]*z_lr_mix + w_shrunk[1]*z_d1 + w_shrunk[2]*z_d2 + w_shrunk[3]*z_meta
p_shrunk = sigmoid(z_shrunk).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_shrunk}).to_csv('submission_time_blend_constrained_wide_refit_shrunk.csv', index=False)

# Hedge 2: equal-probability average over all 5 bases with clipping
p_eq5 = np.clip((p_lr_w + p_lr_ns + p_d1 + p_d2 + p_meta) / 5.0, 0.01, 0.99).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_eq5}).to_csv('submission_time_equal5_refit.csv', index=False)

print('Wrote hedges:')
print(' - submission_time_blend_constrained_wide_refit_shrunk.csv')
print(' - submission_time_equal5_refit.csv')
print('Primary submission.csv remains the S14b refit blend.')

Wrote hedges:
 - submission_time_blend_constrained_wide_refit_shrunk.csv
 - submission_time_equal5_refit.csv
Primary submission.csv remains the S14b refit blend.


In [37]:
# Refit-on-full XGB bagging (3 seeds) for Dense v1, Dense v2, Meta; blend with S14b weights and promote to submission.csv
import numpy as np, pandas as pd, time, gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def combine_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def clean_text_series(s):
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s
def build_subs(df):
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')
def build_meta_v1(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Prepare text/meta
txt_tr = clean_text_series(combine_text(train)); txt_te = clean_text_series(combine_text(test))
subs_tr = build_subs(train); subs_te = build_subs(test)
meta_tr = build_meta_v1(train).astype(np.float32).values
meta_te = build_meta_v1(test).astype(np.float32).values

word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=20000, sublinear_tf=True, smooth_idf=True, norm='l2')

print('Vectorizing (word/char/subs)...')
t0 = time.time()
tfidf_w = TfidfVectorizer(**word_params); Xw_tr = tfidf_w.fit_transform(txt_tr); Xw_te = tfidf_w.transform(txt_te)
tfidf_c = TfidfVectorizer(**char_params); Xc_tr = tfidf_c.fit_transform(txt_tr); Xc_te = tfidf_c.transform(txt_te)
tfidf_s = TfidfVectorizer(**subs_params); Xs_tr = tfidf_s.fit_transform(subs_tr); Xs_te = tfidf_s.transform(subs_te)
print(f'TF-IDF done in {time.time()-t0:.1f}s')

# Build Dense v1 features (SVD 150/150/50 + meta)
svd_w_v1, svd_c_v1, svd_s_v1 = TruncatedSVD(n_components=150, random_state=42), TruncatedSVD(n_components=150, random_state=42), TruncatedSVD(n_components=50, random_state=42)
Zw_tr = svd_w_v1.fit_transform(Xw_tr); Zw_te = svd_w_v1.transform(Xw_te)
Zc_tr = svd_c_v1.fit_transform(Xc_tr); Zc_te = svd_c_v1.transform(Xc_te)
Zs_tr = svd_s_v1.fit_transform(Xs_tr); Zs_te = svd_s_v1.transform(Xs_te)
sc_meta = StandardScaler(with_mean=True, with_std=True)
Mtr_s = sc_meta.fit_transform(meta_tr).astype(np.float32); Mte_s = sc_meta.transform(meta_te).astype(np.float32)
Xtr_v1 = StandardScaler(with_mean=True, with_std=True).fit_transform(np.hstack([Zw_tr, Zc_tr, Zs_tr, Mtr_s]).astype(np.float32))
Xte_v1 = StandardScaler(with_mean=True, with_std=True).fit(np.zeros((1, Xtr_v1.shape[1]), dtype=np.float32)).__class__(with_mean=True, with_std=True)
# Refit scaler properly for test using same stats as train for v1
sc_all_v1 = StandardScaler(with_mean=True, with_std=True)
Xtr_all_v1 = sc_all_v1.fit_transform(np.hstack([Zw_tr, Zc_tr, Zs_tr, Mtr_s]).astype(np.float32))
Xte_all_v1 = sc_all_v1.transform(np.hstack([Zw_te, Zc_te, Zs_te, Mte_s]).astype(np.float32))

# Build Dense v2 features (SVD 250/120 + meta, no subs)
svd_w_v2, svd_c_v2 = TruncatedSVD(n_components=250, random_state=42), TruncatedSVD(n_components=120, random_state=42)
Zw2_tr = svd_w_v2.fit_transform(Xw_tr); Zw2_te = svd_w_v2.transform(Xw_te)
Zc2_tr = svd_c_v2.fit_transform(Xc_tr); Zc2_te = svd_c_v2.transform(Xc_te)
sc_meta2 = StandardScaler(with_mean=True, with_std=True)
Mtr2_s = sc_meta2.fit_transform(meta_tr).astype(np.float32); Mte2_s = sc_meta2.transform(meta_te).astype(np.float32)
sc_all_v2 = StandardScaler(with_mean=True, with_std=True)
Xtr_all_v2 = sc_all_v2.fit_transform(np.hstack([Zw2_tr, Zc2_tr, Mtr2_s]).astype(np.float32))
Xte_all_v2 = sc_all_v2.transform(np.hstack([Zw2_te, Zc2_te, Mte2_s]).astype(np.float32))

# Meta-only features
sc_m = StandardScaler(with_mean=True, with_std=True)
Xtr_meta = sc_m.fit_transform(meta_tr).astype(np.float32)
Xte_meta = sc_m.transform(meta_te).astype(np.float32)

pos = float((y==1).sum()); neg = float((y==0).sum()); spw = (neg/max(pos,1.0)) if pos>0 else 1.0
params_v1 = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.035, subsample=0.8, colsample_bytree=0.6, min_child_weight=8, reg_alpha=1.0, reg_lambda=4.0, gamma=0.1, device='cuda', tree_method='hist', scale_pos_weight=spw)
params_v2 = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.03, min_child_weight=10, subsample=0.75, colsample_bytree=0.7, reg_alpha=1.5, reg_lambda=5.0, gamma=0.2, device='cuda', tree_method='hist', scale_pos_weight=spw)
params_meta = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.05, min_child_weight=8, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.5, reg_lambda=3.0, gamma=0.0, device='cuda', tree_method='hist', scale_pos_weight=spw)
seeds = [42, 1337, 2025]
nrounds_v1, nrounds_v2, nrounds_meta = 200, 120, 80

print('Training XGB bag (3 seeds)...')
p_d1_seeds, p_d2_seeds, p_meta_seeds = [], [], []
dtr_v1 = xgb.DMatrix(Xtr_all_v1, label=y); dte_v1 = xgb.DMatrix(Xte_all_v1)
dtr_v2 = xgb.DMatrix(Xtr_all_v2, label=y); dte_v2 = xgb.DMatrix(Xte_all_v2)
dtr_m  = xgb.DMatrix(Xtr_meta, label=y);  dte_m  = xgb.DMatrix(Xte_meta)
t1 = time.time()
for si, sd in enumerate(seeds, 1):
    params1 = dict(params_v1); params1['seed'] = sd
    params2 = dict(params_v2); params2['seed'] = sd
    paramsm = dict(params_meta); paramsm['seed'] = sd
    bst1 = xgb.train(params1, dtr_v1, num_boost_round=nrounds_v1, verbose_eval=False)
    bst2 = xgb.train(params2, dtr_v2, num_boost_round=nrounds_v2, verbose_eval=False)
    bstm = xgb.train(paramsm, dtr_m,  num_boost_round=nrounds_meta, verbose_eval=False)
    p_d1_seeds.append(bst1.predict(dte_v1).astype(np.float32))
    p_d2_seeds.append(bst2.predict(dte_v2).astype(np.float32))
    p_meta_seeds.append(bstm.predict(dte_m).astype(np.float32))
    print(f'  Seed {sd} done')
print(f'XGB bagging done in {time.time()-t1:.1f}s')

p_te_d1_bag = np.mean(p_d1_seeds, axis=0).astype(np.float32)
p_te_d2_bag = np.mean(p_d2_seeds, axis=0).astype(np.float32)
p_te_meta_bag = np.mean(p_meta_seeds, axis=0).astype(np.float32)

# Load LR refit predictions from previous cell to avoid recomputation
p_te_lr_with = np.load('test_refit_lr_withsub_meta.npy')
p_te_lr_ns = np.load('test_refit_lr_nosub_meta.npy')

# Blend with exact S14b weights
g = 0.90
z_lr_mix = (1.0 - g)*to_logit(p_te_lr_with) + g*to_logit(p_te_lr_ns)
z_d1 = to_logit(p_te_d1_bag); z_d2 = to_logit(p_te_d2_bag); z_meta = to_logit(p_te_meta_bag)
w_lr, w_d1, w_d2, w_meta = 0.33, 0.36, 0.09, 0.22
z_blend = w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + w_meta*z_meta
p_final = sigmoid(z_blend).astype(np.float32)
sub = pd.DataFrame({id_col: ids, target_col: p_final})
sub.to_csv('submission_time_blend_constrained_wide_refit_bagged.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Promoted bagged refit S14b blend to submission.csv. Head:')
print(sub.head())

# Cache bagged components
np.save('test_refit_xgb_dense_v1_bag.npy', p_te_d1_bag)
np.save('test_refit_xgb_dense_v2_bag.npy', p_te_d2_bag)
np.save('test_refit_xgb_meta_bag.npy', p_te_meta_bag)
print('Saved bagged XGB prediction artifacts.')

  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


Vectorizing (word/char/subs)...


TF-IDF done in 3.0s


Training XGB bag (3 seeds)...


  Seed 42 done


  Seed 1337 done


  Seed 2025 done
XGB bagging done in 2.1s
Promoted bagged refit S14b blend to submission.csv. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.495084
1   t3_roiuw                  0.437231
2   t3_mjnbq                  0.366789
3   t3_t8wd1                  0.381403
4  t3_1m4zxu                  0.382439
Saved bagged XGB prediction artifacts.


In [38]:
# S22: Build MiniLM sentence embeddings (title + body) and cache with meta_v1
import sys, subprocess, time, gc, numpy as np, pandas as pd

def ensure(pkg):
    try:
        __import__(pkg)
        return True
    except Exception:
        return False

print('Ensuring sentence-transformers and torch...')
if not ensure('torch'):
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'torch'], check=True)
if not ensure('sentence_transformers'):
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'sentence-transformers'], check=True)

from sentence_transformers import SentenceTransformer

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def build_text(df):
    return (get_title(df) + '\n' + get_body(df)).astype(str)
def build_meta_v1(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

texts_tr = build_text(train).tolist()
texts_te = build_text(test).tolist()
print(f'Texts prepared: train {len(texts_tr)}, test {len(texts_te)}')

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
print(f'Loading SentenceTransformer: {model_name}')
model = SentenceTransformer(model_name, device='cuda')
print('Encoding embeddings on GPU...')
t0 = time.time()
emb_tr = model.encode(texts_tr, batch_size=256, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True, device='cuda')
emb_te = model.encode(texts_te, batch_size=256, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True, device='cuda')
print(f'Embeddings done in {time.time()-t0:.1f}s | shapes tr:{emb_tr.shape} te:{emb_te.shape}')

meta_tr = build_meta_v1(train).astype(np.float32).values
meta_te = build_meta_v1(test).astype(np.float32).values
print(f'Meta_v1 shapes tr:{meta_tr.shape} te:{meta_te.shape}')

np.save('emb_minilm_tr.npy', emb_tr.astype(np.float32))
np.save('emb_minilm_te.npy', emb_te.astype(np.float32))
np.save('meta_v1_tr.npy', meta_tr.astype(np.float32))
np.save('meta_v1_te.npy', meta_te.astype(np.float32))
print('Saved emb_minilm_tr.npy, emb_minilm_te.npy, meta_v1_tr.npy, meta_v1_te.npy')

del emb_tr, emb_te, meta_tr, meta_te, texts_tr, texts_te, model
gc.collect()
print('S22 complete.')

Ensuring sentence-transformers and torch...












  from .autonotebook import tqdm as notebook_tqdm


No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


Texts prepared: train 2878, test 1162
Loading SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 30] Read-only file system: '/app/.cache'


OSError: [Errno 30] Read-only file system: '/app/.cache'

In [39]:
# S22 recovery: set HF caches to writable path and rebuild MiniLM embeddings
import os, time, gc, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer

# Route HF/transformers caches to local writable folder
cache_dir = os.path.abspath('hf_cache')
os.makedirs(cache_dir, exist_ok=True)
os.environ['HF_HOME'] = cache_dir
os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def build_text(df):
    return (get_title(df) + '\n' + get_body(df)).astype(str)
def build_meta_v1(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

texts_tr = build_text(train).tolist()
texts_te = build_text(test).tolist()
print(f'Texts prepared: train {len(texts_tr)}, test {len(texts_te)}')

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
print(f'Loading SentenceTransformer with cache_folder={cache_dir}')
model = SentenceTransformer(model_name, device='cuda', cache_folder=cache_dir)

print('Encoding on GPU...')
t0 = time.time()
emb_tr = model.encode(texts_tr, batch_size=256, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True, device='cuda')
emb_te = model.encode(texts_te, batch_size=256, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True, device='cuda')
print(f'Embeddings done in {time.time()-t0:.1f}s | shapes tr:{emb_tr.shape} te:{emb_te.shape}')

meta_tr = build_meta_v1(train).astype(np.float32).values
meta_te = build_meta_v1(test).astype(np.float32).values
np.save('emb_minilm_tr.npy', emb_tr.astype(np.float32))
np.save('emb_minilm_te.npy', emb_te.astype(np.float32))
np.save('meta_v1_tr.npy', meta_tr.astype(np.float32))
np.save('meta_v1_te.npy', meta_te.astype(np.float32))
print('Saved emb_minilm_tr.npy, emb_minilm_te.npy, meta_v1_tr.npy, meta_v1_te.npy')
del emb_tr, emb_te, meta_tr, meta_te, texts_tr, texts_te, model
gc.collect()
print('S22 recovery complete.')

Texts prepared: train 2878, test 1162
Loading SentenceTransformer with cache_folder=/app/agent_run_states/random-acts-of-pizza-spray-20250912-053053/hf_cache


Encoding on GPU...


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   8%|▊         | 1/12 [00:01<00:11,  1.07s/it]

Batches:  17%|█▋        | 2/12 [00:01<00:07,  1.32it/s]

Batches:  25%|██▌       | 3/12 [00:02<00:05,  1.54it/s]

Batches:  33%|███▎      | 4/12 [00:02<00:04,  1.91it/s]

Batches:  42%|████▏     | 5/12 [00:02<00:03,  2.23it/s]

Batches:  50%|█████     | 6/12 [00:03<00:02,  2.67it/s]

Batches:  58%|█████▊    | 7/12 [00:03<00:01,  2.98it/s]

Batches:  67%|██████▋   | 8/12 [00:03<00:01,  3.37it/s]

Batches:  75%|███████▌  | 9/12 [00:03<00:00,  3.77it/s]

Batches:  83%|████████▎ | 10/12 [00:03<00:00,  4.50it/s]

Batches:  92%|█████████▏| 11/12 [00:03<00:00,  4.94it/s]

Batches: 100%|██████████| 12/12 [00:03<00:00,  3.01it/s]




Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:  20%|██        | 1/5 [00:00<00:02,  1.84it/s]

Batches:  40%|████      | 2/5 [00:00<00:01,  2.06it/s]

Batches:  60%|██████    | 3/5 [00:01<00:00,  2.46it/s]

Batches:  80%|████████  | 4/5 [00:01<00:00,  3.19it/s]

Batches: 100%|██████████| 5/5 [00:01<00:00,  3.26it/s]


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


Embeddings done in 5.6s | shapes tr:(2878, 384) te:(1162, 384)
Saved emb_minilm_tr.npy, emb_minilm_te.npy, meta_v1_tr.npy, meta_v1_te.npy


S22 recovery complete.


In [40]:
# S23: Time-aware XGB on MiniLM embeddings + meta_v1 (6-block CV, 3-seed bag); cache OOF/test
import numpy as np, pandas as pd, time, gc
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

# Load cached embeddings and meta_v1
Emb_tr = np.load('emb_minilm_tr.npy').astype(np.float32)
Emb_te = np.load('emb_minilm_te.npy').astype(np.float32)
Meta_tr = np.load('meta_v1_tr.npy').astype(np.float32)
Meta_te = np.load('meta_v1_te.npy').astype(np.float32)
print('Loaded features:', Emb_tr.shape, Emb_te.shape, Meta_tr.shape, Meta_te.shape)

# 6-block forward-chaining folds and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'Emb+Meta Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# XGB params (per expert)
base_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=3,
    eta=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    min_child_weight=8,
    reg_alpha=0.5,
    reg_lambda=3.0,
    gamma=0.0,
    device='cuda',
    tree_method='hist'
)
num_boost_round = 8000
early_stopping_rounds = 100
seeds = [42, 1337, 2025]

oof_sum = np.zeros(n, dtype=np.float64)
oof_cnt = np.zeros(n, dtype=np.float64)
test_seed_preds = []

for si, seed in enumerate(seeds, 1):
    print(f'=== Emb+Meta Seed {seed} ({si}/{len(seeds)}) ===')
    params = dict(base_params); params['seed'] = seed
    oof_seed = np.zeros(n, dtype=np.float32)
    test_folds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        t0 = time.time()
        Xtr_raw = np.hstack([Emb_tr[tr_idx], Meta_tr[tr_idx]]).astype(np.float32)
        Xva_raw = np.hstack([Emb_tr[va_idx], Meta_tr[va_idx]]).astype(np.float32)
        Xte_raw = np.hstack([Emb_te, Meta_te]).astype(np.float32)
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(Xtr_raw).astype(np.float32)
        Xva = scaler.transform(Xva_raw).astype(np.float32)
        Xte = scaler.transform(Xte_raw).astype(np.float32)
        pos = float((y[tr_idx] == 1).sum()); neg = float((y[tr_idx] == 0).sum())
        params['scale_pos_weight'] = (neg / max(pos, 1.0)) if pos > 0 else 1.0
        dtr = xgb.DMatrix(Xtr, label=y[tr_idx])
        dva = xgb.DMatrix(Xva, label=y[va_idx])
        dte = xgb.DMatrix(Xte)
        booster = xgb.train(params, dtr, num_boost_round=num_boost_round, evals=[(dva, 'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_pred = booster.predict(dva, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        te_pred = booster.predict(dte, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        oof_seed[va_idx] = va_pred; test_folds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[Emb Seed {seed} Fold {fi}] best_iter={booster.best_iteration} | spw={params["scale_pos_weight"]:.2f} | AUC: {auc:.5f} | {time.time()-t0:.1f}s | shapes tr:{Xtr.shape}')
        del Xtr_raw, Xva_raw, Xte_raw, Xtr, Xva, Xte, dtr, dva, dte, booster, scaler; gc.collect()
    seed_auc = roc_auc_score(y[mask], oof_seed[mask])
    print(f'[Emb Seed {seed}] OOF AUC (validated only): {seed_auc:.5f}')
    oof_sum[mask] += oof_seed[mask]; oof_cnt[mask] += 1.0
    test_seed_preds.append(np.mean(test_folds, axis=0).astype(np.float64))
    del oof_seed, test_folds; gc.collect()

oof_avg = np.zeros(n, dtype=np.float32)
oof_avg[mask] = (oof_sum[mask] / np.maximum(oof_cnt[mask], 1.0)).astype(np.float32)
test_avg = np.mean(test_seed_preds, axis=0).astype(np.float32)
auc_oof = roc_auc_score(y[mask], oof_avg[mask])
print(f'Emb+Meta Time-CV OOF AUC (validated only, 3-seed avg): {auc_oof:.5f}')
np.save('oof_xgb_emb_meta_time.npy', oof_avg.astype(np.float32))
np.save('test_xgb_emb_meta_time.npy', test_avg)
print('Saved oof_xgb_emb_meta_time.npy and test_xgb_emb_meta_time.npy')

Loaded features: (2878, 384) (1162, 384) (2878, 14) (1162, 14)
Emb+Meta Time-CV: 5 folds; validated 2398/2878
=== Emb+Meta Seed 42 (1/3) ===


[Emb Seed 42 Fold 1] best_iter=68 | spw=1.94 | AUC: 0.64868 | 0.5s | shapes tr:(480, 398)


[Emb Seed 42 Fold 2] best_iter=41 | spw=2.33 | AUC: 0.70075 | 0.3s | shapes tr:(960, 398)


[Emb Seed 42 Fold 3] best_iter=12 | spw=2.49 | AUC: 0.60800 | 0.3s | shapes tr:(1440, 398)


[Emb Seed 42 Fold 4] best_iter=30 | spw=2.79 | AUC: 0.64451 | 0.3s | shapes tr:(1920, 398)


[Emb Seed 42 Fold 5] best_iter=103 | spw=2.83 | AUC: 0.64119 | 0.5s | shapes tr:(2399, 398)
[Emb Seed 42] OOF AUC (validated only): 0.64339


=== Emb+Meta Seed 1337 (2/3) ===


[Emb Seed 1337 Fold 1] best_iter=240 | spw=1.94 | AUC: 0.64989 | 0.7s | shapes tr:(480, 398)


[Emb Seed 1337 Fold 2] best_iter=16 | spw=2.33 | AUC: 0.68545 | 0.3s | shapes tr:(960, 398)


[Emb Seed 1337 Fold 3] best_iter=8 | spw=2.49 | AUC: 0.62842 | 0.3s | shapes tr:(1440, 398)


[Emb Seed 1337 Fold 4] best_iter=8 | spw=2.79 | AUC: 0.64566 | 0.3s | shapes tr:(1920, 398)


[Emb Seed 1337 Fold 5] best_iter=77 | spw=2.83 | AUC: 0.63445 | 0.4s | shapes tr:(2399, 398)
[Emb Seed 1337] OOF AUC (validated only): 0.62448


=== Emb+Meta Seed 2025 (3/3) ===


[Emb Seed 2025 Fold 1] best_iter=260 | spw=1.94 | AUC: 0.65203 | 0.7s | shapes tr:(480, 398)


[Emb Seed 2025 Fold 2] best_iter=79 | spw=2.33 | AUC: 0.69104 | 0.4s | shapes tr:(960, 398)


[Emb Seed 2025 Fold 3] best_iter=4 | spw=2.49 | AUC: 0.59918 | 0.3s | shapes tr:(1440, 398)


[Emb Seed 2025 Fold 4] best_iter=59 | spw=2.79 | AUC: 0.63145 | 0.4s | shapes tr:(1920, 398)


[Emb Seed 2025 Fold 5] best_iter=175 | spw=2.83 | AUC: 0.65415 | 0.7s | shapes tr:(2399, 398)
[Emb Seed 2025] OOF AUC (validated only): 0.63028


Emb+Meta Time-CV OOF AUC (validated only, 3-seed avg): 0.64020
Saved oof_xgb_emb_meta_time.npy and test_xgb_emb_meta_time.npy


In [41]:
# S24: Constrained time-consistent logit blend adding Emb+Meta base (6th model); write primary + hedges
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# 6-block forward-chaining mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    mask[np.array(blocks[i])] = True
print(f'Time-CV (6 blocks) validated count: {mask.sum()}/{n}')

# Load bases (time-consistent OOF/test)
o_lr_w = np.load('oof_lr_time_withsub_meta.npy'); t_lr_w = np.load('test_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy'); t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy'); t_d1 = np.load('test_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy'); t_d2 = np.load('test_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy'); t_meta = np.load('test_xgb_meta_time.npy')
o_emb = np.load('oof_xgb_emb_meta_time.npy'); t_emb = np.load('test_xgb_emb_meta_time.npy')

# Convert to logits
z_lr_w, z_lr_ns = to_logit(o_lr_w), to_logit(o_lr_ns)
z_d1, z_d2, z_meta, z_emb = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta), to_logit(o_emb)
tz_lr_w, tz_lr_ns = to_logit(t_lr_w), to_logit(t_lr_ns)
tz_d1, tz_d2, tz_meta, tz_emb = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta), to_logit(t_emb)

# Grid per expert guidance
g_grid = [0.85, 0.90, 0.95]
meta_grid = [0.18, 0.20, 0.22]
dense_tot_grid = [0.30, 0.35, 0.40, 0.45]
alpha_grid = [0.65, 0.80]  # split of dense_total -> favor v1
emb_grid = [0.05, 0.08, 0.10, 0.12, 0.15, 0.18, 0.20]

best_auc, best_cfg = -1.0, None
tried = 0
for g in g_grid:
    z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
    tz_lr_mix = (1.0 - g)*tz_lr_w + g*tz_lr_ns
    for w_emb in emb_grid:
        rem1 = 1.0 - w_emb
        for meta_w in meta_grid:
            for d_tot in dense_tot_grid:
                w_lr = rem1 - meta_w - d_tot
                if w_lr <= 0 or w_lr >= 1:
                    continue
                for a in alpha_grid:
                    w_d2 = d_tot * a
                    w_d1 = d_tot - w_d2
                    if w_d1 < 0 or w_d2 < 0:
                        continue
                    z_oof = w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + meta_w*z_meta + w_emb*z_emb
                    auc = roc_auc_score(y[mask], z_oof[mask])
                    tried += 1
                    if auc > best_auc:
                        best_auc = auc
                        best_cfg = dict(g=float(g), w_emb=float(w_emb), w_lr=float(w_lr), w_d1=float(w_d1), w_d2=float(w_d2), w_meta=float(meta_w), tz_lr_mix=tz_lr_mix)
cfg_print = {k: v for k, v in best_cfg.items() if k != 'tz_lr_mix'} if best_cfg is not None else {}
print(f'6-way (with Emb) grid tried {tried} | Best OOF(z,time-mask) AUC: {best_auc:.5f} | cfg={cfg_print}')

# Build primary test prediction
g = best_cfg['g']; w_emb = best_cfg['w_emb']; w_lr = best_cfg['w_lr']; w_d1 = best_cfg['w_d1']; w_d2 = best_cfg['w_d2']; w_meta = best_cfg['w_meta']
tz_lr_mix = best_cfg['tz_lr_mix']
zt_best = w_lr*tz_lr_mix + w_d1*tz_d1 + w_d2*tz_d2 + w_meta*tz_meta + w_emb*tz_emb
pt_best = sigmoid(zt_best).astype(np.float32)
sub = pd.DataFrame({id_col: ids, target_col: pt_best})
sub.to_csv('submission_time_blend_with_emb.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Promoted submission_time_blend_with_emb.csv to submission.csv. Head:')
print(sub.head())

# Hedge 1: 15% shrink toward equal on the 5 logit components (LRmix, D1, D2, Meta, Emb)
w_vec = np.array([w_lr, w_d1, w_d2, w_meta, w_emb], dtype=np.float64)
w_eq = np.ones_like(w_vec) / len(w_vec)
alpha = 0.15
w_shr = (1.0 - alpha)*w_vec + alpha*w_eq
w_shr = (w_shr / w_shr.sum()).astype(np.float64)
zt_shr = w_shr[0]*tz_lr_mix + w_shr[1]*tz_d1 + w_shr[2]*tz_d2 + w_shr[3]*tz_meta + w_shr[4]*tz_emb
pt_shr = sigmoid(zt_shr).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt_shr}).to_csv('submission_time_blend_with_emb_shrunk.csv', index=False)

# Hedge 2: equal-probability average over all 6 bases in prob space with clipping
p_eq6 = np.clip((sigmoid(tz_lr_w) + sigmoid(tz_lr_ns) + t_d1 + t_d2 + t_meta + t_emb) / 6.0, 0.01, 0.99).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_eq6}).to_csv('submission_time_equal6_prob.csv', index=False)
print('Wrote hedges: submission_time_blend_with_emb_shrunk.csv and submission_time_equal6_prob.csv')

Time-CV (6 blocks) validated count: 2398/2878


6-way (with Emb) grid tried 504 | Best OOF(z,time-mask) AUC: 0.67750 | cfg={'g': 0.95, 'w_emb': 0.2, 'w_lr': 0.2800000000000001, 'w_d1': 0.10499999999999998, 'w_d2': 0.195, 'w_meta': 0.22}
Promoted submission_time_blend_with_emb.csv to submission.csv. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.517004
1   t3_roiuw                  0.526068
2   t3_mjnbq                  0.432309
3   t3_t8wd1                  0.434144
4  t3_1m4zxu                  0.516772
Wrote hedges: submission_time_blend_with_emb_shrunk.csv and submission_time_equal6_prob.csv


In [42]:
# S25: Refit-on-full 5-seed bag for XGB bases (Dense v1/v2, Meta, Emb+Meta) and 6-way blend using S24 best weights
import numpy as np, pandas as pd, time, gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def combine_text(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)
def clean_text_series(s):
    s = s.str.lower()
    s = s.str.replace(r'https?://\S+', ' url ', regex=True)
    s = s.str.replace(r'\d+', ' number ', regex=True)
    s = s.str.replace(r'\s+', ' ', regex=True)
    return s
def build_subs(df):
    if 'requester_subreddits_at_request' not in df.columns:
        return pd.Series(['']*len(df))
    sr = df['requester_subreddits_at_request']
    return sr.apply(lambda x: ' '.join([str(s).lower() for s in x]) if isinstance(x, (list, tuple)) else '')
def build_meta_v1(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['hour'] = hour
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio',
              'requester_upvotes_minus_downvotes_at_request',
              'requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request',
              'requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# Seeds and rounds per expert
seeds = [42, 1337, 2025, 614, 2718]
nrounds_v1, nrounds_v2, nrounds_meta, nrounds_emb = 200, 120, 80, 150

# Refit Dense v1/v2 + Meta XGB on full train with 5-seed bag
print('Refit 5-seed bag: Dense v1/v2 + Meta ...')
txt_tr = clean_text_series(combine_text(train)); txt_te = clean_text_series(combine_text(test))
subs_tr = build_subs(train); subs_te = build_subs(test)
meta_tr = build_meta_v1(train).astype(np.float32).values
meta_te = build_meta_v1(test).astype(np.float32).values

word_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(3,5), lowercase=True, min_df=3, max_features=60000, sublinear_tf=True, smooth_idf=True, norm='l2')
subs_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_features=20000, sublinear_tf=True, smooth_idf=True, norm='l2')

t0 = time.time()
tfidf_w = TfidfVectorizer(**word_params); Xw_tr = tfidf_w.fit_transform(txt_tr); Xw_te = tfidf_w.transform(txt_te)
tfidf_c = TfidfVectorizer(**char_params); Xc_tr = tfidf_c.fit_transform(txt_tr); Xc_te = tfidf_c.transform(txt_te)
tfidf_s = TfidfVectorizer(**subs_params); Xs_tr = tfidf_s.fit_transform(subs_tr); Xs_te = tfidf_s.transform(subs_te)
print(f'TF-IDF built in {time.time()-t0:.1f}s')

# Dense v1 features (SVD 150/150/50 + meta)
svd_w_v1, svd_c_v1, svd_s_v1 = TruncatedSVD(n_components=150, random_state=42), TruncatedSVD(n_components=150, random_state=42), TruncatedSVD(n_components=50, random_state=42)
Zw_tr = svd_w_v1.fit_transform(Xw_tr); Zw_te = svd_w_v1.transform(Xw_te)
Zc_tr = svd_c_v1.fit_transform(Xc_tr); Zc_te = svd_c_v1.transform(Xc_te)
Zs_tr = svd_s_v1.fit_transform(Xs_tr); Zs_te = svd_s_v1.transform(Xs_te)
sc_meta = StandardScaler(with_mean=True, with_std=True)
Mtr_s = sc_meta.fit_transform(meta_tr).astype(np.float32); Mte_s = sc_meta.transform(meta_te).astype(np.float32)
sc_all_v1 = StandardScaler(with_mean=True, with_std=True)
Xtr_v1 = sc_all_v1.fit_transform(np.hstack([Zw_tr, Zc_tr, Zs_tr, Mtr_s]).astype(np.float32))
Xte_v1 = sc_all_v1.transform(np.hstack([Zw_te, Zc_te, Zs_te, Mte_s]).astype(np.float32))

# Dense v2 features (SVD 250/120 + meta, no subs)
svd_w_v2, svd_c_v2 = TruncatedSVD(n_components=250, random_state=42), TruncatedSVD(n_components=120, random_state=42)
Zw2_tr = svd_w_v2.fit_transform(Xw_tr); Zw2_te = svd_w_v2.transform(Xw_te)
Zc2_tr = svd_c_v2.fit_transform(Xc_tr); Zc2_te = svd_c_v2.transform(Xc_te)
sc_meta2 = StandardScaler(with_mean=True, with_std=True)
Mtr2_s = sc_meta2.fit_transform(meta_tr).astype(np.float32); Mte2_s = sc_meta2.transform(meta_te).astype(np.float32)
sc_all_v2 = StandardScaler(with_mean=True, with_std=True)
Xtr_v2 = sc_all_v2.fit_transform(np.hstack([Zw2_tr, Zc2_tr, Mtr2_s]).astype(np.float32))
Xte_v2 = sc_all_v2.transform(np.hstack([Zw2_te, Zc2_te, Mte2_s]).astype(np.float32))

# Meta-only features
sc_m = StandardScaler(with_mean=True, with_std=True)
Xtr_m = sc_m.fit_transform(meta_tr).astype(np.float32)
Xte_m = sc_m.transform(meta_te).astype(np.float32)

neg = float((y==0).sum()); pos = float((y==1).sum()); spw = (neg/max(pos,1.0)) if pos>0 else 1.0
params_v1 = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.035, subsample=0.8, colsample_bytree=0.6, min_child_weight=8, reg_alpha=1.0, reg_lambda=4.0, gamma=0.1, device='cuda', tree_method='hist', scale_pos_weight=spw)
params_v2 = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.03, min_child_weight=10, subsample=0.75, colsample_bytree=0.7, reg_alpha=1.5, reg_lambda=5.0, gamma=0.2, device='cuda', tree_method='hist', scale_pos_weight=spw)
params_meta = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.05, min_child_weight=8, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.5, reg_lambda=3.0, gamma=0.0, device='cuda', tree_method='hist', scale_pos_weight=spw)

dtr_v1 = xgb.DMatrix(Xtr_v1, label=y); dte_v1 = xgb.DMatrix(Xte_v1)
dtr_v2 = xgb.DMatrix(Xtr_v2, label=y); dte_v2 = xgb.DMatrix(Xte_v2)
dtr_m  = xgb.DMatrix(Xtr_m,  label=y); dte_m  = xgb.DMatrix(Xte_m)

p_d1_seeds, p_d2_seeds, p_meta_seeds = [], [], []
for sd in seeds:
    b1 = xgb.train({**params_v1, 'seed': sd}, dtr_v1, num_boost_round=nrounds_v1, verbose_eval=False)
    b2 = xgb.train({**params_v2, 'seed': sd}, dtr_v2, num_boost_round=nrounds_v2, verbose_eval=False)
    bm = xgb.train({**params_meta, 'seed': sd}, dtr_m, num_boost_round=nrounds_meta, verbose_eval=False)
    p_d1_seeds.append(b1.predict(dte_v1).astype(np.float32))
    p_d2_seeds.append(b2.predict(dte_v2).astype(np.float32))
    p_meta_seeds.append(bm.predict(dte_m).astype(np.float32))
print('Dense v1/v2/Meta 5-seed bagging complete')

p_te_d1_bag = np.mean(p_d1_seeds, axis=0).astype(np.float32)
p_te_d2_bag = np.mean(p_d2_seeds, axis=0).astype(np.float32)
p_te_meta_bag = np.mean(p_meta_seeds, axis=0).astype(np.float32)

# Refit Emb+Meta on full train with 5-seed bag
print('Refit 5-seed bag: Emb+Meta ...')
Emb_tr = np.load('emb_minilm_tr.npy').astype(np.float32)
Emb_te = np.load('emb_minilm_te.npy').astype(np.float32)
Meta_tr = np.load('meta_v1_tr.npy').astype(np.float32)
Meta_te = np.load('meta_v1_te.npy').astype(np.float32)
sc_e = StandardScaler(with_mean=True, with_std=True)
Xtr_e = sc_e.fit_transform(np.hstack([Emb_tr, Meta_tr]).astype(np.float32))
Xte_e = sc_e.transform(np.hstack([Emb_te, Meta_te]).astype(np.float32))
params_emb = dict(objective='binary:logistic', eval_metric='auc', max_depth=3, eta=0.05, subsample=0.8, colsample_bytree=0.6, min_child_weight=8, reg_alpha=0.5, reg_lambda=3.0, gamma=0.0, device='cuda', tree_method='hist', scale_pos_weight=spw)
dtr_e = xgb.DMatrix(Xtr_e, label=y); dte_e = xgb.DMatrix(Xte_e)
p_emb_seeds = []
for sd in seeds:
    be = xgb.train({**params_emb, 'seed': sd}, dtr_e, num_boost_round=nrounds_emb, verbose_eval=False)
    p_emb_seeds.append(be.predict(dte_e).astype(np.float32))
p_te_emb_bag = np.mean(p_emb_seeds, axis=0).astype(np.float32)
print('Emb+Meta 5-seed bagging complete')

# Load LR refit predictions (from earlier refit cell)
p_te_lr_with = np.load('test_refit_lr_withsub_meta.npy')
p_te_lr_ns = np.load('test_refit_lr_nosub_meta.npy')

# Blend using S24 best 6-way weights on logits
g = 0.95  # LR mix weight from S24 best cfg
w_emb = 0.20; w_lr = 0.28; w_d1 = 0.105; w_d2 = 0.195; w_meta = 0.22
z_lr_mix = (1.0 - g)*to_logit(p_te_lr_with) + g*to_logit(p_te_lr_ns)
z_d1 = to_logit(p_te_d1_bag); z_d2 = to_logit(p_te_d2_bag); z_meta = to_logit(p_te_meta_bag); z_emb = to_logit(p_te_emb_bag)
z_blend = w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + w_meta*z_meta + w_emb*z_emb
p_final = sigmoid(z_blend).astype(np.float32)
sub = pd.DataFrame({id_col: ids, target_col: p_final})
sub.to_csv('submission_time_blend_with_emb_refit5bag.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Promoted 6-way refit 5-seed bag blend to submission.csv. Head:')
print(sub.head())

# Hedges: 15% shrink over 5 logit components (LRmix,D1,D2,Meta,Emb) and equal-prob 6-way
w_vec = np.array([w_lr, w_d1, w_d2, w_meta, w_emb], dtype=np.float64)
w_eq = np.ones_like(w_vec) / len(w_vec)
alpha = 0.15
w_shr = ((1.0 - alpha)*w_vec + alpha*w_eq); w_shr = (w_shr / w_shr.sum()).astype(np.float64)
z_shr = w_shr[0]*z_lr_mix + w_shr[1]*z_d1 + w_shr[2]*z_d2 + w_shr[3]*z_meta + w_shr[4]*z_emb
p_shr = sigmoid(z_shr).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_shr}).to_csv('submission_time_blend_with_emb_refit5bag_shrunk.csv', index=False)
p_eq6 = np.clip((sigmoid((1.0 - g)*to_logit(p_te_lr_with) + g*to_logit(p_te_lr_ns)) + p_te_d1_bag + p_te_d2_bag + p_te_meta_bag + p_te_emb_bag) / 5.0, 0.01, 0.99).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: p_eq6}).to_csv('submission_time_equal5_refit5bag_prob.csv', index=False)
print('Wrote hedges for refit5bag blend.')

Refit 5-seed bag: Dense v1/v2 + Meta ...


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


TF-IDF built in 2.9s


Dense v1/v2/Meta 5-seed bagging complete
Refit 5-seed bag: Emb+Meta ...


Emb+Meta 5-seed bagging complete
Promoted 6-way refit 5-seed bag blend to submission.csv. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.498361
1   t3_roiuw                  0.468186
2   t3_mjnbq                  0.396775
3   t3_t8wd1                  0.376720
4  t3_1m4zxu                  0.421771
Wrote hedges for refit5bag blend.


In [43]:
# S26: Upgraded LR (word 1-3, char_wb 2-6, min_df=2, 250k) + meta_v1+{token_count,sentence_count}; models: main/title/body; L2 vs ElasticNet per expert
import numpy as np, pandas as pd, time, gc, json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def txt_main(df):
    return (get_title(df) + ' \n ' + get_body(df)).astype(str)

def build_meta_up(df):
    title = get_title(df); body = get_body(df)
    out = pd.DataFrame(index=df.index)
    out['title_len'] = title.str.len().astype(np.float32)
    out['body_len'] = body.str.len().astype(np.float32)
    out['title_body_ratio'] = (out['title_len'] / (1.0 + out['body_len'])).astype(np.float32)
    out['has_url'] = body.str.contains(r'https?://', regex=True).astype(np.float32)
    out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
    # add token_count and sentence_count (log1p later)
    out['token_count'] = (title.str.count(r'\w+') + body.str.count(r'\w+')).astype(np.float32)
    out['sentence_count'] = body.str.count(r'[\.!?]').astype(np.float32)
    if 'unix_timestamp_of_request' in df.columns:
        dt = pd.to_datetime(df['unix_timestamp_of_request'], unit='s', utc=True, errors='coerce')
    else:
        dt = pd.to_datetime(0, unit='s', utc=True) + pd.to_timedelta(np.zeros(len(df)), unit='s')
    hour = dt.dt.hour.fillna(0).astype(np.float32)
    out['dayofweek'] = dt.dt.dayofweek.fillna(0).astype(np.float32)
    out['is_weekend'] = out['dayofweek'].isin([5,6]).astype(np.float32)
    out['hour_sin'] = np.sin(2*np.pi*hour/24.0).astype(np.float32)
    out['hour_cos'] = np.cos(2*np.pi*hour/24.0).astype(np.float32)
    for c in [
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_posts_at_request'
    ]:
        if c in df.columns:
            out[c] = pd.to_numeric(df[c], errors='coerce').astype(np.float32)
        else:
            out[c] = 0.0
    for c in ['title_len','body_len','title_body_ratio','token_count','sentence_count',
              'requester_upvotes_minus_downvotes_at_request','requester_upvotes_plus_downvotes_at_request',
              'requester_number_of_comments_at_request','requester_number_of_posts_at_request']:
        if c in out.columns:
            out[c] = np.log1p(out[c].clip(lower=0)).astype(np.float32)
    out = out.replace([np.inf,-np.inf], 0).fillna(0).astype(np.float32)
    return out

# 6-block forward-chaining folds and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'LR upgraded Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# Vectorizers per spec
word_params = dict(analyzer='word', ngram_range=(1,3), lowercase=True, min_df=2, max_features=250_000, sublinear_tf=True, smooth_idf=True, norm='l2')
char_params = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=True, min_df=2, max_features=250_000, sublinear_tf=True, smooth_idf=True, norm='l2')

texts_main_tr, texts_main_te = txt_main(train), txt_main(test)
texts_title_tr, texts_title_te = get_title(train), get_title(test)
texts_body_tr, texts_body_te = get_body(train), get_body(test)
meta_te_full = build_meta_up(test).astype(np.float32).values

def run_lr_view(view_name: str, tr_text: pd.Series, te_text: pd.Series, tag_out: str):
    # Returns best OOF/test and config dict
    l2_C_grid = [0.3, 0.5, 0.8, 1.2, 2.0]
    en_l1_grid = [0.05, 0.10, 0.20]
    en_C_grid = [0.5, 0.8, 1.2]
    best_auc, best_kind, best_cfg = -1.0, None, None
    best_oof, best_test = None, None
    # Helper to train one LR config across folds, score OOF on mask, return test avg
    def train_cfg(penalty_kind: str, C_val: float, l1_ratio: float|None):
        oof = np.zeros(n, dtype=np.float32)
        test_fold_preds = []
        for fi, (tr_idx, va_idx) in enumerate(folds, 1):
            t0 = time.time()
            tfidf_w = TfidfVectorizer(**word_params)
            Xw_tr = tfidf_w.fit_transform(tr_text.iloc[tr_idx]); Xw_va = tfidf_w.transform(tr_text.iloc[va_idx]); Xw_te = tfidf_w.transform(te_text)
            tfidf_c = TfidfVectorizer(**char_params)
            Xc_tr = tfidf_c.fit_transform(tr_text.iloc[tr_idx]); Xc_va = tfidf_c.transform(tr_text.iloc[va_idx]); Xc_te = tfidf_c.transform(te_text)
            # Meta per fold and scale
            M_tr = build_meta_up(train.iloc[tr_idx]).astype(np.float32).values
            M_va = build_meta_up(train.iloc[va_idx]).astype(np.float32).values
            scaler = StandardScaler(with_mean=True, with_std=True)
            M_tr_s = scaler.fit_transform(M_tr).astype(np.float32)
            M_va_s = scaler.transform(M_va).astype(np.float32)
            M_te_s = scaler.transform(meta_te_full).astype(np.float32)
            X_tr = hstack([Xw_tr, Xc_tr, csr_matrix(M_tr_s)], format='csr')
            X_va = hstack([Xw_va, Xc_va, csr_matrix(M_va_s)], format='csr')
            X_te = hstack([Xw_te, Xc_te, csr_matrix(M_te_s)], format='csr')
            if penalty_kind == 'l2':
                clf = LogisticRegression(solver='saga', penalty='l2', C=C_val, max_iter=4000, n_jobs=-1, random_state=42, verbose=0)
            else:
                clf = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=l1_ratio, C=C_val, max_iter=4000, n_jobs=-1, random_state=42, verbose=0)
            clf.fit(X_tr, y[tr_idx])
            va_pred = clf.predict_proba(X_va)[:,1].astype(np.float32)
            te_pred = clf.predict_proba(X_te)[:,1].astype(np.float32)
            oof[va_idx] = va_pred; test_fold_preds.append(te_pred)
            auc = roc_auc_score(y[va_idx], va_pred)
            print(f'[{view_name} {penalty_kind} C={C_val} l1={l1_ratio}] Fold {fi} AUC: {auc:.5f} | {time.time()-t0:.1f}s | tr:{X_tr.shape}')
            del tfidf_w, tfidf_c, Xw_tr, Xw_va, Xw_te, Xc_tr, Xc_va, Xc_te, M_tr, M_va, scaler, M_tr_s, M_va_s, M_te_s, X_tr, X_va, X_te, clf; gc.collect()
        auc_oof = roc_auc_score(y[mask], oof[mask])
        te_avg = np.mean(test_fold_preds, axis=0).astype(np.float32)
        return auc_oof, oof, te_avg
    # L2 sweep
    for C_val in l2_C_grid:
        auc_here, oof_here, te_here = train_cfg('l2', C_val, None)
        print(f'[{view_name}] L2 C={C_val} | OOF(time-mask) AUC: {auc_here:.5f}')
        if auc_here > best_auc:
            best_auc, best_kind = auc_here, ('l2', C_val, None)
            best_oof, best_test = oof_here, te_here
    l2_best_auc = best_auc
    # ElasticNet sweep; keep only if >= l2_best + 0.002
    en_best_auc, en_best = -1.0, None
    en_best_oof, en_best_test = None, None
    for l1 in en_l1_grid:
        for C_val in en_C_grid:
            auc_here, oof_here, te_here = train_cfg('en', C_val, l1)
            print(f'[{view_name}] EN C={C_val} l1={l1} | OOF(time-mask) AUC: {auc_here:.5f}')
            if auc_here > en_best_auc:
                en_best_auc, en_best = auc_here, ('en', C_val, l1)
                en_best_oof, en_best_test = oof_here, te_here
    if en_best_auc >= l2_best_auc + 0.002:
        best_auc, best_kind = en_best_auc, en_best
        best_oof, best_test = en_best_oof, en_best_test
    # Save
    tag = f'lr_time_up_{tag_out}'
    np.save(f'oof_{tag}.npy', best_oof.astype(np.float32))
    np.save(f'test_{tag}.npy', best_test.astype(np.float32))
    print(f'[{view_name}] BEST {best_kind} | OOF(time-mask) AUC: {best_auc:.5f} | saved as {tag}')
    return dict(view=view_name, tag=tag, best=best_kind, auc=best_auc)

t_start = time.time()
res_main = run_lr_view('LR_main', texts_main_tr, texts_main_te, 'main')
res_title = run_lr_view('LR_title', texts_title_tr, texts_title_te, 'title')
res_body = run_lr_view('LR_body', texts_body_tr, texts_body_te, 'body')
print('S26 finished in', f'{time.time()-t_start:.1f}s')
with open('lr_upgraded_results.json','w') as f:
    json.dump({'main':res_main,'title':res_title,'body':res_body}, f, indent=2)
print('Saved lr_upgraded_results.json')

LR upgraded Time-CV: 5 folds; validated 2398/2878


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.3 l1=None] Fold 1 AUC: 0.72635 | 9.2s | tr:(480, 35930)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.3 l1=None] Fold 2 AUC: 0.67470 | 19.9s | tr:(960, 58405)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.3 l1=None] Fold 3 AUC: 0.63685 | 26.2s | tr:(1440, 75903)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.3 l1=None] Fold 4 AUC: 0.63168 | 36.8s | tr:(1920, 90003)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.3 l1=None] Fold 5 AUC: 0.63657 | 45.3s | tr:(2399, 102312)


[LR_main] L2 C=0.3 | OOF(time-mask) AUC: 0.66097


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.5 l1=None] Fold 1 AUC: 0.72832 | 9.2s | tr:(480, 35930)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.5 l1=None] Fold 2 AUC: 0.67576 | 22.3s | tr:(960, 58405)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.5 l1=None] Fold 3 AUC: 0.63483 | 30.0s | tr:(1440, 75903)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.5 l1=None] Fold 4 AUC: 0.63357 | 45.6s | tr:(1920, 90003)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.5 l1=None] Fold 5 AUC: 0.64093 | 56.0s | tr:(2399, 102312)


[LR_main] L2 C=0.5 | OOF(time-mask) AUC: 0.66262


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.8 l1=None] Fold 1 AUC: 0.73005 | 8.5s | tr:(480, 35930)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.8 l1=None] Fold 2 AUC: 0.67464 | 24.6s | tr:(960, 58405)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.8 l1=None] Fold 3 AUC: 0.63094 | 34.9s | tr:(1440, 75903)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.8 l1=None] Fold 4 AUC: 0.63415 | 56.0s | tr:(1920, 90003)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=0.8 l1=None] Fold 5 AUC: 0.64223 | 72.0s | tr:(2399, 102312)


[LR_main] L2 C=0.8 | OOF(time-mask) AUC: 0.66257


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


[LR_main l2 C=1.2 l1=None] Fold 1 AUC: 0.73050 | 6.5s | tr:(480, 35930)


  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)
  out['has_img'] = body.str.contains(r'(imgur|jpg|jpeg|png|gif)', regex=True).astype(np.float32)


In [44]:
# S27: Build all-mpnet-base-v2 sentence embeddings (title+body) with local HF cache; save .npy
import os, time, gc, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer

# Ensure HF caches are writable (reuse same cache dir as S22 recovery)
cache_dir = os.path.abspath('hf_cache')
os.makedirs(cache_dir, exist_ok=True)
os.environ['HF_HOME'] = cache_dir
os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')

def get_title(df):
    return df.get('request_title', pd.Series(['']*len(df))).fillna('').astype(str)
def get_body(df):
    return df.get('request_text_edit_aware', df.get('request_text', pd.Series(['']*len(df)))).fillna('').astype(str)
def build_text(df):
    return (get_title(df) + '\n' + get_body(df)).astype(str)

texts_tr = build_text(train).tolist()
texts_te = build_text(test).tolist()
print(f'Texts prepared: train {len(texts_tr)}, test {len(texts_te)}')

model_name = 'sentence-transformers/all-mpnet-base-v2'
print(f'Loading SentenceTransformer {model_name} with cache_folder={cache_dir}')
model = SentenceTransformer(model_name, device='cuda', cache_folder=cache_dir)

print('Encoding MPNet embeddings on GPU...')
t0 = time.time()
emb_tr = model.encode(texts_tr, batch_size=128, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True, device='cuda')
emb_te = model.encode(texts_te, batch_size=128, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True, device='cuda')
print(f'MPNet embeddings done in {time.time()-t0:.1f}s | shapes tr:{emb_tr.shape} te:{emb_te.shape}')

np.save('emb_mpnet_tr.npy', emb_tr.astype(np.float32))
np.save('emb_mpnet_te.npy', emb_te.astype(np.float32))
print('Saved emb_mpnet_tr.npy and emb_mpnet_te.npy')

del model, emb_tr, emb_te, texts_tr, texts_te
gc.collect()
print('S27 complete.')

Texts prepared: train 2878, test 1162
Loading SentenceTransformer sentence-transformers/all-mpnet-base-v2 with cache_folder=/app/agent_run_states/random-acts-of-pizza-spray-20250912-053053/hf_cache


Encoding MPNet embeddings on GPU...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Batches:   4%|▍         | 1/23 [00:02<01:04,  2.94s/it]

Batches:   9%|▊         | 2/23 [00:05<00:56,  2.69s/it]

Batches:  13%|█▎        | 3/23 [00:07<00:47,  2.35s/it]

Batches:  17%|█▋        | 4/23 [00:09<00:40,  2.15s/it]

Batches:  22%|██▏       | 5/23 [00:10<00:35,  1.99s/it]

Batches:  26%|██▌       | 6/23 [00:12<00:29,  1.76s/it]

Batches:  30%|███       | 7/23 [00:13<00:24,  1.54s/it]

Batches:  35%|███▍      | 8/23 [00:14<00:20,  1.36s/it]

Batches:  39%|███▉      | 9/23 [00:15<00:17,  1.27s/it]

Batches:  43%|████▎     | 10/23 [00:16<00:14,  1.15s/it]

Batches:  48%|████▊     | 11/23 [00:17<00:12,  1.03s/it]

Batches:  52%|█████▏    | 12/23 [00:17<00:10,  1.05it/s]

Batches:  57%|█████▋    | 13/23 [00:18<00:08,  1.12it/s]

Batches:  61%|██████    | 14/23 [00:19<00:08,  1.12it/s]

Batches:  65%|██████▌   | 15/23 [00:20<00:06,  1.21it/s]

Batches:  70%|██████▉   | 16/23 [00:20<00:05,  1.28it/s]

Batches:  74%|███████▍  | 17/23 [00:21<00:04,  1.32it/s]

Batches:  78%|███████▊  | 18/23 [00:22<00:03,  1.41it/s]

Batches:  83%|████████▎ | 19/23 [00:22<00:02,  1.57it/s]

Batches:  87%|████████▋ | 20/23 [00:23<00:01,  1.74it/s]

Batches:  91%|█████████▏| 21/23 [00:23<00:01,  1.80it/s]

Batches:  96%|█████████▌| 22/23 [00:23<00:00,  2.01it/s]

Batches: 100%|██████████| 23/23 [00:23<00:00,  2.61it/s]

Batches: 100%|██████████| 23/23 [00:23<00:00,  1.04s/it]




Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:  10%|█         | 1/10 [00:02<00:26,  2.96s/it]

Batches:  20%|██        | 2/10 [00:04<00:18,  2.34s/it]

Batches:  30%|███       | 3/10 [00:06<00:13,  1.94s/it]

Batches:  40%|████      | 4/10 [00:07<00:10,  1.74s/it]

Batches:  50%|█████     | 5/10 [00:08<00:07,  1.48s/it]

Batches:  60%|██████    | 6/10 [00:09<00:05,  1.36s/it]

Batches:  70%|███████   | 7/10 [00:10<00:03,  1.11s/it]

Batches:  80%|████████  | 8/10 [00:11<00:01,  1.08it/s]

Batches:  90%|█████████ | 9/10 [00:11<00:00,  1.33it/s]

Batches: 100%|██████████| 10/10 [00:11<00:00,  1.14s/it]




MPNet embeddings done in 35.5s | shapes tr:(2878, 768) te:(1162, 768)
Saved emb_mpnet_tr.npy and emb_mpnet_te.npy


S27 complete.


In [45]:
# S28: Time-aware XGB on MPNet embeddings + meta_v1 (6-block CV, 3-seed bag); cache OOF/test
import numpy as np, pandas as pd, time, gc
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values

# Load cached MPNet embeddings and meta_v1
Emb_tr = np.load('emb_mpnet_tr.npy').astype(np.float32)
Emb_te = np.load('emb_mpnet_te.npy').astype(np.float32)
Meta_tr = np.load('meta_v1_tr.npy').astype(np.float32)
Meta_te = np.load('meta_v1_te.npy').astype(np.float32)
print('Loaded MPNet features:', Emb_tr.shape, Emb_te.shape, Meta_tr.shape, Meta_te.shape)

# 6-block forward-chaining folds and mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
folds = []; mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    va_idx = np.array(blocks[i]); tr_idx = np.concatenate(blocks[:i])
    folds.append((tr_idx, va_idx)); mask[va_idx] = True
print(f'MPNet+Meta Time-CV: {len(folds)} folds; validated {mask.sum()}/{n}')

# XGB params (as in S23, with slight colsample room for larger dims)
base_params = dict(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=3,
    eta=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    min_child_weight=8,
    reg_alpha=0.5,
    reg_lambda=3.0,
    gamma=0.0,
    device='cuda',
    tree_method='hist'
)
num_boost_round = 8000
early_stopping_rounds = 100
seeds = [42, 1337, 2025]

oof_sum = np.zeros(n, dtype=np.float64)
oof_cnt = np.zeros(n, dtype=np.float64)
test_seed_preds = []

for si, seed in enumerate(seeds, 1):
    print(f'=== MPNet+Meta Seed {seed} ({si}/{len(seeds)}) ===')
    params = dict(base_params); params['seed'] = seed
    oof_seed = np.zeros(n, dtype=np.float32)
    test_folds = []
    for fi, (tr_idx, va_idx) in enumerate(folds, 1):
        t0 = time.time()
        Xtr_raw = np.hstack([Emb_tr[tr_idx], Meta_tr[tr_idx]]).astype(np.float32)
        Xva_raw = np.hstack([Emb_tr[va_idx], Meta_tr[va_idx]]).astype(np.float32)
        Xte_raw = np.hstack([Emb_te, Meta_te]).astype(np.float32)
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xtr = scaler.fit_transform(Xtr_raw).astype(np.float32)
        Xva = scaler.transform(Xva_raw).astype(np.float32)
        Xte = scaler.transform(Xte_raw).astype(np.float32)
        pos = float((y[tr_idx] == 1).sum()); neg = float((y[tr_idx] == 0).sum())
        params['scale_pos_weight'] = (neg / max(pos, 1.0)) if pos > 0 else 1.0
        dtr = xgb.DMatrix(Xtr, label=y[tr_idx])
        dva = xgb.DMatrix(Xva, label=y[va_idx])
        dte = xgb.DMatrix(Xte)
        booster = xgb.train(params, dtr, num_boost_round=num_boost_round, evals=[(dva, 'valid')], early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_pred = booster.predict(dva, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        te_pred = booster.predict(dte, iteration_range=(0, booster.best_iteration+1)).astype(np.float32)
        oof_seed[va_idx] = va_pred; test_folds.append(te_pred)
        auc = roc_auc_score(y[va_idx], va_pred)
        print(f'[MPNet Seed {seed} Fold {fi}] best_iter={booster.best_iteration} | spw={params["scale_pos_weight"]:.2f} | AUC: {auc:.5f} | {time.time()-t0:.1f}s | shapes tr:{Xtr.shape}')
        del Xtr_raw, Xva_raw, Xte_raw, Xtr, Xva, Xte, dtr, dva, dte, booster, scaler; gc.collect()
    seed_auc = roc_auc_score(y[mask], oof_seed[mask])
    print(f'[MPNet Seed {seed}] OOF AUC (validated only): {seed_auc:.5f}')
    oof_sum[mask] += oof_seed[mask]; oof_cnt[mask] += 1.0
    test_seed_preds.append(np.mean(test_folds, axis=0).astype(np.float64))
    del oof_seed, test_folds; gc.collect()

oof_avg = np.zeros(n, dtype=np.float32)
oof_avg[mask] = (oof_sum[mask] / np.maximum(oof_cnt[mask], 1.0)).astype(np.float32)
test_avg = np.mean(test_seed_preds, axis=0).astype(np.float32)
auc_oof = roc_auc_score(y[mask], oof_avg[mask])
print(f'MPNet+Meta Time-CV OOF AUC (validated only, 3-seed avg): {auc_oof:.5f}')
np.save('oof_xgb_emb_mpnet_time.npy', oof_avg.astype(np.float32))
np.save('test_xgb_emb_mpnet_time.npy', test_avg)
print('Saved oof_xgb_emb_mpnet_time.npy and test_xgb_emb_mpnet_time.npy')

Loaded MPNet features: (2878, 768) (1162, 768) (2878, 14) (1162, 14)
MPNet+Meta Time-CV: 5 folds; validated 2398/2878
=== MPNet+Meta Seed 42 (1/3) ===


[MPNet Seed 42 Fold 1] best_iter=99 | spw=1.94 | AUC: 0.67585 | 0.8s | shapes tr:(480, 782)


[MPNet Seed 42 Fold 2] best_iter=161 | spw=2.33 | AUC: 0.67840 | 0.8s | shapes tr:(960, 782)


[MPNet Seed 42 Fold 3] best_iter=10 | spw=2.49 | AUC: 0.63069 | 0.4s | shapes tr:(1440, 782)


[MPNet Seed 42 Fold 4] best_iter=7 | spw=2.79 | AUC: 0.61982 | 0.4s | shapes tr:(1920, 782)


[MPNet Seed 42 Fold 5] best_iter=29 | spw=2.83 | AUC: 0.61483 | 0.5s | shapes tr:(2399, 782)
[MPNet Seed 42] OOF AUC (validated only): 0.62090


=== MPNet+Meta Seed 1337 (2/3) ===


[MPNet Seed 1337 Fold 1] best_iter=149 | spw=1.94 | AUC: 0.67626 | 0.7s | shapes tr:(480, 782)


[MPNet Seed 1337 Fold 2] best_iter=67 | spw=2.33 | AUC: 0.68780 | 0.5s | shapes tr:(960, 782)


[MPNet Seed 1337 Fold 3] best_iter=7 | spw=2.49 | AUC: 0.60389 | 0.4s | shapes tr:(1440, 782)


[MPNet Seed 1337 Fold 4] best_iter=13 | spw=2.79 | AUC: 0.62401 | 0.4s | shapes tr:(1920, 782)


[MPNet Seed 1337 Fold 5] best_iter=22 | spw=2.83 | AUC: 0.60226 | 0.4s | shapes tr:(2399, 782)
[MPNet Seed 1337] OOF AUC (validated only): 0.62912


=== MPNet+Meta Seed 2025 (3/3) ===


[MPNet Seed 2025 Fold 1] best_iter=113 | spw=1.94 | AUC: 0.68658 | 0.6s | shapes tr:(480, 782)


[MPNet Seed 2025 Fold 2] best_iter=81 | spw=2.33 | AUC: 0.69627 | 0.6s | shapes tr:(960, 782)


[MPNet Seed 2025 Fold 3] best_iter=65 | spw=2.49 | AUC: 0.61451 | 0.6s | shapes tr:(1440, 782)


[MPNet Seed 2025 Fold 4] best_iter=28 | spw=2.79 | AUC: 0.63609 | 0.5s | shapes tr:(1920, 782)


[MPNet Seed 2025 Fold 5] best_iter=14 | spw=2.83 | AUC: 0.63984 | 0.4s | shapes tr:(2399, 782)
[MPNet Seed 2025] OOF AUC (validated only): 0.63786


MPNet+Meta Time-CV OOF AUC (validated only, 3-seed avg): 0.63820
Saved oof_xgb_emb_mpnet_time.npy and test_xgb_emb_mpnet_time.npy


In [46]:
# S29: 7-way constrained time-consistent logit blend (LRmix, D1, D2, Meta, Emb_MiniLM, Emb_MPNet); promote if best
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# 6-block forward-chaining time mask
order = np.argsort(train['unix_timestamp_of_request'].values)
n = len(train); k = 6
blocks = np.array_split(order, k)
mask = np.zeros(n, dtype=bool)
for i in range(1, k):
    mask[np.array(blocks[i])] = True
print(f'Time-CV (6 blocks) validated count: {mask.sum()}/{n}')

# Load 5 core time-bases + 2 embedding bases (MiniLM and MPNet) OOF/test
o_lr_w = np.load('oof_lr_time_withsub_meta.npy'); t_lr_w = np.load('test_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy');  t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy');         t_d1 = np.load('test_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy');      t_d2 = np.load('test_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy');         t_meta = np.load('test_xgb_meta_time.npy')
o_emb_minilm = np.load('oof_xgb_emb_meta_time.npy'); t_emb_minilm = np.load('test_xgb_emb_meta_time.npy')
o_emb_mpnet  = np.load('oof_xgb_emb_mpnet_time.npy'); t_emb_mpnet  = np.load('test_xgb_emb_mpnet_time.npy')

# Convert to logits
z_lr_w, z_lr_ns = to_logit(o_lr_w), to_logit(o_lr_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
z_emb_minilm, z_emb_mpnet = to_logit(o_emb_minilm), to_logit(o_emb_mpnet)
tz_lr_w, tz_lr_ns = to_logit(t_lr_w), to_logit(t_lr_ns)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)
tz_emb_minilm, tz_emb_mpnet = to_logit(t_emb_minilm), to_logit(t_emb_mpnet)

# Grids (kept tight for speed/robustness)
g_grid = [0.90, 0.95]
meta_grid = [0.18, 0.20, 0.22]
dense_tot_grid = [0.30, 0.35, 0.40]
alpha_grid = [0.65, 0.80]  # split dense total into v1/v2: w_d2 = d_tot * alpha; w_d1 = d_tot - w_d2
emb_minilm_grid = [0.08, 0.12, 0.16]
emb_mpnet_grid = [0.04, 0.08, 0.12]

best_auc, best_cfg = -1.0, None
tried = 0
for g in g_grid:
    z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
    tz_lr_mix = (1.0 - g)*tz_lr_w + g*tz_lr_ns
    for w_emb_min in emb_minilm_grid:
        for w_emb_mp in emb_mpnet_grid:
            for meta_w in meta_grid:
                for d_tot in dense_tot_grid:
                    # Remaining for LR after allocating meta, dense, embeddings
                    w_lr_rem = 1.0 - (meta_w + d_tot + w_emb_min + w_emb_mp)
                    if w_lr_rem <= 0 or w_lr_rem >= 1:
                        continue
                    for a in alpha_grid:
                        w_d2 = d_tot * a
                        w_d1 = d_tot - w_d2
                        if w_d1 < 0 or w_d2 < 0:
                            continue
                        z_oof = (w_lr_rem*z_lr_mix +
                                 w_d1*z_d1 + w_d2*z_d2 +
                                 meta_w*z_meta +
                                 w_emb_min*z_emb_minilm +
                                 w_emb_mp*z_emb_mpnet)
                        auc = roc_auc_score(y[mask], z_oof[mask])
                        tried += 1
                        if auc > best_auc:
                            best_auc = auc
                            best_cfg = dict(g=float(g), w_lr=float(w_lr_rem), w_d1=float(w_d1), w_d2=float(w_d2), w_meta=float(meta_w),
                                            w_emb_min=float(w_emb_min), w_emb_mp=float(w_emb_mp), tz_lr_mix=tz_lr_mix)
cfg_print = {k: v for k, v in best_cfg.items() if k != 'tz_lr_mix'} if best_cfg is not None else {}
print(f'7-way grid tried {tried} | Best OOF(z,time-mask) AUC: {best_auc:.5f} | cfg={cfg_print}')

# Build primary test prediction
g = best_cfg['g']; w_lr = best_cfg['w_lr']; w_d1 = best_cfg['w_d1']; w_d2 = best_cfg['w_d2'];
w_meta = best_cfg['w_meta']; w_emb_min = best_cfg['w_emb_min']; w_emb_mp = best_cfg['w_emb_mp']
tz_lr_mix = best_cfg['tz_lr_mix']
zt_best = (w_lr*tz_lr_mix + w_d1*tz_d1 + w_d2*tz_d2 + w_meta*tz_meta + w_emb_min*tz_emb_minilm + w_emb_mp*tz_emb_mpnet)
pt_best = sigmoid(zt_best).astype(np.float32)
sub = pd.DataFrame({id_col: ids, target_col: pt_best})
sub.to_csv('submission_time_blend_with_two_emb.csv', index=False)
sub.to_csv('submission.csv', index=False)
print('Promoted 7-way (with MiniLM+MPNet) blend to submission.csv. Head:')
print(sub.head())

# Hedge: 15% shrink toward equal across 6 components (LRmix, D1, D2, Meta, Emb_min, Emb_mp) in logit space
w_vec = np.array([w_lr, w_d1, w_d2, w_meta, w_emb_min, w_emb_mp], dtype=np.float64)
w_eq = np.ones_like(w_vec) / len(w_vec)
alpha = 0.15
w_shr = ((1.0 - alpha)*w_vec + alpha*w_eq); w_shr = (w_shr / w_shr.sum()).astype(np.float64)
zt_shr = (w_shr[0]*tz_lr_mix + w_shr[1]*tz_d1 + w_shr[2]*tz_d2 + w_shr[3]*tz_meta + w_shr[4]*tz_emb_minilm + w_shr[5]*tz_emb_mpnet)
pt_shr = sigmoid(zt_shr).astype(np.float32)
pd.DataFrame({id_col: ids, target_col: pt_shr}).to_csv('submission_time_blend_with_two_emb_shrunk.csv', index=False)
print('Wrote hedge: submission_time_blend_with_two_emb_shrunk.csv')

Time-CV (6 blocks) validated count: 2398/2878


7-way grid tried 324 | Best OOF(z,time-mask) AUC: 0.68094 | cfg={'g': 0.95, 'w_lr': 0.24, 'w_d1': 0.10499999999999998, 'w_d2': 0.195, 'w_meta': 0.22, 'w_emb_min': 0.12, 'w_emb_mp': 0.12}
Promoted 7-way (with MiniLM+MPNet) blend to submission.csv. Head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.517362
1   t3_roiuw                  0.523317
2   t3_mjnbq                  0.439774
3   t3_t8wd1                  0.435404
4  t3_1m4zxu                  0.509780
Wrote hedge: submission_time_blend_with_two_emb_shrunk.csv


In [47]:
# S30: Recency-weighted 7-way logit blends: full-mask, last-2 blocks, and gamma-decayed (gamma in {0.90,0.95,0.98})
import numpy as np, pandas as pd, time
from sklearn.metrics import roc_auc_score

id_col = 'request_id'; target_col = 'requester_received_pizza'
train = pd.read_json('train.json')
test = pd.read_json('test.json')
y = train[target_col].astype(int).values
ids = test[id_col].values

def to_logit(p, eps=1e-6):
    p = np.clip(p.astype(np.float64), eps, 1.0 - eps)
    return np.log(p / (1.0 - p))
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

# 6-block forward-chaining blocks and masks
order = np.argsort(train['unix_timestamp_of_request'].values)
k = 6
blocks = np.array_split(order, k)
n = len(train)
mask_full = np.zeros(n, dtype=bool)
for i in range(1, k):
    mask_full[np.array(blocks[i])] = True
# last-2 validated blocks only (blocks[4], blocks[5])
mask_last2 = np.zeros(n, dtype=bool)
for i in [4,5]:
    mask_last2[np.array(blocks[i])] = True
print(f'Time-CV validated full: {mask_full.sum()}/{n} | last2: {mask_last2.sum()}')

# Load bases (same as S29)
o_lr_w = np.load('oof_lr_time_withsub_meta.npy'); t_lr_w = np.load('test_lr_time_withsub_meta.npy')
o_lr_ns = np.load('oof_lr_time_nosub_meta.npy');  t_lr_ns = np.load('test_lr_time_nosub_meta.npy')
o_d1 = np.load('oof_xgb_dense_time.npy');         t_d1 = np.load('test_xgb_dense_time.npy')
o_d2 = np.load('oof_xgb_dense_time_v2.npy');      t_d2 = np.load('test_xgb_dense_time_v2.npy')
o_meta = np.load('oof_xgb_meta_time.npy');        t_meta = np.load('test_xgb_meta_time.npy')
o_emb_min = np.load('oof_xgb_emb_meta_time.npy'); t_emb_min = np.load('test_xgb_emb_meta_time.npy')
o_emb_mp  = np.load('oof_xgb_emb_mpnet_time.npy');t_emb_mp  = np.load('test_xgb_emb_mpnet_time.npy')

# Convert to logits
z_lr_w, z_lr_ns = to_logit(o_lr_w), to_logit(o_lr_ns)
z_d1, z_d2, z_meta = to_logit(o_d1), to_logit(o_d2), to_logit(o_meta)
z_emn, z_emp = to_logit(o_emb_min), to_logit(o_emb_mp)
tz_lr_w, tz_lr_ns = to_logit(t_lr_w), to_logit(t_lr_ns)
tz_d1, tz_d2, tz_meta = to_logit(t_d1), to_logit(t_d2), to_logit(t_meta)
tz_emn, tz_emp = to_logit(t_emb_min), to_logit(t_emb_mp)

# Weight grids per expert priors
g_grid = [0.90, 0.95, 0.97]
meta_grid = [0.18, 0.20, 0.22]
dense_tot_grid = [0.22, 0.30, 0.35, 0.40]
alpha_grid = [0.50, 0.65, 0.80]  # split dense_total into v1/v2
emn_grid = [0.10, 0.12, 0.15]
emp_grid = [0.08, 0.10, 0.12]

def search_best(mask, sample_weight=None):
    best_auc, best_cfg = -1.0, None
    tried = 0
    for g in g_grid:
        z_lr_mix = (1.0 - g)*z_lr_w + g*z_lr_ns
        tz_lr_mix = (1.0 - g)*tz_lr_w + g*tz_lr_ns
        for w_emn in emn_grid:
            for w_emp in emp_grid:
                for w_meta in meta_grid:
                    for d_tot in dense_tot_grid:
                        w_lr = 1.0 - (w_emn + w_emp + w_meta + d_tot)
                        if w_lr <= 0 or w_lr >= 1:
                            continue
                        for a in alpha_grid:
                            w_d2 = d_tot * a
                            w_d1 = d_tot - w_d2
                            if w_d1 < 0 or w_d2 < 0: continue
                            z_oof = (w_lr*z_lr_mix + w_d1*z_d1 + w_d2*z_d2 + w_meta*z_meta + w_emn*z_emn + w_emp*z_emp)
                            auc = roc_auc_score(y[mask], z_oof[mask], sample_weight=(sample_weight[mask] if sample_weight is not None else None))
                            tried += 1
                            if auc > best_auc:
                                best_auc = auc
                                best_cfg = dict(g=float(g), w_lr=float(w_lr), w_d1=float(w_d1), w_d2=float(w_d2), w_meta=float(w_meta),
                                                w_emn=float(w_emn), w_emp=float(w_emp), tz_lr_mix=tz_lr_mix)
    return best_auc, best_cfg, tried

# 1) Full-mask (reconfirm best on all validated indices)
auc_full, cfg_full, tried_full = search_best(mask_full)
print(f'[Full] tried={tried_full} | best OOF(z) AUC={auc_full:.5f} | cfg={ {k:v for k,v in cfg_full.items() if k!="tz_lr_mix"} }')

# 2) Last-2 blocks only
auc_last2, cfg_last2, tried_last2 = search_best(mask_last2)
print(f'[Last2] tried={tried_last2} | best OOF(z,last2) AUC={auc_last2:.5f} | cfg={ {k:v for k,v in cfg_last2.items() if k!="tz_lr_mix"} }')

# 3) Time-decayed objective over validated (gamma in {0.90,0.95,0.98})
best_gamma, best_auc_g, best_cfg_g = None, -1.0, None
for gamma in [0.90, 0.95, 0.98]:
    # assign weights by block recency: later blocks higher weight
    w = np.zeros(n, dtype=np.float64)
    for bi in range(1, k):
        # older blocks get smaller weight; newer larger
        age = (k - 1) - bi  # bi in 1..5; age 4..0
        w[np.array(blocks[bi])] = (gamma ** age)
    auc_g, cfg_g, _ = search_best(mask_full, sample_weight=w)
    print(f'[Gamma {gamma}] best OOF(z,weighted) AUC={auc_g:.5f}')
    if auc_g > best_auc_g:
        best_auc_g, best_cfg_g, best_gamma = auc_g, cfg_g, gamma
print(f'[Gamma-best] gamma={best_gamma} | AUC={best_auc_g:.5f} | cfg={ {k:v for k,v in best_cfg_g.items() if k!="tz_lr_mix"} }')

# Build submissions for each variant + 15% shrink hedges
def build_and_save(tag, cfg):
    g = cfg['g']; tz_lr_mix = cfg['tz_lr_mix']
    w_lr, w_d1, w_d2, w_meta, w_emn, w_emp = cfg['w_lr'], cfg['w_d1'], cfg['w_d2'], cfg['w_meta'], cfg['w_emn'], cfg['w_emp']
    zt = (w_lr*tz_lr_mix + w_d1*tz_d1 + w_d2*tz_d2 + w_meta*tz_meta + w_emn*tz_emn + w_emp*tz_emp)
    pt = sigmoid(zt).astype(np.float32)
    pd.DataFrame({id_col: ids, target_col: pt}).to_csv(f'submission_7way_{tag}.csv', index=False)
    # Shrink hedge
    w_vec = np.array([w_lr, w_d1, w_d2, w_meta, w_emn, w_emp], dtype=np.float64)
    w_eq = np.ones_like(w_vec)/len(w_vec)
    alpha = 0.15
    w_shr = ((1.0 - alpha)*w_vec + alpha*w_eq); w_shr = (w_shr / w_shr.sum()).astype(np.float64)
    zt_shr = (w_shr[0]*tz_lr_mix + w_shr[1]*tz_d1 + w_shr[2]*tz_d2 + w_shr[3]*tz_meta + w_shr[4]*tz_emn + w_shr[5]*tz_emp)
    pt_shr = sigmoid(zt_shr).astype(np.float32)
    pd.DataFrame({id_col: ids, target_col: pt_shr}).to_csv(f'submission_7way_{tag}_shrunk.csv', index=False)

build_and_save('full', cfg_full)
build_and_save('last2', cfg_last2)
build_and_save(f'gamma{best_gamma:.2f}'.replace('.','p'), best_cfg_g)

# Promote gamma-best as primary per expert advice
prim = f'submission_7way_gamma{best_gamma:.2f}'.replace('.','p') + '.csv'
pd.read_csv(prim).to_csv('submission.csv', index=False)
print(f'Promoted {prim} to submission.csv')

Time-CV validated full: 2398/2878 | last2: 958


[Full] tried=972 | best OOF(z) AUC=0.68130 | cfg={'g': 0.97, 'w_lr': 0.24, 'w_d1': 0.15, 'w_d2': 0.15, 'w_meta': 0.22, 'w_emn': 0.12, 'w_emp': 0.12}


[Last2] tried=972 | best OOF(z,last2) AUC=0.64682 | cfg={'g': 0.97, 'w_lr': 0.22999999999999998, 'w_d1': 0.15, 'w_d2': 0.15, 'w_meta': 0.2, 'w_emn': 0.15, 'w_emp': 0.12}


[Gamma 0.95] best OOF(z,weighted) AUC=0.67821


[Gamma 0.98] best OOF(z,weighted) AUC=0.68007
[Gamma-best] gamma=0.98 | AUC=0.68007 | cfg={'g': 0.97, 'w_lr': 0.24, 'w_d1': 0.15, 'w_d2': 0.15, 'w_meta': 0.22, 'w_emn': 0.12, 'w_emp': 0.12}
Promoted submission_7way_gamma0p98.csv to submission.csv


In [48]:
# S31: Block-5 optimizer + per-model recent interpolation (r in {0.24,0.30}) + gamma-weighted; hedged submissions (compact)
import numpy as np, pandas as pd, os
from sklearn.metrics import roc_auc_score
id_col='request_id'; target_col='requester_received_pizza'
train=pd.read_json('train.json'); test=pd.read_json('test.json')
y=train[target_col].astype(int).values; ids=test[id_col].values
def to_logit(p,eps=1e-6): p=np.clip(p.astype(np.float64),eps,1-eps); return np.log(p/(1-p))
def sigmoid(z): return 1/(1+np.exp(-z))
# 6-block forward-chaining, use last validated block (blocks[5])
order=np.argsort(train['unix_timestamp_of_request'].values); k=6; blocks=np.array_split(order,k); n=len(train)
mask_blk5=np.zeros(n,bool); mask_blk5[np.array(blocks[5])]=True
print('Block-5 size:', int(mask_blk5.sum()), '/', n)
# Load base OOF/test
o_lr_w=np.load('oof_lr_time_withsub_meta.npy'); t_lr_w=np.load('test_lr_time_withsub_meta.npy')
o_lr_ns=np.load('oof_lr_time_nosub_meta.npy'); t_lr_ns=np.load('test_lr_time_nosub_meta.npy')
o_d1=np.load('oof_xgb_dense_time.npy'); t_d1=np.load('test_xgb_dense_time.npy')
o_d2=np.load('oof_xgb_dense_time_v2.npy'); t_d2=np.load('test_xgb_dense_time_v2.npy')
o_meta=np.load('oof_xgb_meta_time.npy'); t_meta=np.load('test_xgb_meta_time.npy')
o_emn=np.load('oof_xgb_emb_meta_time.npy'); t_emn=np.load('test_xgb_emb_meta_time.npy')
o_emp=np.load('oof_xgb_emb_mpnet_time.npy'); t_emp=np.load('test_xgb_emb_mpnet_time.npy')
# Logits
z_lr_w,z_lr_ns=to_logit(o_lr_w),to_logit(o_lr_ns); z_d1,z_d2,z_meta=to_logit(o_d1),to_logit(o_d2),to_logit(o_meta)
z_emn,z_emp=to_logit(o_emn),to_logit(o_emp)
tz_lr_w,tz_lr_ns=to_logit(t_lr_w),to_logit(t_lr_ns); tz_d1,tz_d2,tz_meta=to_logit(t_d1),to_logit(t_d2),to_logit(t_meta)
tz_emn,tz_emp=to_logit(t_emn),to_logit(t_emp)
# Recent test-only (avg of recent35/45) for eligible models
def load_recent_avg_logit(prefix):
    arrs=[]
    for suf in ['_recent35.npy','_recent45.npy']:
        p=prefix+suf
        if os.path.exists(p):
            try: arrs.append(to_logit(np.load(p)))
            except: pass
    return (np.mean(arrs,axis=0).astype(np.float64)) if arrs else None
tz_lr_w_r=load_recent_avg_logit('test_lr_time_withsub_meta')
tz_lr_ns_r=load_recent_avg_logit('test_lr_time_nosub_meta')
tz_emn_r=load_recent_avg_logit('test_xgb_emb_meta_time')
tz_emp_r=load_recent_avg_logit('test_xgb_emb_mpnet_time')
print('Recent availability:', {'lr_w':tz_lr_w_r is not None,'lr_ns':tz_lr_ns_r is not None,'emn':tz_emn_r is not None,'emp':tz_emp_r is not None})
# Grids per expert
g_grid=[0.92,0.94,0.96,0.98]
d_tot_grid=[0.10,0.14,0.18,0.22]; v1_frac_grid=[0.5,0.6,0.7]
meta_grid=[0.16,0.18,0.20,0.22]; emn_grid=[0.10,0.12,0.14]; emp_grid=[0.08,0.10,0.12]
def search_blk5():
    best_auc=-1.0; best=None; tried=0
    for g in g_grid:
        z_lr_mix=(1-g)*z_lr_w+g*z_lr_ns
        for d_tot in d_tot_grid:
            for v1f in v1_frac_grid:
                w_d1=d_tot*v1f; w_d2=d_tot-w_d1
                if w_d1<0 or w_d2<0: continue
                for w_meta in meta_grid:
                    for w_emn in emn_grid:
                        for w_emp in emp_grid:
                            w_sum=d_tot+w_meta+w_emn+w_emp; w_lr=1.0-w_sum
                            if w_lr<=0 or w_lr>=1: continue
                            if not (0.24<=w_lr<=0.50): continue
                            z=w_lr*z_lr_mix+w_d1*z_d1+w_d2*z_d2+w_meta*z_meta+w_emn*z_emn+w_emp*z_emp
                            auc=roc_auc_score(y[mask_blk5], z[mask_blk5]); tried+=1
                            if auc>best_auc:
                                best_auc=auc; best=dict(g=float(g),w_lr=float(w_lr),w_d1=float(w_d1),w_d2=float(w_d2),w_meta=float(w_meta),w_emn=float(w_emn),w_emp=float(w_emp))
    print('Blk5 tried', tried, '| best AUC', f'{best_auc:.5f}', '|', best); return best
cfg=search_blk5()
# Alpha grids (test-only interpolation); set to [0] if no recent
aLR=[0.15,0.25,0.35] if (tz_lr_w_r is not None or tz_lr_ns_r is not None) else [0.0]
aMN=[0.20,0.30,0.40] if (tz_emn_r is not None) else [0.0]
aMP=[0.10,0.20,0.30] if (tz_emp_r is not None) else [0.0]
def pick_alphas(cfg,r,tol=0.02):
    wl,wmn,wmp=cfg['w_lr'],cfg['w_emn'],cfg['w_emp']
    best=None; best_err=1e9; best_sum=9e9
    for al in aLR:
        for amn in aMN:
            for amp in aMP:
                s=wl*al+wmn*amn+wmp*amp; err=abs(s-r); sm=al+amn+amp
                if (err<best_err) or (abs(err-best_err)<1e-12 and sm<best_sum):
                    best_err=err; best_sum=sm; best=(al,amn,amp)
    return best
def build_sub(tag,cfg,alphas):
    g=cfg['g']
    tz_lr_mix_full=(1-g)*tz_lr_w+g*tz_lr_ns
    z_w_r=tz_lr_w_r if tz_lr_w_r is not None else tz_lr_w
    z_ns_r=tz_lr_ns_r if tz_lr_ns_r is not None else tz_lr_ns
    tz_lr_mix_recent=(1-g)*z_w_r+g*z_ns_r
    a_lr,a_mn,a_mp=alphas
    tz_lr_mix=(1-a_lr)*tz_lr_mix_full+a_lr*tz_lr_mix_recent
    tz_mn=(1-a_mn)*tz_emn + a_mn*(tz_emn_r if tz_emn_r is not None else tz_emn)
    tz_mp=(1-a_mp)*tz_emp + a_mp*(tz_emp_r if tz_emp_r is not None else tz_emp)
    zt=cfg['w_lr']*tz_lr_mix+cfg['w_d1']*tz_d1+cfg['w_d2']*tz_d2+cfg['w_meta']*tz_meta+cfg['w_emn']*tz_mn+cfg['w_emp']*tz_mp
    pt=sigmoid(zt).astype(np.float32)
    fn=f'submission_blk5_{tag}.csv'; pd.DataFrame({id_col:ids,target_col:pt}).to_csv(fn,index=False); print('Wrote',fn,'mean',float(pt.mean()))
alphas24=pick_alphas(cfg,0.24); alphas30=pick_alphas(cfg,0.30)
print('alphas r24/r30:', alphas24, alphas30)
build_sub('r24', cfg, alphas24)
build_sub('r30', cfg, alphas30)
# Gamma-weighted variant over validated blocks (no recent interpolation)
def search_gamma():
    best=None; best_auc=-1; best_gm=None
    for gamma in [0.995,0.998,0.9995]:
        w=np.zeros(n,np.float64)
        for bi in range(1,k):
            age=(k-1)-bi; w[np.array(blocks[bi])]=gamma**age
        best_auc_g=-1; best_cfg_g=None
        for g in g_grid:
            z_lr_mix=(1-g)*z_lr_w+g*z_lr_ns
            for d_tot in d_tot_grid:
                for v1f in v1_frac_grid:
                    w_d1=d_tot*v1f; w_d2=d_tot-w_d1
                    if w_d1<0 or w_d2<0: continue
                    for w_meta in meta_grid:
                        for w_emn in emn_grid:
                            for w_emp in emp_grid:
                                w_sum=d_tot+w_meta+w_emn+w_emp; w_lr=1-w_sum
                                if w_lr<=0 or w_lr>=1 or not (0.24<=w_lr<=0.50): continue
                                z=w_lr*z_lr_mix+w_d1*z_d1+w_d2*z_d2+w_meta*z_meta+w_emn*z_emn+w_emp*z_emp
                                auc=roc_auc_score(y, z, sample_weight=w)
                                if auc>best_auc_g: best_auc_g=auc; best_cfg_g=dict(g=float(g),w_lr=float(w_lr),w_d1=float(w_d1),w_d2=float(w_d2),w_meta=float(w_meta),w_emn=float(w_emn),w_emp=float(w_emp))
        print('[gamma',gamma,'] best',f'{best_auc_g:.5f}',best_cfg_g)
        if best_auc_g>best_auc: best_auc=best_auc_g; best=best_cfg_g; best_gm=gamma
    return best_gm,best_auc,best
gm,aucg,cfg_g=search_gamma()
print('Gamma-best:', gm, aucg, cfg_g)
g=cfg_g['g']; tz_lr_mix=(1-g)*tz_lr_w+g*tz_lr_ns
zt=cfg_g['w_lr']*tz_lr_mix+cfg_g['w_d1']*tz_d1+cfg_g['w_d2']*tz_d2+cfg_g['w_meta']*tz_meta+cfg_g['w_emn']*tz_emn+cfg_g['w_emp']*tz_emp
pt=sigmoid(zt).astype(np.float32)
fn=f'submission_blk5_gamma{str(gm).replace(".","p")}.csv'; pd.DataFrame({id_col:ids,target_col:pt}).to_csv(fn,index=False); print('Wrote',fn)
# Hedged logit-averages
def load_sub(path): return pd.read_csv(path)[target_col].values.astype(np.float64)
def logit_avg(paths,out):
    z=np.mean([to_logit(load_sub(p)) for p in paths],axis=0); p=sigmoid(z).astype(np.float32)
    pd.DataFrame({id_col:ids,target_col:p}).to_csv(out,index=False); print('Wrote',out,'mean',float(p.mean()))
s_r24='submission_blk5_r24.csv'; s_r30='submission_blk5_r30.csv'; s_g=f'submission_blk5_gamma{str(gm).replace(".","p")}.csv'
logit_avg([s_r24,s_g],'submission_logitavg_r24_gamma.csv')
logit_avg([s_r24,s_r30,s_g],'submission_logitavg_r24_r30_gamma.csv')
pd.read_csv('submission_logitavg_r24_gamma.csv').to_csv('submission.csv', index=False)
print('Promoted submission_logitavg_r24_gamma.csv to submission.csv')

Block-5 size: 479 / 2878
Recent availability: {'lr_w': False, 'lr_ns': False, 'emn': False, 'emp': False}


Blk5 tried 1596 | best AUC 0.65235 | {'g': 0.98, 'w_lr': 0.45999999999999996, 'w_d1': 0.05, 'w_d2': 0.05, 'w_meta': 0.22, 'w_emn': 0.14, 'w_emp': 0.08}
alphas r24/r30: (0.0, 0.0, 0.0) (0.0, 0.0, 0.0)
Wrote submission_blk5_r24.csv mean 0.38267284631729126
Wrote submission_blk5_r30.csv mean 0.38267284631729126


[gamma 0.995 ] best 0.68111 {'g': 0.98, 'w_lr': 0.29999999999999993, 'w_d1': 0.154, 'w_d2': 0.066, 'w_meta': 0.22, 'w_emn': 0.14, 'w_emp': 0.12}


[gamma 0.998 ] best 0.68129 {'g': 0.98, 'w_lr': 0.29999999999999993, 'w_d1': 0.154, 'w_d2': 0.066, 'w_meta': 0.22, 'w_emn': 0.14, 'w_emp': 0.12}


[gamma 0.9995 ] best 0.68138 {'g': 0.98, 'w_lr': 0.29999999999999993, 'w_d1': 0.154, 'w_d2': 0.066, 'w_meta': 0.22, 'w_emn': 0.14, 'w_emp': 0.12}
Gamma-best: 0.9995 0.6813808736215854 {'g': 0.98, 'w_lr': 0.29999999999999993, 'w_d1': 0.154, 'w_d2': 0.066, 'w_meta': 0.22, 'w_emn': 0.14, 'w_emp': 0.12}
Wrote submission_blk5_gamma0p9995.csv
Wrote submission_logitavg_r24_gamma.csv mean 0.3966662585735321
Wrote submission_logitavg_r24_r30_gamma.csv mean 0.3919568359851837
Promoted submission_logitavg_r24_gamma.csv to submission.csv


In [49]:
# S32: Inspect available recent* artifacts to wire correct filenames
import os, re, json, glob
cands = sorted([f for f in os.listdir('.') if f.endswith('.npy') and ('recent' in f.lower() or re.search(r'recent\d+', f.lower()))])
print('Found recent-like npy files:', len(cands))
for f in cands[:200]:
    print(f)
# Also show any test_*_recent* explicitly
print('\nExplicit test_*_recent* matches:')
for f in sorted(glob.glob('test_*_recent*.npy')):
    print(f)

Found recent-like npy files: 6
test_catboost_textmeta_v2_recent35.npy
test_catboost_textmeta_v2_recent45.npy
test_lr_nosub_meta_recent35.npy
test_lr_nosub_meta_recent45.npy
test_xgb_minilm_meta_recent35.npy
test_xgb_minilm_meta_recent45.npy

Explicit test_*_recent* matches:
test_catboost_textmeta_v2_recent35.npy
test_catboost_textmeta_v2_recent45.npy
test_lr_nosub_meta_recent35.npy
test_lr_nosub_meta_recent45.npy
test_xgb_minilm_meta_recent35.npy
test_xgb_minilm_meta_recent45.npy


In [50]:
# S33: Reload recent test-only using discovered filenames and rebuild r24/r30 + hedges
import numpy as np, pandas as pd, os
def to_logit(p,eps=1e-6): p=np.clip(p.astype(np.float64),eps,1-eps); return np.log(p/(1-p))
def sigmoid(z): return 1/(1+np.exp(-z))
def load_recent_avg_logit(prefix):
    arrs=[]
    for suf in ['_recent35.npy','_recent45.npy']:
        p=prefix+suf
        if os.path.exists(p):
            try: arrs.append(to_logit(np.load(p)))
            except: pass
    return (np.mean(arrs,axis=0).astype(np.float64)) if arrs else None
# Try alternative recent prefixes discovered by S32
tz_lr_ns_r_alt = load_recent_avg_logit('test_lr_nosub_meta')
tz_emn_r_alt   = load_recent_avg_logit('test_xgb_minilm_meta')
# Override globals from S31 if better recent found
try:
    tz_lr_ns_r = tz_lr_ns_r_alt if (tz_lr_ns_r_alt is not None) else tz_lr_ns_r
except NameError:
    tz_lr_ns_r = tz_lr_ns_r_alt
try:
    tz_emn_r = tz_emn_r_alt if (tz_emn_r_alt is not None) else tz_emn_r
except NameError:
    tz_emn_r = tz_emn_r_alt
print('Updated recent availability:', {
    'lr_ns': tz_lr_ns_r is not None,
    'minilm': tz_emn_r is not None
})
# Recompute alphas for r=0.24 and r=0.30 using existing cfg and pick_alphas from S31
alphas24 = pick_alphas(cfg, 0.24); alphas30 = pick_alphas(cfg, 0.30)
print('New alphas r24/r30:', alphas24, alphas30)
# Rebuild submissions using existing build_sub from S31
build_sub('r24_alt', cfg, alphas24)
build_sub('r30_alt', cfg, alphas30)
# Rebuild hedges with gamma-best from S31 and new r24_alt/r30_alt
def load_sub(path): return pd.read_csv(path)['requester_received_pizza'].values.astype(np.float64)
def logit_avg(paths,out, ids_col):
    z=np.mean([to_logit(load_sub(p)) for p in paths],axis=0); p=sigmoid(z).astype(np.float32)
    pd.DataFrame({ids_col: pd.read_json('test.json')['request_id'].values, 'requester_received_pizza': p}).to_csv(out,index=False)
    print('Wrote',out,'mean',float(p.mean()))
s_r24_alt='submission_blk5_r24_alt.csv'; s_r30_alt='submission_blk5_r30_alt.csv'; s_g=f'submission_blk5_gamma{str(gm).replace(".","p")}.csv'
logit_avg([s_r24_alt,s_g],'submission_logitavg_r24alt_gamma.csv','request_id')
logit_avg([s_r24_alt,s_r30_alt,s_g],'submission_logitavg_r24alt_r30alt_gamma.csv','request_id')
# Promote the 2-way hedge
pd.read_csv('submission_logitavg_r24alt_gamma.csv').to_csv('submission.csv', index=False)
print('Promoted submission_logitavg_r24alt_gamma.csv to submission.csv')

Updated recent availability: {'lr_ns': True, 'minilm': True}
New alphas r24/r30: (0.0, 0.0, 0.0) (0.0, 0.0, 0.0)
Wrote submission_blk5_r24_alt.csv mean 0.38267284631729126
Wrote submission_blk5_r30_alt.csv mean 0.38267284631729126
Wrote submission_logitavg_r24alt_gamma.csv mean 0.3966662585735321
Wrote submission_logitavg_r24alt_r30alt_gamma.csv mean 0.3919568359851837
Promoted submission_logitavg_r24alt_gamma.csv to submission.csv


In [51]:
# S34: Recompute alpha grids based on updated recent availability and rebuild submissions/hedges
import numpy as np, pandas as pd
def to_logit(p,eps=1e-6): p=np.clip(p.astype(np.float64),eps,1-eps); return np.log(p/(1-p))
def sigmoid(z): return 1/(1+np.exp(-z))
# Recompute alpha grids now that tz_*_r were updated in S33
aLR=[0.15,0.25,0.35] if (('tz_lr_w_r' in globals() and tz_lr_w_r is not None) or ('tz_lr_ns_r' in globals() and tz_lr_ns_r is not None)) else [0.0]
aMN=[0.20,0.30,0.40] if ('tz_emn_r' in globals() and tz_emn_r is not None) else [0.0]
aMP=[0.10,0.20,0.30] if ('tz_emp_r' in globals() and tz_emp_r is not None) else [0.0]
print('Alpha grids set:', {'LR':aLR, 'MiniLM':aMN, 'MPNet':aMP})
def pick_alphas(cfg,r,tol=0.02):
    wl,wmn,wmp=cfg['w_lr'],cfg['w_emn'],cfg['w_emp']
    best=None; best_err=1e9; best_sum=9e9
    for al in aLR:
        for amn in aMN:
            for amp in aMP:
                s=wl*al+wmn*amn+wmp*amp; err=abs(s-r); sm=al+amn+amp
                if (err<best_err) or (abs(err-best_err)<1e-12 and sm<best_sum):
                    best_err=err; best_sum=sm; best=(al,amn,amp)
    return best
def build_sub(tag,cfg,alphas):
    g=cfg['g']
    tz_lr_mix_full=(1-g)*tz_lr_w+g*tz_lr_ns
    z_w_r=tz_lr_w_r if ('tz_lr_w_r' in globals() and tz_lr_w_r is not None) else tz_lr_w
    z_ns_r=tz_lr_ns_r if ('tz_lr_ns_r' in globals() and tz_lr_ns_r is not None) else tz_lr_ns
    tz_lr_mix_recent=(1-g)*z_w_r+g*z_ns_r
    a_lr,a_mn,a_mp=alphas
    tz_lr_mix=(1-a_lr)*tz_lr_mix_full+a_lr*tz_lr_mix_recent
    tz_mn=(1-a_mn)*tz_emn + a_mn*((tz_emn_r if ('tz_emn_r' in globals()) else None) if (('tz_emn_r' in globals()) and (tz_emn_r is not None)) else tz_emn)
    tz_mp=(1-a_mp)*tz_emp + a_mp*((tz_emp_r if ('tz_emp_r' in globals()) else None) if (('tz_emp_r' in globals()) and (tz_emp_r is not None)) else tz_emp)
    zt=cfg['w_lr']*tz_lr_mix+cfg['w_d1']*tz_d1+cfg['w_d2']*tz_d2+cfg['w_meta']*tz_meta+cfg['w_emn']*tz_mn+cfg['w_emp']*tz_mp
    pt=sigmoid(zt).astype(np.float32)
    fn=f'submission_blk5_{tag}.csv'; pd.DataFrame({'request_id':pd.read_json('test.json')['request_id'].values,'requester_received_pizza':pt}).to_csv(fn,index=False); print('Wrote',fn,'mean',float(pt.mean()))
# Use cfg from S31 (block-5 best) for alpha targeting
alphas24=pick_alphas(cfg,0.24); alphas30=pick_alphas(cfg,0.30)
print('Recomputed alphas r24/r30:', alphas24, alphas30)
build_sub('r24_alt2', cfg, alphas24)
build_sub('r30_alt2', cfg, alphas30)
# Hedge with existing gamma-best
def load_sub(path): return pd.read_csv(path)['requester_received_pizza'].values.astype(np.float64)
def logit_avg(paths,out):
    z=np.mean([to_logit(load_sub(p)) for p in paths],axis=0); p=sigmoid(z).astype(np.float32)
    pd.DataFrame({'request_id':pd.read_json('test.json')['request_id'].values,'requester_received_pizza':p}).to_csv(out,index=False); print('Wrote',out,'mean',float(p.mean()))
s_r24='submission_blk5_r24_alt2.csv'; s_r30='submission_blk5_r30_alt2.csv'; s_g=f'submission_blk5_gamma{str(gm).replace(".","p")}.csv'
logit_avg([s_r24,s_g],'submission_logitavg_r24alt2_gamma.csv')
logit_avg([s_r24,s_r30,s_g],'submission_logitavg_r24alt2_r30alt2_gamma.csv')
pd.read_csv('submission_logitavg_r24alt2_gamma.csv').to_csv('submission.csv', index=False)
print('Promoted submission_logitavg_r24alt2_gamma.csv to submission.csv')

Alpha grids set: {'LR': [0.15, 0.25, 0.35], 'MiniLM': [0.2, 0.3, 0.4], 'MPNet': [0.0]}
Recomputed alphas r24/r30: (0.35, 0.4, 0.0) (0.35, 0.4, 0.0)
Wrote submission_blk5_r24_alt2.csv mean 0.34550267457962036
Wrote submission_blk5_r30_alt2.csv mean 0.34550267457962036
Wrote submission_logitavg_r24alt2_gamma.csv mean 0.37762686610221863
Wrote submission_logitavg_r24alt2_r30alt2_gamma.csv mean 0.3667590320110321
Promoted submission_logitavg_r24alt2_gamma.csv to submission.csv


In [52]:
# S35: Promote safer 2-way hedge (r24 + gamma) to submission.csv per expert advice
import pandas as pd
src = 'submission_logitavg_r24_gamma.csv'
df = pd.read_csv(src)
df.to_csv('submission.csv', index=False)
print('Promoted', src, 'to submission.csv | mean p =', float(df['requester_received_pizza'].mean()))

Promoted submission_logitavg_r24_gamma.csv to submission.csv | mean p = 0.3966662772977625


In [53]:
# S36: Conservative recent interpolation using gamma-best weights; build r24/r30 small-alpha variants and hedges
import numpy as np, pandas as pd, os
def to_logit(p,eps=1e-6): p=np.clip(p.astype(np.float64),eps,1-eps); return np.log(p/(1-p))
def sigmoid(z): return 1/(1+np.exp(-z))
id_col='request_id'; target_col='requester_received_pizza'
test_ids = pd.read_json('test.json')[id_col].values
# Ensure gamma-best cfg is available from S31
assert 'cfg_g' in globals(), 'gamma-best cfg not found; run S31 first'
g = cfg_g['g']
# Use existing full-history logits from S31
# tz_lr_w, tz_lr_ns, tz_d1, tz_d2, tz_meta, tz_emn, tz_emp must exist
tz_lr_mix_full = (1-g)*tz_lr_w + g*tz_lr_ns
# Recent logits loaded in S31/S33: tz_lr_w_r, tz_lr_ns_r, tz_emn_r (MPNet none)
z_w_r = tz_lr_w_r if ('tz_lr_w_r' in globals() and tz_lr_w_r is not None) else tz_lr_w
z_ns_r = tz_lr_ns_r if ('tz_lr_ns_r' in globals() and tz_lr_ns_r is not None) else tz_lr_ns
tz_lr_mix_recent = (1-g)*z_w_r + g*z_ns_r
tz_mn_recent = tz_emn_r if ('tz_emn_r' in globals() and tz_emn_r is not None) else tz_emn
# Small alpha grids per expert
aLR = [0.05, 0.10, 0.15] if (('tz_lr_w_r' in globals() and tz_lr_w_r is not None) or ('tz_lr_ns_r' in globals() and tz_lr_ns_r is not None)) else [0.0]
aMN = [0.05, 0.10, 0.15] if ('tz_emn_r' in globals() and tz_emn_r is not None) else [0.0]
aMP = [0.0]  # no MPNet recent
print('Small alpha grids:', {'LR': aLR, 'MiniLM': aMN, 'MPNet': aMP})
def pick_alphas_gamma(cfg, r, tol=0.02):
    wl, wmn, wmp = cfg['w_lr'], cfg['w_emn'], cfg['w_emp']
    best = None; best_err = 1e9; best_sum = 1e9
    for al in aLR:
        for amn in aMN:
            for amp in aMP:
                s = wl*al + wmn*amn + wmp*amp
                err = abs(s - r); sm = al + amn + amp
                if (err < best_err) or (abs(err - best_err) < 1e-12 and sm < best_sum):
                    best_err = err; best_sum = sm; best = (al, amn, amp)
    return best
def build_with_cfg(tag, cfg, alphas):
    a_lr, a_mn, a_mp = alphas
    tz_lr_mix = (1-a_lr)*tz_lr_mix_full + a_lr*tz_lr_mix_recent
    tz_mn = (1-a_mn)*tz_emn + a_mn*tz_mn_recent
    # MPNet unchanged (no recent)
    zt = (cfg['w_lr']*tz_lr_mix + cfg['w_d1']*tz_d1 + cfg['w_d2']*tz_d2 + cfg['w_meta']*tz_meta + cfg['w_emn']*tz_mn + cfg['w_emp']*tz_emp)
    pt = sigmoid(zt).astype(np.float32)
    fn = f'submission_blk5_{tag}.csv'; pd.DataFrame({id_col: test_ids, target_col: pt}).to_csv(fn, index=False); print('Wrote', fn, 'mean', float(pt.mean()))
# Compute alphas against gamma-best weights
alphas_r24 = pick_alphas_gamma(cfg_g, 0.24); alphas_r30 = pick_alphas_gamma(cfg_g, 0.30)
print('Conservative alphas r24/r30 (gamma-weights):', alphas_r24, alphas_r30)
build_with_cfg('r24_small', cfg_g, alphas_r24)
build_with_cfg('r30_small', cfg_g, alphas_r30)
# Build hedges with gamma-best submission (no recent interpolation)
def load_sub(path): return pd.read_csv(path)[target_col].values.astype(np.float64)
def logit_avg(paths, out):
    z = np.mean([to_logit(load_sub(p)) for p in paths], axis=0); p = sigmoid(z).astype(np.float32)
    pd.DataFrame({id_col: test_ids, target_col: p}).to_csv(out, index=False); print('Wrote', out, 'mean', float(p.mean()))
gamma_path = f'submission_blk5_gamma{str(gm).replace(".","p")}.csv'
logit_avg(['submission_blk5_r24_small.csv', gamma_path], 'submission_logitavg_r24small_gamma.csv')
logit_avg(['submission_blk5_r24_small.csv', 'submission_blk5_r30_small.csv', gamma_path], 'submission_logitavg_r24small_r30small_gamma.csv')
print('Note: submission.csv unchanged; primary remains submission_logitavg_r24_gamma.csv')

Small alpha grids: {'LR': [0.05, 0.1, 0.15], 'MiniLM': [0.05, 0.1, 0.15], 'MPNet': [0.0]}
Conservative alphas r24/r30 (gamma-weights): (0.15, 0.15, 0.0) (0.15, 0.15, 0.0)
Wrote submission_blk5_r24_small.csv mean 0.3995879590511322
Wrote submission_blk5_r30_small.csv mean 0.3995879590511322
Wrote submission_logitavg_r24small_gamma.csv mean 0.4053078293800354
Wrote submission_logitavg_r24small_r30small_gamma.csv mean 0.4033985137939453
Note: submission.csv unchanged; primary remains submission_logitavg_r24_gamma.csv


In [54]:
# S37: Promote conservative small-alpha 2-way hedge to submission.csv
import pandas as pd
src = 'submission_logitavg_r24small_gamma.csv'
df = pd.read_csv(src)
df.to_csv('submission.csv', index=False)
print('Promoted', src, 'to submission.csv | mean p =', float(df['requester_received_pizza'].mean()))

Promoted submission_logitavg_r24small_gamma.csv to submission.csv | mean p = 0.4053078408777969
