In [3]:
# CE CPU smoke test: cross-encoder/ms-marco-MiniLM-L-6-v2 with CPC-prepended inputs
import os, sys, time, math, numpy as np, pandas as pd
from pathlib import Path

# 1) Install compatible deps (pin transformers/tokenizers; honor torch constraints to avoid drift)
import subprocess
pip_args = [sys.executable, '-m', 'pip', 'install', '-q', '-c', 'constraints.txt',
            'transformers==4.44.2', 'tokenizers==0.19.1', 'sentence-transformers==2.7.0',
            '--upgrade', '--upgrade-strategy', 'only-if-needed']
subprocess.run(pip_args, check=True)
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)

# 2) Load data and folds
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')  # id, fold (StratifiedGroupKFold on original train.csv, group=anchor)
train = train.merge(folds, on='id', how='left', validate='one_to_one')
assert train['fold'].notna().all(), 'Fold merge by id failed'
train['fold'] = train['fold'].astype(int)

# 3) Build CPC-prepended input pairs
def pairify(df: pd.DataFrame):
    ctx = df['context'].astype(str).tolist()
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    s1 = [f"[CPC {c}] {aa}" for c, aa in zip(ctx, a)]
    s2 = [f"[CPC {c}] {bb}" for c, bb in zip(ctx, b)]
    return list(zip(s1, s2))

# 4) 2k/2-fold smoke OOF using existing folds parity (even vs odd) to define 2 splits
# Choose up to 2000 rows (stratified by fold parity) for a quick diagnostic
train = train.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
parity = (train['fold'] % 2).values  # 0 or 1
idx0 = np.where(parity == 0)[0]
idx1 = np.where(parity == 1)[0]
n0 = min(1000, len(idx0))
n1 = min(1000, len(idx1))
sel_idx0 = idx0[:n0]
sel_idx1 = idx1[:n1]
sel_idx = np.concatenate([sel_idx0, sel_idx1])
sel = train.iloc[sel_idx].reset_index(drop=True)
sel_parity = (sel['fold'] % 2).values

# 5) Load CE model on CPU
model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
print('Loading CE model:', model_name, 'on CPU ...', flush=True)
t0 = time.time()
ce = CrossEncoder(model_name, max_length=128, device='cpu')
print('Loaded in', round(time.time()-t0, 2), 's', flush=True)

APPLY_SIGMOID = True  # ms-marco models are often trained with BCE; sigmoid can improve correlation
def _sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def predict_pairs(pairs, batch_size=32):
    # CrossEncoder returns float scores; optionally pass through sigmoid
    p = ce.predict(pairs, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    if APPLY_SIGMOID:
        p = _sigmoid(p).astype(np.float32)
    return p

# 6) 2-fold OOF on the selected subset
y = sel['score'].astype(np.float32).values
oof = np.zeros(len(sel), dtype=np.float32)
for f in [0, 1]:
    va_idx = np.where(sel_parity == f)[0]
    if len(va_idx) == 0:
        continue
    pairs_va = pairify(sel.iloc[va_idx])
    t1 = time.time()
    preds_va = predict_pairs(pairs_va, batch_size=32)
    oof[va_idx] = preds_va
    r = pearsonr(preds_va, y[va_idx])[0]
    print(f'[Smoke CE FoldParity={f}] val_n={len(va_idx)} raw r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)

r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
print('Smoke CE OOF Pearson (2k/2-fold parity subset):', round(float(r_all), 6))
print('Pred stats: min', float(oof.min()), 'max', float(oof.max()), 'mean', float(oof.mean()))

# 7) Sanity orientation: if negative correlation but decent magnitude, flip sign (diagnostic only)
if np.isfinite(r_all) and r_all < 0 and abs(r_all) > 0.3:
    oof = -oof
    r_all = pearsonr(oof, y)[0]
    print('Flipped sign. New r:', round(float(r_all), 6))

print('DONE smoke test. If r >= 0.65, proceed to full 5-fold OOF+test generation in next cell.', flush=True)

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.21.0 requires fsspec[http]<=2024.6.1,>=2023.1.0, but you have fsspec 2025.9.0 which is incompatible.


Loading CE model: cross-encoder/ms-marco-MiniLM-L-6-v2 on CPU ...


Loaded in 0.36 s


[Smoke CE FoldParity=0] val_n=1000 raw r=0.188156; elapsed 1.1s


[Smoke CE FoldParity=1] val_n=1000 raw r=0.222761; elapsed 1.2s


Smoke CE OOF Pearson (2k/2-fold parity subset): 0.206057
Pred stats: min 0.08257093280553818 max 0.9999490976333618 mean 0.9687997698783875
DONE smoke test. If r >= 0.65, proceed to full 5-fold OOF+test generation in next cell.


In [5]:
# Full 5-fold CE OOF + test generation (strict [id,pred] schema) using BAAI/bge-reranker-base (CPU), no CPC, sigmoid
import sys, time, numpy as np, pandas as pd
from pathlib import Path
from scipy.stats import pearsonr
from sentence_transformers import CrossEncoder

SEED = 42
np.random.seed(SEED)

# Load data and folds
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
assert train['fold'].notna().all(), 'Fold merge by id failed'
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

def pairify_no_cpc(df: pd.DataFrame):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    return list(zip(a, b))

def _sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

model_name = 'BAAI/bge-reranker-base'  # best from smoke test on CPU
print('Loading CE model:', model_name, 'on CPU ...', flush=True)
t0 = time.time()
ce = CrossEncoder(model_name, max_length=128, device='cpu')
print('Loaded in', round(time.time()-t0, 2), 's', flush=True)

y = train['score'].astype(np.float32).values
oof = np.zeros(len(train), dtype=np.float32)
te_acc = np.zeros(len(test), dtype=np.float64)

for f in range(NUM_FOLDS):
    f0 = time.time()
    va_idx = np.where(train['fold'].values == f)[0]
    va_df = train.iloc[va_idx]
    pairs_va = pairify_no_cpc(va_df)
    logits_va = ce.predict(pairs_va, batch_size=32, show_progress_bar=False).astype(np.float32)
    preds_va = _sigmoid(logits_va).astype(np.float32)
    oof[va_idx] = preds_va
    r = pearsonr(preds_va, y[va_idx])[0]
    print(f'[CE Fold {f}] n={len(va_idx)} raw(sigmoid) r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)
    # test predictions for this fold
    te_pairs = pairify_no_cpc(test)
    logits_te = ce.predict(te_pairs, batch_size=32, show_progress_bar=False).astype(np.float32)
    te_acc += _sigmoid(logits_te).astype(np.float64)

# Global OOF r
r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
print('CE OOF Pearson (sigmoid):', round(float(r_all), 6))

te_pred_mean = (te_acc / NUM_FOLDS).astype(np.float32)

# Save artifacts with strict schema [id, pred]
oof_df = pd.DataFrame({'id': train['id'], 'pred': oof.astype(np.float32)})
sub_df = pd.DataFrame({'id': test['id'], 'pred': np.clip(te_pred_mean, 0.0, 1.0)})
oof_path = 'oof_ce_minilm.csv'; sub_path = 'submission_ce_minilm.csv'
oof_df.to_csv(oof_path, index=False)
sub_df.to_csv(sub_path, index=False)

# Sanity checks
chk_oof = pd.read_csv(oof_path); chk_sub = pd.read_csv(sub_path)
assert list(chk_oof.columns) == ['id','pred'] and list(chk_sub.columns) == ['id','pred']
assert chk_oof['id'].nunique() == len(train) and chk_sub['id'].nunique() == len(test)
assert not chk_oof['pred'].isna().any() and not chk_sub['pred'].isna().any()
print('Saved', oof_path, 'and', sub_path, 'OK.')

Loading CE model: BAAI/bge-reranker-base on CPU ...


Loaded in 0.84 s


[CE Fold 0] n=6086 raw(sigmoid) r=0.444495; elapsed 18.7s


[CE Fold 1] n=6515 raw(sigmoid) r=0.449640; elapsed 19.6s


[CE Fold 2] n=6630 raw(sigmoid) r=0.436400; elapsed 20.6s


[CE Fold 3] n=6941 raw(sigmoid) r=0.454112; elapsed 19.1s


[CE Fold 4] n=6653 raw(sigmoid) r=0.440996; elapsed 18.7s


CE OOF Pearson (sigmoid): 0.445097
Saved oof_ce_minilm.csv and submission_ce_minilm.csv OK.


In [4]:
# Smoke test several alternative cross-encoders (CPU) and formats on the same 2k subset
import time
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

assert 'sel' in globals() and 'sel_parity' in globals(), 'Run cell 0 first to define sel subset'

def make_pairs(df, use_cpc: bool):
    if use_cpc:
        ctx = df['context'].astype(str).tolist()
        a = df['anchor'].astype(str).tolist()
        b = df['target'].astype(str).tolist()
        s1 = [f"[CPC {c}] {aa}" for c, aa in zip(ctx, a)]
        s2 = [f"[CPC {c}] {bb}" for c, bb in zip(ctx, b)]
        return list(zip(s1, s2))
    else:
        return list(zip(df['anchor'].astype(str).tolist(), df['target'].astype(str).tolist()))

def _sigmoid(x):
    import numpy as np
    return 1.0 / (1.0 + np.exp(-x))

def eval_model(model_name: str, use_sigmoid: bool, use_cpc: bool, max_length: int = 128, batch_size: int = 32):
    print(f'\nModel={model_name} sigmoid={use_sigmoid} use_cpc={use_cpc}', flush=True)
    t0 = time.time()
    ce = CrossEncoder(model_name, max_length=max_length, device='cpu')
    print('Loaded in', round(time.time()-t0, 2), 's', flush=True)
    y = sel['score'].astype('float32').values
    oof = np.zeros(len(sel), dtype=np.float32)
    for f in [0,1]:
        va_idx = np.where(sel_parity == f)[0]
        if len(va_idx) == 0:
            continue
        pairs = make_pairs(sel.iloc[va_idx], use_cpc=use_cpc)
        t1 = time.time()
        preds = ce.predict(pairs, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        if use_sigmoid:
            preds = _sigmoid(preds).astype(np.float32)
        oof[va_idx] = preds
        r = pearsonr(preds, y[va_idx])[0]
        print(f'  FoldParity={f} r={r:.6f} elapsed={time.time()-t1:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print('  OOF r=', round(float(r_all), 6), 'min/max/mean=', float(oof.min()), float(oof.max()), float(oof.mean()), flush=True)
    return r_all

candidates = [
    ('cross-encoder/stsb-roberta-base', False),  # usually regression 0..1
    ('BAAI/bge-reranker-base', True),           # logits -> sigmoid
    ('jinaai/jina-reranker-v2-base-en', True),  # logits -> sigmoid
]

results = []
for name, use_sig in candidates:
    for use_cpc in (True, False):
        try:
            r = eval_model(name, use_sigmoid=use_sig, use_cpc=use_cpc, max_length=128, batch_size=32)
            results.append((name, use_sig, use_cpc, float(r)))
        except Exception as e:
            print('  ERROR for', name, 'use_cpc', use_cpc, ':', e, flush=True)

print('\nSummary:', results, flush=True)


Model=cross-encoder/stsb-roberta-base sigmoid=False use_cpc=True


  ERROR for cross-encoder/stsb-roberta-base use_cpc True : data did not match any variant of untagged enum ModelWrapper at line 250356 column 3



Model=cross-encoder/stsb-roberta-base sigmoid=False use_cpc=False


  ERROR for cross-encoder/stsb-roberta-base use_cpc False : data did not match any variant of untagged enum ModelWrapper at line 250356 column 3



Model=BAAI/bge-reranker-base sigmoid=True use_cpc=True


Loaded in 3.87 s


  FoldParity=0 r=0.309318 elapsed=5.1s


  FoldParity=1 r=0.268846 elapsed=4.6s


  OOF r= 0.289362 min/max/mean= 0.5024835467338562 0.7310519814491272 0.6799047589302063



Model=BAAI/bge-reranker-base sigmoid=True use_cpc=False


Loaded in 1.05 s


  FoldParity=0 r=0.496222 elapsed=3.6s


  FoldParity=1 r=0.433706 elapsed=3.5s


  OOF r= 0.464706 min/max/mean= 0.5000093579292297 0.7310519814491272 0.6260038614273071



Model=jinaai/jina-reranker-v2-base-en sigmoid=True use_cpc=True


  ERROR for jinaai/jina-reranker-v2-base-en use_cpc True : jinaai/jina-reranker-v2-base-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`



Model=jinaai/jina-reranker-v2-base-en sigmoid=True use_cpc=False


  ERROR for jinaai/jina-reranker-v2-base-en use_cpc False : jinaai/jina-reranker-v2-base-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`



Summary: [('BAAI/bge-reranker-base', True, True, 0.28936243057250977), ('BAAI/bge-reranker-base', True, False, 0.46470585465431213)]


In [6]:
# Correct CE (MiniLM-L-6) smoke test: raw logits, CPC prefix3 on both sides, CrossEncoder pairs
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)

# Load data + folds deterministically (no shuffle); use existing folds_by_id.csv
train_df = pd.read_csv('train.csv')
folds = pd.read_csv('folds_by_id.csv')
train_df = train_df.merge(folds, on='id', how='left', validate='one_to_one')
train_df['fold'] = train_df['fold'].astype(int)

def make_pairs_cpc_prefix3(df: pd.DataFrame):
    ctx3 = df['context'].astype(str).str[:3].tolist()
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    s1 = [f"[CPC {c}] {aa}" for c, aa in zip(ctx3, a)]
    s2 = [f"[CPC {c}] {bb}" for c, bb in zip(ctx3, b)]
    return list(zip(s1, s2))

# Build a 2k subset: first 1000 rows from even folds and first 1000 from odd folds (stable order)
parity = (train_df['fold'].values % 2)
idx_even = np.where(parity == 0)[0][:1000]
idx_odd  = np.where(parity == 1)[0][:1000]
sel_idx = np.concatenate([idx_even, idx_odd])
sel = train_df.iloc[sel_idx].reset_index(drop=True)
sel_parity = (sel['fold'].values % 2)

model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
print('Loading CE model:', model_name, 'on CPU ...', flush=True)
t0 = time.time()
ce = CrossEncoder(model_name, device='cpu', max_length=128)  # raw logits
print('Loaded in', round(time.time()-t0,2), 's', flush=True)

y = sel['score'].astype(np.float32).values
oof = np.zeros(len(sel), dtype=np.float32)
for f in [0,1]:
    va_idx = np.where(sel_parity == f)[0]
    if len(va_idx) == 0:
        continue
    pairs_va = make_pairs_cpc_prefix3(sel.iloc[va_idx])
    t1 = time.time()
    logits = ce.predict(pairs_va, batch_size=32, show_progress_bar=False).astype(np.float32)
    oof[va_idx] = logits  # no sigmoid
    r = pearsonr(logits, y[va_idx])[0]
    print(f'[MiniLM-L6 smoke FoldParity={f}] n={len(va_idx)} raw r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)

r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
print('Smoke OOF r (expected >=0.65):', round(float(r_all), 6))
print('Logit stats min/max/mean:', float(oof.min()), float(oof.max()), float(oof.mean()))

Loading CE model: cross-encoder/ms-marco-MiniLM-L-6-v2 on CPU ...


Loaded in 0.54 s


[MiniLM-L6 smoke FoldParity=0] n=1000 raw r=0.491884; elapsed 1.0s


[MiniLM-L6 smoke FoldParity=1] n=1000 raw r=0.491177; elapsed 0.7s


Smoke OOF r (expected >=0.65): 0.49172
Logit stats min/max/mean: -2.258300542831421 9.818795204162598 5.268746376037598


In [7]:
# MiniLM CE smoke: test formatting/symmetry/model variants to reach r>=0.65
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

assert 'sel' in globals() and 'sel_parity' in globals(), 'Run cell 3 to define sel subset'

def make_pairs(df: pd.DataFrame, use_cpc_prefix3: bool, reverse: bool = False):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    if use_cpc_prefix3:
        c3 = df['context'].astype(str).str[:3].tolist()
        if not reverse:
            s1 = [f"[CPC {c}] {aa}" for c, aa in zip(c3, a)]
            s2 = [f"[CPC {c}] {bb}" for c, bb in zip(c3, b)]
        else:
            s1 = [f"[CPC {c}] {bb}" for c, bb in zip(c3, b)]
            s2 = [f"[CPC {c}] {aa}" for c, aa in zip(c3, a)]
    else:
        if not reverse:
            s1, s2 = a, b
        else:
            s1, s2 = b, a
    return list(zip(s1, s2))

def eval_ce(model_name: str, max_len: int, use_cpc_prefix3: bool, symmetry: bool, batch_size: int = 32):
    print(f'CE smoke model={model_name} max_len={max_len} cpc3={use_cpc_prefix3} sym={symmetry}', flush=True)
    t0 = time.time()
    ce = CrossEncoder(model_name, device='cpu', max_length=max_len)
    print('Loaded in', round(time.time()-t0,2), 's')
    y = sel['score'].astype(np.float32).values
    oof = np.zeros(len(sel), dtype=np.float32)
    for fp in [0,1]:
        va_idx = np.where(sel_parity == fp)[0]
        if len(va_idx) == 0: continue
        va_df = sel.iloc[va_idx]
        p_main = make_pairs(va_df, use_cpc_prefix3=use_cpc_prefix3, reverse=False)
        t1 = time.time()
        s_main = ce.predict(p_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        if symmetry:
            p_rev = make_pairs(va_df, use_cpc_prefix3=use_cpc_prefix3, reverse=True)
            s_rev = ce.predict(p_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
            s = (s_main + s_rev) / 2.0
        else:
            s = s_main
        oof[va_idx] = s
        r = pearsonr(s, y[va_idx])[0]
        print(f'  FoldParity={fp} r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print('  OOF r=', round(float(r_all),6), 'min/max/mean=', float(oof.min()), float(oof.max()), float(oof.mean()), flush=True)
    return float(r_all)

# Candidates: L-6 and L-12, with/without CPC prefix3, with symmetry averaging
cands = [
    ('cross-encoder/ms-marco-MiniLM-L-6-v2', 128, True, True),
    ('cross-encoder/ms-marco-MiniLM-L-6-v2', 128, False, True),
    ('cross-encoder/ms-marco-MiniLM-L-12-v2', 192, True, True),
    ('cross-encoder/ms-marco-MiniLM-L-12-v2', 192, False, True),
]
results = []
for m, ml, cpc3, sym in cands:
    try:
        r = eval_ce(m, ml, cpc3, sym)
        results.append((m, ml, cpc3, sym, r))
    except Exception as e:
        print('  ERROR', m, ml, cpc3, sym, ':', e, flush=True)

print('Summary:', results, flush=True)
best = None
for row in results:
    if best is None or (row[-1] is not None and row[-1] > best[-1]):
        best = row
print('Best:', best, flush=True)

CE smoke model=cross-encoder/ms-marco-MiniLM-L-6-v2 max_len=128 cpc3=True sym=True


Loaded in 0.35 s


  FoldParity=0 r=0.461687; elapsed 1.9s


  FoldParity=1 r=0.484764; elapsed 1.7s


  OOF r= 0.473114 min/max/mean= -0.8967779874801636 9.818795204162598 5.377791881561279


CE smoke model=cross-encoder/ms-marco-MiniLM-L-6-v2 max_len=128 cpc3=False sym=True


Loaded in 0.64 s


  FoldParity=0 r=0.564724; elapsed 1.4s


  FoldParity=1 r=0.550846; elapsed 1.3s


  OOF r= 0.557887 min/max/mean= -11.372194290161133 8.473657608032227 -5.434383869171143


CE smoke model=cross-encoder/ms-marco-MiniLM-L-12-v2 max_len=192 cpc3=True sym=True


Loaded in 0.38 s


  FoldParity=0 r=0.448868; elapsed 3.4s


  FoldParity=1 r=0.475624; elapsed 3.1s


  OOF r= 0.46215 min/max/mean= -1.187737226486206 9.560964584350586 5.876559734344482


CE smoke model=cross-encoder/ms-marco-MiniLM-L-12-v2 max_len=192 cpc3=False sym=True


Loaded in 0.38 s


  FoldParity=0 r=0.553570; elapsed 2.5s


  FoldParity=1 r=0.534756; elapsed 2.2s


  OOF r= 0.544243 min/max/mean= -11.255294799804688 8.826813697814941 -5.39487361907959


Summary: [('cross-encoder/ms-marco-MiniLM-L-6-v2', 128, True, True, 0.47311434149742126), ('cross-encoder/ms-marco-MiniLM-L-6-v2', 128, False, True, 0.557887077331543), ('cross-encoder/ms-marco-MiniLM-L-12-v2', 192, True, True, 0.4621501863002777), ('cross-encoder/ms-marco-MiniLM-L-12-v2', 192, False, True, 0.5442430377006531)]


Best: ('cross-encoder/ms-marco-MiniLM-L-6-v2', 128, False, True, 0.557887077331543)


In [21]:
# Full OOF+test CE generation (MiniLM-L-6) with CPU recipe: raw logits, no CPC, symmetry ON, max_length=192
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

def make_pairs(df: pd.DataFrame, reverse: bool = False):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    if not reverse:
        s1, s2 = a, b
    else:
        s1, s2 = b, a
    return list(zip(s1, s2))

model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
max_len = 192
batch_size = 64
use_symmetry = True  # per expert advice: ON
print('Loading CE:', model_name, 'cpu, max_len=', max_len, 'symmetry=', use_symmetry, flush=True)
t0 = time.time()
ce = CrossEncoder(model_name, device='cpu', max_length=max_len)
print('Loaded in', round(time.time()-t0,2), 's', flush=True)

y = train['score'].astype(np.float32).values
oof = np.zeros(len(train), dtype=np.float32)

# Precompute test pairs once (no fold dependency) and predict with symmetry if enabled
pairs_main_te = make_pairs(test)
scores_main_te = ce.predict(pairs_main_te, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
if use_symmetry:
    pairs_rev_te = make_pairs(test, reverse=True)
    scores_rev_te = ce.predict(pairs_rev_te, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    te_scores = (scores_main_te + scores_rev_te) / 2.0
else:
    te_scores = scores_main_te

for f in range(NUM_FOLDS):
    f0 = time.time()
    va_idx = np.where(train['fold'].values == f)[0]
    va_df = train.iloc[va_idx]
    pairs_main = make_pairs(va_df)
    s_main = ce.predict(pairs_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    if use_symmetry:
        pairs_rev = make_pairs(va_df, reverse=True)
        s_rev = ce.predict(pairs_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        s = (s_main + s_rev) / 2.0
    else:
        s = s_main
    oof[va_idx] = s
    r = pearsonr(s, y[va_idx])[0]
    print(f'[Fold {f}] n={len(va_idx)} raw r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)

r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
print('Final CE OOF r (raw logits, sym ON, len 192):', round(float(r_all), 6), flush=True)

# Save strict schema for downstream transforms
pd.DataFrame({'id': train['id'], 'pred': oof.astype(np.float32)}).to_csv('oof_ce_minilm.csv', index=False)
pd.DataFrame({'id': test['id'], 'pred': te_scores.astype(np.float32)}).to_csv('submission_ce_minilm.csv', index=False)
print('Saved oof_ce_minilm.csv and submission_ce_minilm.csv', flush=True)

Loading CE: cross-encoder/ms-marco-MiniLM-L-6-v2 cpu, max_len= 192 symmetry= True


Loaded in 0.41 s


[Fold 0] n=6086 raw r=0.564206; elapsed 6.4s


[Fold 1] n=6515 raw r=0.547474; elapsed 6.9s


[Fold 2] n=6630 raw r=0.542416; elapsed 7.7s


[Fold 3] n=6941 raw r=0.545060; elapsed 5.8s


[Fold 4] n=6653 raw r=0.539629; elapsed 6.9s


Final CE OOF r (raw logits, sym ON, len 192): 0.54733


Saved oof_ce_minilm.csv and submission_ce_minilm.csv


In [9]:
# MiniLM-L-6 template sweep (CPU): test query/passage prompts, CPC prefix3, symmetry
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

assert 'sel' in globals() and 'sel_parity' in globals(), 'Run cell 3 to define sel subset'

def make_pairs_template(df: pd.DataFrame, template: str, reverse: bool = False):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    c3 = df['context'].astype(str).str[:3].tolist()
    s1 = []; s2 = []
    if template == 'plain':
        s1 = a if not reverse else b
        s2 = b if not reverse else a
    elif template == 'query_passage':
        if not reverse:
            s1 = [f'query: {aa}' for aa in a]
            s2 = [f'passage: {bb}' for bb in b]
        else:
            s1 = [f'query: {bb}' for bb in b]
            s2 = [f'passage: {aa}' for aa in a]
    elif template == 'cpc3_plain':
        if not reverse:
            s1 = [f'[CPC {c}] {aa}' for c, aa in zip(c3, a)]
            s2 = [f'[CPC {c}] {bb}' for c, bb in zip(c3, b)]
        else:
            s1 = [f'[CPC {c}] {bb}' for c, bb in zip(c3, b)]
            s2 = [f'[CPC {c}] {aa}' for c, aa in zip(c3, a)]
    elif template == 'cpc3_query_passage':
        if not reverse:
            s1 = [f'query: [CPC {c}] {aa}' for c, aa in zip(c3, a)]
            s2 = [f'passage: [CPC {c}] {bb}' for c, bb in zip(c3, b)]
        else:
            s1 = [f'query: [CPC {c}] {bb}' for c, bb in zip(c3, b)]
            s2 = [f'passage: [CPC {c}] {aa}' for c, aa in zip(c3, a)]
    else:
        raise ValueError('Unknown template')
    return list(zip(s1, s2))

def eval_template(model_name: str, template: str, max_len: int = 128, symmetry: bool = True, batch_size: int = 32):
    print(f'Model={model_name} tmpl={template} sym={symmetry}', flush=True)
    t0 = time.time()
    ce = CrossEncoder(model_name, device='cpu', max_length=max_len)
    print('Loaded in', round(time.time()-t0,2), 's', flush=True)
    y = sel['score'].astype(np.float32).values
    oof = np.zeros(len(sel), dtype=np.float32)
    for fp in [0,1]:
        va_idx = np.where(sel_parity == fp)[0]
        if len(va_idx) == 0: continue
        va_df = sel.iloc[va_idx]
        p_main = make_pairs_template(va_df, template, reverse=False)
        t1 = time.time()
        s_main = ce.predict(p_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        if symmetry:
            p_rev = make_pairs_template(va_df, template, reverse=True)
            s_rev = ce.predict(p_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
            s = (s_main + s_rev) / 2.0
        else:
            s = s_main
        oof[va_idx] = s
        r = pearsonr(s, y[va_idx])[0]
        print(f'  FoldParity={fp} r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print('  OOF r=', round(float(r_all),6), 'min/max/mean=', float(oof.min()), float(oof.max()), float(oof.mean()), flush=True)
    return float(r_all)

model = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
templates = ['plain', 'query_passage', 'cpc3_plain', 'cpc3_query_passage']
results = []
for tmpl in templates:
    try:
        r = eval_template(model, tmpl, max_len=128, symmetry=True, batch_size=32)
        results.append((tmpl, True, r))
    except Exception as e:
        print('  ERROR tmpl', tmpl, e, flush=True)
for tmpl in templates:
    try:
        r = eval_template(model, tmpl, max_len=128, symmetry=False, batch_size=32)
        results.append((tmpl, False, r))
    except Exception as e:
        print('  ERROR tmpl', tmpl, e, flush=True)
print('Summary (tmpl, sym, r):', results, flush=True)
best = max(results, key=lambda x: (x[2] if x[2] is not None else -1.0)) if results else None
print('Best:', best, flush=True)

Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=plain sym=True


Loaded in 0.57 s


  FoldParity=0 r=0.564724; elapsed 1.4s


  FoldParity=1 r=0.550846; elapsed 1.2s


  OOF r= 0.557887 min/max/mean= -11.372194290161133 8.473657608032227 -5.434383869171143


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=query_passage sym=True


Loaded in 0.36 s


  FoldParity=0 r=0.568381; elapsed 1.7s


  FoldParity=1 r=0.546066; elapsed 1.6s


  OOF r= 0.557246 min/max/mean= -11.449594497680664 5.321118354797363 -7.024003028869629


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=cpc3_plain sym=True


Loaded in 0.36 s


  FoldParity=0 r=0.461687; elapsed 2.2s


  FoldParity=1 r=0.484764; elapsed 1.9s


  OOF r= 0.473114 min/max/mean= -0.8967779874801636 9.818795204162598 5.377791881561279


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=cpc3_query_passage sym=True


Loaded in 0.36 s


  FoldParity=0 r=0.445932; elapsed 2.4s


  FoldParity=1 r=0.481770; elapsed 2.3s


  OOF r= 0.463544 min/max/mean= -1.8124641180038452 7.252917289733887 3.602077007293701


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=plain sym=False


Loaded in 0.41 s


  FoldParity=0 r=0.568412; elapsed 0.7s


  FoldParity=1 r=0.548776; elapsed 0.7s


  OOF r= 0.558729 min/max/mean= -11.419656753540039 8.438697814941406 -5.534883975982666


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=query_passage sym=False


Loaded in 0.35 s


  FoldParity=0 r=0.567048; elapsed 0.9s


  FoldParity=1 r=0.542985; elapsed 0.8s


  OOF r= 0.555054 min/max/mean= -11.480846405029297 5.321118354797363 -7.06156063079834


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=cpc3_plain sym=False


Loaded in 0.37 s


  FoldParity=0 r=0.491884; elapsed 1.1s


  FoldParity=1 r=0.491177; elapsed 1.0s


  OOF r= 0.49172 min/max/mean= -2.258300542831421 9.818795204162598 5.268746376037598


Model=cross-encoder/ms-marco-MiniLM-L-6-v2 tmpl=cpc3_query_passage sym=False


Loaded in 0.37 s


  FoldParity=0 r=0.471256; elapsed 1.2s


  FoldParity=1 r=0.485039; elapsed 1.1s


  OOF r= 0.478248 min/max/mean= -3.049187183380127 7.271217346191406 3.5236079692840576


Summary (tmpl, sym, r): [('plain', True, 0.557887077331543), ('query_passage', True, 0.5572460293769836), ('cpc3_plain', True, 0.47311434149742126), ('cpc3_query_passage', True, 0.4635443091392517), ('plain', False, 0.5587289333343506), ('query_passage', False, 0.5550540089607239), ('cpc3_plain', False, 0.4917197525501251), ('cpc3_query_passage', False, 0.4782479405403137)]


Best: ('plain', False, 0.5587289333343506)


In [10]:
# BGE reranker raw-logit smoke + optional full OOF if >= threshold
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

assert 'sel' in globals() and 'sel_parity' in globals(), 'Run cell 3 to define sel subset'

def eval_bge_raw(symmetry: bool, batch_size: int = 32, max_len: int = 128):
    name = 'BAAI/bge-reranker-base'
    print(f'Eval {name} raw logits, symmetry={symmetry}', flush=True)
    t0 = time.time()
    ce = CrossEncoder(name, device='cpu', max_length=max_len)
    print('Loaded in', round(time.time()-t0,2), 's', flush=True)
    y = sel['score'].astype(np.float32).values
    oof = np.zeros(len(sel), dtype=np.float32)
    for fp in [0,1]:
        va_idx = np.where(sel_parity == fp)[0]
        if len(va_idx) == 0: continue
        va = sel.iloc[va_idx]
        a = va['anchor'].astype(str).tolist()
        b = va['target'].astype(str).tolist()
        pairs_main = list(zip(a, b))
        t1 = time.time()
        s_main = ce.predict(pairs_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        if symmetry:
            pairs_rev = list(zip(b, a))
            s_rev = ce.predict(pairs_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
            s = (s_main + s_rev) / 2.0
        else:
            s = s_main
        oof[va_idx] = s
        r = pearsonr(s, y[va_idx])[0]
        print(f'  FoldParity={fp} r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print('  OOF r=', round(float(r_all),6), 'min/max/mean=', float(oof.min()), float(oof.max()), float(oof.mean()), flush=True)
    return float(r_all), ce

# Smoke both symmetry settings
r_sym, _ = eval_bge_raw(symmetry=True)
r_nosym, _ = eval_bge_raw(symmetry=False)
best_sym = r_sym >= r_nosym
best_r = max(r_sym, r_nosym)
print('BGE raw best smoke r=', round(best_r,6), 'symmetry=', best_sym, flush=True)

# If good enough, run full OOF+test and overwrite ce_minilm artifacts for stacker
THRESH = 0.58
if best_r >= THRESH:
    print('Running full OOF with BGE raw logits; symmetry=', best_sym, flush=True)
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    folds = pd.read_csv('folds_by_id.csv')
    train = train.merge(folds, on='id', how='left', validate='one_to_one')
    train['fold'] = train['fold'].astype(int)
    NUM_FOLDS = int(train['fold'].max()) + 1
    ce_full = CrossEncoder('BAAI/bge-reranker-base', device='cpu', max_length=128)
    y = train['score'].astype(np.float32).values
    oof = np.zeros(len(train), dtype=np.float32)
    # test predictions
    a_te = test['anchor'].astype(str).tolist(); b_te = test['target'].astype(str).tolist()
    te_main = list(zip(a_te, b_te))
    te_main_scores = ce_full.predict(te_main, batch_size=32, show_progress_bar=False).astype(np.float32)
    if best_sym:
        te_rev = list(zip(b_te, a_te))
        te_rev_scores = ce_full.predict(te_rev, batch_size=32, show_progress_bar=False).astype(np.float32)
        te_scores = (te_main_scores + te_rev_scores) / 2.0
    else:
        te_scores = te_main_scores
    for f in range(NUM_FOLDS):
        f0 = time.time()
        va_idx = np.where(train['fold'].values == f)[0]
        va = train.iloc[va_idx]
        a = va['anchor'].astype(str).tolist(); b = va['target'].astype(str).tolist()
        va_main = list(zip(a, b))
        s_main = ce_full.predict(va_main, batch_size=32, show_progress_bar=False).astype(np.float32)
        if best_sym:
            va_rev = list(zip(b, a))
            s_rev = ce_full.predict(va_rev, batch_size=32, show_progress_bar=False).astype(np.float32)
            s = (s_main + s_rev) / 2.0
        else:
            s = s_main
        oof[va_idx] = s
        r = pearsonr(s, y[va_idx])[0]
        print(f'  [Full Fold {f}] r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print('Full BGE raw OOF r=', round(float(r_all),6), flush=True)
    # Overwrite CE artifacts expected by stacker
    pd.DataFrame({'id': train['id'], 'pred': oof.astype(np.float32)}).to_csv('oof_ce_minilm.csv', index=False)
    pd.DataFrame({'id': test['id'], 'pred': te_scores.astype(np.float32)}).to_csv('submission_ce_minilm.csv', index=False)
    print('Overwrote oof_ce_minilm.csv and submission_ce_minilm.csv with BGE raw logits.', flush=True)
else:
    print('BGE raw did not meet threshold; skip full OOF.', flush=True)

Eval BAAI/bge-reranker-base raw logits, symmetry=True


Loaded in 0.8 s


  FoldParity=0 r=0.464254; elapsed 6.6s


  FoldParity=1 r=0.475629; elapsed 6.2s


  OOF r= 0.470034 min/max/mean= 3.728556475834921e-05 0.9999665021896362 0.5484459400177002


Eval BAAI/bge-reranker-base raw logits, symmetry=False


Loaded in 1.01 s


  FoldParity=0 r=0.469314; elapsed 3.0s


  FoldParity=1 r=0.483053; elapsed 2.9s


  OOF r= 0.476314 min/max/mean= 3.728599403984845e-05 0.9999665021896362 0.5459015965461731


BGE raw best smoke r= 0.476314 symmetry= False


BGE raw did not meet threshold; skip full OOF.


In [11]:
# Full OOF+test for additional CEs: MiniLM-L-12 (raw, no CPC, symmetry) and BGE raw (no sigmoid, symmetry)
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)

def gen_ce_full(model_name: str, out_tag: str, max_len: int = 192, symmetry: bool = True, batch_size: int = 32):
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    folds = pd.read_csv('folds_by_id.csv')
    train = train.merge(folds, on='id', how='left', validate='one_to_one')
    train['fold'] = train['fold'].astype(int)
    NUM_FOLDS = int(train['fold'].max()) + 1
    def make_pairs(df, reverse=False):
        a = df['anchor'].astype(str).tolist()
        b = df['target'].astype(str).tolist()
        if not reverse:
            return list(zip(a, b))
        else:
            return list(zip(b, a))
    print(f'Loading CE: {model_name} (cpu) max_len={max_len} symmetry={symmetry}', flush=True)
    t0 = time.time()
    ce = CrossEncoder(model_name, device='cpu', max_length=max_len)
    print('Loaded in', round(time.time()-t0,2), 's', flush=True)
    y = train['score'].astype(np.float32).values
    oof = np.zeros(len(train), dtype=np.float32)
    # test predictions
    te_main = make_pairs(test)
    te_main_scores = ce.predict(te_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    if symmetry:
        te_rev = make_pairs(test, reverse=True)
        te_rev_scores = ce.predict(te_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        te_scores = (te_main_scores + te_rev_scores) / 2.0
    else:
        te_scores = te_main_scores
    for f in range(NUM_FOLDS):
        f0 = time.time()
        va_idx = np.where(train['fold'].values == f)[0]
        va_df = train.iloc[va_idx]
        s_main = ce.predict(make_pairs(va_df), batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        if symmetry:
            s_rev = ce.predict(make_pairs(va_df, reverse=True), batch_size=batch_size, show_progress_bar=False).astype(np.float32)
            s = (s_main + s_rev) / 2.0
        else:
            s = s_main
        oof[va_idx] = s
        r = pearsonr(s, y[va_idx])[0]
        print(f'  [Fold {f}] r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print(f'Final OOF r for {out_tag} =', round(float(r_all), 6), flush=True)
    pd.DataFrame({'id': train['id'], 'pred': oof.astype(np.float32)}).to_csv(f'oof_ce_{out_tag}.csv', index=False)
    pd.DataFrame({'id': test['id'], 'pred': te_scores.astype(np.float32)}).to_csv(f'submission_ce_{out_tag}.csv', index=False)
    print(f'Saved oof_ce_{out_tag}.csv and submission_ce_{out_tag}.csv', flush=True)

# Generate MiniLM-L-12-v2 (raw logits, no CPC, symmetry) -> files oof_ce_l12.csv/submission_ce_l12.csv
try:
    gen_ce_full('cross-encoder/ms-marco-MiniLM-L-12-v2', out_tag='l12', max_len=192, symmetry=True, batch_size=32)
except Exception as e:
    print('MiniLM-L-12 generation failed:', e, flush=True)

# Generate BGE raw logits (no sigmoid), symmetry -> files oof_ce_bge_rerank.csv/submission_ce_bge_rerank.csv
try:
    gen_ce_full('BAAI/bge-reranker-base', out_tag='bge_rerank', max_len=128, symmetry=True, batch_size=32)
except Exception as e:
    print('BGE raw generation failed:', e, flush=True)

Loading CE: cross-encoder/ms-marco-MiniLM-L-12-v2 (cpu) max_len=192 symmetry=True


Loaded in 0.43 s


  [Fold 0] r=0.551037; elapsed 15.4s


  [Fold 1] r=0.532257; elapsed 14.0s


  [Fold 2] r=0.532294; elapsed 15.7s


  [Fold 3] r=0.533919; elapsed 15.4s


  [Fold 4] r=0.517023; elapsed 14.4s


Final OOF r for l12 = 0.532922


Saved oof_ce_l12.csv and submission_ce_l12.csv


Loading CE: BAAI/bge-reranker-base (cpu) max_len=128 symmetry=True


Loaded in 0.99 s


  [Fold 0] r=0.453725; elapsed 37.0s


  [Fold 1] r=0.455899; elapsed 40.3s


  [Fold 2] r=0.442429; elapsed 41.4s


  [Fold 3] r=0.468451; elapsed 42.7s


  [Fold 4] r=0.456696; elapsed 39.3s


Final OOF r for bge_rerank = 0.455499


Saved oof_ce_bge_rerank.csv and submission_ce_bge_rerank.csv


In [14]:
# MiniLM-L-6 Separator Trick (context after [SEP]) smoke + optional full OOF (overwrite ce_minilm artifacts)
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)

# Ensure subset exists; if not, build a 2k parity subset deterministically
if 'sel' not in globals() or 'sel_parity' not in globals():
    train_df = pd.read_csv('train.csv')
    folds = pd.read_csv('folds_by_id.csv')
    train_df = train_df.merge(folds, on='id', how='left', validate='one_to_one')
    train_df['fold'] = train_df['fold'].astype(int)
    parity = (train_df['fold'].values % 2)
    idx_even = np.where(parity == 0)[0][:1000]
    idx_odd  = np.where(parity == 1)[0][:1000]
    sel_idx = np.concatenate([idx_even, idx_odd])
    sel = train_df.iloc[sel_idx].reset_index(drop=True)
    sel_parity = (sel['fold'].values % 2)

# Global SEP token placeholder; will be set from tokenizer after model load
SEP = '[SEP]'

def make_pairs_sep(df: pd.DataFrame, reverse: bool = False):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    c = df['context'].astype(str).tolist()
    if not reverse:
        s1 = a
        s2 = [f"{bb} {SEP} {cc}" for bb, cc in zip(b, c)]
    else:
        s1 = b
        s2 = [f"{aa} {SEP} {cc}" for aa, cc in zip(a, c)]
    return list(zip(s1, s2))

model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
max_len = 192
batch_size = 32
print('Separator Trick smoke:', model_name, 'max_len=', max_len, flush=True)
t0 = time.time()
ce = CrossEncoder(model_name, device='cpu', max_length=max_len)  # raw logits
# Use the tokenizer's true separator token if available
tok_sep = getattr(getattr(ce, 'tokenizer', None), 'sep_token', None)
if tok_sep and isinstance(tok_sep, str):
    SEP = tok_sep
print('Loaded in', round(time.time()-t0,2), 's; sep_token=', repr(SEP), flush=True)

y = sel['score'].astype(np.float32).values
oof = np.zeros(len(sel), dtype=np.float32)
for fp in [0,1]:
    va_idx = np.where(sel_parity == fp)[0]
    if len(va_idx) == 0: continue
    va_df = sel.iloc[va_idx]
    p_f = make_pairs_sep(va_df, reverse=False)
    p_r = make_pairs_sep(va_df, reverse=True)
    t1 = time.time()
    s_f = ce.predict(p_f, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    s_r = ce.predict(p_r, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    s = (s_f + s_r) / 2.0
    oof[va_idx] = s
    r = pearsonr(s, y[va_idx])[0]
    print(f'  Parity={fp} r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)

r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
print('Smoke OOF r (Separator Trick):', round(float(r_all), 6), flush=True)

# If strong enough, run full OOF+test and overwrite ce_minilm artifacts
THRESH = 0.65
if np.isfinite(r_all) and r_all >= THRESH:
    print('Running full 5-fold OOF+test with Separator Trick; overwriting ce_minilm artifacts', flush=True)
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    folds = pd.read_csv('folds_by_id.csv')
    train = train.merge(folds, on='id', how='left', validate='one_to_one')
    train['fold'] = train['fold'].astype(int)
    NUM_FOLDS = int(train['fold'].max()) + 1
    # Reuse ce; ensure same settings
    ce_full = ce
    y_full = train['score'].astype(np.float32).values
    oof_full = np.zeros(len(train), dtype=np.float32)
    # Test predictions (forward+reverse)
    pte_f = make_pairs_sep(test, reverse=False)
    pte_r = make_pairs_sep(test, reverse=True)
    s_te_f = ce_full.predict(pte_f, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    s_te_r = ce_full.predict(pte_r, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    te_scores = (s_te_f + s_te_r) / 2.0
    for f in range(NUM_FOLDS):
        f0 = time.time()
        va_idx = np.where(train['fold'].values == f)[0]
        va_df = train.iloc[va_idx]
        pf = make_pairs_sep(va_df, reverse=False)
        pr = make_pairs_sep(va_df, reverse=True)
        s_f = ce_full.predict(pf, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        s_r = ce_full.predict(pr, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        s = (s_f + s_r) / 2.0
        oof_full[va_idx] = s
        r = pearsonr(s, y_full[va_idx])[0]
        print(f'  [Fold {f}] r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)
    r_fin = pearsonr(oof_full, y_full)[0] if np.std(oof_full) > 0 else float('nan')
    print('Final CE OOF r (Separator Trick):', round(float(r_fin), 6), flush=True)
    pd.DataFrame({'id': train['id'], 'pred': oof_full.astype(np.float32)}).to_csv('oof_ce_minilm.csv', index=False)
    pd.DataFrame({'id': test['id'], 'pred': te_scores.astype(np.float32)}).to_csv('submission_ce_minilm.csv', index=False)
    print('Saved oof_ce_minilm.csv and submission_ce_minilm.csv', flush=True)
else:
    print('Separator Trick smoke < threshold; not running full OOF.', flush=True)

Separator Trick smoke: cross-encoder/ms-marco-MiniLM-L-6-v2 max_len= 192


Loaded in 0.49 s; sep_token= '[SEP]'


  Parity=0 r=0.564983; elapsed 1.6s


  Parity=1 r=0.545467; elapsed 1.5s


Smoke OOF r (Separator Trick): 0.555326


Separator Trick smoke < threshold; not running full OOF.


In [13]:
# CE smoke: MiniLM-L-2-v2 and quora-roberta-base (plain pairs, raw logits), then full OOF if >= 0.60
import time, numpy as np, pandas as pd
from sentence_transformers import CrossEncoder
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)

# Ensure 2k parity subset exists
if 'sel' not in globals() or 'sel_parity' not in globals():
    train_df = pd.read_csv('train.csv')
    folds = pd.read_csv('folds_by_id.csv')
    train_df = train_df.merge(folds, on='id', how='left', validate='one_to_one')
    train_df['fold'] = train_df['fold'].astype(int)
    parity = (train_df['fold'].values % 2)
    idx_even = np.where(parity == 0)[0][:1000]
    idx_odd  = np.where(parity == 1)[0][:1000]
    sel_idx = np.concatenate([idx_even, idx_odd])
    sel = train_df.iloc[sel_idx].reset_index(drop=True)
    sel_parity = (sel['fold'].values % 2)

def make_pairs_plain(df: pd.DataFrame, reverse: bool = False):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    if not reverse:
        return list(zip(a, b))
    else:
        return list(zip(b, a))

def smoke_eval(model_name: str, max_len: int = 256, batch_size: int = 32):
    print(f"Smoke CE: {model_name} max_len={max_len}", flush=True)
    t0 = time.time()
    ce = CrossEncoder(model_name, device='cpu', max_length=max_len)
    print('Loaded in', round(time.time()-t0,2), 's', flush=True)
    y = sel['score'].astype(np.float32).values
    results = []  # (symmetry, r)
    for symmetry in (False, True):
        oof = np.zeros(len(sel), dtype=np.float32)
        for fp in [0,1]:
            va_idx = np.where(sel_parity == fp)[0]
            if len(va_idx) == 0: continue
            va = sel.iloc[va_idx]
            p_main = make_pairs_plain(va, reverse=False)
            t1 = time.time()
            s_main = ce.predict(p_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
            if symmetry:
                p_rev = make_pairs_plain(va, reverse=True)
                s_rev = ce.predict(p_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
                s = (s_main + s_rev) / 2.0
            else:
                s = s_main
            oof[va_idx] = s
            r = pearsonr(s, y[va_idx])[0]
            print(f'  sym={symmetry} parity={fp} r={r:.6f}; elapsed {time.time()-t1:.1f}s', flush=True)
        r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
        print(f'  -> OOF r (sym={symmetry}) =', round(float(r_all), 6), 'min/max/mean=', float(oof.min()), float(oof.max()), float(oof.mean()), flush=True)
        results.append((symmetry, float(r_all)))
    best = max(results, key=lambda x: (x[1] if np.isfinite(x[1]) else -1.0))
    print('Best smoke for', model_name, '=>', best, flush=True)
    return best, ce

def run_full_oof(model_name: str, max_len: int, symmetry: bool, batch_size: int = 32):
    print(f'Running full OOF+test for {model_name} max_len={max_len} symmetry={symmetry}', flush=True)
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    folds = pd.read_csv('folds_by_id.csv')
    train = train.merge(folds, on='id', how='left', validate='one_to_one')
    train['fold'] = train['fold'].astype(int)
    NUM_FOLDS = int(train['fold'].max()) + 1
    ce = CrossEncoder(model_name, device='cpu', max_length=max_len)
    y = train['score'].astype(np.float32).values
    oof = np.zeros(len(train), dtype=np.float32)
    # test predictions
    te_main = make_pairs_plain(test, reverse=False)
    te_main_scores = ce.predict(te_main, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
    if symmetry:
        te_rev = make_pairs_plain(test, reverse=True)
        te_rev_scores = ce.predict(te_rev, batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        te_scores = (te_main_scores + te_rev_scores) / 2.0
    else:
        te_scores = te_main_scores
    for f in range(NUM_FOLDS):
        f0 = time.time()
        va_idx = np.where(train['fold'].values == f)[0]
        va = train.iloc[va_idx]
        s_main = ce.predict(make_pairs_plain(va, reverse=False), batch_size=batch_size, show_progress_bar=False).astype(np.float32)
        if symmetry:
            s_rev = ce.predict(make_pairs_plain(va, reverse=True), batch_size=batch_size, show_progress_bar=False).astype(np.float32)
            s = (s_main + s_rev) / 2.0
        else:
            s = s_main
        oof[va_idx] = s
        r = pearsonr(s, y[va_idx])[0]
        print(f'  [Fold {f}] r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)
    r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
    print('Full OOF r =', round(float(r_all), 6), flush=True)
    # Overwrite canonical CE artifacts for stacker ingestion
    pd.DataFrame({'id': train['id'], 'pred': oof.astype(np.float32)}).to_csv('oof_ce_minilm.csv', index=False)
    pd.DataFrame({'id': test['id'], 'pred': te_scores.astype(np.float32)}).to_csv('submission_ce_minilm.csv', index=False)
    print('Saved oof_ce_minilm.csv and submission_ce_minilm.csv (overwritten with best CE).', flush=True)
    return float(r_all)

candidates = [
    ('cross-encoder/ms-marco-MiniLM-L-2-v2', 256),
    ('cross-encoder/quora-roberta-base', 256),
]

best_overall = (-1.0, None, None)  # (r, model_name, symmetry)
for name, ml in candidates:
    try:
        (sym_best, r_best), _ = smoke_eval(name, max_len=ml, batch_size=32)
        if np.isfinite(r_best) and r_best > best_overall[0]:
            best_overall = (r_best, name, sym_best)
    except Exception as e:
        print(f'ERROR loading/evaluating {name}: {e}', flush=True)

print('Best across candidates:', best_overall, flush=True)
THRESH = 0.60
if best_overall[0] >= THRESH and best_overall[1] is not None:
    final_r = run_full_oof(best_overall[1], max_len=256, symmetry=best_overall[2], batch_size=32)
    print('Final CE (best candidate) full OOF r=', round(float(final_r), 6), flush=True)
else:
    print('No candidate reached threshold; skipping full OOF.', flush=True)

Smoke CE: cross-encoder/ms-marco-MiniLM-L-2-v2 max_len=256


Loaded in 2.36 s


  sym=False parity=0 r=0.517818; elapsed 0.3s


  sym=False parity=1 r=0.493694; elapsed 0.3s


  -> OOF r (sym=False) = 0.505715 min/max/mean= -12.00761890411377 9.67796802520752 -6.1005449295043945


  sym=True parity=0 r=0.515791; elapsed 0.6s


  sym=True parity=1 r=0.498911; elapsed 0.6s


  -> OOF r (sym=True) = 0.507319 min/max/mean= -11.90707778930664 9.119239807128906 -5.993034362792969


Best smoke for cross-encoder/ms-marco-MiniLM-L-2-v2 => (True, 0.5073191523551941)


Smoke CE: cross-encoder/quora-roberta-base max_len=256


ERROR loading/evaluating cross-encoder/quora-roberta-base: data did not match any variant of untagged enum ModelWrapper at line 250356 column 3


Best across candidates: (0.5073191523551941, 'cross-encoder/ms-marco-MiniLM-L-2-v2', True)


No candidate reached threshold; skipping full OOF.


In [15]:
# CE diagnostic with transformers (AutoTokenizer/AutoModelForSequenceClassification), plain (anchor,target) raw logits
import numpy as np, pandas as pd, torch, time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import pearsonr

train = pd.read_csv('train.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)

# Deterministic 1k sample (first 500 even-fold + 500 odd-fold)
parity = (train['fold'].values % 2)
idx_even = np.where(parity == 0)[0][:500]
idx_odd  = np.where(parity == 1)[0][:500]
sel_idx = np.concatenate([idx_even, idx_odd])
sel = train.iloc[sel_idx].reset_index(drop=True)

name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
print('Loading', name, 'with transformers on CPU...', flush=True)
t0 = time.time()
tok = AutoTokenizer.from_pretrained(name)
mdl = AutoModelForSequenceClassification.from_pretrained(name)
mdl.eval(); mdl.to('cpu')
print('Loaded in', round(time.time()-t0, 2), 's')

# Build plain (anchor,target) pairs
pairs = list(zip(sel['anchor'].astype(str).tolist(), sel['target'].astype(str).tolist()))
y = sel['score'].astype(np.float32).values

# Batch inference for raw logits
def batched_logits(pairs, bs=64, max_length=192):
    out = []
    for i in range(0, len(pairs), bs):
        a, b = zip(*pairs[i:i+bs])
        enc = tok(list(a), list(b), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        with torch.no_grad():
            logits = mdl(**{k: v.to('cpu') for k, v in enc.items()}).logits.squeeze(-1).cpu().numpy()
        out.append(logits.astype(np.float32))
    return np.concatenate(out, axis=0)

logit = batched_logits(pairs, bs=64, max_length=192)
r_raw = pearsonr(logit, y)[0] if np.std(logit) > 0 else float('nan')
sig = 1.0/(1.0 + np.exp(-logit))
r_sig = pearsonr(sig.astype(np.float32), y)[0] if np.std(sig) > 0 else float('nan')
print('diag raw r=', round(float(r_raw), 6), 'sig r=', round(float(r_sig), 6), 'logit stats:', float(logit.min()), float(logit.max()), float(logit.mean()), flush=True)

Loading cross-encoder/ms-marco-MiniLM-L-6-v2 with transformers on CPU...


Loaded in 0.42 s


diag raw r= 0.55069 sig r= 0.516399 logit stats: -11.419656753540039 8.316144943237305 -5.684374809265137


In [17]:
# Full 5-fold CE (transformers) using STS-B regression head: cross-encoder/stsb-roberta-base
import time, numpy as np, pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import pearsonr

SEED = 42
np.random.seed(SEED)
torch.set_grad_enabled(False)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

MODEL = 'cross-encoder/stsb-roberta-base'
MAX_LEN = 256
BATCH = 64
print('Loading', MODEL, 'on CPU...', flush=True)
t0 = time.time()
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
mdl = AutoModelForSequenceClassification.from_pretrained(MODEL)
mdl.eval(); mdl.to('cpu')
print('Loaded in', round(time.time()-t0, 2), 's', flush=True)

def batched_scores(pairs, bs=BATCH, max_length=MAX_LEN):
    out = []
    for i in range(0, len(pairs), bs):
        a, b = zip(*pairs[i:i+bs])
        enc = tok(list(a), list(b), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        with torch.no_grad():
            logits = mdl(**{k: v.to('cpu') for k, v in enc.items()}).logits.squeeze(-1)
        out.append(logits.cpu().numpy().astype(np.float32))
    return np.concatenate(out, axis=0)

def make_pairs(df: pd.DataFrame, reverse: bool = False):
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    return list(zip(b, a)) if reverse else list(zip(a, b))

y = train['score'].astype(np.float32).values
oof = np.zeros(len(train), dtype=np.float32)

# Precompute test predictions (symmetry avg for stability)
pairs_te_f = make_pairs(test, reverse=False)
pairs_te_r = make_pairs(test, reverse=True)
s_te_f = batched_scores(pairs_te_f)
s_te_r = batched_scores(pairs_te_r)
te_scores = ((s_te_f + s_te_r) / 2.0).astype(np.float32)

for f in range(NUM_FOLDS):
    f0 = time.time()
    va_idx = np.where(train['fold'].values == f)[0]
    va_df = train.iloc[va_idx]
    s_f = batched_scores(make_pairs(va_df, reverse=False))
    s_r = batched_scores(make_pairs(va_df, reverse=True))
    s = ((s_f + s_r) / 2.0).astype(np.float32)
    oof[va_idx] = s
    r = pearsonr(s, y[va_idx])[0]
    print(f'[STS-B Fold {f}] r={r:.6f}; elapsed {time.time()-f0:.1f}s', flush=True)

r_all = pearsonr(oof, y)[0] if np.std(oof) > 0 else float('nan')
print('STS-B full OOF r=', round(float(r_all), 6), flush=True)

# Save with strict schemas
pd.DataFrame({'id': train['id'], 'pred': oof}).to_csv('oof_ce_stsb_plain.csv', index=False)
pd.DataFrame({'id': test['id'], 'pred': te_scores}).to_csv('submission_ce_stsb_plain.csv', index=False)
print('Saved oof_ce_stsb_plain.csv and submission_ce_stsb_plain.csv', flush=True)

Loading cross-encoder/stsb-roberta-base on CPU...


Loaded in 0.28 s


[STS-B Fold 0] r=0.505154; elapsed 29.7s


[STS-B Fold 1] r=0.518127; elapsed 30.2s


[STS-B Fold 2] r=0.505230; elapsed 33.7s


[STS-B Fold 3] r=0.522861; elapsed 34.2s


[STS-B Fold 4] r=0.507379; elapsed 32.0s


STS-B full OOF r= 0.51171


Saved oof_ce_stsb_plain.csv and submission_ce_stsb_plain.csv


In [22]:
# Build fold-safe CE feature transforms (raw, iso, z, rank) from existing oof_ce_minilm.csv/submission_ce_minilm.csv
import numpy as np, pandas as pd
from sklearn.isotonic import IsotonicRegression

train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
oof   = pd.read_csv('oof_ce_minilm.csv')  # [id,pred]
sub   = pd.read_csv('submission_ce_minilm.csv')  # [id,pred]

# Merge folds
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

y = train['score'].astype(np.float32).values
oof_raw = train[['id']].merge(oof, on='id', how='left', validate='one_to_one')['pred'].astype(np.float32).values
te_raw  = test[['id']].merge(sub, on='id', how='left', validate='one_to_one')['pred'].astype(np.float32).values

# Allocate
oof_iso  = np.zeros(len(train), dtype=np.float32)
oof_z    = np.zeros(len(train), dtype=np.float32)
oof_rank = np.zeros(len(train), dtype=np.float32)
te_iso_acc  = np.zeros(len(test), dtype=np.float64)
te_z_acc    = np.zeros(len(test), dtype=np.float64)
te_rank_acc = np.zeros(len(test), dtype=np.float64)

fold_arr = train['fold'].values.astype(int)
for f in range(NUM_FOLDS):
    tr = fold_arr != f
    va = fold_arr == f
    # Isotonic on train-only
    iso = IsotonicRegression(increasing=True, out_of_bounds='clip')
    iso.fit(oof_raw[tr], y[tr])
    oof_iso[va] = iso.transform(oof_raw[va]).astype(np.float32)
    te_iso_acc += iso.transform(te_raw).astype(np.float64)
    # z-score using train-only stats
    mu = float(oof_raw[tr].mean()); sd = float(oof_raw[tr].std()) or 1.0
    oof_z[va] = (oof_raw[va] - mu) / sd
    te_z_acc += (te_raw - mu) / sd
    # rank within train-only reference
    ref = np.sort(oof_raw[tr].astype(np.float32))
    if ref.size > 0:
        j_va = np.searchsorted(ref, oof_raw[va], side='right')
        oof_rank[va] = j_va / max(ref.size - 1, 1)
        j_te = np.searchsorted(ref, te_raw, side='right')
        te_rank_acc += (j_te / max(ref.size - 1, 1))

te_iso  = (te_iso_acc / NUM_FOLDS).astype(np.float32)
te_z    = (te_z_acc / NUM_FOLDS).astype(np.float32)
te_rank = (te_rank_acc / NUM_FOLDS).astype(np.float32)

# Save feature blocks with consistent schemas
oof_df = pd.DataFrame({
    'id': train['id'],
    'ce_plain_raw': oof_raw,
    'ce_plain_iso': oof_iso,
    'ce_plain_z': oof_z,
    'ce_plain_rank': oof_rank,
})
te_df = pd.DataFrame({
    'id': test['id'],
    'ce_plain_raw': te_raw,
    'ce_plain_iso': te_iso,
    'ce_plain_z': te_z,
    'ce_plain_rank': te_rank,
})
oof_df.to_csv('oof_ce_plain_feats.csv', index=False)
te_df.to_csv('ce_plain_feats_test.csv', index=False)
print('Saved oof_ce_plain_feats.csv and ce_plain_feats_test.csv')

Saved oof_ce_plain_feats.csv and ce_plain_feats_test.csv
