In [2]:
# BM25 Okapi/L/Plus features (A->B and B->A), fold-safe with stemming, ranks, squares, on normalized text.
# Outputs: oof_bm25_var_norm.csv and bm25_var_norm_test.csv
import time, re, math, gc, unicodedata, numpy as np, pandas as pd
from collections import Counter, defaultdict

# ---------- Normalization utilities (NFKC + confusables + number-unit split + de-hyphen) ----------
REPL = {
    '\u00b5': 'u',   # micro sign
    '\u03bc': 'u',   # Greek mu
    '\u03a9': 'ohm', # Omega
    '\u2126': 'ohm', # Ohm symbol
    '\u00b0C': 'deg C',  # degree C
    '\u00b0F': 'deg F',  # degree F
    '\u00b0': 'deg',     # bare degree
    '\u00d7': 'x',       # multiplication sign
    '\u2032': "'",      # prime
    '\u2033': '"',      # double prime
}
_SUBS = str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789')
_SUPS_MAP = { '²': '2', '³': '3', '⁺': '+', '⁻': '-' }
NUM_UNIT_ATTACH = re.compile(r'(?i)(\d+(?:[\./]\d+)?)([a-zA-Z%][a-zA-Z%/]*)')

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s))
    for k, v in REPL.items():
        s = s.replace(k, v)
    for k, v in _SUPS_MAP.items():
        s = s.replace(k, v)
    s = s.translate(_SUBS)
    s = s.replace('-', ' ').replace('_', ' ')
    s = NUM_UNIT_ATTACH.sub(r'\1 \2', s)
    s = s.lower()
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# ---------- Tokenizer with Porter stemming ----------
try:
    from nltk.stem import PorterStemmer
    _stemmer = PorterStemmer()
    def stem_token(tok: str) -> str:
        return _stemmer.stem(tok)
except Exception:
    def stem_token(tok: str) -> str:
        return tok

_word_re = re.compile(r"[a-z0-9]+(?:[./][a-z0-9]+)?")
def tokenize_stems(text: str):
    if not isinstance(text, str):
        text = ''
    text = normalize_text(text)
    toks = _word_re.findall(text)
    return [stem_token(t) for t in toks if t]

# ---------- BM25 IDF and scoring ----------
def build_corpus_stats(docs_tokens):
    N = len(docs_tokens)
    df = Counter()
    dl = np.zeros(N, dtype=np.int32)
    for i, toks in enumerate(docs_tokens):
        dl[i] = len(toks)
        if toks:
            df.update(set(toks))
    avgdl = float(dl.mean() if N > 0 else 0.0)
    idf = {t: math.log((N - c + 0.5)/(c + 0.5) + 1.0) for t, c in df.items()}
    return {'N': N, 'df': df, 'idf': idf, 'dl': dl, 'avgdl': avgdl}

def tf_counts(tokens):
    return Counter(tokens)

def bm25_okapi_score(query_tokens, doc_tokens, stats, k1=1.5, b=0.75):
    if not query_tokens or not doc_tokens:
        return 0.0
    idf = stats['idf']
    dl = len(doc_tokens); avgdl = stats['avgdl'] if stats['avgdl'] > 0 else 1.0
    tf = tf_counts(doc_tokens)
    score = 0.0
    for t in set(query_tokens):
        if t not in idf:
            continue
        f = tf.get(t, 0)
        if f == 0:
            continue
        denom = f + k1 * (1 - b + b * dl / avgdl)
        score += idf[t] * (f * (k1 + 1)) / denom
    return float(score)

def bm25l_score(query_tokens, doc_tokens, stats, k1=1.5, b=0.75, delta=0.5):
    if not query_tokens or not doc_tokens:
        return 0.0
    idf = stats['idf']
    dl = len(doc_tokens); avgdl = stats['avgdl'] if stats['avgdl'] > 0 else 1.0
    tf = tf_counts(doc_tokens)
    score = 0.0
    for t in set(query_tokens):
        if t not in idf:
            continue
        f = tf.get(t, 0)
        if f == 0:
            continue
        denom = f + k1 * (1 - b + b * dl / avgdl)
        score += idf[t] * ((f + delta) * (k1 + 1)) / (denom + delta)
    return float(score)

def bm25plus_score(query_tokens, doc_tokens, stats, k1=1.2, b=0.75, delta=1.0):
    if not query_tokens or not doc_tokens:
        return 0.0
    idf = stats['idf']
    dl = len(doc_tokens); avgdl = stats['avgdl'] if stats['avgdl'] > 0 else 1.0
    tf = tf_counts(doc_tokens)
    score = 0.0
    for t in set(query_tokens):
        if t not in idf:
            continue
        f = tf.get(t, 0)
        if f == 0:
            continue
        denom = f + k1 * (1 - b + b * dl / avgdl)
        score += idf[t] * ((f * (k1 + 1)) / denom + delta)
    return float(score)

# ---------- Load data and folds ----------
t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
fold_arr = train['fold'].values.astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1
print('Found folds:', NUM_FOLDS, flush=True)

# Pre-tokenize once (normalized + stemmed)
A_tr_tok = [tokenize_stems(x) for x in train['anchor'].astype(str).tolist()]
B_tr_tok = [tokenize_stems(x) for x in train['target'].astype(str).tolist()]
A_te_tok = [tokenize_stems(x) for x in test['anchor'].astype(str).tolist()]
B_te_tok = [tokenize_stems(x) for x in test['target'].astype(str).tolist()]

n_tr = len(train); n_te = len(test)
cols = [
    'bm25_okapi_ab','bm25_okapi_ba',
    'bm25l_ab','bm25l_ba',
    'bm25p_ab','bm25p_ba',
    'bm25_okapi_ab_sq','bm25_okapi_ba_sq',
    'bm25l_ab_sq','bm25l_ba_sq',
    'bm25p_ab_sq','bm25p_ba_sq',
    'bm25_okapi_ab_rank','bm25_okapi_ba_rank',
    'bm25l_ab_rank','bm25l_ba_rank',
    'bm25p_ab_rank','bm25p_ba_rank',
    'bm25_okapi_ab_pct','bm25_okapi_ba_pct',
    'bm25l_ab_pct','bm25l_ba_pct',
    'bm25p_ab_pct','bm25p_ba_pct'
]
oof = np.zeros((n_tr, len(cols)), dtype=np.float32)
te_accum = np.zeros((n_te, len(cols)), dtype=np.float32)

for f in range(NUM_FOLDS):
    f0 = time.time()
    tr_idx = np.where(fold_arr != f)[0]
    va_idx = np.where(fold_arr == f)[0]
    print(f'Fold {f}: train {len(tr_idx)} val {len(va_idx)}', flush=True)

    # Build train-only corpus stats for B (for A->B scoring) and for A (for B->A scoring)
    docs_B = [B_tr_tok[i] for i in tr_idx]
    docs_A = [A_tr_tok[i] for i in tr_idx]
    stats_B = build_corpus_stats(docs_B)
    stats_A = build_corpus_stats(docs_A)

    # Compute reference scores on train-only (for rank/pct on val)
    ref_ok_ab = []; ref_ok_ba = []
    ref_l_ab = []; ref_l_ba = []
    ref_p_ab = []; ref_p_ba = []
    for i in tr_idx:
        qa, db = A_tr_tok[i], B_tr_tok[i]
        qb, da = B_tr_tok[i], A_tr_tok[i]
        s_ok_ab = bm25_okapi_score(qa, db, stats_B, k1=1.5, b=0.75)
        s_ok_ba = bm25_okapi_score(qb, da, stats_A, k1=1.5, b=0.75)
        s_l_ab = bm25l_score(qa, db, stats_B, k1=1.5, b=0.75, delta=0.5)
        s_l_ba = bm25l_score(qb, da, stats_A, k1=1.5, b=0.75, delta=0.5)
        s_p_ab = bm25plus_score(qa, db, stats_B, k1=1.2, b=0.75, delta=1.0)
        s_p_ba = bm25plus_score(qb, da, stats_A, k1=1.2, b=0.75, delta=1.0)
        ref_ok_ab.append(s_ok_ab); ref_ok_ba.append(s_ok_ba)
        ref_l_ab.append(s_l_ab);   ref_l_ba.append(s_l_ba)
        ref_p_ab.append(s_p_ab);   ref_p_ba.append(s_p_ba)
    ref_ab_by_anchor = defaultdict(list)
    ref_ba_by_anchor = defaultdict(list)
    for idx_i, i in enumerate(tr_idx):
        a = train.at[i, 'anchor']
        ref_ab_by_anchor[a].append((ref_ok_ab[idx_i], ref_l_ab[idx_i], ref_p_ab[idx_i]))
        ref_ba_by_anchor[a].append((ref_ok_ba[idx_i], ref_l_ba[idx_i], ref_p_ba[idx_i]))
    for a in ref_ab_by_anchor:
        arr = np.array(ref_ab_by_anchor[a], dtype=np.float32)
        ref_ab_by_anchor[a] = np.sort(arr, axis=0)
    for a in ref_ba_by_anchor:
        arr = np.array(ref_ba_by_anchor[a], dtype=np.float32)
        ref_ba_by_anchor[a] = np.sort(arr, axis=0)

    # Compute OOF for validation rows
    for idx in va_idx:
        qa, db = A_tr_tok[idx], B_tr_tok[idx]
        qb, da = B_tr_tok[idx], A_tr_tok[idx]
        ok_ab = bm25_okapi_score(qa, db, stats_B, k1=1.5, b=0.75)
        ok_ba = bm25_okapi_score(qb, da, stats_A, k1=1.5, b=0.75)
        l_ab = bm25l_score(qa, db, stats_B, k1=1.5, b=0.75, delta=0.5)
        l_ba = bm25l_score(qb, da, stats_A, k1=1.5, b=0.75, delta=0.5)
        p_ab = bm25plus_score(qa, db, stats_B, k1=1.2, b=0.75, delta=1.0)
        p_ba = bm25plus_score(qb, da, stats_A, k1=1.2, b=0.75, delta=1.0)
        vals = [ok_ab, ok_ba, l_ab, l_ba, p_ab, p_ba]
        vals_sq = [v*v for v in vals]
        a = train.at[idx, 'anchor']
        arr_ab = ref_ab_by_anchor.get(a)
        arr_ba = ref_ba_by_anchor.get(a)
        ranks = [np.nan]*6; pcts = [np.nan]*6
        if arr_ab is not None and len(arr_ab) > 0:
            for j, v in enumerate([ok_ab, l_ab, p_ab]):
                jpos = np.searchsorted(arr_ab[:, j], v, side='right')
                pcts[j*2] = jpos / len(arr_ab)
                ranks[j*2] = jpos
        if arr_ba is not None and len(arr_ba) > 0:
            for j, v in enumerate([ok_ba, l_ba, p_ba]):
                jpos = np.searchsorted(arr_ba[:, j], v, side='right')
                pcts[j*2+1] = jpos / len(arr_ba)
                ranks[j*2+1] = jpos
        row = [ok_ab, ok_ba, l_ab, l_ba, p_ab, p_ba] + vals_sq + ranks + pcts
        oof[idx, :] = np.array(row, dtype=np.float32)

    # Test features using this fold's stats
    te_vals = np.zeros((n_te, len(cols)), dtype=np.float32)
    for j in range(n_te):
        qa, db = A_te_tok[j], B_te_tok[j]
        qb, da = B_te_tok[j], A_te_tok[j]
        ok_ab = bm25_okapi_score(qa, db, stats_B, k1=1.5, b=0.75)
        ok_ba = bm25_okapi_score(qb, da, stats_A, k1=1.5, b=0.75)
        l_ab = bm25l_score(qa, db, stats_B, k1=1.5, b=0.75, delta=0.5)
        l_ba = bm25l_score(qb, da, stats_A, k1=1.5, b=0.75, delta=0.5)
        p_ab = bm25plus_score(qa, db, stats_B, k1=1.2, b=0.75, delta=1.0)
        p_ba = bm25plus_score(qb, da, stats_A, k1=1.2, b=0.75, delta=1.0)
        vals = [ok_ab, ok_ba, l_ab, l_ba, p_ab, p_ba]
        vals_sq = [v*v for v in vals]
        a = test.at[j, 'anchor']
        arr_ab = ref_ab_by_anchor.get(a)
        arr_ba = ref_ba_by_anchor.get(a)
        ranks = [np.nan]*6; pcts = [np.nan]*6
        if arr_ab is not None and len(arr_ab) > 0:
            for k, v in enumerate([ok_ab, l_ab, p_ab]):
                jpos = np.searchsorted(arr_ab[:, k], v, side='right')
                pcts[k*2] = jpos / len(arr_ab)
                ranks[k*2] = jpos
        if arr_ba is not None and len(arr_ba) > 0:
            for k, v in enumerate([ok_ba, l_ba, p_ba]):
                jpos = np.searchsorted(arr_ba[:, k], v, side='right')
                pcts[k*2+1] = jpos / len(arr_ba)
                ranks[k*2+1] = jpos
        te_vals[j, :] = np.array([ok_ab, ok_ba, l_ab, l_ba, p_ab, p_ba] + vals_sq + ranks + pcts, dtype=np.float32)
    te_accum += te_vals

    print(f'Fold {f} done in {time.time()-f0:.2f}s', flush=True)
    gc.collect()

# Average test across folds
te_mean = te_accum / NUM_FOLDS

# Save
oof_df = pd.DataFrame({'id': train['id']})
for k, c in enumerate(cols):
    oof_df[c] = oof[:, k]
oof_df.to_csv('oof_bm25_var_norm.csv', index=False)

te_df = pd.DataFrame({'id': test['id']})
for k, c in enumerate(cols):
    te_df[c] = te_mean[:, k]
te_df.to_csv('bm25_var_norm_test.csv', index=False)

print('Saved oof_bm25_var_norm.csv and bm25_var_norm_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

Found folds: 5


Fold 0: train 26739 val 6086


Fold 0 done in 0.58s


Fold 1: train 26310 val 6515


Fold 1 done in 0.51s


Fold 2: train 26195 val 6630


Fold 2 done in 0.51s


Fold 3: train 25884 val 6941


Fold 3 done in 0.50s


Fold 4: train 26172 val 6653


Fold 4 done in 0.51s


Saved oof_bm25_var_norm.csv and bm25_var_norm_test.csv; elapsed 0.06 min
