In [1]:
# IDF-weighted overlap features (1-2 grams), fold-safe. Outputs: oof_idf_overlap.csv, idf_overlap_test.csv
import time, re, math, numpy as np, pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

SEED = 42
np.random.seed(SEED)

def tokenize_words(s: str):
    return re.findall(r"\w+", str(s).lower())

STOP = set(ENGLISH_STOP_WORDS)

def filter_tokens(tokens):
    # remove stopwords and single-character tokens (mostly noise)
    return [t for t in tokens if (t not in STOP and len(t) > 1)]

def gen_ngrams(tokens, n):
    if n == 1:
        return tokens
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def build_df(corpus_docs):
    # corpus_docs: iterable of sets of terms
    df = Counter()
    for terms in corpus_docs:
        if terms:
            df.update(set(terms))
    return df

def idf_from_df(df_counter, N):
    # log((N - df + 0.5)/(df + 0.5) + 1)
    idf = {}
    for t, df in df_counter.items():
        idf[t] = math.log((N - df + 0.5)/(df + 0.5) + 1.0)
    return idf

def weighted_overlap_metrics(A_terms, B_terms, idf):
    # IDF-weighted precision/recall/F1 and Jaccard
    A_set, B_set = set(A_terms), set(B_terms)
    if not A_set and not B_set:
        return (0.0, 0.0, 0.0, 0.0, 0.0, 0, 0)
    inter = A_set & B_set
    union = A_set | B_set
    w_inter = sum(idf.get(t, 0.0) for t in inter)
    w_A = sum(idf.get(t, 0.0) for t in A_set) + 1e-12
    w_B = sum(idf.get(t, 0.0) for t in B_set) + 1e-12
    w_union = sum(idf.get(t, 0.0) for t in union) + 1e-12
    prec = w_inter / w_A
    rec = w_inter / w_B
    f1 = 0.0 if (prec+rec) == 0 else (2*prec*rec)/(prec+rec)
    jac = w_inter / w_union
    return (prec, rec, f1, jac, w_inter, len(A_set), len(B_set))

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
assert (train['fold']>=0).all(), 'Fold merge by id failed'

# Pre-tokenize and build 1-2 gram bags per row (stopword-removed)
A_tr_tok = [filter_tokens(tokenize_words(x)) for x in train['anchor'].astype(str).tolist()]
B_tr_tok = [filter_tokens(tokenize_words(x)) for x in train['target'].astype(str).tolist()]
A_te_tok = [filter_tokens(tokenize_words(x)) for x in test['anchor'].astype(str).tolist()]
B_te_tok = [filter_tokens(tokenize_words(x)) for x in test['target'].astype(str).tolist()]

def make_ngrams_pairs(tokens_list):
    uni = [gen_ngrams(t,1) for t in tokens_list]
    bi = [gen_ngrams(t,2) for t in tokens_list]
    return uni, bi

A_tr_uni, A_tr_bi = make_ngrams_pairs(A_tr_tok)
B_tr_uni, B_tr_bi = make_ngrams_pairs(B_tr_tok)
A_te_uni, A_te_bi = make_ngrams_pairs(A_te_tok)
B_te_uni, B_te_bi = make_ngrams_pairs(B_te_tok)

fold_arr = train['fold'].values.astype(int)
n_tr = len(train); n_te = len(test)

# Allocate OOF arrays
cols = [
    'idf1_prec','idf1_rec','idf1_f1','idf1_jac','idf1_wi',
    'idf2_prec','idf2_rec','idf2_f1','idf2_jac','idf2_wi'
]
oof = np.zeros((n_tr, len(cols)), dtype=np.float32)
te_fold_preds = []  # list of (n_te, len(cols)) arrays

for f in sorted(np.unique(fold_arr)):
    f0 = time.time()
    tr_idx = np.where(fold_arr != f)[0]
    va_idx = np.where(fold_arr == f)[0]
    # Build corpus for IDF on train-only (union of anchor+target docs for each n-gram level)
    # Unigrams
    corpus_uni_docs = [set(A_tr_uni[i]) for i in tr_idx] + [set(B_tr_uni[i]) for i in tr_idx]
    N_uni = len(corpus_uni_docs)
    df_uni = build_df(corpus_uni_docs)
    idf_uni = idf_from_df(df_uni, N_uni)
    # Bigrams
    corpus_bi_docs = [set(A_tr_bi[i]) for i in tr_idx] + [set(B_tr_bi[i]) for i in tr_idx]
    N_bi = len(corpus_bi_docs)
    df_bi = build_df(corpus_bi_docs)
    idf_bi = idf_from_df(df_bi, N_bi)

    # Compute OOF for this fold
    for i in va_idx:
        p1, r1, f1, j1, wi1, _, _ = weighted_overlap_metrics(A_tr_uni[i], B_tr_uni[i], idf_uni)
        p2, r2, f2, j2, wi2, _, _ = weighted_overlap_metrics(A_tr_bi[i], B_tr_bi[i], idf_bi)
        oof[i, :] = [p1, r1, f1, j1, wi1, p2, r2, f2, j2, wi2]

    # Compute test features for this fold
    te_mat = np.zeros((n_te, len(cols)), dtype=np.float32)
    for j in range(n_te):
        p1, r1, f1, j1, wi1, _, _ = weighted_overlap_metrics(A_te_uni[j], B_te_uni[j], idf_uni)
        p2, r2, f2, j2, wi2, _, _ = weighted_overlap_metrics(A_te_bi[j], B_te_bi[j], idf_bi)
        te_mat[j, :] = [p1, r1, f1, j1, wi1, p2, r2, f2, j2, wi2]
    te_fold_preds.append(te_mat)
    print(f'IDF-overlap fold {int(f)} done in {time.time()-f0:.1f}s', flush=True)

# Aggregate test across folds (mean)
te_mean = np.mean(np.stack(te_fold_preds, axis=0), axis=0).astype(np.float32)

# Save artifacts
oof_df = pd.DataFrame({'id': train['id']})
for k, c in enumerate(cols):
    oof_df[c] = oof[:, k]
oof_df.to_csv('oof_idf_overlap.csv', index=False)

te_df = pd.DataFrame({'id': test['id']})
for k, c in enumerate(cols):
    te_df[c] = te_mean[:, k]
te_df.to_csv('idf_overlap_test.csv', index=False)

print('Saved oof_idf_overlap.csv and idf_overlap_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

IDF-overlap fold 0 done in 0.2s


IDF-overlap fold 1 done in 0.2s


IDF-overlap fold 2 done in 0.2s


IDF-overlap fold 3 done in 0.2s


IDF-overlap fold 4 done in 0.2s


Saved oof_idf_overlap.csv and idf_overlap_test.csv; elapsed 0.02 min


In [3]:
# Ensure rapidfuzz is installed (CPU-only, safe to add)
import sys, subprocess
subprocess.run([sys.executable, '-m', 'pip', 'install', 'rapidfuzz==3.9.7'], check=True)
print('rapidfuzz installed')

Collecting rapidfuzz==3.9.7
  Downloading rapidfuzz-3.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 87.9 MB/s eta 0:00:00


Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.9.7
rapidfuzz installed


In [4]:
# RapidFuzz token-set features (fold-safe; no fitting). Outputs: oof_fuzz.csv, fuzz_test.csv
import time, numpy as np, pandas as pd
from rapidfuzz import fuzz, utils

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
assert (train['fold']>=0).all(), 'Fold merge by id failed'

def prep_texts(df):
    a = df['anchor'].astype(str).str.lower().tolist()
    b = df['target'].astype(str).str.lower().tolist()
    return a, b

A_tr, B_tr = prep_texts(train)
A_te, B_te = prep_texts(test)

def compute_rf(a_list, b_list):
    n = len(a_list)
    s1 = np.zeros(n, dtype=np.float32)
    s2 = np.zeros(n, dtype=np.float32)
    proc = utils.default_process
    for i, (a, b) in enumerate(zip(a_list, b_list)):
        s1[i] = fuzz.token_set_ratio(a, b, processor=proc) / 100.0
        s2[i] = fuzz.partial_token_set_ratio(a, b, processor=proc) / 100.0
    return s1, s2

tr_set, tr_partial = compute_rf(A_tr, B_tr)
te_set, te_partial = compute_rf(A_te, B_te)

pd.DataFrame({'id': train['id'], 'rf_token_set': tr_set, 'rf_partial_token_set': tr_partial}).to_csv('oof_fuzz.csv', index=False)
pd.DataFrame({'id': test['id'], 'rf_token_set': te_set, 'rf_partial_token_set': te_partial}).to_csv('fuzz_test.csv', index=False)
print('Saved oof_fuzz.csv and fuzz_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

Saved oof_fuzz.csv and fuzz_test.csv; elapsed 0.0 min


In [5]:
# LCS (char/token) and char n-gram (3-5) similarities. Outputs: oof_lcs_char_ngrams.csv, lcs_char_ngrams_test.csv
import time, numpy as np, pandas as pd, re
from rapidfuzz.distance import LCSseq

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
assert (train['fold']>=0).all(), 'Fold merge by id failed'

def char_lcs_ratio(a: str, b: str) -> float:
    # normalized_similarity returns [0,100]
    return float(LCSseq.normalized_similarity(str(a), str(b))) / 100.0

def tokenize_words(s: str):
    return re.findall(r"\w+", str(s).lower())

def token_lcs_ratio(a: str, b: str) -> float:
    ta = tokenize_words(a); tb = tokenize_words(b)
    na, nb = len(ta), len(tb)
    if na == 0 and nb == 0:
        return 0.0
    # DP LCS on tokens (phrases are short, so this is fine)
    dp = [0] * (nb + 1)
    for i in range(1, na + 1):
        prev = 0
        ai = ta[i-1]
        for j in range(1, nb + 1):
            tmp = dp[j]
            if ai == tb[j-1]:
                dp[j] = prev + 1
            else:
                if dp[j] < dp[j-1]:
                    dp[j] = dp[j-1]
            prev = tmp
    lcs_len = dp[nb]
    return float(lcs_len) / float(max(na, nb) if max(na, nb) > 0 else 1)

def shingles(s: str, k: int):
    s = str(s).lower()
    if k <= 0:
        return set()
    if len(s) < k:
        return {s} if s else set()
    return {s[i:i+k] for i in range(len(s)-k+1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b: return 0.0
    return len(a & b) / (len(a | b) + 1e-12)

def dice(a: set, b: set) -> float:
    if not a and not b: return 0.0
    return 2.0 * len(a & b) / (len(a) + len(b) + 1e-12)

def compute_features(df: pd.DataFrame):
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    n = len(df)
    lcs_char = np.zeros(n, dtype=np.float32)
    lcs_tok  = np.zeros(n, dtype=np.float32)
    jac3 = np.zeros(n, dtype=np.float32); jac4 = np.zeros(n, dtype=np.float32); jac5 = np.zeros(n, dtype=np.float32)
    dice3 = np.zeros(n, dtype=np.float32); dice4 = np.zeros(n, dtype=np.float32); dice5 = np.zeros(n, dtype=np.float32)
    for i, (a, b) in enumerate(zip(A, B)):
        lcs_char[i] = char_lcs_ratio(a, b)
        lcs_tok[i]  = token_lcs_ratio(a, b)
        a3 = shingles(a, 3); b3 = shingles(b, 3)
        a4 = shingles(a, 4); b4 = shingles(b, 4)
        a5 = shingles(a, 5); b5 = shingles(b, 5)
        jac3[i] = jaccard(a3, b3); jac4[i] = jaccard(a4, b4); jac5[i] = jaccard(a5, b5)
        dice3[i] = dice(a3, b3); dice4[i] = dice(a4, b4); dice5[i] = dice(a5, b5)
    return pd.DataFrame({
        'lcs_char': lcs_char, 'lcs_tok': lcs_tok,
        'char3_jac': jac3, 'char4_jac': jac4, 'char5_jac': jac5,
        'char3_dice': dice3, 'char4_dice': dice4, 'char5_dice': dice5
    })

tr_feats = compute_features(train)
te_feats = compute_features(test)

tr_out = pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1)
te_out = pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1)
tr_out.to_csv('oof_lcs_char_ngrams.csv', index=False)
te_out.to_csv('lcs_char_ngrams_test.csv', index=False)
print('Saved oof_lcs_char_ngrams.csv and lcs_char_ngrams_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

Saved oof_lcs_char_ngrams.csv and lcs_char_ngrams_test.csv; elapsed 0.01 min


In [7]:
# Quick health check for latest LGBM stacker outputs
import pandas as pd, numpy as np
from scipy.stats import pearsonr

train = pd.read_csv('train.csv')
oof = pd.read_csv('oof_stack_lgbm.csv')  # expects columns: id, oof
df = train[['id','score']].merge(oof, on='id', how='inner')
p = pearsonr(df['oof'].astype(float).values, df['score'].astype(float).values)[0]
print('OOF Pearson (oof_stack_lgbm.csv vs train score):', round(float(p), 6))

sub = pd.read_csv('submission_stack_lgbm.csv')
print('Submission stats: n=', len(sub), 'min=', float(sub['score'].min()), 'max=', float(sub['score'].max()), 'mean=', float(sub['score'].mean()))
print(sub.head())

OOF Pearson (oof_stack_lgbm.csv vs train score): 0.752741
Submission stats: n= 3648 min= 0.0 max= 1.0 mean= 0.35697559763779885
                 id     score
0  2a988c7d98568627  0.128831
1  75a3ae03b26e2f7e  0.301520
2  0126c870aede9858  0.076760
3  2cf662e1cc9b354e  0.314683
4  8dfee5874de0b408  0.149585


In [8]:
# Numeric and units normalization + overlap features (fold-safe, no fitting). Outputs: oof_numeric_units.csv, numeric_units_test.csv
import re, unicodedata, time, numpy as np, pandas as pd

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# -------- Normalization utilities --------
def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

REPLACEMENTS = {
    '\u00b5': 'u',  # micro sign
    '\u03bc': 'u',  # Greek mu
    '\u03a9': 'ohm',  # Omega
    '\u2126': 'ohm',  # Ohm symbol
    '\u00b0C': 'C',   # degree C -> C
    '\u00b0F': 'F',   # degree F -> F
}

def normalize_symbols(s: str) -> str:
    s = nfkc(str(s))
    for k, v in REPLACEMENTS.items():
        s = s.replace(k, v)
    # unify separators
    s = s.replace('-', ' ').replace('_', ' ')
    return s

# Split attached number+unit like 10nm -> '10 nm'; 3.3kV -> '3.3 kV'
NUM_UNIT_ATTACH = re.compile(r'(?i)(\d+(?:[\./]\d+)?)([a-zA-Z%][a-zA-Z%/]*)')
def split_attached_num_unit(s: str) -> str:
    return NUM_UNIT_ATTACH.sub(r'\1 \2', s)

def basic_clean(s: str) -> str:
    s = normalize_symbols(s)
    s = split_attached_num_unit(s)
    return s

# -------- Extraction --------
NUM_RE = re.compile(r'(?i)\b\d+(?:[\./]\d+)?(?:e[+-]?\d+)?\b')
UNIT_TOKEN_RE = re.compile(r'(?i)^[a-z][a-z0-9%/^-]{0,6}$')  # short-ish unit-like tokens

def extract_numbers(text: str):
    return [m.group(0) for m in NUM_RE.finditer(text)]

def extract_tokens(text: str):
    # Keep alnum and % / ^
    return re.findall(r'[A-Za-z0-9%/^.]+', text.lower())

def is_unit(tok: str) -> bool:
    return bool(UNIT_TOKEN_RE.match(tok)) and not tok[0].isdigit()

def parse_num(s: str) -> float | None:
    try:
        # replace '/' in decimals already handled; just use float safely
        return float(s.replace('/', '.')) if '/' in s and s.count('/') == 1 else float(s)
    except Exception:
        return None

def extract_num_unit_pairs(tokens):
    pairs = []
    for i, tok in enumerate(tokens):
        if NUM_RE.fullmatch(tok):
            # look ahead for a unit token
            if i+1 < len(tokens) and is_unit(tokens[i+1]):
                pairs.append((tok, tokens[i+1]))
    return pairs

def features_for_row(a: str, b: str):
    a_txt = basic_clean(a)
    b_txt = basic_clean(b)
    nums_a = extract_numbers(a_txt)
    nums_b = extract_numbers(b_txt)
    toks_a = extract_tokens(a_txt)
    toks_b = extract_tokens(b_txt)
    # Units: collect tokens that look like units; prioritize ones following numbers
    units_a = set([t for t in toks_a if is_unit(t)])
    units_b = set([t for t in toks_b if is_unit(t)])
    # Number counts and overlaps
    cnt_num_a = len(nums_a); cnt_num_b = len(nums_b)
    overlap_num_exact = len(set(nums_a) & set(nums_b))
    # numeric deltas
    vals_a = [parse_num(x) for x in nums_a]; vals_a = [v for v in vals_a if v is not None]
    vals_b = [parse_num(x) for x in nums_b]; vals_b = [v for v in vals_b if v is not None]
    if vals_a and vals_b:
        # pairwise min absolute difference
        mins = []
        for va in vals_a:
            md = min(abs(va - vb) for vb in vals_b)
            mins.append(md)
        min_abs_delta = float(min(mins))
        mean_abs_delta = float(np.mean([abs(va - vb) for va in vals_a for vb in vals_b]))
        any_equal_round0 = any(int(round(va)) == int(round(vb)) for va in vals_a for vb in vals_b)
    else:
        min_abs_delta = np.nan; mean_abs_delta = np.nan; any_equal_round0 = False
    # unit overlaps
    unit_overlap_cnt = len(units_a & units_b)
    unit_union = len(units_a | units_b)
    unit_jaccard = (unit_overlap_cnt / unit_union) if unit_union > 0 else 0.0
    # number+unit bigram exact overlap
    pairs_a = set(extract_num_unit_pairs(toks_a))
    pairs_b = set(extract_num_unit_pairs(toks_b))
    pair_overlap = len(pairs_a & pairs_b)
    # simple ratios
    num_count_diff = abs(cnt_num_a - cnt_num_b)
    num_count_ratio = (min(cnt_num_a, cnt_num_b) / max(cnt_num_a, cnt_num_b)) if max(cnt_num_a, cnt_num_b) > 0 else 1.0
    return {
        'num_cnt_a': float(cnt_num_a),
        'num_cnt_b': float(cnt_num_b),
        'num_overlap_exact': float(overlap_num_exact),
        'num_min_abs_delta': float(min_abs_delta) if min_abs_delta == min_abs_delta else np.nan,
        'num_mean_abs_delta': float(mean_abs_delta) if mean_abs_delta == mean_abs_delta else np.nan,
        'num_equal_round_int': float(any_equal_round0),
        'unit_cnt_a': float(len(units_a)),
        'unit_cnt_b': float(len(units_b)),
        'unit_overlap_cnt': float(unit_overlap_cnt),
        'unit_jaccard': float(unit_jaccard),
        'numunit_pair_overlap': float(pair_overlap),
        'num_count_diff': float(num_count_diff),
        'num_count_ratio': float(num_count_ratio),
    }

def compute_df(df: pd.DataFrame) -> pd.DataFrame:
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    rows = []
    for i, (a, b) in enumerate(zip(A, B)):
        rows.append(features_for_row(a, b))
        if (i+1) % 5000 == 0:
            print(f'.. {i+1} rows', flush=True)
    out = pd.DataFrame(rows)
    # cheap non-linearities
    out['unit_overlap_cnt_sq'] = out['unit_overlap_cnt'] ** 2
    out['num_overlap_exact_sq'] = out['num_overlap_exact'] ** 2
    # replace NaNs in deltas with large sentinel (will be median-imputed in stacker anyway)
    out['num_min_abs_delta'] = out['num_min_abs_delta'].fillna(1e6)
    out['num_mean_abs_delta'] = out['num_mean_abs_delta'].fillna(1e6)
    return out.astype('float32')

tr_feats = compute_df(train)
te_feats = compute_df(test)

pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1).to_csv('oof_numeric_units.csv', index=False)
pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1).to_csv('numeric_units_test.csv', index=False)
print('Saved oof_numeric_units.csv and numeric_units_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

.. 5000 rows


.. 10000 rows


.. 15000 rows


.. 20000 rows


.. 25000 rows


.. 30000 rows


Saved oof_numeric_units.csv and numeric_units_test.csv; elapsed 0.01 min


In [9]:
# Abbreviation/acronym detection + expansion overlap features. Outputs: oof_acronym.csv, acronym_test.csv
import re, time, numpy as np, pandas as pd

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Patterns: LONG (SHORT) | SHORT (LONG)
# SHORT: 2-10 uppercase letters/digits with no spaces; allow hyphen optional inside
# LONG: 3+ tokens letters/digits/-, at least as long as SHORT
P_LONG_SHORT = re.compile(r"([A-Za-z][A-Za-z0-9\- ]{2,}?)\s*\(([A-Z0-9][A-Z0-9\-]{1,9})\)")
P_SHORT_LONG = re.compile(r"\b([A-Z0-9][A-Z0-9\-]{1,9})\b\s*\(([A-Za-z][A-Za-z0-9\- ]{2,}?)\)")

def extract_pairs(text: str):
    pairs = {}  # short -> long
    if not isinstance(text, str):
        return pairs
    for m in P_LONG_SHORT.finditer(text):
        long = m.group(1).strip()
        short = m.group(2).strip()
        if len(short) >= 2 and len(long) >= len(short):
            pairs[short] = long
    for m in P_SHORT_LONG.finditer(text):
        short = m.group(1).strip()
        long = m.group(2).strip()
        if len(short) >= 2 and len(long) >= len(short):
            pairs[short] = long
    return pairs

def tokenize_simple(s: str):
    return re.findall(r"[A-Za-z0-9]+", s.lower())

def expand_text(s: str, pairs: dict):
    # Replace whole-word SHORT with LONG (case-sensitive for SHORT); guard boundaries
    if not pairs or not isinstance(s, str):
        return s if isinstance(s, str) else ''
    out = s
    for short, long in pairs.items():
        # whole word boundary replace; avoid catastrophic overlap by using regex
        try:
            out = re.sub(rf"\b{re.escape(short)}\b", long, out)
        except re.error:
            pass
    return out

def jaccard(a:set, b:set):
    if not a and not b: return 0.0
    return len(a & b) / (len(a | b) + 1e-12)

def dice(a:set, b:set):
    if not a and not b: return 0.0
    return 2.0 * len(a & b) / (len(a) + len(b) + 1e-12)

def features_for_row(a: str, b: str):
    m_a = extract_pairs(a)
    m_b = extract_pairs(b)
    # union map prioritizing longer expansions when conflict
    union = dict(m_a)
    for k, v in m_b.items():
        if k in union:
            union[k] = v if len(v) > len(union[k]) else union[k]
        else:
            union[k] = v
    a_exp = expand_text(a, union)
    b_exp = expand_text(b, union)
    ta = set(tokenize_simple(a))
    tb = set(tokenize_simple(b))
    tae = set(tokenize_simple(a_exp))
    tbe = set(tokenize_simple(b_exp))
    # base vs expanded overlaps
    jac_base = jaccard(ta, tb)
    dice_base = dice(ta, tb)
    jac_exp = jaccard(tae, tbe)
    dice_exp = dice(tae, tbe)
    gain_jac = jac_exp - jac_base
    gain_dice = dice_exp - dice_base
    # acronym stats
    n_acr_a = len(m_a); n_acr_b = len(m_b); n_acr_union = len(union)
    acr_overlap = len(set(m_a.keys()) & set(m_b.keys()))
    any_def = 1.0 if n_acr_union > 0 else 0.0
    return {
        'acr_any_def': float(any_def),
        'acr_cnt_a': float(n_acr_a),
        'acr_cnt_b': float(n_acr_b),
        'acr_cnt_union': float(n_acr_union),
        'acr_overlap_cnt': float(acr_overlap),
        'acr_jaccard_base': float(jac_base),
        'acr_dice_base': float(dice_base),
        'acr_jaccard_exp': float(jac_exp),
        'acr_dice_exp': float(dice_exp),
        'acr_jaccard_gain': float(gain_jac),
        'acr_dice_gain': float(gain_dice),
    }

def compute_df(df: pd.DataFrame) -> pd.DataFrame:
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    rows = []
    for i, (a, b) in enumerate(zip(A, B)):
        rows.append(features_for_row(a, b))
        if (i+1) % 5000 == 0:
            print(f'.. {i+1} rows', flush=True)
    out = pd.DataFrame(rows)
    # simple non-linearities
    out['acr_overlap_cnt_sq'] = (out['acr_overlap_cnt'] ** 2).astype(np.float32)
    out['acr_cnt_union_log1p'] = np.log1p(out['acr_cnt_union']).astype(np.float32)
    return out.astype('float32')

tr_feats = compute_df(train)
te_feats = compute_df(test)

pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1).to_csv('oof_acronym.csv', index=False)
pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1).to_csv('acronym_test.csv', index=False)
print('Saved oof_acronym.csv and acronym_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

.. 5000 rows


.. 10000 rows


.. 15000 rows


.. 20000 rows


.. 25000 rows


.. 30000 rows


Saved oof_acronym.csv and acronym_test.csv; elapsed 0.01 min


In [10]:
# Soft token alignment (local alignment over tokens with JW/Stem matches). Outputs: oof_soft_align.csv, soft_align_test.csv
import time, re, numpy as np, pandas as pd
from rapidfuzz.distance import JaroWinkler

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Optional stemming
try:
    from nltk.stem import PorterStemmer
    _stemmer = PorterStemmer()
    def stem_token(tok: str) -> str:
        return _stemmer.stem(tok)
except Exception:
    def stem_token(tok: str) -> str:
        return tok

_word_re = re.compile(r"[a-zA-Z0-9]+(?:[-_./][a-zA-Z0-9]+)?")
def tokenize(text: str):
    if not isinstance(text, str):
        text = ''
    text = text.lower()
    toks = _word_re.findall(text)
    return toks

def jw_sim(a: str, b: str) -> float:
    return float(JaroWinkler.normalized_similarity(a, b))  # 0..1

def local_align_score(tokens_a, tokens_b, jw_thresh=0.90, match_exact=2.0, match_soft=1.2, mismatch=-0.5, gap=-0.7):
    # Smith-Waterman style local alignment on token sequences with soft matches
    na, nb = len(tokens_a), len(tokens_b)
    if na == 0 or nb == 0:
        return 0.0, 0.0, 0.0
    stems_a = [stem_token(t) for t in tokens_a]
    stems_b = [stem_token(t) for t in tokens_b]
    H = np.zeros((na+1, nb+1), dtype=np.float32)
    best = 0.0
    matches = 0
    for i in range(1, na+1):
        ta = tokens_a[i-1]; sa = stems_a[i-1]
        for j in range(1, nb+1):
            tb = tokens_b[j-1]; sb = stems_b[j-1]
            if ta == tb:
                s = match_exact
                is_match = True
            elif sa == sb or jw_sim(ta, tb) >= jw_thresh:
                s = match_soft
                is_match = True
            else:
                s = mismatch
                is_match = False
            h_diag = H[i-1, j-1] + s
            h_up = H[i-1, j] + gap
            h_left = H[i, j-1] + gap
            H[i, j] = max(0.0, h_diag, h_up, h_left)
            if H[i, j] > best:
                best = float(H[i, j])
            if is_match and H[i, j] > 0:
                matches += 1
    # Normalize scores
    norm_len = float(max(na, nb))
    norm_best = best / (match_exact * norm_len + 1e-6)
    match_ratio = matches / norm_len
    return float(best), float(norm_best), float(match_ratio)

def compute_soft_align(df: pd.DataFrame) -> pd.DataFrame:
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    n = len(df)
    raw = np.zeros(n, dtype=np.float32)
    norm = np.zeros(n, dtype=np.float32)
    mrat = np.zeros(n, dtype=np.float32)
    for i, (a, b) in enumerate(zip(A, B)):
        ta = tokenize(a); tb = tokenize(b)
        r, z, mr = local_align_score(ta, tb)
        raw[i] = r; norm[i] = z; mrat[i] = mr
        if (i+1) % 5000 == 0:
            print(f'.. {i+1} rows', flush=True)
    out = pd.DataFrame({
        'soft_align_raw': raw,
        'soft_align_norm': norm,
        'soft_align_match_ratio': mrat,
    })
    # simple non-linearities
    out['soft_align_norm_sq'] = out['soft_align_norm'] ** 2
    out['soft_align_raw_log1p'] = np.log1p(np.maximum(out['soft_align_raw'].values, 0.0)).astype(np.float32)
    return out.astype('float32')

tr_feats = compute_soft_align(train)
te_feats = compute_soft_align(test)

pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1).to_csv('oof_soft_align.csv', index=False)
pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1).to_csv('soft_align_test.csv', index=False)
print('Saved oof_soft_align.csv and soft_align_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

.. 5000 rows


.. 10000 rows


.. 15000 rows


.. 20000 rows


.. 25000 rows


.. 30000 rows


Saved oof_soft_align.csv and soft_align_test.csv; elapsed 0.02 min


In [11]:
# Unicode/confusables normalization + normalized lexical overlaps
# Outputs: oof_norm_text.csv, norm_text_test.csv
import re, unicodedata, time, numpy as np, pandas as pd

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# -------- Normalization utilities (NFKC + confusables + split number+unit + de-hyphen) --------
REPLACEMENTS = {
    '\u00b5': 'u',   # micro sign
    '\u03bc': 'u',   # Greek mu
    '\u03a9': 'ohm', # Omega
    '\u2126': 'ohm', # Ohm symbol
    '\u00b0': 'deg', # degree symbol
}

NUM_UNIT_ATTACH = re.compile(r'(?i)(\d+(?:[\./]\d+)?)([a-zA-Z%][a-zA-Z%/]*)')

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s))
    for k, v in REPLACEMENTS.items():
        s = s.replace(k, v)
    # unify hyphen/underscore to space, collapse whitespace
    s = s.replace('-', ' ').replace('_', ' ')
    # split attached number+unit 10nm -> '10 nm'
    s = NUM_UNIT_ATTACH.sub(r'\1 \2', s)
    # lowercase
    s = s.lower()
    # collapse multiple spaces
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# Optional stemming
try:
    from nltk.stem import PorterStemmer
    _stemmer = PorterStemmer()
    def stem_token(tok: str) -> str:
        return _stemmer.stem(tok)
except Exception:
    def stem_token(tok: str) -> str:
        return tok

_word_re = re.compile(r"[a-z0-9]+(?:[./][a-z0-9]+)?")
def tokenize_stems(text: str):
    if not isinstance(text, str):
        text = ''
    toks = _word_re.findall(text)
    return [stem_token(t) for t in toks if t]

def shingles(s: str, k: int):
    if k <= 0:
        return set()
    if len(s) < k:
        return {s} if s else set()
    return {s[i:i+k] for i in range(len(s)-k+1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b: return 0.0
    return len(a & b) / (len(a | b) + 1e-12)

def dice(a: set, b: set) -> float:
    if not a and not b: return 0.0
    return 2.0 * len(a & b) / (len(a) + len(b) + 1e-12)

def compute_norm_feats(df: pd.DataFrame) -> pd.DataFrame:
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    n = len(df)
    tok_jac = np.zeros(n, dtype=np.float32)
    tok_dice = np.zeros(n, dtype=np.float32)
    c3_jac = np.zeros(n, dtype=np.float32)
    c4_jac = np.zeros(n, dtype=np.float32)
    c5_jac = np.zeros(n, dtype=np.float32)
    for i, (a, b) in enumerate(zip(A, B)):
        na = normalize_text(a); nb = normalize_text(b)
        ta = set(tokenize_stems(na)); tb = set(tokenize_stems(nb))
        tok_jac[i] = jaccard(ta, tb)
        tok_dice[i] = dice(ta, tb)
        a3 = shingles(na, 3); b3 = shingles(nb, 3)
        a4 = shingles(na, 4); b4 = shingles(nb, 4)
        a5 = shingles(na, 5); b5 = shingles(nb, 5)
        c3_jac[i] = jaccard(a3, b3)
        c4_jac[i] = jaccard(a4, b4)
        c5_jac[i] = jaccard(a5, b5)
        if (i+1) % 5000 == 0:
            print(f'.. {i+1} rows', flush=True)
    out = pd.DataFrame({
        'norm_tok_jaccard': tok_jac,
        'norm_tok_dice': tok_dice,
        'norm_char3_jac': c3_jac,
        'norm_char4_jac': c4_jac,
        'norm_char5_jac': c5_jac,
    })
    # simple non-linearities
    out['norm_tok_jaccard_sq'] = out['norm_tok_jaccard'] ** 2
    out['norm_tok_dice_sq'] = out['norm_tok_dice'] ** 2
    return out.astype('float32')

tr_feats = compute_norm_feats(train)
te_feats = compute_norm_feats(test)

pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1).to_csv('oof_norm_text.csv', index=False)
pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1).to_csv('norm_text_test.csv', index=False)
print('Saved oof_norm_text.csv and norm_text_test.csv; elapsed', round((time.time()-t0)/60,2), 'min')

.. 5000 rows


.. 10000 rows


.. 15000 rows


.. 20000 rows


.. 25000 rows


.. 30000 rows


Saved oof_norm_text.csv and norm_text_test.csv; elapsed 0.01 min


In [12]:
# Normalized IDF-overlap (1-3 grams), fold-safe. Outputs: oof_idf_overlap_norm.csv, idf_overlap_norm_test.csv
import time, re, math, unicodedata, numpy as np, pandas as pd
from collections import Counter

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)

# -------- Normalization utilities (NFKC + confusables + sub/superscripts + number-unit split + de-hyphen) --------
REPL = {
    '\u00b5': 'u',   # micro sign
    '\u03bc': 'u',   # Greek mu
    '\u03a9': 'ohm', # Omega
    '\u2126': 'ohm', # Ohm symbol
    '\u00b0C': 'deg C',  # degree C
    '\u00b0F': 'deg F',  # degree F
    '\u00b0': 'deg',     # bare degree
    '\u00d7': 'x',       # multiplication sign
    '\u2032': "'",      # prime
    '\u2033': '"',      # double prime
}
_SUBS = str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789')
_SUPS_MAP = { '²': '2', '³': '3', '⁺': '+', '⁻': '-' }
NUM_UNIT_ATTACH = re.compile(r'(?i)(\d+(?:[\./]\d+)?)([a-zA-Z%][a-zA-Z%/]*)')

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s))
    for k, v in REPL.items():
        s = s.replace(k, v)
    for k, v in _SUPS_MAP.items():
        s = s.replace(k, v)
    s = s.translate(_SUBS)
    s = s.replace('-', ' ').replace('_', ' ')
    s = NUM_UNIT_ATTACH.sub(r'\1 \2', s)
    s = s.lower()
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenize_words(s: str):
    # after normalization, allow a-z0-9 and ./ inside tokens
    return re.findall(r"[a-z0-9]+(?:[./][a-z0-9]+)?", normalize_text(s))

def filter_tokens(tokens):
    # no stopword removal here; keep technical tokens; drop empty/single char
    return [t for t in tokens if len(t) > 1]

def gen_ngrams(tokens, n):
    if n == 1:
        return tokens
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def build_df(corpus_docs):
    df = Counter()
    for terms in corpus_docs:
        if terms:
            df.update(set(terms))
    return df

def idf_from_df(df_counter, N):
    idf = {}
    for t, df in df_counter.items():
        idf[t] = math.log((N - df + 0.5)/(df + 0.5) + 1.0)
    return idf

def weighted_overlap_metrics(A_terms, B_terms, idf):
    A_set, B_set = set(A_terms), set(B_terms)
    if not A_set and not B_set:
        return (0.0, 0.0, 0.0, 0.0, 0.0, 0, 0)
    inter = A_set & B_set
    union = A_set | B_set
    w_inter = sum(idf.get(t, 0.0) for t in inter)
    w_A = sum(idf.get(t, 0.0) for t in A_set) + 1e-12
    w_B = sum(idf.get(t, 0.0) for t in B_set) + 1e-12
    w_union = sum(idf.get(t, 0.0) for t in union) + 1e-12
    prec = w_inter / w_A
    rec = w_inter / w_B
    f1 = 0.0 if (prec+rec) == 0 else (2*prec*rec)/(prec+rec)
    jac = w_inter / w_union
    return (prec, rec, f1, jac, w_inter, len(A_set), len(B_set))

# Pre-tokenize normalized 1-3 grams per row
def make_ng123(tokens_list):
    uni = [gen_ngrams(t,1) for t in tokens_list]
    bi  = [gen_ngrams(t,2) for t in tokens_list]
    tri = [gen_ngrams(t,3) for t in tokens_list]
    return uni, bi, tri

A_tr_tok = [filter_tokens(tokenize_words(x)) for x in train['anchor'].astype(str).tolist()]
B_tr_tok = [filter_tokens(tokenize_words(x)) for x in train['target'].astype(str).tolist()]
A_te_tok = [filter_tokens(tokenize_words(x)) for x in test['anchor'].astype(str).tolist()]
B_te_tok = [filter_tokens(tokenize_words(x)) for x in test['target'].astype(str).tolist()]

A_tr_u, A_tr_b, A_tr_t = make_ng123(A_tr_tok)
B_tr_u, B_tr_b, B_tr_t = make_ng123(B_tr_tok)
A_te_u, A_te_b, A_te_t = make_ng123(A_te_tok)
B_te_u, B_te_b, B_te_t = make_ng123(B_te_tok)

fold_arr = train['fold'].values.astype(int)
n_tr = len(train); n_te = len(test)

cols = [
    'nidf1_prec','nidf1_rec','nidf1_f1','nidf1_jac','nidf1_wi',
    'nidf2_prec','nidf2_rec','nidf2_f1','nidf2_jac','nidf2_wi',
    'nidf3_prec','nidf3_rec','nidf3_f1','nidf3_jac','nidf3_wi'
]
oof = np.zeros((n_tr, len(cols)), dtype=np.float32)
te_fold_preds = []

for f in sorted(np.unique(fold_arr)):
    f0 = time.time()
    tr_idx = np.where(fold_arr != f)[0]
    va_idx = np.where(fold_arr == f)[0]
    # Build corpus IDF on train-only for each n-gram level
    corpus_uni_docs = [set(A_tr_u[i]) for i in tr_idx] + [set(B_tr_u[i]) for i in tr_idx]
    corpus_bi_docs  = [set(A_tr_b[i]) for i in tr_idx] + [set(B_tr_b[i]) for i in tr_idx]
    corpus_tri_docs = [set(A_tr_t[i]) for i in tr_idx] + [set(B_tr_t[i]) for i in tr_idx]
    idf_uni = idf_from_df(build_df(corpus_uni_docs), len(corpus_uni_docs))
    idf_bi  = idf_from_df(build_df(corpus_bi_docs),  len(corpus_bi_docs))
    idf_tri = idf_from_df(build_df(corpus_tri_docs), len(corpus_tri_docs))

    # OOF for this fold
    for i in va_idx:
        p1, r1, f1, j1, wi1, _, _ = weighted_overlap_metrics(A_tr_u[i], B_tr_u[i], idf_uni)
        p2, r2, f2, j2, wi2, _, _ = weighted_overlap_metrics(A_tr_b[i], B_tr_b[i], idf_bi)
        p3, r3, f3, j3, wi3, _, _ = weighted_overlap_metrics(A_tr_t[i], B_tr_t[i], idf_tri)
        oof[i, :] = [p1, r1, f1, j1, wi1, p2, r2, f2, j2, wi2, p3, r3, f3, j3, wi3]

    # Test for this fold
    te_mat = np.zeros((n_te, len(cols)), dtype=np.float32)
    for j in range(n_te):
        p1, r1, f1, j1, wi1, _, _ = weighted_overlap_metrics(A_te_u[j], B_te_u[j], idf_uni)
        p2, r2, f2, j2, wi2, _, _ = weighted_overlap_metrics(A_te_b[j], B_te_b[j], idf_bi)
        p3, r3, f3, j3, wi3, _, _ = weighted_overlap_metrics(A_te_t[j], B_te_t[j], idf_tri)
        te_mat[j, :] = [p1, r1, f1, j1, wi1, p2, r2, f2, j2, wi2, p3, r3, f3, j3, wi3]
    te_fold_preds.append(te_mat)
    print(f'Normalized IDF-overlap fold {int(f)} done in {time.time()-f0:.1f}s', flush=True)

# Aggregate test across folds (mean)
te_mean = np.mean(np.stack(te_fold_preds, axis=0), axis=0).astype(np.float32)

# Save
oof_df = pd.DataFrame({'id': train['id']})
for k, c in enumerate(cols):
    oof_df[c] = oof[:, k]
oof_df.to_csv('oof_idf_overlap_norm.csv', index=False)

te_df = pd.DataFrame({'id': test['id']})
for k, c in enumerate(cols):
    te_df[c] = te_mean[:, k]
te_df.to_csv('idf_overlap_norm_test.csv', index=False)

print('Saved oof_idf_overlap_norm.csv and idf_overlap_norm_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

Normalized IDF-overlap fold 0 done in 0.3s


Normalized IDF-overlap fold 1 done in 0.2s


Normalized IDF-overlap fold 2 done in 0.3s


Normalized IDF-overlap fold 3 done in 0.3s


Normalized IDF-overlap fold 4 done in 0.3s


Saved oof_idf_overlap_norm.csv and idf_overlap_norm_test.csv; elapsed 0.04 min


In [13]:
# Fold-safe target encoding with shrinkage (m=10): TE_anchor, TE_anchor_ctx3, TE_target_ctx3 (+ counts)
# Outputs: oof_te.csv, te_test.csv
import time, numpy as np, pandas as pd

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

# Keys
anch = train['anchor'].astype(str).values
targ = train['target'].astype(str).values
ctx3_tr = train['context'].astype(str).str[:3].values
ctx3_te = test['context'].astype(str).str[:3].values
anch_te = test['anchor'].astype(str).values
targ_te = test['target'].astype(str).values
y = train['score'].astype(np.float32).values

# Outputs
oof_te_anchor = np.zeros(len(train), dtype=np.float32)
oof_te_anchor_ctx3 = np.zeros(len(train), dtype=np.float32)
oof_te_target_ctx3 = np.zeros(len(train), dtype=np.float32)
oof_cnt_anchor = np.zeros(len(train), dtype=np.float32)
oof_cnt_anchor_ctx3 = np.zeros(len(train), dtype=np.float32)
oof_cnt_target_ctx3 = np.zeros(len(train), dtype=np.float32)

te_te_anchor_acc = np.zeros(len(test), dtype=np.float64)
te_te_anchor_ctx3_acc = np.zeros(len(test), dtype=np.float64)
te_te_target_ctx3_acc = np.zeros(len(test), dtype=np.float64)
te_cnt_anchor_acc = np.zeros(len(test), dtype=np.float64)
te_cnt_anchor_ctx3_acc = np.zeros(len(test), dtype=np.float64)
te_cnt_target_ctx3_acc = np.zeros(len(test), dtype=np.float64)

m_anchor = 10.0
m_ctx = 10.0

for f in range(NUM_FOLDS):
    f0 = time.time()
    tr_idx = np.where(train['fold'].values != f)[0]
    va_idx = np.where(train['fold'].values == f)[0]
    # Global mean
    gmean = float(y[tr_idx].mean()) if len(tr_idx) else float(y.mean())
    # Group stats on train-only
    df_tr = pd.DataFrame({
        'anchor': anch[tr_idx],
        'target': targ[tr_idx],
        'ctx3': ctx3_tr[tr_idx],
        'y': y[tr_idx],
    })
    # anchor only
    grp_a = df_tr.groupby('anchor')['y']
    mean_a = grp_a.mean().to_dict()
    cnt_a = grp_a.size().to_dict()
    # anchor+ctx3
    df_tr['a_c'] = df_tr['anchor'] + '||' + df_tr['ctx3']
    grp_ac = df_tr.groupby('a_c')['y']
    mean_ac = grp_ac.mean().to_dict()
    cnt_ac = grp_ac.size().to_dict()
    # target+ctx3
    df_tr['t_c'] = df_tr['target'] + '||' + df_tr['ctx3']
    grp_tc = df_tr.groupby('t_c')['y']
    mean_tc = grp_tc.mean().to_dict()
    cnt_tc = grp_tc.size().to_dict()

    # Assign to val with shrinkage enc = (sum + m*gmean)/(cnt + m)
    for idx in va_idx:
        a = anch[idx]
        c3 = ctx3_tr[idx]
        t = targ[idx]
        key_ac = a + '||' + c3
        key_tc = t + '||' + c3
        ca = float(cnt_a.get(a, 0.0)); ma = float(mean_a.get(a, gmean))
        cac = float(cnt_ac.get(key_ac, 0.0)); mac = float(mean_ac.get(key_ac, gmean))
        ctc = float(cnt_tc.get(key_tc, 0.0)); mtc = float(mean_tc.get(key_tc, gmean))
        oof_cnt_anchor[idx] = ca
        oof_cnt_anchor_ctx3[idx] = cac
        oof_cnt_target_ctx3[idx] = ctc
        # back out sum = mean * cnt
        enc_a = ((ma * ca) + m_anchor * gmean) / (ca + m_anchor) if (ca + m_anchor) > 0 else gmean
        enc_ac = ((mac * cac) + m_ctx * gmean) / (cac + m_ctx) if (cac + m_ctx) > 0 else gmean
        enc_tc = ((mtc * ctc) + m_ctx * gmean) / (ctc + m_ctx) if (ctc + m_ctx) > 0 else gmean
        oof_te_anchor[idx] = enc_a
        oof_te_anchor_ctx3[idx] = enc_ac
        oof_te_target_ctx3[idx] = enc_tc

    # Test encodings using same train-only stats; accumulate to average across folds
    gmean_te = gmean
    # anchor only
    ca_te = np.array([float(cnt_a.get(a, 0.0)) for a in anch_te], dtype=np.float64)
    ma_te = np.array([float(mean_a.get(a, gmean_te)) for a in anch_te], dtype=np.float64)
    te_cnt_anchor_acc += ca_te
    te_te_anchor_acc += (((ma_te * ca_te) + m_anchor * gmean_te) / (ca_te + m_anchor + 1e-12))
    # anchor+ctx3
    keys_ac_te = (pd.Series(anch_te) + '||' + pd.Series(ctx3_te)).tolist()
    cac_te = np.array([float(cnt_ac.get(k, 0.0)) for k in keys_ac_te], dtype=np.float64)
    mac_te = np.array([float(mean_ac.get(k, gmean_te)) for k in keys_ac_te], dtype=np.float64)
    te_cnt_anchor_ctx3_acc += cac_te
    te_te_anchor_ctx3_acc += (((mac_te * cac_te) + m_ctx * gmean_te) / (cac_te + m_ctx + 1e-12))
    # target+ctx3
    keys_tc_te = (pd.Series(targ_te) + '||' + pd.Series(ctx3_te)).tolist()
    ctc_te = np.array([float(cnt_tc.get(k, 0.0)) for k in keys_tc_te], dtype=np.float64)
    mtc_te = np.array([float(mean_tc.get(k, gmean_te)) for k in keys_tc_te], dtype=np.float64)
    te_cnt_target_ctx3_acc += ctc_te
    te_te_target_ctx3_acc += (((mtc_te * ctc_te) + m_ctx * gmean_te) / (ctc_te + m_ctx + 1e-12))

    print(f'TE fold {f} done in {time.time()-f0:.1f}s', flush=True)

# Average test across folds
te_anchor = (te_te_anchor_acc / NUM_FOLDS).astype(np.float32)
te_anchor_ctx3 = (te_te_anchor_ctx3_acc / NUM_FOLDS).astype(np.float32)
te_target_ctx3 = (te_te_target_ctx3_acc / NUM_FOLDS).astype(np.float32)
te_cnt_a = (te_cnt_anchor_acc / NUM_FOLDS).astype(np.float32)
te_cnt_ac = (te_cnt_anchor_ctx3_acc / NUM_FOLDS).astype(np.float32)
te_cnt_tc = (te_cnt_target_ctx3_acc / NUM_FOLDS).astype(np.float32)

# Save
oof_df = pd.DataFrame({
    'id': train['id'],
    'te_anchor': oof_te_anchor,
    'te_anchor_ctx3': oof_te_anchor_ctx3,
    'te_target_ctx3': oof_te_target_ctx3,
    'te_cnt_anchor': oof_cnt_anchor,
    'te_cnt_anchor_ctx3': oof_cnt_anchor_ctx3,
    'te_cnt_target_ctx3': oof_cnt_target_ctx3,
})
oof_df.to_csv('oof_te.csv', index=False)

te_df = pd.DataFrame({
    'id': test['id'],
    'te_anchor': te_anchor,
    'te_anchor_ctx3': te_anchor_ctx3,
    'te_target_ctx3': te_target_ctx3,
    'te_cnt_anchor': te_cnt_a,
    'te_cnt_anchor_ctx3': te_cnt_ac,
    'te_cnt_target_ctx3': te_cnt_tc,
})
te_df.to_csv('te_test.csv', index=False)
print('Saved oof_te.csv and te_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

TE fold 0 done in 0.0s


TE fold 1 done in 0.0s


TE fold 2 done in 0.0s


TE fold 3 done in 0.0s


TE fold 4 done in 0.0s


Saved oof_te.csv and te_test.csv; elapsed 0.01 min


In [14]:
# Monge–Elkan over normalized tokens with Jaro–Winkler base
# Outputs: oof_monge.csv, monge_test.csv
import re, unicodedata, time, numpy as np, pandas as pd
from rapidfuzz.distance import JaroWinkler

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Normalization (same spirit as prior sprints)
REPL = {
    '\u00b5': 'u',
    '\u03bc': 'u',
    '\u03a9': 'ohm',
    '\u2126': 'ohm',
    '\u00b0C': 'deg C',
    '\u00b0F': 'deg F',
    '\u00b0': 'deg',
    '\u00d7': 'x',
    '\u2032': "'",
    '\u2033': '"',
}
_SUBS = str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789')
_SUPS_MAP = { '²': '2', '³': '3', '⁺': '+', '⁻': '-' }
NUM_UNIT_ATTACH = re.compile(r'(?i)(\d+(?:[\./]\d+)?)([a-zA-Z%][a-zA-Z%/]*)')

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s))
    for k, v in REPL.items():
        s = s.replace(k, v)
    for k, v in _SUPS_MAP.items():
        s = s.replace(k, v)
    s = s.translate(_SUBS)
    s = s.replace('-', ' ').replace('_', ' ')
    s = NUM_UNIT_ATTACH.sub(r'\1 \2', s)
    s = s.lower()
    s = re.sub(r'\s+', ' ', s).strip()
    return s

_word_re = re.compile(r"[a-z0-9]+(?:[./][a-z0-9]+)?")
def tokenize(text: str):
    if not isinstance(text, str):
        text = ''
    text = normalize_text(text)
    return _word_re.findall(text)

def jw_sim(a: str, b: str) -> float:
    return float(JaroWinkler.normalized_similarity(a, b))  # 0..1

def monge_elkan(A: list[str], B: list[str], jw_thresh: float = 0.0):
    # Asymmetric Monge–Elkan: for each token in A, take max JW to any token in B, then mean
    if not A:
        return 0.0
    if not B:
        return 0.0
    m = 0.0
    for ta in A:
        best = 0.0
        for tb in B:
            s = jw_sim(ta, tb)
            if s > best:
                best = s
        if best >= jw_thresh:
            m += best
    return float(m / max(len(A), 1))

def compute_df(df: pd.DataFrame) -> pd.DataFrame:
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    n = len(df)
    me_ab = np.zeros(n, dtype=np.float32)
    me_ba = np.zeros(n, dtype=np.float32)
    me_sym_mean = np.zeros(n, dtype=np.float32)
    me_sym_max = np.zeros(n, dtype=np.float32)
    me_sym_min = np.zeros(n, dtype=np.float32)
    for i, (a, b) in enumerate(zip(A, B)):
        ta = tokenize(a); tb = tokenize(b)
        s_ab = monge_elkan(ta, tb, jw_thresh=0.0)
        s_ba = monge_elkan(tb, ta, jw_thresh=0.0)
        me_ab[i] = s_ab
        me_ba[i] = s_ba
        me_sym_mean[i] = 0.5 * (s_ab + s_ba)
        me_sym_max[i] = max(s_ab, s_ba)
        me_sym_min[i] = min(s_ab, s_ba)
        if (i+1) % 5000 == 0:
            print(f'.. {i+1} rows', flush=True)
    out = pd.DataFrame({
        'me_jw_ab': me_ab,
        'me_jw_ba': me_ba,
        'me_jw_mean': me_sym_mean,
        'me_jw_max': me_sym_max,
        'me_jw_min': me_sym_min,
    })
    # simple non-linearities
    out['me_jw_mean_sq'] = out['me_jw_mean'] ** 2
    return out.astype('float32')

tr_feats = compute_df(train)
te_feats = compute_df(test)

pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1).to_csv('oof_monge.csv', index=False)
pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1).to_csv('monge_test.csv', index=False)
print('Saved oof_monge.csv and monge_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

.. 5000 rows


.. 10000 rows


.. 15000 rows


.. 20000 rows


.. 25000 rows


.. 30000 rows


Saved oof_monge.csv and monge_test.csv; elapsed 0.01 min


In [15]:
# Char 3-gram TF-IDF cosine similarity (fold-safe). Outputs: oof_char3_tfidf_cos.csv, char3_tfidf_cos_test.csv
import time, re, unicodedata, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s)).lower()
    s = s.replace('-', ' ').replace('_', ' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s

A_tr = train['anchor'].astype(str).apply(normalize_text).tolist()
B_tr = train['target'].astype(str).apply(normalize_text).tolist()
A_te = test['anchor'].astype(str).apply(normalize_text).tolist()
B_te = test['target'].astype(str).apply(normalize_text).tolist()

n_tr = len(train); n_te = len(test)
oof = np.zeros(n_tr, dtype=np.float32)
te_acc = np.zeros(n_te, dtype=np.float64)

def rowwise_cosine(X, Y):
    # X, Y are sparse matrices with same shape and aligned rows
    # cosine = (x·y) / (||x|| ||y||)
    num = (X.multiply(Y)).sum(axis=1).A1.astype(np.float64)
    x2 = X.multiply(X).sum(axis=1).A1.astype(np.float64)
    y2 = Y.multiply(Y).sum(axis=1).A1.astype(np.float64)
    den = np.sqrt(np.maximum(x2, 1e-12)) * np.sqrt(np.maximum(y2, 1e-12))
    c = num / np.maximum(den, 1e-12)
    # Clamp to [0,1] numeric safety
    c = np.clip(c, 0.0, 1.0)
    return c.astype(np.float32)

for f in range(NUM_FOLDS):
    f0 = time.time()
    tr_idx = np.where(train['fold'].values != f)[0]
    va_idx = np.where(train['fold'].values == f)[0]
    vec = TfidfVectorizer(analyzer='char', ngram_range=(3,3), min_df=3)
    corpus = [A_tr[i] for i in tr_idx] + [B_tr[i] for i in tr_idx]
    V = vec.fit_transform(corpus)
    Va = vec.transform([A_tr[i] for i in va_idx])
    Vb = vec.transform([B_tr[i] for i in va_idx])
    oof[va_idx] = rowwise_cosine(Va, Vb)
    # Test for this fold
    Ta = vec.transform(A_te)
    Tb = vec.transform(B_te)
    te_acc += rowwise_cosine(Ta, Tb).astype(np.float64)
    print(f'char3 tfidf fold {f} done in {time.time()-f0:.1f}s', flush=True)

te_mean = (te_acc / NUM_FOLDS).astype(np.float32)

pd.DataFrame({'id': train['id'], 'char3_tfidf_cos': oof}).to_csv('oof_char3_tfidf_cos.csv', index=False)
pd.DataFrame({'id': test['id'], 'char3_tfidf_cos': te_mean}).to_csv('char3_tfidf_cos_test.csv', index=False)
print('Saved oof_char3_tfidf_cos.csv and char3_tfidf_cos_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

char3 tfidf fold 0 done in 0.4s


char3 tfidf fold 1 done in 0.4s


char3 tfidf fold 2 done in 0.4s


char3 tfidf fold 3 done in 0.4s


char3 tfidf fold 4 done in 0.4s


Saved oof_char3_tfidf_cos.csv and char3_tfidf_cos_test.csv; elapsed 0.03 min


In [16]:
# Fold-safe transforms (iso, z, rank) for embedding single-column OOFs: mpnet_st, e5_asym, bge
import numpy as np, pandas as pd
from sklearn.isotonic import IsotonicRegression
from pathlib import Path

train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
fold_arr = train['fold'].values.astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1
y = train['score'].astype(np.float32).values

def load_single(oof_path, sub_path):
    if (not Path(oof_path).exists()) or (not Path(sub_path).exists()):
        return None, None
    oof = pd.read_csv(oof_path); sub = pd.read_csv(sub_path)
    oof_cols = [c for c in oof.columns if c != 'id']
    sub_cols = [c for c in sub.columns if c != 'id']
    if not oof_cols or not sub_cols:
        return None, None
    oc = oof_cols[-1]; sc = sub_cols[-1]
    o = train[['id']].merge(oof[['id', oc]], on='id', how='left')[oc].astype(np.float32).values
    t = test[['id']].merge(sub[['id', sc]], on='id', how='left')[sc].astype(np.float32).values
    return o, t

cands = [
    ('mpnet', 'oof_mpnet_st.csv', 'submission_mpnet_st.csv'),
    ('e5',    'oof_e5_asym.csv',   'submission_e5_asym.csv'),
    ('bge',   'oof_bge.csv',       'submission_bge.csv'),
]

out_oof = {'id': train['id'].values}
out_te  = {'id': test['id'].values}

for tag, oofp, subp in cands:
    o, t = load_single(oofp, subp)
    if o is None:
        print(f'Skip {tag}: missing {oofp} or {subp}', flush=True)
        continue
    print(f'Embedding transforms for {tag}: source {oofp}, {subp}', flush=True)
    o_iso  = np.zeros(len(train), dtype=np.float32)
    o_z    = np.zeros(len(train), dtype=np.float32)
    o_rank = np.zeros(len(train), dtype=np.float32)
    t_iso_acc  = np.zeros(len(test), dtype=np.float64)
    t_z_acc    = np.zeros(len(test), dtype=np.float64)
    t_rank_acc = np.zeros(len(test), dtype=np.float64)
    for f in range(NUM_FOLDS):
        tr = fold_arr != f
        va = fold_arr == f
        # isotonic
        iso = IsotonicRegression(increasing=True, out_of_bounds='clip')
        iso.fit(o[tr], y[tr])
        o_iso[va] = iso.transform(o[va]).astype(np.float32)
        t_iso_acc += iso.transform(t).astype(np.float64)
        # z-score
        mu = float(o[tr].mean()); sd = float(o[tr].std()) or 1.0
        o_z[va] = (o[va] - mu) / sd
        t_z_acc += (t - mu) / sd
        # rank
        ref = np.sort(o[tr].astype(np.float32))
        if ref.size > 0:
            j_va = np.searchsorted(ref, o[va], side='right')
            o_rank[va] = j_va / max(ref.size - 1, 1)
            j_te = np.searchsorted(ref, t, side='right')
            t_rank_acc += (j_te / max(ref.size - 1, 1))
    out_oof[f'{tag}_z'] = o_z.astype(np.float32)
    out_oof[f'{tag}_rank'] = o_rank.astype(np.float32)
    out_oof[f'{tag}_iso'] = o_iso.astype(np.float32)
    out_te[f'{tag}_z'] = (t_z_acc / NUM_FOLDS).astype(np.float32)
    out_te[f'{tag}_rank'] = (t_rank_acc / NUM_FOLDS).astype(np.float32)
    out_te[f'{tag}_iso'] = (t_iso_acc / NUM_FOLDS).astype(np.float32)

pd.DataFrame(out_oof).to_csv('oof_embed_transforms.csv', index=False)
pd.DataFrame(out_te).to_csv('embed_transforms_test.csv', index=False)
print('Saved oof_embed_transforms.csv and embed_transforms_test.csv')

Embedding transforms for mpnet: source oof_mpnet_st.csv, submission_mpnet_st.csv


Embedding transforms for e5: source oof_e5_asym.csv, submission_e5_asym.csv


Embedding transforms for bge: source oof_bge.csv, submission_bge.csv


Saved oof_embed_transforms.csv and embed_transforms_test.csv


In [17]:
# PatentSBERTa cosine + fold-safe transforms (raw, iso, z, rank)
import numpy as np, pandas as pd, time
from pathlib import Path
from sklearn.isotonic import IsotonicRegression
from sentence_transformers import SentenceTransformer

t0 = time.time()
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
F = int(train['fold'].max()) + 1
y = train['score'].astype(np.float32).values

MODEL = 'AI-Growth-Lab/PatentSBERTa'
print('Loading SentenceTransformer:', MODEL, 'on CPU...', flush=True)
st = SentenceTransformer(MODEL, device='cpu')

def enc(texts, bs=128):
    return st.encode(texts, batch_size=bs, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False).astype(np.float32)

# Precompute embeddings for all rows once
a_tr = enc(train['anchor'].astype(str).tolist())
b_tr = enc(train['target'].astype(str).tolist())
a_te = enc(test['anchor'].astype(str).tolist())
b_te = enc(test['target'].astype(str).tolist())

# Cosine since normalized -> dot product
raw_tr = (a_tr * b_tr).sum(axis=1).astype(np.float32)
raw_te = (a_te * b_te).sum(axis=1).astype(np.float32)

# Fold-safe iso/z/rank
oof_raw  = np.zeros(len(train), dtype=np.float32)
oof_iso  = np.zeros(len(train), dtype=np.float32)
oof_z    = np.zeros(len(train), dtype=np.float32)
oof_rank = np.zeros(len(train), dtype=np.float32)
te_iso_acc  = np.zeros(len(test), dtype=np.float64)
te_z_acc    = np.zeros(len(test), dtype=np.float64)
te_rank_acc = np.zeros(len(test), dtype=np.float64)

fold_arr = train['fold'].values.astype(int)
for f in range(F):
    tr = fold_arr != f; va = fold_arr == f
    # assign raw directly (no fit needed)
    oof_raw[va] = raw_tr[va]
    # isotonic
    iso = IsotonicRegression(increasing=True, out_of_bounds='clip')
    iso.fit(raw_tr[tr], y[tr])
    oof_iso[va] = iso.transform(raw_tr[va]).astype(np.float32)
    te_iso_acc += iso.transform(raw_te).astype(np.float64)
    # z
    mu = float(raw_tr[tr].mean()); sd = float(raw_tr[tr].std()) or 1.0
    oof_z[va] = (raw_tr[va] - mu) / sd
    te_z_acc += (raw_te - mu) / sd
    # rank
    ref = np.sort(raw_tr[tr].astype(np.float32))
    if ref.size > 0:
        j_va = np.searchsorted(ref, raw_tr[va], side='right')
        oof_rank[va] = j_va / max(ref.size - 1, 1)
        j_te = np.searchsorted(ref, raw_te, side='right')
        te_rank_acc += (j_te / max(ref.size - 1, 1))

te_iso  = (te_iso_acc / F).astype(np.float32)
te_z    = (te_z_acc / F).astype(np.float32)
te_rank = (te_rank_acc / F).astype(np.float32)

# Save
pd.DataFrame({
    'id': train['id'],
    'patentsberta_raw': oof_raw,
    'patentsberta_iso': oof_iso,
    'patentsberta_z': oof_z,
    'patentsberta_rank': oof_rank,
}).to_csv('oof_patentsberta.csv', index=False)
pd.DataFrame({
    'id': test['id'],
    'patentsberta_raw': raw_te.astype(np.float32),
    'patentsberta_iso': te_iso,
    'patentsberta_z': te_z,
    'patentsberta_rank': te_rank,
}).to_csv('patentsberta_test.csv', index=False)
print('Saved oof_patentsberta.csv and patentsberta_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

  from .autonotebook import tqdm as notebook_tqdm


Loading SentenceTransformer: AI-Growth-Lab/PatentSBERTa on CPU...




Saved oof_patentsberta.csv and patentsberta_test.csv; elapsed 1.59 min


In [18]:
# Char 4/5-gram TF-IDF cosine similarities (fold-safe). Outputs: oof_char45_tfidf_cos.csv, char45_tfidf_cos_test.csv
import time, re, unicodedata, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s)).lower()
    s = s.replace('-', ' ').replace('_', ' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s

A_tr = train['anchor'].astype(str).apply(normalize_text).tolist()
B_tr = train['target'].astype(str).apply(normalize_text).tolist()
A_te = test['anchor'].astype(str).apply(normalize_text).tolist()
B_te = test['target'].astype(str).apply(normalize_text).tolist()

n_tr = len(train); n_te = len(test)
oof4 = np.zeros(n_tr, dtype=np.float32); oof5 = np.zeros(n_tr, dtype=np.float32)
te4_acc = np.zeros(n_te, dtype=np.float64); te5_acc = np.zeros(n_te, dtype=np.float64)

def rowwise_cosine(X, Y):
    num = (X.multiply(Y)).sum(axis=1).A1.astype(np.float64)
    x2 = X.multiply(X).sum(axis=1).A1.astype(np.float64)
    y2 = Y.multiply(Y).sum(axis=1).A1.astype(np.float64)
    den = np.sqrt(np.maximum(x2, 1e-12)) * np.sqrt(np.maximum(y2, 1e-12))
    c = num / np.maximum(den, 1e-12)
    return np.clip(c, 0.0, 1.0).astype(np.float32)

for f in range(NUM_FOLDS):
    f0 = time.time()
    tr_idx = np.where(train['fold'].values != f)[0]
    va_idx = np.where(train['fold'].values == f)[0]
    # 4-gram
    vec4 = TfidfVectorizer(analyzer='char', ngram_range=(4,4), min_df=3)
    corp4 = [A_tr[i] for i in tr_idx] + [B_tr[i] for i in tr_idx]
    V4 = vec4.fit_transform(corp4)
    Va4 = vec4.transform([A_tr[i] for i in va_idx]); Vb4 = vec4.transform([B_tr[i] for i in va_idx])
    oof4[va_idx] = rowwise_cosine(Va4, Vb4)
    Ta4 = vec4.transform(A_te); Tb4 = vec4.transform(B_te)
    te4_acc += rowwise_cosine(Ta4, Tb4).astype(np.float64)
    # 5-gram
    vec5 = TfidfVectorizer(analyzer='char', ngram_range=(5,5), min_df=3)
    corp5 = corp4  # reuse
    V5 = vec5.fit_transform(corp5)
    Va5 = vec5.transform([A_tr[i] for i in va_idx]); Vb5 = vec5.transform([B_tr[i] for i in va_idx])
    oof5[va_idx] = rowwise_cosine(Va5, Vb5)
    Ta5 = vec5.transform(A_te); Tb5 = vec5.transform(B_te)
    te5_acc += rowwise_cosine(Ta5, Tb5).astype(np.float64)
    print(f'char45 tfidf fold {f} done in {time.time()-f0:.1f}s', flush=True)

te4 = (te4_acc / NUM_FOLDS).astype(np.float32)
te5 = (te5_acc / NUM_FOLDS).astype(np.float32)

pd.DataFrame({'id': train['id'], 'char4_tfidf_cos': oof4, 'char5_tfidf_cos': oof5}).to_csv('oof_char45_tfidf_cos.csv', index=False)
pd.DataFrame({'id': test['id'], 'char4_tfidf_cos': te4, 'char5_tfidf_cos': te5}).to_csv('char45_tfidf_cos_test.csv', index=False)
print('Saved oof_char45_tfidf_cos.csv and char45_tfidf_cos_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

char45 tfidf fold 0 done in 0.7s


char45 tfidf fold 1 done in 0.7s


char45 tfidf fold 2 done in 0.7s


char45 tfidf fold 3 done in 0.7s


char45 tfidf fold 4 done in 0.7s


Saved oof_char45_tfidf_cos.csv and char45_tfidf_cos_test.csv; elapsed 0.06 min


In [19]:
# anferico/bert-for-patents cosine + fold-safe transforms (raw, iso, z, rank)
import numpy as np, pandas as pd, time
from pathlib import Path
from sklearn.isotonic import IsotonicRegression
from sentence_transformers import SentenceTransformer

t0 = time.time()
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
F = int(train['fold'].max()) + 1
y = train['score'].astype(np.float32).values

MODEL = 'anferico/bert-for-patents'
print('Loading SentenceTransformer:', MODEL, 'on CPU...', flush=True)
st = SentenceTransformer(MODEL, device='cpu')

def enc(texts, bs=128):
    return st.encode(texts, batch_size=bs, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False).astype(np.float32)

# Precompute embeddings for all rows once
a_tr = enc(train['anchor'].astype(str).tolist())
b_tr = enc(train['target'].astype(str).tolist())
a_te = enc(test['anchor'].astype(str).tolist())
b_te = enc(test['target'].astype(str).tolist())

# Cosine since normalized -> dot product
raw_tr = (a_tr * b_tr).sum(axis=1).astype(np.float32)
raw_te = (a_te * b_te).sum(axis=1).astype(np.float32)

# Fold-safe iso/z/rank
oof_raw  = np.zeros(len(train), dtype=np.float32)
oof_iso  = np.zeros(len(train), dtype=np.float32)
oof_z    = np.zeros(len(train), dtype=np.float32)
oof_rank = np.zeros(len(train), dtype=np.float32)
te_iso_acc  = np.zeros(len(test), dtype=np.float64)
te_z_acc    = np.zeros(len(test), dtype=np.float64)
te_rank_acc = np.zeros(len(test), dtype=np.float64)

fold_arr = train['fold'].values.astype(int)
for f in range(F):
    tr = fold_arr != f; va = fold_arr == f
    oof_raw[va] = raw_tr[va]
    iso = IsotonicRegression(increasing=True, out_of_bounds='clip')
    iso.fit(raw_tr[tr], y[tr])
    oof_iso[va] = iso.transform(raw_tr[va]).astype(np.float32)
    te_iso_acc += iso.transform(raw_te).astype(np.float64)
    mu = float(raw_tr[tr].mean()); sd = float(raw_tr[tr].std()) or 1.0
    oof_z[va] = (raw_tr[va] - mu) / sd
    te_z_acc += (raw_te - mu) / sd
    ref = np.sort(raw_tr[tr].astype(np.float32))
    if ref.size > 0:
        j_va = np.searchsorted(ref, raw_tr[va], side='right')
        oof_rank[va] = j_va / max(ref.size - 1, 1)
        j_te = np.searchsorted(ref, raw_te, side='right')
        te_rank_acc += (j_te / max(ref.size - 1, 1))

te_iso  = (te_iso_acc / F).astype(np.float32)
te_z    = (te_z_acc / F).astype(np.float32)
te_rank = (te_rank_acc / F).astype(np.float32)

# Save
pd.DataFrame({
    'id': train['id'],
    'bertpat_raw': oof_raw,
    'bertpat_iso': oof_iso,
    'bertpat_z': oof_z,
    'bertpat_rank': oof_rank,
}).to_csv('oof_bertpat.csv', index=False)
pd.DataFrame({
    'id': test['id'],
    'bertpat_raw': raw_te.astype(np.float32),
    'bertpat_iso': te_iso,
    'bertpat_z': te_z,
    'bertpat_rank': te_rank,
}).to_csv('bertpat_test.csv', index=False)
print('Saved oof_bertpat.csv and bertpat_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

Loading SentenceTransformer: anferico/bert-for-patents on CPU...


No sentence-transformers model found with name anferico/bert-for-patents. Creating a new one with MEAN pooling.




Saved oof_bertpat.csv and bertpat_test.csv; elapsed 4.29 min


In [20]:
# Length and stopword-stripped overlap features (no fitting). Outputs: oof_len_stop.csv, len_stop_test.csv
import re, time, numpy as np, pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

STOP = set(ENGLISH_STOP_WORDS)
word_re = re.compile(r"\w+")

def toks_no_stop(s: str):
    toks = [t.lower() for t in word_re.findall(str(s))]
    return [t for t in toks if (t not in STOP and len(t) > 1)]

def jaccard(a:set, b:set) -> float:
    if not a and not b: return 0.0
    return len(a & b) / (len(a | b) + 1e-12)

def dice(a:set, b:set) -> float:
    if not a and not b: return 0.0
    return 2.0 * len(a & b) / (len(a) + len(b) + 1e-12)

def compute(df: pd.DataFrame) -> pd.DataFrame:
    A = df['anchor'].astype(str).tolist()
    B = df['target'].astype(str).tolist()
    n = len(df)
    len_ca = np.zeros(n, dtype=np.float32); len_cb = np.zeros(n, dtype=np.float32)
    len_wa = np.zeros(n, dtype=np.float32); len_wb = np.zeros(n, dtype=np.float32)
    len_cdiff = np.zeros(n, dtype=np.float32); len_cminmax = np.zeros(n, dtype=np.float32)
    len_wdiff = np.zeros(n, dtype=np.float32); len_wminmax = np.zeros(n, dtype=np.float32)
    jac = np.zeros(n, dtype=np.float32); di = np.zeros(n, dtype=np.float32)
    ov_cnt = np.zeros(n, dtype=np.float32); uni_cnt = np.zeros(n, dtype=np.float32)
    for i, (a, b) in enumerate(zip(A, B)):
        ta = toks_no_stop(a); tb = toks_no_stop(b)
        sa, sb = set(ta), set(tb)
        ca = len(a); cb = len(b)
        wa = len(ta); wb = len(tb)
        len_ca[i] = ca; len_cb[i] = cb
        len_wa[i] = wa; len_wb[i] = wb
        len_cdiff[i] = abs(ca - cb)
        mx = max(ca, cb) or 1.0; len_cminmax[i] = min(ca, cb) / mx
        len_wdiff[i] = abs(wa - wb)
        mxw = max(wa, wb) or 1.0; len_wminmax[i] = min(wa, wb) / mxw
        jac[i] = jaccard(sa, sb)
        di[i] = dice(sa, sb)
        inter = sa & sb; union = sa | sb
        ov_cnt[i] = float(len(inter)); uni_cnt[i] = float(len(union))
    out = pd.DataFrame({
        'len_char_a': len_ca, 'len_char_b': len_cb,
        'len_char_diff': len_cdiff, 'len_char_minmax': len_cminmax,
        'len_word_a': len_wa, 'len_word_b': len_wb,
        'len_word_diff': len_wdiff, 'len_word_minmax': len_wminmax,
        'nostop_jaccard': jac, 'nostop_dice': di,
        'nostop_overlap_cnt': ov_cnt, 'nostop_union_cnt': uni_cnt,
    })
    # simple non-linearities
    out['len_char_minmax_sq'] = (out['len_char_minmax'] ** 2).astype(np.float32)
    out['len_word_minmax_sq'] = (out['len_word_minmax'] ** 2).astype(np.float32)
    out['nostop_jaccard_sq'] = (out['nostop_jaccard'] ** 2).astype(np.float32)
    return out.astype('float32')

tr_feats = compute(train)
te_feats = compute(test)

pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1).to_csv('oof_len_stop.csv', index=False)
pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1).to_csv('len_stop_test.csv', index=False)
print('Saved oof_len_stop.csv and len_stop_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

Saved oof_len_stop.csv and len_stop_test.csv; elapsed 0.01 min


In [21]:
# Fold-safe KNN regression meta-features from patent embeddings (PatentSBERTa, bert-for-patents)
# Outputs: oof_knn_meta.csv, knn_meta_test.csv
import numpy as np, pandas as pd, time, re
from pathlib import Path
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer

t0 = time.time()
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')
train = train.merge(folds, on='id', how='left', validate='one_to_one')
train['fold'] = train['fold'].astype(int)
NUM_FOLDS = int(train['fold'].max()) + 1
y = train['score'].astype(np.float32).values

def l2norm(mat: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(mat, axis=1, keepdims=True) + 1e-12
    return (mat / n).astype(np.float32)

def pair_rep(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    # Construct pair vector [a, b, |a-b|, a*b] then L2-normalize row-wise
    return l2norm(np.concatenate([a, b, np.abs(a-b), a*b], axis=1))

def knn_wmean_from_index(nn: NearestNeighbors, Xq: np.ndarray, y_ref: np.ndarray, k: int, p: int = 1) -> np.ndarray:
    # cosine metric returns distance d = 1 - cos_sim => sim = 1 - d
    dists, inds = nn.kneighbors(Xq, n_neighbors=k, return_distance=True)
    sims = np.maximum(0.0, 1.0 - dists)
    w = (sims ** p)
    wy = (w * y_ref[inds])
    denom = np.sum(w, axis=1, keepdims=True) + 1e-12
    return (np.sum(wy, axis=1, keepdims=True) / denom).astype(np.float32).ravel()

def build_group_indices(keys: np.ndarray):
    mp = {}
    for i, k in enumerate(keys):
        mp.setdefault(k, []).append(i)
    return mp

def compute_knn_meta_for_encoder(model_name: str, tag: str, bs: int = 128):
    print(f'Encoding with {model_name} ...', flush=True)
    st = SentenceTransformer(model_name, device='cpu')
    def enc(texts):
        return st.encode(texts, batch_size=bs, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False).astype(np.float32)
    a_tr = enc(train['anchor'].astype(str).tolist())
    b_tr = enc(train['target'].astype(str).tolist())
    a_te = enc(test['anchor'].astype(str).tolist())
    b_te = enc(test['target'].astype(str).tolist())
    V_tr = pair_rep(a_tr, b_tr)
    V_te = pair_rep(a_te, b_te)

    # Outputs per encoder
    oof_g10 = np.zeros(len(train), dtype=np.float32)
    oof_g25 = np.zeros(len(train), dtype=np.float32)
    oof_anch = np.zeros(len(train), dtype=np.float32)
    oof_cpc3 = np.zeros(len(train), dtype=np.float32)
    te_g10_acc = np.zeros(len(test), dtype=np.float64)
    te_g25_acc = np.zeros(len(test), dtype=np.float64)
    te_anch_acc = np.zeros(len(test), dtype=np.float64)
    te_cpc3_acc = np.zeros(len(test), dtype=np.float64)

    fold_arr = train['fold'].values.astype(int)
    anchors_tr = train['anchor'].astype(str).values
    cpc3_tr = train['context'].astype(str).str[:3].values
    anchors_te = test['anchor'].astype(str).values
    cpc3_te = test['context'].astype(str).str[:3].values

    for f in range(NUM_FOLDS):
        f0 = time.time()
        tr_idx = np.where(fold_arr != f)[0]
        va_idx = np.where(fold_arr == f)[0]
        V_ref = V_tr[tr_idx]
        y_ref = y[tr_idx]
        # Global KNN on train-only
        nn_global = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1).fit(V_ref)
        V_va = V_tr[va_idx]
        oof_g10[va_idx] = knn_wmean_from_index(nn_global, V_va, y_ref, k=10, p=1)
        oof_g25[va_idx] = knn_wmean_from_index(nn_global, V_va, y_ref, k=25, p=1)

        # Grouped indices on train-only for within-anchor and within-cpc3
        grp_anchor = build_group_indices(anchors_tr[tr_idx])
        grp_cpc3 = build_group_indices(cpc3_tr[tr_idx])
        # Prebuild per-group NN for groups with >=3 instances
        nn_anchor = {}
        for g, inds in grp_anchor.items():
            if len(inds) >= 3:
                nn_anchor[g] = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1).fit(V_ref[np.array(inds)])
        nn_cpc3 = {}
        for g, inds in grp_cpc3.items():
            if len(inds) >= 3:
                nn_cpc3[g] = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1).fit(V_ref[np.array(inds)])

        # Within-anchor
        for loc, idx in enumerate(va_idx):
            a = anchors_tr[idx]
            if a in nn_anchor:
                oof_anch[idx] = knn_wmean_from_index(nn_anchor[a], V_tr[idx:idx+1], y_ref, k=min(25, nn_anchor[a].n_samples_fit_), p=1)[0]
            else:
                oof_anch[idx] = oof_g10[idx]
        # Within-cpc3
        for loc, idx in enumerate(va_idx):
            c3 = cpc3_tr[idx]
            if c3 in nn_cpc3:
                oof_cpc3[idx] = knn_wmean_from_index(nn_cpc3[c3], V_tr[idx:idx+1], y_ref, k=min(25, nn_cpc3[c3].n_samples_fit_), p=1)[0]
            else:
                oof_cpc3[idx] = oof_g10[idx]

        # Test predictions using train-only index; average across folds
        te_g10_acc += knn_wmean_from_index(nn_global, V_te, y_ref, k=10, p=1).astype(np.float64)
        te_g25_acc += knn_wmean_from_index(nn_global, V_te, y_ref, k=25, p=1).astype(np.float64)
        # For grouped test, use available group NN else fallback to global k10
        te_anchor_tmp = np.zeros(len(test), dtype=np.float64)
        for j in range(len(test)):
            a = anchors_te[j]
            if a in nn_anchor:
                te_anchor_tmp[j] = float(knn_wmean_from_index(nn_anchor[a], V_te[j:j+1], y_ref, k=min(25, nn_anchor[a].n_samples_fit_), p=1)[0])
            else:
                te_anchor_tmp[j] = float(knn_wmean_from_index(nn_global, V_te[j:j+1], y_ref, k=10, p=1)[0])
        te_anch_acc += te_anchor_tmp
        te_cpc3_tmp = np.zeros(len(test), dtype=np.float64)
        for j in range(len(test)):
            c3 = cpc3_te[j]
            if c3 in nn_cpc3:
                te_cpc3_tmp[j] = float(knn_wmean_from_index(nn_cpc3[c3], V_te[j:j+1], y_ref, k=min(25, nn_cpc3[c3].n_samples_fit_), p=1)[0])
            else:
                te_cpc3_tmp[j] = float(knn_wmean_from_index(nn_global, V_te[j:j+1], y_ref, k=10, p=1)[0])
        te_cpc3_acc += te_cpc3_tmp
        print(f'[{tag}] fold {f} done in {time.time()-f0:.1f}s', flush=True)

    te_g10 = (te_g10_acc / NUM_FOLDS).astype(np.float32)
    te_g25 = (te_g25_acc / NUM_FOLDS).astype(np.float32)
    te_anch = (te_anch_acc / NUM_FOLDS).astype(np.float32)
    te_cpc3 = (te_cpc3_acc / NUM_FOLDS).astype(np.float32)

    cols = {
        f'knn_{tag}_wmean10': oof_g10,
        f'knn_{tag}_wmean25': oof_g25,
        f'knn_{tag}_anchor': oof_anch,
        f'knn_{tag}_cpc3': oof_cpc3,
    }
    cols_te = {
        f'knn_{tag}_wmean10': te_g10,
        f'knn_{tag}_wmean25': te_g25,
        f'knn_{tag}_anchor': te_anch,
        f'knn_{tag}_cpc3': te_cpc3,
    }
    return cols, cols_te

# Compute for both encoders
cols_patberta, cols_te_patberta = compute_knn_meta_for_encoder('AI-Growth-Lab/PatentSBERTa', tag='patberta')
cols_bertpat, cols_te_bertpat = compute_knn_meta_for_encoder('anferico/bert-for-patents', tag='bertpat')

# Save
oof_df = pd.DataFrame({'id': train['id']})
te_df = pd.DataFrame({'id': test['id']})
for k, v in {**cols_patberta, **cols_bertpat}.items():
    oof_df[k] = v.astype(np.float32)
for k, v in {**cols_te_patberta, **cols_te_bertpat}.items():
    te_df[k] = v.astype(np.float32)
oof_df.to_csv('oof_knn_meta.csv', index=False)
te_df.to_csv('knn_meta_test.csv', index=False)
print('Saved oof_knn_meta.csv and knn_meta_test.csv; elapsed', round((time.time()-t0)/60,2), 'min', flush=True)

Encoding with AI-Growth-Lab/PatentSBERTa ...




[patberta] fold 0 done in 477.6s


[patberta] fold 1 done in 494.8s


[patberta] fold 2 done in 503.0s


[patberta] fold 3 done in 505.6s


[patberta] fold 4 done in 497.3s


Encoding with anferico/bert-for-patents ...


No sentence-transformers model found with name anferico/bert-for-patents. Creating a new one with MEAN pooling.




[bertpat] fold 0 done in 496.8s


[bertpat] fold 1 done in 507.3s


[bertpat] fold 2 done in 515.1s


[bertpat] fold 3 done in 527.0s


[bertpat] fold 4 done in 517.5s


Saved oof_knn_meta.csv and knn_meta_test.csv; elapsed 89.81 min
