In [3]:
# Soft TF-IDF (1-2g stems) with JW=0.90, fold-safe, on normalized text. Outputs: oof_soft_tfidf_norm.csv and soft_tfidf_norm_test.csv
import time, math, re, gc, sys, unicodedata
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from rapidfuzz.distance import JaroWinkler

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
folds = pd.read_csv('folds_by_id.csv')  # columns: id, fold
train = train.merge(folds, on='id', how='left')

# ---------- Normalization utilities (NFKC + confusables + number-unit split + de-hyphen) ----------
REPL = {
    '\u00b5': 'u',   # micro sign
    '\u03bc': 'u',   # Greek mu
    '\u03a9': 'ohm', # Omega
    '\u2126': 'ohm', # Ohm symbol
    '\u00b0C': 'deg C',  # degree C
    '\u00b0F': 'deg F',  # degree F
    '\u00b0': 'deg',     # bare degree
    '\u00d7': 'x',       # multiplication sign
    '\u2032': "'",      # prime
    '\u2033': '"',      # double prime
}

# Sub/superscripts
_SUBS = str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789')
_SUPS_MAP = { '²': '2', '³': '3', '⁺': '+', '⁻': '-' }

NUM_UNIT_ATTACH = re.compile(r'(?i)(\d+(?:[\./]\d+)?)([a-zA-Z%][a-zA-Z%/]*)')

def nfkc(s: str) -> str:
    return unicodedata.normalize('NFKC', s)

def normalize_text(s: str) -> str:
    s = nfkc(str(s))
    for k, v in REPL.items():
        s = s.replace(k, v)
    for k, v in _SUPS_MAP.items():
        s = s.replace(k, v)
    s = s.translate(_SUBS)
    # unify hyphen/underscore to space
    s = s.replace('-', ' ').replace('_', ' ')
    # split attached number+unit 10nm -> '10 nm'
    s = NUM_UNIT_ATTACH.sub(r'\1 \2', s)
    # lowercase and collapse spaces
    s = s.lower()
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# ---------- Tokenizer with Porter stemming (no external corpora needed) ----------
try:
    from nltk.stem import PorterStemmer
    _stemmer = PorterStemmer()
    def stem_token(tok: str) -> str:
        return _stemmer.stem(tok)
except Exception:
    def stem_token(tok: str) -> str:
        return tok

_word_re = re.compile(r"[a-z0-9]+(?:[./][a-z0-9]+)?")  # normalized regex (post-normalize_text)
def tokenize_stems(text: str):
    if not isinstance(text, str):
        text = ''
    text = normalize_text(text)
    toks = _word_re.findall(text)
    return [stem_token(t) for t in toks if t]

# sklearn will build ngrams over tokens returned by tokenizer
def analyzer(text: str):
    return tokenize_stems(text)

# ---------- Soft TF-IDF computation ----------
JW_THRESH = 0.90

def jw_sim(a: str, b: str) -> float:
    return float(JaroWinkler.normalized_similarity(a, b))

def sparse_to_weight_dict(vec, feature_names):
    ind = vec.indices
    data = vec.data
    return {feature_names[i]: float(w) for i, w in zip(ind, data)}

def vec_norm_squared(vec):
    return float((vec.power(2)).sum())

def soft_tfidf_score(wA: dict, wB: dict, cache: dict) -> float:
    if not wA or not wB:
        return 0.0
    num = 0.0
    for ta, wa in wA.items():
        best = 0.0; best_tb = None
        for tb, wb in wB.items():
            key = (ta, tb) if ta <= tb else (tb, ta)
            s = cache.get(key)
            if s is None:
                s = jw_sim(ta, tb)
                cache[key] = s
            if s >= JW_THRESH and s > best:
                best = s; best_tb = tb
        if best_tb is not None:
            num += wa * best * wB[best_tb]
    return num

def compute_soft_pair(vecA, vecB, feature_names, jw_cache):
    wA = sparse_to_weight_dict(vecA, feature_names)
    wB = sparse_to_weight_dict(vecB, feature_names)
    num = soft_tfidf_score(wA, wB, jw_cache)
    na = math.sqrt(vec_norm_squared(vecA))
    nb = math.sqrt(vec_norm_squared(vecB))
    if na == 0.0 or nb == 0.0:
        return 0.0
    return float(num / (na * nb))

# ---------- Fold-safe computation ----------
NUM_FOLDS = int(train['fold'].max()) + 1
print(f'Found {NUM_FOLDS} folds', flush=True)

oof_vals = np.zeros(len(train), dtype=np.float32)
len_diff_arr = np.zeros(len(train), dtype=np.float32)

# For per-anchor train-only ranks/stats per fold
oof_rank = np.full(len(train), np.nan, dtype=np.float32)
oof_pct = np.full(len(train), np.nan, dtype=np.float32)
oof_gap_top = np.full(len(train), np.nan, dtype=np.float32)

# For test, we'll average per-fold test predictions
test_soft_matrix = []
test_len_diff_list = []

def doc_len_tokens(text):
    return len(tokenize_stems(text))

for fold in range(NUM_FOLDS):
    t_fold0 = time.time()
    tr_idx = np.where(train['fold'].values != fold)[0]
    va_idx = np.where(train['fold'].values == fold)[0]
    tr_df = train.iloc[tr_idx].reset_index(drop=True)
    va_df = train.iloc[va_idx].reset_index(drop=True)
    print(f'Fold {fold}: train {len(tr_df)} val {len(va_df)}', flush=True)

    # Build corpus on train-only (A ∪ B), on normalized text
    corpus = pd.concat([tr_df['anchor'], tr_df['target']], axis=0).astype(str).tolist()
    vectorizer = TfidfVectorizer(analyzer='word', tokenizer=analyzer, preprocessor=None, lowercase=False, ngram_range=(1,2), min_df=3)
    V = vectorizer.fit_transform(corpus)
    feature_names = np.array(vectorizer.get_feature_names_out())
    print(f'  Vocab size: {len(feature_names)}', flush=True)

    # Transform val and test (anchor/target separately) with same vectorizer
    va_anchor = vectorizer.transform(va_df['anchor'].astype(str).tolist())
    va_target = vectorizer.transform(va_df['target'].astype(str).tolist())

    # Compute per-row soft tfidf for val
    jw_cache = {}
    vals = np.zeros(len(va_df), dtype=np.float32)
    v_len_diff = np.zeros(len(va_df), dtype=np.float32)
    for i in range(len(va_df)):
        if (i+1) % 1000 == 0:
            print(f'    val row {i+1}/{len(va_df)}', flush=True)
        vals[i] = compute_soft_pair(va_anchor[i], va_target[i], feature_names, jw_cache)
        v_len_diff[i] = abs(doc_len_tokens(va_df.at[i,'anchor']) - doc_len_tokens(va_df.at[i,'target']))

    oof_vals[va_idx] = vals
    len_diff_arr[va_idx] = v_len_diff

    # Per-anchor ranks/pct/gap computed on train-only within this fold
    tr_anchor = vectorizer.transform(tr_df['anchor'].astype(str).tolist())
    tr_target = vectorizer.transform(tr_df['target'].astype(str).tolist())
    jw_cache_tr = {}
    tr_soft = np.zeros(len(tr_df), dtype=np.float32)
    for i in range(len(tr_df)):
        if (i+1) % 5000 == 0:
            print(f'    train ref row {i+1}/{len(tr_df)}', flush=True)
        tr_soft[i] = compute_soft_pair(tr_anchor[i], tr_target[i], feature_names, jw_cache_tr)
    tr_anchor_col = tr_df['anchor'].values

    # Build per-anchor sorted lists from train-only
    ref_scores = defaultdict(list)
    for a, s in zip(tr_anchor_col, tr_soft):
        ref_scores[a].append(float(s))
    for a in ref_scores:
        ref_scores[a].sort()

    # Assign ranks/pct/gap for OOF val rows using train-only reference
    for loc, idx in enumerate(va_idx):
        a = train.at[idx, 'anchor']
        s = oof_vals[idx]
        arr = ref_scores.get(a)
        if not arr:
            continue
        j = np.searchsorted(arr, s, side='right')
        pct = j / len(arr)
        rank = j
        gap_top = (arr[-1] - s)
        oof_pct[idx] = pct
        oof_rank[idx] = rank
        oof_gap_top[idx] = gap_top

    # Test predictions for this fold (will be averaged across folds)
    te_anchor = vectorizer.transform(test['anchor'].astype(str).tolist())
    te_target = vectorizer.transform(test['target'].astype(str).tolist())
    jw_cache_te = {}
    te_vals = np.zeros(len(test), dtype=np.float32)
    te_len_diff = np.zeros(len(test), dtype=np.float32)
    for i in range(len(test)):
        if (i+1) % 2000 == 0:
            print(f'    test row {i+1}/{len(test)}', flush=True)
        te_vals[i] = compute_soft_pair(te_anchor[i], te_target[i], feature_names, jw_cache_te)
        te_len_diff[i] = abs(doc_len_tokens(test.at[i,'anchor']) - doc_len_tokens(test.at[i,'target']))
    test_soft_matrix.append(te_vals)
    test_len_diff_list.append(te_len_diff)

    dt = time.time() - t_fold0
    print(f'Fold {fold} done in {dt/60:.2f} min', flush=True)
    del vectorizer, V, va_anchor, va_target, tr_anchor, tr_target, jw_cache, jw_cache_tr, jw_cache_te
    gc.collect()

# ---------- Build OOF dataframe with transforms ----------
oof = pd.DataFrame({'id': train['id'], 'anchor': train['anchor'], 'soft_tfidf': oof_vals, 'len_diff': len_diff_arr, 'rank': oof_rank, 'pct': oof_pct, 'gap_to_top': oof_gap_top})
oof['soft_tfidf_sq'] = oof['soft_tfidf'] ** 2
oof['one_minus_soft'] = 1.0 - oof['soft_tfidf']
oof['fisher_z'] = np.arctanh(np.clip(oof['soft_tfidf'] * 2 - 1, -0.999999, 0.999999))  # map [0,1]->[-1,1] then Fisher-z
oof['soft_x_len_diff'] = oof['soft_tfidf'] * oof['len_diff']
oof = oof.drop(columns=['anchor'])

# Save OOF (normalized-text variant)
oof.to_csv('oof_soft_tfidf_norm.csv', index=False)
print('Saved oof_soft_tfidf_norm.csv', oof.shape, flush=True)

# ---------- Aggregate test across folds and compute ranks vs OOF distribution ----------
test_soft = np.mean(np.vstack(test_soft_matrix), axis=0).astype(np.float32)
test_len_diff = np.mean(np.vstack(test_len_diff_list), axis=0).astype(np.float32)
te = pd.DataFrame({'id': test['id'], 'anchor': test['anchor'], 'soft_tfidf': test_soft, 'len_diff': test_len_diff})
te['soft_tfidf_sq'] = te['soft_tfidf'] ** 2
te['one_minus_soft'] = 1.0 - te['soft_tfidf']
te['fisher_z'] = np.arctanh(np.clip(te['soft_tfidf'] * 2 - 1, -0.999999, 0.999999))
te['soft_x_len_diff'] = te['soft_tfidf'] * te['len_diff']

# Build per-anchor reference from OOF for percentile and gap_to_top
ref = oof[['id','soft_tfidf']].join(train[['id','anchor']].set_index('id'), on='id')
ref_groups = defaultdict(list)
for a, s in zip(ref['anchor'].values, ref['soft_tfidf'].values):
    ref_groups[a].append(float(s))
for a in ref_groups:
    ref_groups[a].sort()

rank_list = np.full(len(te), np.nan, dtype=np.float32)
pct_list = np.full(len(te), np.nan, dtype=np.float32)
gap_list = np.full(len(te), np.nan, dtype=np.float32)
for i in range(len(te)):
    a = te.at[i, 'anchor']
    s = te.at[i, 'soft_tfidf']
    arr = ref_groups.get(a)
    if arr:
        j = np.searchsorted(arr, s, side='right')
        pct_list[i] = j / len(arr)
        rank_list[i] = j
        gap_list[i] = (arr[-1] - s)

te['rank'] = rank_list
te['pct'] = pct_list
te['gap_to_top'] = gap_list
te = te.drop(columns=['anchor'])
te.to_csv('soft_tfidf_norm_test.csv', index=False)
print('Saved soft_tfidf_norm_test.csv', te.shape, flush=True)

print('All done in', round((time.time()-t0)/60,2), 'min', flush=True)

Found 5 folds


Fold 0: train 26739 val 6086




  Vocab size: 5224


    val row 1000/6086


    val row 2000/6086


    val row 3000/6086


    val row 4000/6086


    val row 5000/6086


    val row 6000/6086


    train ref row 5000/26739


    train ref row 10000/26739


    train ref row 15000/26739


    train ref row 20000/26739


    train ref row 25000/26739


    test row 2000/3648


Fold 0 done in 0.12 min


Fold 1: train 26310 val 6515




  Vocab size: 5177


    val row 1000/6515


    val row 2000/6515


    val row 3000/6515


    val row 4000/6515


    val row 5000/6515


    val row 6000/6515


    train ref row 5000/26310


    train ref row 10000/26310


    train ref row 15000/26310


    train ref row 20000/26310


    train ref row 25000/26310


    test row 2000/3648


Fold 1 done in 0.12 min


Fold 2: train 26195 val 6630




  Vocab size: 5146


    val row 1000/6630


    val row 2000/6630


    val row 3000/6630


    val row 4000/6630


    val row 5000/6630


    val row 6000/6630


    train ref row 5000/26195


    train ref row 10000/26195


    train ref row 15000/26195


    train ref row 20000/26195


    train ref row 25000/26195


    test row 2000/3648


Fold 2 done in 0.12 min


Fold 3: train 25884 val 6941




  Vocab size: 5052


    val row 1000/6941


    val row 2000/6941


    val row 3000/6941


    val row 4000/6941


    val row 5000/6941


    val row 6000/6941


    train ref row 5000/25884


    train ref row 10000/25884


    train ref row 15000/25884


    train ref row 20000/25884


    train ref row 25000/25884


    test row 2000/3648


Fold 3 done in 0.12 min


Fold 4: train 26172 val 6653




  Vocab size: 5159


    val row 1000/6653


    val row 2000/6653


    val row 3000/6653


    val row 4000/6653


    val row 5000/6653


    val row 6000/6653


    train ref row 5000/26172


    train ref row 10000/26172


    train ref row 15000/26172


    train ref row 20000/26172


    train ref row 25000/26172


    test row 2000/3648


Fold 4 done in 0.12 min


Saved oof_soft_tfidf_norm.csv (32825, 10)


Saved soft_tfidf_norm_test.csv (3648, 10)


All done in 0.61 min
