In [1]:
# Char edit similarity features: Jaro-Winkler, Levenshtein, Damerau-Levenshtein (train/test)
import time, pandas as pd
from rapidfuzz.distance import JaroWinkler, Levenshtein, DamerauLevenshtein

t0 = time.time()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

def compute_char_edit(df: pd.DataFrame) -> pd.DataFrame:
    a = df['anchor'].astype(str).tolist()
    b = df['target'].astype(str).tolist()
    n = len(df)
    jw = [0.0]*n; lev = [0.0]*n; dlev = [0.0]*n
    for i, (x, y) in enumerate(zip(a, b)):
        # normalized_similarity returns 0..1
        jw[i] = float(JaroWinkler.normalized_similarity(x, y))
        lev[i] = float(Levenshtein.normalized_similarity(x, y))
        dlev[i] = float(DamerauLevenshtein.normalized_similarity(x, y))
        if (i+1) % 50000 == 0:
            print(f'.. {i+1} rows done', flush=True)
    out = pd.DataFrame({
        'jaro_winkler': jw,
        'levenshtein_norm': lev,
        'damerau_lev_norm': dlev,
    })
    # cheap non-linearities
    out['jaro_winkler_sq'] = out['jaro_winkler']**2
    out['levenshtein_norm_sq'] = out['levenshtein_norm']**2
    out['damerau_lev_norm_sq'] = out['damerau_lev_norm']**2
    out['one_minus_jw'] = 1.0 - out['jaro_winkler']
    out['one_minus_lev'] = 1.0 - out['levenshtein_norm']
    out['one_minus_dlev'] = 1.0 - out['damerau_lev_norm']
    return out.astype('float32')

print('Computing train char-edit features...', flush=True)
tr_feats = compute_char_edit(train)
tr_out = pd.concat([train[['id']].reset_index(drop=True), tr_feats.reset_index(drop=True)], axis=1)
tr_out.to_csv('oof_char_edit.csv', index=False)
print('Saved oof_char_edit.csv with shape', tr_out.shape, flush=True)

print('Computing test char-edit features...', flush=True)
te_feats = compute_char_edit(test)
te_out = pd.concat([test[['id']].reset_index(drop=True), te_feats.reset_index(drop=True)], axis=1)
te_out.to_csv('char_edit_test.csv', index=False)
print('Saved char_edit_test.csv with shape', te_out.shape, flush=True)

print('Done in', round((time.time()-t0)/60,2), 'min', flush=True)

Computing train char-edit features...


Saved oof_char_edit.csv with shape (32825, 10)


Computing test char-edit features...


Saved char_edit_test.csv with shape (3648, 10)


Done in 0.0 min
