In [None]:
%load_ext autoreload
%autoreload 2

## Remove bad pairs from similar pairs file that aren't good matches based upon a manual review of several thousand borderline pairs from train
The borderline pairs are pairs with low similarity that were manually reviewed and found to be not good matches.

I don't recall exactly how the borderline pairs that went to review were generated, but most likely we simply identified
training pairs that had low levenshtein similarity. We don't have a notebook for this.

In [None]:
from collections import namedtuple

import matplotlib.pyplot as plt
import pandas as pd
import wandb

from src.models.utils import add_padding

In [None]:
given_surname = "given"
Config = namedtuple("Config", "in_path bad_pairs_path out_path")
config = Config(
    in_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-unfiltered.csv.gz",
    bad_pairs_path=f"s3://familysearch-names/interim/{given_surname}_variants_clorinda_reviewed.tsv",
    out_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="47a_filter_bad_pairs",
    group=given_surname,
    notes="",
    config=config._asdict()
)

### Read data

In [None]:
df = pd.read_csv(config.in_path)
print(df.shape)

In [None]:
df.head(5)

In [None]:
bad_pairs_df = pd.read_csv(config.bad_pairs_path, sep="\t", names=["name", "alt_name"])
print(bad_pairs_df.shape)

In [None]:
# add padding
bad_pairs_df.loc[:, "name"] = bad_pairs_df.loc[:, "name"].map(add_padding)
bad_pairs_df.loc[:, "alt_name"] = bad_pairs_df.loc[:, "alt_name"].map(add_padding)

In [None]:
bad_pairs_df.head(5)

### Remove bad pairs

In [None]:
bad_pairs = set(bad_pairs_df.itertuples(index=False, name=None)) \
        .union(set(bad_pairs_df[["alt_name", "name"]].itertuples(index=False, name=None)))
print(len(bad_pairs))

In [None]:
def is_bad_pair(row):
    return (row["name1"], row["name2"]) in bad_pairs

In [None]:
df = df[~df.apply(is_bad_pair, axis=1)]

In [None]:
print(df.shape)

In [None]:
# write filtered pairs
df.to_csv(config.out_path, index=False)

In [None]:
wandb.finish()