In [None]:
%load_ext autoreload
%autoreload 2

# Generate pairs of best-matching name pieces from multi-word given or surnames

In [None]:
from os.path import join

from pathlib import Path
from pandarallel import pandarallel
import pandas as pd
from tqdm import tqdm

from src.data.filesystem import glob
from src.data.prepare import match_name_pairs

In [None]:
# configure
in_path = "s3://familysearch-names/interim/tree-hr-surname/"
out_path = "s3://familysearch-names/interim/tree-hr-surname-pairs/"
is_surname = True

In [None]:
Path(out_path).mkdir(parents=True, exist_ok=True)
pandarallel.initialize()

In [None]:
# TODO switch this over to mpire someday
filenames = glob(in_path + "*.parquet")
for filename in tqdm(filenames):
    basename = Path(filename).stem

    # read file
    df = pd.read_parquet(filename)

    # split name and alt-name into individual name pieces
    df["name_pieces"] = df["name"].str.split()
    df["alt_name_pieces"] = df["alt_name"].str.split()

    # match pieces in name with the nearest alt_name pieces
    df["pairs"] = df.parallel_apply(match_name_pairs, axis=1)

    # explode pairs into a new (name, alt_name) dataframe
    df = pd.DataFrame(
        df[["pairs"]].explode("pairs", ignore_index=True).dropna()["pairs"].tolist(),
        columns=["name", "alt_name"],
    )

    # remove exact matches and empty
    df = df[(df["name"] != df["alt_name"]) & ~df["name"].isna() & ~df["alt_name"].isna()]

    # write
    df.to_parquet(join(out_path, basename + ".parquet"))