In [1]:
%load_ext autoreload
%autoreload 2

# Generate pairs of best-matching name pieces from multi-word given or surnames
For a multi-word tree name <-> a multi-word record name pair, determine which tree-name word should be
associated with which record-name word

In [2]:
from os.path import join

from mpire import WorkerPool
from pathlib import Path
import pandas as pd

from nama.data.filesystem import glob, download_file_from_s3, save_file
from nama.data.match import match_name_pairs

In [3]:
# DO FOR BOTH GIVEN AND SURNAME
# given_surname = "given"
given_surname = "surname"

in_path = f"s3://fs-nama-data/2024/familysearch-names/interim/tree-hr-{given_surname}/"
out_path = f"s3://fs-nama-data/2024/familysearch-names/interim/tree-hr-{given_surname}-pairs/"

In [4]:
def process_file(out_path, filename):
    basename = Path(filename).stem

    # read file
    filename = download_file_from_s3(filename) if filename.startswith("s3://") else filename
    df = pd.read_parquet(filename)

    # split name and alt-name into individual name pieces
    df["name_pieces"] = df["name"].str.split()
    df["alt_name_pieces"] = df["alt_name"].str.split()

    # match pieces in name with the nearest alt_name pieces
    df["pairs"] = df.apply(match_name_pairs, axis=1)

    # explode pairs into a new (name, alt_name) dataframe
    df = pd.DataFrame(
        df[["pairs"]].explode("pairs", ignore_index=True).dropna()["pairs"].tolist(),
        columns=["name", "alt_name"],
    )

    # remove empty
    df = df[~df["name"].isna() & ~df["alt_name"].isna()]

    # write
    save_file(join(out_path, basename) + ".parquet", 
              lambda local_out_path : df.to_parquet(local_out_path))    

In [5]:
# process files
filenames = glob(join(in_path,"*.parquet"))
with WorkerPool(shared_objects=out_path) as pool:
    pool.map(process_file, filenames, progress_bar=True)

100%|██████████| 12000/12000 [44:19<00:00,  1.41it/s] 
