In [1]:
%load_ext autoreload
%autoreload 2

# Clean preferred names and split into separate given and surname datasets

In [4]:
from os.path import join

import csv
from mpire import WorkerPool
import pandas as pd
from pathlib import Path

from nama.data.filesystem import glob, download_file_from_s3, save_file
from nama.data.normalize import normalize

In [5]:
in_path = "s3://fs-nama-data/2024/familysearch-names/raw/tree-preferred/"
given_out_path = "s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/"
surname_out_path = "s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-surname/"

In [6]:
def normalize_given_and_join(name):
    return (" ".join(normalize(name, is_surname=False, dont_return_empty=False))).strip()

def normalize_surname_and_join(name):
    return (" ".join(normalize(name, is_surname=True, dont_return_empty=False))).strip()

In [7]:
def process_file(shared, filename):
    given_out_path, surname_out_path = shared
    basename = Path(filename).stem

    filename = download_file_from_s3(filename) if filename.startswith("s3://") else filename
    
    df = pd.read_csv(
        filename,
        sep="|",
        compression="gzip",
        names=["name"],
        dtype={"name": str},
        na_filter=False,
        encoding="utf-8",
        encoding_errors="replace",
        on_bad_lines="warn",
        quoting=csv.QUOTE_NONE,
    )

    # create separate given and surname dataframes
    given_df = df[["name"]].copy()
    surname_df = df[["name"]].copy()
    del df

    # split names into given and surname
    given_df["name"] = given_df["name"].str.replace(r"\^.*$", "", regex=True)
    surname_df["name"] = surname_df["name"].str.replace(r"^.*\^", "", regex=True)

    # filter out non-latin names
    given_df = given_df[
        given_df["name"].str.endswith("~Latn")
    ]
    surname_df = surname_df[
        surname_df["name"].str.endswith("~Latn")
    ]

    # remove ~Latn suffix and lowercase
    given_df["name"] = given_df["name"].str.replace(
        "~Latn$", "", regex=True
    ).str.lower()
    surname_df["name"] = surname_df["name"].str.replace(
        "~Latn$", "", regex=True
    ).str.lower()

    # normalize names and join the pieces back into a single space-separated string
    given_df["name"] = given_df["name"].map(normalize_given_and_join)
    surname_df["name"] = surname_df["name"].map(normalize_surname_and_join)

    # remove empty names
    given_df = given_df[given_df["name"] != ""]
    surname_df = surname_df[surname_df["name"] != ""]
    
    # write files
    save_file(join(given_out_path, basename) + ".parquet", 
              lambda local_out_path : given_df.to_parquet(local_out_path, engine="pyarrow", compression="snappy"))
    save_file(join(surname_out_path, basename) + ".parquet",
              lambda local_out_path : surname_df.to_parquet(local_out_path, engine="pyarrow", compression="snappy"))        

In [12]:
# process files
filenames = glob(join(in_path,"*.gz"))
print(len(filenames))
with WorkerPool(shared_objects=(given_out_path, surname_out_path)) as pool:
    pool.map(process_file, filenames, progress_bar=True)

12000


100%|██████████| 12000/12000 [6:18:14<00:00,  5.88s/it]  
