In [1]:
%load_ext autoreload
%autoreload 2

# Clean raw names and split into separate given and surname datasets

In [22]:
from os.path import join

import csv
from mpire import WorkerPool
import pandas as pd
from pathlib import Path

from nama.data.filesystem import glob, download_file_from_s3, save_file
from nama.data.normalize import normalize

In [3]:
in_path = "s3://fs-nama-data/2024/familysearch-names/raw/tree-hr/"
given_out_path = "s3://fs-nama-data/2024/familysearch-names/interim/tree-hr-given/"
surname_out_path = "s3://fs-nama-data/2024/familysearch-names/interim/tree-hr-surname/"

In [4]:
def normalize_given_and_join(name):
    return (" ".join(normalize(name, is_surname=False, dont_return_empty=False))).strip()

def normalize_surname_and_join(name):
    return (" ".join(normalize(name, is_surname=True, dont_return_empty=False))).strip()

In [23]:
def process_file(shared, filename):
    given_out_path, surname_out_path = shared
    basename = Path(filename).stem

    filename = download_file_from_s3(filename) if filename.startswith("s3://") else filename
    
    df = pd.read_csv(
        filename,
        sep="|",
        compression="gzip",
        names=["name", "alt_name"],
        dtype={"name": str, "alt_name": str},
        na_filter=False,
        encoding="utf-8",
        quoting=csv.QUOTE_NONE,
    )

    # create separate given and surname dataframes
    given_df = df[["name", "alt_name"]].copy()
    surname_df = df[["name", "alt_name"]].copy()
    del df

    # split names into given and surname
    given_df["name"] = given_df["name"].str.replace(r"\^.*$", "", regex=True)
    given_df["alt_name"] = given_df["alt_name"].str.replace(r"\^.*$", "", regex=True)
    surname_df["name"] = surname_df["name"].str.replace(r"^.*\^", "", regex=True)
    surname_df["alt_name"] = surname_df["alt_name"].str.replace(r"^.*\^", "", regex=True)

    # filter out non-latin names
    given_df = given_df[
        given_df["name"].str.endswith("~Latn")
        & given_df["alt_name"].str.endswith("~Latn")
        ]
    surname_df = surname_df[
        surname_df["name"].str.endswith("~Latn")
        & surname_df["alt_name"].str.endswith("~Latn")
        ]

    # remove ~Latn suffix
    given_df["name"] = given_df["name"].str.replace(
        "~Latn$", "", regex=True
    )
    given_df["alt_name"] = given_df["alt_name"].str.replace(
        "~Latn$", "", regex=True
    )
    surname_df["name"] = surname_df["name"].str.replace(
        "~Latn$", "", regex=True
    )
    surname_df["alt_name"] = surname_df["alt_name"].str.replace(
        "~Latn$", "", regex=True
    )

    # normalize names and join the pieces back into a single space-separated string
    given_df["name"] = given_df["name"].map(normalize_given_and_join)
    given_df["alt_name"] = given_df["alt_name"].map(normalize_given_and_join)
    surname_df["name"] = surname_df["name"].map(normalize_surname_and_join)
    surname_df["alt_name"] = surname_df["alt_name"].map(normalize_surname_and_join)

    # remove empty names
    given_df = given_df[(given_df["name"] != "") & (given_df["alt_name"] != "")]
    surname_df = surname_df[(surname_df["name"] != "") & (surname_df["alt_name"] != "")]
    
    # remove exact matches
    given_df = given_df[given_df["name"] != given_df["alt_name"]]
    surname_df = surname_df[surname_df["name"] != surname_df["alt_name"]]

    # write files
    save_file(join(given_out_path, basename) + ".parquet", 
              lambda local_out_path : given_df.to_parquet(local_out_path, engine="pyarrow", compression="snappy"))
    save_file(join(surname_out_path, basename) + ".parquet",
              lambda local_out_path : surname_df.to_parquet(local_out_path, engine="pyarrow", compression="snappy"))        

In [26]:
# process files
filenames = glob(join(in_path,"*.gz"))
print(len(filenames))
with WorkerPool(shared_objects=(given_out_path, surname_out_path)) as pool:
    pool.map(process_file, filenames, progress_bar=True)

12000


100%|██████████| 12000/12000 [15:54:24<00:00, 28.94s/it]  


In [25]:
# filename = "s3://fs-nama-data/2024/familysearch-names/raw/tree-hr/part-00063.gz"
# process_file((given_out_path, surname_out_path), filename)