In [None]:
%load_ext autoreload
%autoreload 2

# Clean raw names and split into separate given and surname datasets

In [None]:
from os.path import join
from pathlib import Path
from tqdm import tqdm
import pandas as pd

from src.data.prepare import normalize

In [None]:
in_path = "../data-raw/tree-hr/"
given_out_path = "../data-raw/tree-hr-given/"
surname_out_path = "../data-raw/tree-hr-surname/"

In [None]:
def normalize_given_and_join(name):
    return " ".join(normalize(name, False))

def normalize_surname_and_join(name):
    return " ".join(normalize(name, True))

In [None]:
# process files
# TODO switch this over to mpire
filenames = list(Path(in_path).glob("*.gz"))
for f in tqdm(filenames):
    basename = f.stem

    df = pd.read_csv(
        f,
        sep="|",
        compression="gzip",
        names=["name", "alt_name"],
        dtype={"name": str, "alt_name": str},
        na_filter=False,
        encoding="utf-8",
    )

    # create separate given and surname dataframes
    given_df = df[["name", "alt_name"]].copy()
    surname_df = df[["name", "alt_name"]].copy()
    del df

    # split names into given and surname
    given_df["name"] = given_df["name"].str.replace("\^.*$", "", regex=True)
    given_df["alt_name"] = given_df["alt_name"].str.replace("\^.*$", "", regex=True)
    surname_df["name"] = surname_df["name"].str.replace("^.*\^", "", regex=True)
    surname_df["alt_name"] = surname_df["alt_name"].str.replace("^.*\^", "", regex=True)

    # filter out non-latin names
    given_df = given_df[
        given_df["name"].str.endswith("~Latn")
        & given_df["alt_name"].str.endswith("~Latn")
    ]
    surname_df = surname_df[
        surname_df["name"].str.endswith("~Latn")
        & surname_df["alt_name"].str.endswith("~Latn")
    ]

    # remove ~Latn suffix
    given_df["name"] = given_df["name"].str.replace(
        "~Latn$", "", regex=True
    )
    given_df["alt_name"] = given_df["alt_name"].str.replace(
        "~Latn$", "", regex=True
    )
    surname_df["name"] = surname_df["name"].str.replace(
        "~Latn$", "", regex=True
    )
    surname_df["alt_name"] = surname_df["alt_name"].str.replace(
        "~Latn$", "", regex=True
    )

    # normalize names and join the pieces back into a single space-separated string
    given_df["name"] = given_df["name"].map(normalize_given_and_join)
    given_df["alt_name"] = given_df["alt_name"].map(normalize_given_and_join)
    surname_df["name"] = surname_df["name"].map(normalize_surname_and_join)
    surname_df["alt_name"] = surname_df["alt_name"].map(normalize_surname_and_join)

    # remove exact matches
    given_df = given_df[given_df["name"] != given_df["alt_name"]]
    surname_df = surname_df[surname_df["name"] != surname_df["alt_name"]]

    # write files
    given_df.to_parquet(
        join(given_out_path, basename) + ".parquet", engine="pyarrow", compression="snappy"
    )
    surname_df.to_parquet(
        join(surname_out_path, basename) + ".parquet", engine="pyarrow", compression="snappy"
    )