In [None]:
%load_ext autoreload
%autoreload 2

# Clean preferred names and split into separate given and surname datasets

In [None]:
from os.path import join

import pandas as pd
from pathlib import Path
from tqdm import tqdm
import unidecode

from src.data.filesystem import glob

In [None]:
in_path = "s3://familysearch-names/raw/tree-preferred/"
given_out_path = "s3://familysearch-names/interim/tree-preferred-given/"
surname_out_path = "s3://familysearch-names/interim/tree-preferred-surname/"

In [None]:
# process files
filenames = glob(in_path+"*.gz")
for filename in tqdm(filenames):
    basename = Path(filename).stem

    df = pd.read_csv(
        filename,
        sep="|",
        compression="gzip",
        names=["name"],
        dtype={"name": str},
        na_filter=False,
        encoding="utf-8",
    )

    # remove diacritics (do it now so we just have to map once for the full name)
    df["name"] = df["name"].map(unidecode.unidecode)

    # create separate given and surname dataframes
    given_df = df[["name"]].copy()
    surname_df = df[["name"]].copy()
    del df

    # split names into given and surname
    given_df["name"] = given_df["name"].str.replace("\^.*$", "", regex=True)
    surname_df["name"] = surname_df["name"].str.replace("^.*\^", "", regex=True)

    # filter out non-latin names
    given_df = given_df[
        given_df["name"].str.endswith("~Latn")
    ]
    surname_df = surname_df[
        surname_df["name"].str.endswith("~Latn")
    ]

    # remove ~Latn suffix and lowercase
    given_df["name"] = given_df["name"].str.replace(
        "~Latn$", "", regex=True
    ).str.lower()
    surname_df["name"] = surname_df["name"].str.replace(
        "~Latn$", "", regex=True
    ).str.lower()

    # remove non-alpha (remove apostrophies, replace everything else with space) and strip
    given_df["name"] = (
        given_df["name"].str.replace("[`'´‘’]", "", regex=True)
        .str.replace("[^ a-z]", " ", regex=True)
        .str.replace(" +", " ", regex=True)
        .str.strip()
    )
    surname_df["name"] = (
        surname_df["name"].str.replace("[`'´‘’]", "", regex=True)
        .str.replace("[^ a-z]", " ", regex=True)
        .str.replace(" +", " ", regex=True)
        .str.strip()
    )

    # write files
    given_df.to_parquet(
        join(given_out_path, basename) + ".parquet", engine="pyarrow", compression="snappy"
    )
    surname_df.to_parquet(
        join(surname_out_path, basename) + ".parquet", engine="pyarrow", compression="snappy"
    )