In [None]:
%load_ext autoreload
%autoreload 2

# Create dataset for training base Roberta masked language model

Use preferred tree name and tree-record pairs to create two lists of names to train a Roberta masked language model.

In [None]:
import os
import random

from mpire import WorkerPool
import pandas as pd
from tqdm import tqdm

from src.data.filesystem import glob
from src.data.normalize import normalize

In [None]:
given_surname = 'given'

pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
# copy tree_hr files locally to improve performance: aws s3 sync s3://familysearch-names/interim/tree-hr-{given_surname} tree-hr-{given_surname}
# tree_hr_path = f"s3://familysearch-names/interim/tree-hr-{given_surname}/"
tree_hr_path = f"../data/tree-hr-{given_surname}/"

output_dir = "../data/processed/"

In [None]:
def normalize_name(name):
    return normalize(
        name, 
        is_surname=given_surname == 'surname', 
        preserve_wildcards=False,
        handle_patronymics=True, 
        dont_return_empty=False,
    )

def save_names(output_path, names):
    with open(output_path, 'w', encoding='utf-8') as f:
        for name in names:
            f.write(name + '\n')

### Save preferred tree names

In [None]:
pref_df = pd.read_csv(pref_path, na_filter=False)
print(len(pref_df))
print(pref_df['frequency'].sum())
pref_df.head(3)

In [None]:
all_names = []
for name, frequency in tqdm(zip(pref_df['name'], pref_df['frequency'])):
    normalized_names = normalize_name(name)
    if len(normalized_names) == 0:
        continue
    all_names.extend(normalized_names * frequency)
print(len(all_names))
all_names[:10]

In [None]:
del pref_df

In [None]:
%%time
random.shuffle(all_names)
all_names[:10]

In [None]:
save_names(os.path.join(output_dir, f"all-tree-preferred-{given_surname}.txt"), all_names)

### Save tree-record matches

In [None]:
filenames = glob(os.path.join(tree_hr_path,"*.parquet"))
print(len(filenames))
filenames[:5]

In [None]:
def process_file(filename):
    tree_record_names = []
    df = pd.read_parquet(filename)
    # we already have tree names, so just grab alt_name
    for alt_name in df['alt_name']:
        tree_record_names.extend(normalize_name(alt_name))
    return tree_record_names

In [None]:
# read tree-record names
with WorkerPool() as pool:
    results = pool.map(process_file, filenames, progress_bar=True)
len(results)

In [None]:
# tree-record names
all_names = []
for result in tqdm(results):
    all_names.extend(result)
len(all_names)

In [None]:
del results

In [None]:
%%time
random.shuffle(all_names)
all_names[:10]

In [None]:
save_names(os.path.join(output_dir, f"all-tree-hr-{given_surname}.txt"), all_names)