In [None]:
%load_ext autoreload
%autoreload 2

# Split similar name pairs into train (frequent) and test (rare)

Take the similar pairs generated by notebook 40, remove bad pairs identified by Clorinda, and split into train and test. Train contains only rows where both the tree name and the record name are among the most frequent tree preferred names. The assumption is that more-frequent names will have more-reliable tree-record attachment frequencies for training purposes.

NOTE: `tree-hr-{given_surname}-train-v2.csv.gz` is not the same as `tree-hr-{given_surname}-train.csv.gz` v2 has a different file format, and we use a different approach to splitting train and test in v2.

In [None]:
from collections import Counter

import pandas as pd

In [None]:
given_surname = "given"
sample_name = "john"
train_size = 0.45

in_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-v2.csv.gz"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
bad_pairs_path=f"s3://familysearch-names/interim/{given_surname}_variants_clorinda_reviewed.tsv"

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
test_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-test-v2.csv.gz"

## Read similar name pairs

In [None]:
df = pd.read_csv(in_path, na_filter=False)
print(df.shape)
df.head(3)

In [None]:
# name = tree_name, alt_name = record_name
df = df[["name", "alt_name", "frequency"]]
df.rename(columns={"name": "tree_name", "alt_name": "record_name"}, inplace=True)
print(df.shape)
df.head(3)

## Remove non-alpha names

In [None]:
alpha = r"[a-z]+"
df = df[(df["tree_name"].str.fullmatch(alpha)) & (df["record_name"].str.fullmatch(alpha))]
print(df.shape)

## Remove bad pairs

In [None]:
bad_pairs_df = pd.read_csv(bad_pairs_path, 
                           na_filter=False,
                           sep='\t', 
                           names=["name1", "name2"])

In [None]:
print(bad_pairs_df.shape)
bad_pairs_df.head(3)

In [None]:
# remove tree_name, record_name matches
df = pd.merge(df, bad_pairs_df, 
              left_on=['tree_name','record_name'],right_on=['name1','name2'],
              how='left', indicator=True)
df = df[df['_merge'] != 'both'].drop(columns=['name1', 'name2', '_merge'])
# remove record_name, tree_name matches
df = pd.merge(df, bad_pairs_df, 
              left_on=['tree_name','record_name'],right_on=['name2','name1'],
              how='left', indicator=True)
df = df[df['_merge'] != 'both'].drop(columns=['name1', 'name2', '_merge'])
print(df.shape)
df.head(20)

## Read name frequencies

In [None]:
pref_df = pd.read_csv(pref_path, na_filter=False)

In [None]:
print(pref_df.shape)
pref_df.head(3)

## Split based upon frequency as a preferred name

In [None]:
# map name -> frequency
name_frequency = pref_df.set_index(pref_df.columns[0]).to_dict()[pref_df.columns[1]]
name_frequency[sample_name]

In [None]:
# get frequency of all_names (tree or record) in df
counter = Counter()
for name in set(df["tree_name"]) | set(df["record_name"]):
    counter[name] = name_frequency.get(name, 0)
len(counter)

In [None]:
# what is the least most-common name?
size = int(train_size * len(counter))
print(size)
counter.most_common()[size-5:size+5]

In [None]:
df[(df["tree_name"] == "weober") | (df["record_name"] == "weober")]

In [None]:
# keep the top train_size as training names, put the rest in test
train_names = set(name for name, _ in counter.most_common(size))
len(train_names)

In [None]:
train_df = df[(df["tree_name"].isin(train_names)) & (df["record_name"].isin(train_names))]
train_df.shape

In [None]:
test_df = df[(~df["tree_name"].isin(train_names)) | (~df["record_name"].isin(train_names))]
test_df.shape

## Write train and test 

In [None]:
train_df.to_csv(train_path, index=False)

In [None]:
test_df.to_csv(test_path, index=False)