In [1]:
%load_ext autoreload
%autoreload 2

# Split similar name pairs into train (frequent) and test (rare)

Take the similar pairs generated by notebook 40, remove bad pairs identified by Clorinda, and split into train and test. Train contains only rows where both the tree name and the record name are among the most frequent tree preferred names. The assumption is that more-frequent names will have more-reliable tree-record attachment frequencies for training purposes.

In [2]:
from collections import Counter

import pandas as pd

from nama.data.filesystem import download_file_from_s3, save_file

In [3]:
# TODO run for given and surname
given_surname = "given"
# given_surname = "surname"
sample_name = "john"
train_size = 0.45

in_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-similar.csv.gz"
pref_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
# re-use Clorinda's bad pairs from last year
bad_pairs_path=f"s3://fs-nama-data/2023/familysearch-names/interim/{given_surname}_variants_clorinda_reviewed.tsv"

train_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz"
test_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz"

## Read similar name pairs

In [4]:
in_path = download_file_from_s3(in_path) if in_path.startswith("s3://") else in_path
df = pd.read_csv(in_path, na_filter=False)
print(df.shape)
df.head(3)

(4797249, 10)


Unnamed: 0,name,alt_name,frequency,reverse_frequency,sum_name_frequency,total_name_frequency,total_alt_name_frequency,ordered_prob,unordered_prob,similarity
0,a,a,1622927,1622927,2578937,36295683,36295683,0.629301,0.046807,1.0
1,aa,a,139,154,482,5067,36295683,0.288382,8e-06,0.5
2,aa,aa,45,45,482,5067,5067,0.093361,0.008961,1.0


In [5]:
# name = tree_name, alt_name = record_name
df = df[["name", "alt_name", "frequency"]]
df.rename(columns={"name": "tree_name", "alt_name": "record_name"}, inplace=True)
print(df.shape)
df.head(3)

(4797249, 3)


Unnamed: 0,tree_name,record_name,frequency
0,a,a,1622927
1,aa,a,139
2,aa,aa,45


## Remove non-alpha names

In [6]:
alpha = r"[a-z]+"
df = df[(df["tree_name"].str.fullmatch(alpha)) & (df["record_name"].str.fullmatch(alpha))]
print(df.shape)

(4797249, 3)


## Remove bad pairs

In [7]:
bad_pairs_path = download_file_from_s3(bad_pairs_path) if bad_pairs_path.startswith("s3://") else bad_pairs_path
bad_pairs_df = pd.read_csv(bad_pairs_path, 
                           na_filter=False,
                           sep='\t', 
                           names=["name1", "name2"])

In [8]:
print(bad_pairs_df.shape)
bad_pairs_df.head(3)

(19526, 2)


Unnamed: 0,name1,name2
0,aage,angel
1,aage,angie
2,aaltje,adaline


In [9]:
# remove tree_name, record_name matches
df = pd.merge(df, bad_pairs_df, 
              left_on=['tree_name','record_name'],right_on=['name1','name2'],
              how='left', indicator=True)
df = df[df['_merge'] != 'both'].drop(columns=['name1', 'name2', '_merge'])
# remove record_name, tree_name matches
df = pd.merge(df, bad_pairs_df, 
              left_on=['tree_name','record_name'],right_on=['name2','name1'],
              how='left', indicator=True)
df = df[df['_merge'] != 'both'].drop(columns=['name1', 'name2', '_merge'])
print(df.shape)
df.head(20)

(4775571, 3)


Unnamed: 0,tree_name,record_name,frequency
0,a,a,1622927
1,aa,a,139
2,aa,aa,45
3,aa,aae,5
4,aa,ana,12
5,aa,anna,76
6,aaaard,aagaard,12
7,aaafke,aafke,6
8,aaage,aage,2
9,aaage,age,9


## Read name frequencies

In [10]:
pref_path = download_file_from_s3(pref_path) if pref_path.startswith("s3://") else pref_path
print(pref_path)
pref_df = pd.read_csv(pref_path, na_filter=False)

/tmp/tmpol_vbars.gz


BadGzipFile: Not a gzipped file (b'PA')

In [None]:
print(pref_df.shape)
pref_df.head(3)

## Split based upon frequency as a preferred name

In [None]:
# map name -> frequency
name_frequency = pref_df.set_index(pref_df.columns[0]).to_dict()[pref_df.columns[1]]
name_frequency[sample_name]

In [None]:
# get frequency of all_names (tree or record) in df
counter = Counter()
for name in set(df["tree_name"]) | set(df["record_name"]):
    counter[name] = name_frequency.get(name, 0)
len(counter)

In [None]:
# what is the least most-common name?
size = int(train_size * len(counter))
print(size)
counter.most_common()[size-5:size+5]

In [None]:
df[(df["tree_name"] == "weober") | (df["record_name"] == "weober")]

In [None]:
# keep the top train_size as training names, put the rest in test
train_names = set(name for name, _ in counter.most_common(size))
len(train_names)

In [None]:
train_df = df[(df["tree_name"].isin(train_names)) & (df["record_name"].isin(train_names))]
train_df.shape

In [None]:
test_df = df[(~df["tree_name"].isin(train_names)) | (~df["record_name"].isin(train_names))]
test_df.shape

## Write train and test 

In [None]:
save_file(train_path,
          lambda local_out_path : train_df.to_csv(local_out_path, index=False))

In [None]:
save_file(test_path,
          lambda local_out_path : test_df.to_csv(local_out_path, index=False))