In [1]:
%load_ext autoreload
%autoreload 2

# Augment the training dataset with other matching pairs based upon names having the same code or levenshtein similarity
We found that even though we had millions of pairs of names, it wasn't nearly enough.
Many good name pairs were not represented in the data.
So we augmented the data with additional pairs, and gave them frequencies based upon how many coding algorithms assigned them the same code, and how similar they were in levenshtein.
This turns out to be key to the success of the algorithm.
Otherwise these pairs are represented as negatives, which is a significant issue.

This notebook takes about 24 hours to run for given names, and probably about 72 hours for surnames.

TODO Currently, we don't add scores for having the same soundex code, high levenshtein similarity, etc. to the name associated with itself.
But maybe we should?

In [2]:
from collections import namedtuple

import pandas as pd

from nama.data.augment import generate_augmented_name_pairs, augment_dataset
from nama.data.filesystem import download_file_from_s3, save_file
from nama.data.utils import load_dataset

In [4]:
# Config

# run on 1536Gb machine high-memory instance

# TODO do for given and surname
given_surname = "given"
# given_surname = "surname"

threshold = 3
augment_discount = 1
original_data_multiplier = 8
batch_size = 5000
n_jobs=64
Config = namedtuple("Config", "in_path threshold augment_discount original_data_multiplier augments_path combined_path")
config = Config(
    in_path=f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    threshold=threshold,
    augment_discount=augment_discount,
    original_data_multiplier=original_data_multiplier,
    augments_path=f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-train-augments.csv.gz",
    combined_path=f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-train-augmented.csv.gz",
)

In [5]:
# load data
in_path = download_file_from_s3(config.in_path) if config.in_path.startswith("s3://") else config.in_path
input_names, record_name_frequencies, candidate_names = load_dataset(in_path)

In [6]:
# sample to make things go faster
# sample_size = 10000
# batch_size = 1000
# input_names, record_name_frequencies, candidate_names = select_frequent_k(input_names, record_name_frequencies, candidate_names, k=batch_size)

In [7]:
all_names = set(input_names).union(set(candidate_names))
print(len(all_names))

836285


In [None]:
# get augmented name-pair data
name_pairs_df = generate_augmented_name_pairs(
    input_names, 
    record_name_frequencies, 
    candidate_names, 
    threshold=config.threshold, 
    discount=config.augment_discount, 
    batch_size=batch_size,
    n_jobs=n_jobs,
)

  0%|                                                                                         | 0/168 [2:01:50<?, ?it/s]

In [None]:
# rename columns
name_pairs_df = name_pairs_df.rename(
    columns={'name1': 'tree_name', 
             'name2': 'record_name',
             'co_occurrence': 'frequency',
            }
)

In [None]:
print(len(name_pairs_df))
name_pairs_df.head(10)

In [None]:
# free memory
input_names = record_name_frequencies = candidate_names = None

In [None]:
# write augments
save_file(config.augments_path,
          lambda local_out_path : name_pairs_df.to_csv(local_out_path, index=False))

In [None]:
# read raw input dataset
in_path = download_file_from_s3(config.in_path) if config.in_path.startswith("s3://") else config.in_path
raw_input_df = pd.read_csv(in_path)

In [None]:
print(len(raw_input_df))
print(len(set(raw_input_df["tree_name"])))
print(len(set(raw_input_df["record_name"])))
print(len(set(raw_input_df["tree_name"]).union(set(raw_input_df["record_name"]))))
raw_input_df.head(10)

In [None]:
# augment raw input dataset with augmented name pairs
augmented_df = augment_dataset(raw_input_df, name_pairs_df, multiplier=config.original_data_multiplier)

In [None]:
# free memory
raw_input_df = name_pairs_df = None

In [None]:
print(len(augmented_df))
print(len(set(augmented_df["tree_name"])))
print(len(set(augmented_df["record_name"])))
print(len(set(augmented_df["tree_name"]).union(set(augmented_df["record_name"]))))
augmented_df.head(10)

In [None]:
# write merged dataset
save_file(config.combined_path,
          lambda local_out_path : augmented_df.to_csv(local_out_path, index=False))

## Analyze results

In [None]:
print(len(augmented_df))

In [None]:
print(augmented_df["frequency"].max())

In [None]:
print(len(augmented_df[augmented_df["frequency"] == 2]))

In [None]:
print(len(augmented_df[augmented_df["frequency"] == 3]))

In [None]:
print(len(augmented_df[augmented_df["frequency"] == 4]))

In [None]:
print(len(augmented_df[augmented_df["frequency"] > 1000]))

In [None]:
augmented_df[augmented_df["frequency"] <= 100]['frequency'].plot(kind="hist", bins=100, figsize=(20, 6), logy=True)

In [None]:
augmented_df[augmented_df["frequency"] == 4].sample(100)

In [None]:
augmented_df[augmented_df["frequency"] == 3].sample(100)

In [None]:
augmented_df[augmented_df["frequency"] == 2].sample(100)