In [None]:
%load_ext autoreload
%autoreload 2

# Augment the training dataset with other matching pairs based upon names having the same code or levenshtein similarity
We found that even though we had millions of pairs of names, it wasn't nearly enough.
Many good name pairs were not represented in the data.
So we augmented the data with additional pairs, and gave them frequencies based upon how many coding algorithms assigned them the same code, and how similar they were in levenshtein.
This turns out to be key to the success of the algorithm.
Otherwise these pairs are represented as negatives, which is a significant issue.

In [None]:
from collections import namedtuple

import pandas as pd
import wandb

from src.data.augment import generate_augmented_name_pairs, augment_dataset
from src.data.utils import load_dataset

In [None]:
# run on m1024

In [None]:
# Config

given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
threshold = 3
augment_discount = 1
original_data_multiplier = 8
batch_size = 5000
Config = namedtuple("Config", "in_path threshold augment_discount original_data_multiplier augments_path combined_path")
config = Config(
    in_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    threshold=threshold,
    augment_discount=augment_discount,
    original_data_multiplier=original_data_multiplier,
    augments_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-augments.csv.gz",
    combined_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-augmented.csv.gz",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="49_augment_dataset",
    group=given_surname,
    notes="",
    config=config._asdict(),
)

In [None]:
# load data
input_names, weighted_actual_names, candidate_names = load_dataset(config.in_path)

In [None]:
# sample to make things go faster
# sample_size = 10000
# batch_size = 1000
# input_names, weighted_actual_names, candidate_names = select_frequent_k(input_names, weighted_actual_names, candidate_names, k=batch_size)

In [None]:
all_names = set(input_names).union(set(candidate_names))
print(len(all_names))

In [None]:
# get augmented name-pair data
name_pairs_df = generate_augmented_name_pairs(input_names, weighted_actual_names, candidate_names, threshold=config.threshold, discount=config.augment_discount, batch_size=batch_size)

In [None]:
print(len(name_pairs_df))
name_pairs_df.head(10)

In [None]:
# free memory
input_names = weighted_actual_names = candidate_names = None

In [None]:
# write augments
name_pairs_df.to_csv(config.augments_path, index=False)

In [None]:
# read raw input dataset
raw_input_df = pd.read_csv(config.in_path)

In [None]:
print(len(raw_input_df))
print(len(set(raw_input_df["name1"])))
print(len(set(raw_input_df["name2"])))
print(len(set(raw_input_df["name1"]).union(set(raw_input_df["name2"]))))
raw_input_df.head(10)

In [None]:
# augment raw input dataset with augmented name pairs
augmented_df = augment_dataset(raw_input_df, name_pairs_df, multiplier=config.original_data_multiplier)

In [None]:
# free memory
raw_input_df = name_pairs_df = None

In [None]:
print(len(augmented_df))
print(len(set(augmented_df["name1"])))
print(len(set(augmented_df["name2"])))
print(len(set(augmented_df["name1"]).union(set(augmented_df["name2"]))))
augmented_df.head(10)

In [None]:
# write merged dataset
augmented_df.to_csv(config.combined_path, index=False)

In [None]:
wandb.finish()

## Analyze results

In [None]:
augmented_df = pd.read_csv(config.out_path)

In [None]:
print(len(augmented_df))

In [None]:
print(augmented_df["co_occurrence"].max())

In [None]:
print(len(augmented_df[augmented_df["co_occurrence"] == 2]))

In [None]:
print(len(augmented_df[augmented_df["co_occurrence"] == 3]))

In [None]:
print(len(augmented_df[augmented_df["co_occurrence"] == 4]))

In [None]:
print(len(augmented_df[augmented_df["co_occurrence"] > 1000]))

In [None]:
augmented_df[augmented_df["co_occurrence"] <= 100]['co_occurrence'].plot(kind="hist", bins=100, figsize=(20, 6), logy=True)

In [None]:
augmented_df[augmented_df["co_occurrence"] == 4].sample(100)

In [None]:
augmented_df[augmented_df["co_occurrence"] == 3].sample(100)

In [None]:
augmented_df[augmented_df["co_occurrence"] == 2].sample(100)