In [None]:
%load_ext autoreload
%autoreload 2

# Generate common non-negatives

Add existing standard, triplets, and nicknames to common non-negatives

In [None]:
import re

import pandas as pd
from tqdm.auto import tqdm

from src.data.utils import read_csv

In [None]:
given_surname = "surname"

num_common_names = 10000

pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
std_path = f"../references/std_{given_surname}.txt"
triplets_path=f"../data/processed/tree-hr-{given_surname}-triplets-v2-1000.csv.gz"
given_nicknames_path = "../references/givenname_nicknames.csv"

non_negatives_path = f"../data/processed/common_{given_surname}_non_negatives.csv"

## Load data

### read preferred names

In [None]:
pref_df = read_csv(pref_path)
common_names = set([name for name in pref_df['name'][:num_common_names].tolist() \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)])
len(common_names)

## Start with FS buckets

In [None]:
common_names_set = set(common_names)
common_non_negatives = set()

with open(std_path) as f:
    for ix, line in enumerate(f.readlines()):
        line = line.strip()
        head_names, tail_names = line.split(':')
        head_names = head_names.strip()
        tail_names = tail_names.strip()
        names = set()
        if len(head_names):
            names |= set(head_names.split(' '))
        if len(tail_names):
            names |= set(tail_names.split(' '))
        names = [name for name in names if len(name) > 0]
        for name1 in names:
            if name1 not in common_names_set:
                continue
            for name2 in names:
                if name2 not in common_names_set:
                    continue
                if name1 == name2:
                    continue
                common_non_negatives.add((name1, name2))
print(len(common_non_negatives))

### add triplets

In [None]:
triplets_df = pd.read_csv(triplets_path)
print(len(triplets_df))
triplets_df.head(3)

In [None]:
for anchor, pos, neg in zip(
    triplets_df['anchor'], 
    triplets_df['positive'],
    triplets_df['negative'],
):
    common_non_negatives.add((anchor, pos))
    common_non_negatives.add((pos, anchor))
    common_non_negatives.add((anchor, neg))
    common_non_negatives.add((neg, anchor))
len(common_non_negatives)

### add given nicknames

In [None]:
if given_surname == "given":
    with open(given_nicknames_path, "rt") as f:
        for line in f.readlines():
            names = line.split(',')
            for name1 in names:
                for name2 in names:
                    if name1 > name2:
                        common_non_negatives.add((name1, name2))
                        common_non_negatives.add((name2, name1))
len(common_non_negatives)

## Save common non-negatives

In [None]:
records = []
for name1, name2 in common_non_negatives:
    records.append({'name1': name1, 'name2': name2})
df = pd.DataFrame(records)
df.to_csv(non_negatives_path, index=False)