In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Evaluate Coder PR
Calculate precision and recall for NYSIIS, Soundex, etc. code

## Tiny Query

| Experiment | Threshold     | Precision | Recall | F1     | F2     |
|------------|---------------|-----------|--------|--------|--------|
|Soundex     |               | 0.142     | 0.522  | 0.22   | 0.34   |
|Nysiis      |               | 0.147     | 0.358  | 0.20   | 0.28   |
|Nama        | 0.10 @40      | 0.148     | 0.586  | 0.24   | 0.37   |

In [9]:
from collections import defaultdict
import json
import os
import re

import boto3
import jellyfish
from mpire import WorkerPool
import numpy as np
import pandas as pd
from py4j.java_gateway import JavaGateway
import torch
from tqdm import tqdm

from nama.data.filesystem import download_file_from_s3
from nama.data.utils import read_csv
from nama.eval.freq_metrics import calc_avg_precision_recall
from nama.models.tokenizer import get_tokenize_function_and_vocab
from nama.models.utils import top_similar_names

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [17]:
# config
# TODO run both given and surname
given_surname = "given"
# given_surname = "surname"

nama_threshold = 0.1
nama_limit = 40

linkage = "average"  # average, complete
similarity_threshold = 0.1 if given_surname == "given" else 0.25
cluster_freq_normalizer = "none"  # log, log10, none
max_tokens = 10
bi_encoder_vocab_size = 2048
num_epochs = 8
embedding_dim = 256
learning_rate = 0.00005 

clusters_path = f"s3://fs-nama-data/2024/nama-data/data/processed/clusters_{given_surname}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}-augmented.json"
super_clusters_path = f"s3://fs-nama-data/2024/nama-data/data/processed/super_clusters_{given_surname}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
tokenizer_path=f"s3://fs-nama-data/2024/nama-data/data/models/fs-{given_surname}-subword-tokenizer-{bi_encoder_vocab_size}.json"
bi_encoder_path = f"s3://fs-nama-data/2024/nama-data/data/models/bi_encoder-ce-{given_surname}-{num_epochs}-{embedding_dim}-{num_epochs}-{bi_encoder_vocab_size}-{learning_rate}.pth"

train_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz"
test_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz"
query_path = f"s3://fs-nama-data/2023/familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"
pref_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
nickname_path = "../references/givenname_nicknames.csv"

### Load data

In [8]:
# these nicknames include nickname heads going to themselves (e.g., john -> john)
nicknames = defaultdict(set)
if given_surname == "given":
    with open(nickname_path, 'r') as f:
        for line in f:
            line = line.strip()
            names = line.split(',')
            headname = names[0]
            for name in names:
                nicknames[name].add(headname)
print(len(nicknames))
print(nicknames['zachary'])
print(nicknames['zachariah'])

1201
{'zachariah'}
{'zachariah'}


In [11]:
path = download_file_from_s3(query_path) if query_path.startswith("s3://") else query_path
query_names = read_csv(path)["name"].tolist()
print(len(query_names))
query_names[0:3]

5000


['james', 'janos', 'caroline']

In [12]:
# load pref names
path = download_file_from_s3(pref_path) if pref_path.startswith("s3://") else pref_path
pref_df = read_csv(path)

In [13]:
# create common names pref names that occur >= common_name_threshold
common_names = [name for name, freq in zip(pref_df['name'], pref_df['frequency']) \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)]
common_names = common_names[:10000]
len(common_names)

10000

In [14]:
path = download_file_from_s3(train_path) if train_path.startswith("s3://") else train_path
train_df = read_csv(path)
print(train_df.shape)
train_df.head(3)

(3580532, 3)


Unnamed: 0,tree_name,record_name,frequency
0,a,a,1622927
1,aa,a,139
2,aa,aa,45


In [15]:
path = download_file_from_s3(test_path) if test_path.startswith("s3://") else test_path
test_df = pd.read_csv(path, na_filter=False)
print(test_df.shape)
test_df.head(3)

(1195039, 3)


Unnamed: 0,tree_name,record_name,frequency
0,aaaard,aagaard,12
1,aaagot,aagot,8
2,aaassiena,aassiena,2


In [16]:
all_df = pd.concat([train_df, test_df])
print(all_df.shape)
all_df.head(3)

(4775571, 3)


Unnamed: 0,tree_name,record_name,frequency
0,a,a,1622927
1,aa,a,139
2,aa,aa,45


In [18]:
nama_name_cluster = {}       # name -> cluster position
nama_cluster_centroids = []  # centroid for each cluster
nama_cluster_labels = []     # label for each cluster
nama_cluster_super_cluster = {}  # cluster label -> super_cluster label

path = download_file_from_s3(clusters_path) if clusters_path.startswith("s3://") else clusters_path
with open(path, 'r') as f:
    nama_clusters = json.load(f)  # cluster label -> names, centroid

path = download_file_from_s3(super_clusters_path) if super_clusters_path.startswith("s3://") else super_clusters_path
with open(path, 'r') as f:
    nama_super_clusters = json.load(f)  # super_cluster label -> cluster labels

for label, cluster in nama_clusters.items():
    for name in cluster['names']:
        nama_name_cluster[name] = len(nama_cluster_labels)
    nama_cluster_labels.append(label)
    nama_cluster_centroids.append(np.array(cluster['centroid']))
nama_cluster_labels = np.array(nama_cluster_labels)

for super_cluster_label, super_cluster in nama_super_clusters.items():
    for cluster_label in super_cluster:
        nama_cluster_super_cluster[cluster_label] = super_cluster_label

In [20]:
len(nama_name_cluster)

154032

In [24]:
names = set(all_df["tree_name"]) | set(all_df["record_name"])
cnt = 0
for ix, name in enumerate(names):
    if name not in nama_name_cluster:
        cnt += 1
print(len(names), cnt, cnt/len(names))

1897758 1751954 0.9231703936961404


In [25]:
total_freq = 0
cluster_freq = 0
for name, freq in zip(all_df['tree_name'], all_df['frequency']):
    total_freq += freq
    if name in nama_name_cluster:
        cluster_freq += freq
print(total_freq, cluster_freq, cluster_freq/total_freq)

1076069127 1054256502 0.9797293459567863


In [26]:
total_freq = 0
cluster_freq = 0
for name, freq in zip(all_df['record_name'], all_df['frequency']):
    total_freq += freq
    if name in nama_name_cluster:
        cluster_freq += freq
print(total_freq, cluster_freq, cluster_freq/total_freq)

1076069127 1046111160 0.9721598118110492


In [27]:
# load tokenizer
path = download_file_from_s3(tokenizer_path) if tokenizer_path.startswith("s3://") else tokenizer_path
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(tokenizer_path=path, max_tokens=max_tokens)
len(tokenizer_vocab)

2048

In [28]:
# load bi-encoder
path = download_file_from_s3(bi_encoder_path) if bi_encoder_path.startswith("s3://") else bi_encoder_path
bi_encoder_model = torch.load(path)
bi_encoder_model.eval()

  bi_encoder_model = torch.load(path)


BiEncoder(
  (embedding): Embedding(2048, 256)
  (positional_embedding): Embedding(10, 256)
  (pooling): AdaptiveAvgPool1d(output_size=1)
)

## Set up FamilySearch coder

In [None]:
# match-spark/pipeline mexico-dup-classifier branch
# java -cp target/spark-pipeline.jar org.familysearch.search.spark.py4j.Py4JGateway

gateway = JavaGateway()

def fs_coder(name):
    # can result ever contain multiple comma-separated codes?
    # if so, do we index both and query one, or index one and query both?
    return gateway.getClusters(name, given_surname == 'surname')

In [None]:
fs_coder('ebbie')

## Set up FS Nama coder

In [18]:
# searchng-standards-wrapper py4j branch
# java -classpath target/searchng-standards-wrapper.jar org.familysearch.recordsearch.standards.Py4JGateway

gateway = JavaGateway()

def fs_nama_coder(name):
    return gateway.getClusters(name, given_surname == 'surname')

In [19]:
fs_nama_coder('ebbie')

'abbey/eby'

## Set up nama coder

In [21]:
def get_embedding(name):
    embedding = bi_encoder_model.get_embedding(tokenize(name)) 
    embedding /= np.linalg.norm(embedding)
    return embedding

In [22]:
def nama_coder_threshold_limit(name, threshold, limit):
    codes = []
        
    # get the primary (indexed) cluster
    if name in nama_name_cluster:
        # if it is in the cluster dictionary, get that cluster
        cluster_label = nama_cluster_labels[nama_name_cluster[name]]
    else:
        # if it isn't, get the nearest cluster
        emb = get_embedding(name)
        cluster_label = top_similar_names(emb, nama_cluster_centroids, nama_cluster_labels, 
                                          threshold=0, top_n=1)[0][0]
    # index it under this cluster
    codes.append(cluster_label)
    
    # include additional clusters in this cluster's super-cluster
    super_cluster_clusters = nama_super_clusters.get(nama_cluster_super_cluster.get(cluster_label, None), [])
    for nearby_cluster in super_cluster_clusters:
        # don't check length, because we want all clusters in the super-cluster
        if nearby_cluster not in codes:
            codes.append(nearby_cluster)

    # include additional clusters near this cluster
    if limit > len(codes):
        emb = get_embedding(name)
        nearby_clusters, similarities = top_similar_names(emb, nama_cluster_centroids, nama_cluster_labels,
                                                          threshold=threshold, top_n=limit-len(codes))
        for nearby_cluster, similarity in zip(nearby_clusters, similarities):
            # print(name, nearby_cluster, similarity)
            if len(codes) >= limit or similarity < threshold:
                break
            if nearby_cluster not in codes:
                codes.append(nearby_cluster)
            
    return ','.join(codes)

def nama_coder(name):
    return nama_coder_threshold_limit(name, nama_threshold, nama_limit)

In [29]:
def _sample(code):
    return ' '.join(nama_clusters[code]['names'][:8])

total_codes = 0
for name in ['dallan', 'richard', 'solveig', 'evelyn', 'barbara', 'susan', 'henry', 'becca']:
    codes = nama_coder_threshold_limit(name, threshold=0.65, limit=40)
    codes = codes.split(',')
    print(name, len(codes))
    total_codes += len(codes)
    for code in codes:
        print('   ', code, _sample(code))
print(total_codes)

dallan 4
    dillon/dillon dillan dalen dillion dillin dillian dellujan dallan dallin
    dillon/dylan diran dyann dilan dylan
    dallas/dallas dallas dalis dalay dallace dalles dallice dalyce dalit
    dolan/dolan dolling dolen doljin dalling dolon dollin dollen dolena
richard 17
    richard/richard richards richaurd richerd ritchard richardae recardo riccarda richeard
    richard/dack dec deak daek dack dek deck
    richard/dick dyck dickie dich diack dickey dick dziecko dika
    richard/ricky richelieu rische richert ritchie rickie ricky richa ricki
    richard/richardson ricketson richeson richarson rickerson richardson richison richerson richardsen
    richard/rd ryd rd
    richard/record records record secord recorded recherd
    richard/ryszard ryszard buzzard blizard blizzard
    richard/dk leduc duc dk
    rchd/richdi ruchard rickhard richarad rochard rashid rchd richford rachard
    richan/richenda richenda richens richan richins richenza richland
    frederick/rychli richo 

## Evaluate

In [30]:
def get_all_name_codes(coder_name, coder, names):
    
    def _wrapped_coder(name):
        return name, coder(name)
    
    if coder_name == 'familysearch' or coder_name == 'nama':
        results = [_wrapped_coder(name) for name in tqdm(names, mininterval=5.0)]
    else:
        with WorkerPool() as pool:
            results = pool.map(_wrapped_coder, names, progress_bar=True, progress_bar_options={'mininterval': 5.0})
    return results

In [31]:
def get_codes(coder_name, coder, nicknames, names):
    # name2codes simulates query: given a name, what codes to lookup
    name2codes = defaultdict(set)
    # code2names simulates index: given a code, what names are indexed under that code
    code2names = defaultdict(set)
    # get codes for name - index name under the first code, query name under all codes
    for name, codes in get_all_name_codes(coder_name, coder, names):
        for ix, code in enumerate(codes.split(',')):
            # query code
            name2codes[name].add(code)
            # add name to code bucket
            if ix == 0:
                code2names[code].add(name)
        if given_surname == "given" and name in nicknames:
            # query codes for each nickhead of nickname
            for nickhead in nicknames[name]:
                codes = coder(nickhead)
                for code in codes.split(','):
                    name2codes[name].add(code)
                    # make sure nickhead is added to the code bucket
                    code2names[code].add(nickhead)
    return name2codes, code2names

def eval_clusters(coder_name, coder, nicknames, data_df, query_names):
        name2codes, code2names = get_codes(coder_name, coder,
                                           # familysearch and nama coders handle nicknames
                                           [] if coder_name in ['familysearch', 'nama'] else nicknames,
                                           set(data_df["tree_name"]) | set(data_df["record_name"]))
        print("total names", len(name2codes))
        print("total index entries", sum(len(names) for names in code2names.values()))
        print("total codes", len(code2names))
        print("total queries", len(query_names))
        print("total lookups", sum(len(name2codes[query]) for query in query_names))
        precision, recall, f1, f2 = calc_avg_precision_recall(query_names, 
                                                                       name2codes, 
                                                                       code2names, 
                                                                       data_df)
        with open('results.txt', 'w') as f:
            f.write(f"precision={precision}, recall={recall} f1={f1} f2={f2}\n")
        print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

In [32]:
tiny_df = all_df.sample(n=100_000, random_state=42)
len(tiny_df)

100000

In [34]:
nama_threshold = 0.75
nama_limit = 0

coders = [
    ('soundex', jellyfish.soundex), 
    ('nysiis', jellyfish.nysiis), 
    ('nama', nama_coder),
#     ('familysearch', fs_coder),
#     ('fs-nama', fs_nama_coder),    
    ]
data_sources = [
    ('tiny', tiny_df),
#     ('train', train_df),
#     ('test', test_df),
#     ('all', all_df),
    ]
for label, data_df in data_sources:
    print(label)
    for coder_name, coder in coders:
        print(coder_name)
        eval_clusters(coder_name, coder, nicknames, data_df, query_names)

tiny
soundex


100%|██████████| 125464/125464 [00:01<00:00, 73316.54it/s]


total names 125464
total index entries 125482
total codes 4834
total queries 5000
total lookups 6220


100%|██████████| 5000/5000 [02:41<00:00, 31.02it/s]

precision=0.1421674428294558, recall=0.5217846354629473 f1=0.22345223324626126 f2=0.3401370422742517
nysiis



100%|██████████| 125464/125464 [00:01<00:00, 77673.04it/s]


total names 125464
total index entries 125482
total codes 37257
total queries 5000
total lookups 6506


100%|██████████| 5000/5000 [02:48<00:00, 29.63it/s]


precision=0.14733403965154154, recall=0.3583130964271831 f1=0.20880852353309892 f2=0.27854037596859293
nama


100%|██████████| 125464/125464 [2:25:34<00:00, 14.36it/s] 


total names 125464
total index entries 125464
total codes 15019
total queries 5000
total lookups 82091


100%|██████████| 5000/5000 [02:28<00:00, 33.68it/s]

precision=0.14783639884435323, recall=0.5862144545660852 f1=0.2361248774820221 f2=0.367980528541687



