In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Evaluate Coder PR
Calculate precision and recall for NYSIIS, Soundex, etc. code

### TODO
- Re-run FS but use the bi-encoder to get clusters for OOV names, just as in Nama (unaugmented) clusters to make sure that we don't have a bug

#### Tiny Query

| Experiment | Threshold   |Precision|Recall| F1 | F2 |
|------------|-------------|---------|------|----|----|
|Soundex     |             | 0.152   | 0.608|0.24|0.38|
|Nysiis      |             | 0.157   | 0.425|0.23|0.32|
|FamilySearch|             | 0.167   | 0.648|0.27|0.41|
|Nama        |0.55/0.8 @0  | 0.164   | 0.617|0.26|0.40|
|Nama        |0.55/0.8 @20 | 0.131   | 0.778|0.22|0.39|
|Nama        |0.65/0.8 @20 | 0.147   | 0.729|0.24|0.41|
|Nama        |0.65/0.85 @20| 0.152   | 0.723|0.25|0.41|
|Nama        |0.65/1.0 @20 | 0.154   | 0.712|0.25|0.41|
|Nama        |0.70/1.0 @20 | 0.160   | 0.689|0.26|0.41|
|Nama        |0.75/0.9 @20 | 0.164   | 0.657|0.26|0.41|
|Nama        |0.75/1.0 @20 | 0.165   | 0.655|0.26|0.41|
|Nama none   |0.75/1.0 @20 | 0.164   | 0.651|0.26|0.41|
|Nama CE     |0.10/1.0 @40 | 0.137   | 0.749|0.23|0.39|
|Nama CE     |0.20/1.0 @40 | 0.159   | 0.649|0.25|0.40|
|Nama CE     |0.30/1.0 @40 | 0.165   | 0.619|0.26|0.40|  40k lookups
|Nama CE     |0.30/1.0 @0  | 0.161   | 0.602|0.25|0.39|  39k lookups
|Nama CE     |0.15/1.0 @20 | 0.153   | 0.676|0.25|0.40|  56k lookups
|Nama BE     |0.65/1.0 @20 | 0.153   | 0.725|0.26|0.42|  54k lookups
|Nama BE     |0.75/1.0 @20 | 0.165   | 0.656|0.26|0.41|  43k lookups
|Nama BE     |0.75/1.0 @40 | 0.165   | 0.657|0.26|0.41|  44k lookups

#### Tiny Common

| Experiment | Threshold  |Precision|Recall| F1 | F2 |
|------------|------------|---------|------|----|----|
|FamilySearch|            | 0.251   | 0.675|0.37|0.50|
|Nama        |0.55/0.8 @20| 0.205   | 0.809|0.33|0.51|

#### All Query

| Experiment | Threshold  |Precision|Recall| F1 | F2 |
|------------|------------|---------|------|----|----|
|Soundex     |            | 0.343   | 0.920|0.50|0.69|
|Nysiis      |            | 0.413   | 0.877|0.56|0.72|
|FamilySearch|            | 0.379   | 0.953|0.54|0.73|
|Nama        |0.55/0.8 @20| 0.301   | 0.977|0.46|0.67|
|Nama BE     |0.75/1.0 @40| 0.376   | 0.957|0.54|0.73|  45k lookups
|Nama BE     |0.60/1.0 @40| 0.327   | 0.973|0.49|0.70|  74k lookups

#### Compare Nama clustering approaches on Tiny Query

| Experiment | Threshold  |Precision|Recall| F1  | F2  |
|------------|------------|---------|------|-----|-----|
|BE          | 0.3        | 0.174   | 0.583|0.268|0.396|
|CE          | 0.15       | 0.188   | 0.534|0.278|0.390|
|CE          | 0.08       | 0.179   | 0.567|0.272|0.395|
|CE          | 0.10       | 0.180   | 0.560|0.273|0.394|

### Surname Tiny Query
| Experiment | Threshold  |Precision|Recall| F1  | F2  |
|------------|------------|---------|------|-----|-----|
|FamilySearch|            | 0.355   | 0.545|0.430|0.492|
|Nama BE     |0.75/1.0 @40|


In [None]:
from collections import defaultdict
import json
import os
import re

import boto3
import jellyfish
from mpire import WorkerPool
import numpy as np
import pandas as pd
from py4j.java_gateway import JavaGateway
import torch
from tqdm import tqdm

from src.data.utils import read_csv
from src.eval.freq_metrics import calc_avg_precision_recall
from src.models.tokenizer import get_tokenize_function_and_vocab
from src.models.utils import top_similar_names

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [None]:
# configure
given_surname = "surname"

nama_threshold = 0.1
nama_limit = 40

linkage = "average"  # average, complete
similarity_threshold = 0.1 if given_surname == "given" else 0.25
scorer = "ce"
cluster_freq_normalizer = "none"  # log, log10, none
clusters_path = f"../data/processed/clusters_{given_surname}-{scorer}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}-augmented.json"
super_clusters_path = f"../data/processed/super_clusters_{given_surname}-{scorer}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
max_tokens = 10
nama_subwords_path=f"../data/models/fs-{given_surname}-subword-tokenizer-2000f.json"
model_type = 'cecommon+0+aug-0-1'
nama_model_path = f"../data/models/bi_encoder-{given_surname}-{model_type}.pth"

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
test_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-test-v2.csv.gz"
query_path = f"s3://familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"

### Load data

In [None]:
s3 = boto3.resource('s3')

# these nicknames include nickname heads going to themselves (e.g., john -> john)
nicknames = defaultdict(set)
if given_surname == "given":
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names:
            nicknames[name].add(headname)
print(len(nicknames))
print(nicknames['zachery'])
print(nicknames['zachariah'])

In [None]:
query_names = pd.read_csv(query_path, na_filter=False)["name"].tolist()
print(len(query_names))
query_names[0:3]

In [None]:
# load pref names
pref_df = read_csv(pref_path)

In [None]:
# create common names pref names that occur >= common_name_threshold
common_names = [name for name, freq in zip(pref_df['name'], pref_df['frequency']) \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)]
common_names = common_names[:10000]
len(common_names)

In [None]:
train_df = pd.read_csv(train_path, na_filter=False)
print(train_df.shape)
train_df.head(3)

In [None]:
test_df = pd.read_csv(test_path, na_filter=False)
print(test_df.shape)
test_df.head(3)

In [None]:
all_df = pd.concat([train_df, test_df])
print(all_df.shape)
all_df.head(3)

In [None]:
nama_name_cluster = {}       # name -> cluster position
nama_cluster_centroids = []  # centroid for each cluster
nama_cluster_labels = []     # label for each cluster
nama_cluster_super_cluster = {}  # cluster label -> super_cluster label

with open(clusters_path, 'r') as f:
    nama_clusters = json.load(f)  # cluster label -> names, centroid

with open(super_clusters_path, 'r') as f:
    nama_super_clusters = json.load(f)  # super_cluster label -> cluster labels

for label, cluster in nama_clusters.items():
    for name in cluster['names']:
        nama_name_cluster[name] = len(nama_cluster_labels)
    nama_cluster_labels.append(label)
    nama_cluster_centroids.append(np.array(cluster['centroid']))
nama_cluster_labels = np.array(nama_cluster_labels)

for super_cluster_label, super_cluster in nama_super_clusters.items():
    for cluster_label in super_cluster:
        nama_cluster_super_cluster[cluster_label] = super_cluster_label

In [None]:
len(nama_name_cluster)

In [None]:
names = set(all_df["tree_name"]) | set(all_df["record_name"])
cnt = 0
for ix, name in enumerate(names):
    if name not in nama_name_cluster:
        cnt += 1
print(len(names), cnt, cnt/len(names))

In [None]:
total_freq = 0
cluster_freq = 0
for name, freq in zip(all_df['tree_name'], all_df['frequency']):
    total_freq += freq
    if name in nama_name_cluster:
        cluster_freq += freq
print(total_freq, cluster_freq, cluster_freq/total_freq)

In [None]:
total_freq = 0
cluster_freq = 0
for name, freq in zip(all_df['record_name'], all_df['frequency']):
    total_freq += freq
    if name in nama_name_cluster:
        cluster_freq += freq
print(total_freq, cluster_freq, cluster_freq/total_freq)

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    max_tokens=max_tokens,
    subwords_path=nama_subwords_path,
)
len(tokenizer_vocab)

In [None]:
# load model
nama_model = torch.load(nama_model_path)
nama_model.eval()

## Set up FamilySearch coder

In [None]:
# match-spark/pipeline
# java -cp target/spark-pipeline.jar org.familysearch.search.spark.py4j.Py4JGateway

gateway = JavaGateway()

def fs_coder(name):
    # can result ever contain multiple comma-separated codes?
    # if so, do we index both and query one, or index one and query both?
    return gateway.getClusters(name, given_surname == 'surname')

In [None]:
fs_coder('ab')

## Set up nama coder

In [None]:
def get_embedding(name):
    embedding = nama_model.get_embedding(tokenize(name)) 
    embedding /= np.linalg.norm(embedding)
    return embedding

In [None]:
def nama_coder_threshold_limit(name, threshold, limit):
    codes = []
        
    # get the primary (indexed) cluster
    if name in nama_name_cluster:
        # if it is in the cluster dictionary, get that cluster
        cluster_label = nama_cluster_labels[nama_name_cluster[name]]
    else:
        # if it isn't, get the nearest cluster
        emb = get_embedding(name)
        cluster_label = top_similar_names(emb, nama_cluster_centroids, nama_cluster_labels, 
                                          threshold=0, top_n=1)[0][0]
    # index it under this cluster
    codes.append(cluster_label)
    
    # include additional clusters in this cluster's super-cluster
    super_cluster_clusters = nama_super_clusters.get(nama_cluster_super_cluster.get(cluster_label, None), [])
    for nearby_cluster in super_cluster_clusters:
        # don't check length, because we want all clusters in the super-cluster
        if nearby_cluster not in codes:
            codes.append(nearby_cluster)

    # include additional clusters near this cluster
    if limit > len(codes):
        emb = get_embedding(name)
        nearby_clusters, similarities = top_similar_names(emb, nama_cluster_centroids, nama_cluster_labels,
                                                          threshold=threshold, top_n=limit-len(codes))
        for nearby_cluster, similarity in zip(nearby_clusters, similarities):
            # print(name, nearby_cluster, similarity)
            if len(codes) >= limit or similarity < threshold:
                break
            if nearby_cluster not in codes:
                codes.append(nearby_cluster)
            
    return ','.join(codes)

def nama_coder(name):
    return nama_coder_threshold_limit(name, nama_threshold, nama_limit)

In [None]:
def _sample(code):
    return ' '.join(nama_clusters[code]['names'][:8])

total_codes = 0
for name in ['dallan', 'richard', 'solveig', 'evelyn', 'barbara', 'susan', 'henry', 'becca']:
    codes = nama_coder_threshold_limit(name, threshold=0.65, limit=40)
    codes = codes.split(',')
    print(name, len(codes))
    total_codes += len(codes)
    for code in codes:
        print('   ', code, _sample(code))
print(total_codes)

## Evaluate

In [None]:
def get_all_name_codes(coder_name, coder, names):
    
    def _wrapped_coder(name):
        return name, coder(name)
    
    if coder_name == 'familysearch' or coder_name == 'nama':
        results = [_wrapped_coder(name) for name in tqdm(names, mininterval=5.0)]
    else:
        with WorkerPool() as pool:
            results = pool.map(_wrapped_coder, names, progress_bar=True, progress_bar_options={'mininterval': 5.0})
    return results

In [None]:
def get_codes(coder_name, coder, nicknames, names):
    # name2codes simulates query: given a name, what codes to lookup
    name2codes = defaultdict(set)
    # code2names simulates index: given a code, what names are indexed under that code
    code2names = defaultdict(set)
    # get codes for name - index name under the first code, query name under all codes
    for name, codes in get_all_name_codes(coder_name, coder, names):
        for ix, code in enumerate(codes.split(',')):
            # query code
            name2codes[name].add(code)
            # add name to code bucket
            if ix == 0:
                code2names[code].add(name)
        if given_surname == "given" and name in nicknames:
            # query codes for each nickhead of nickname
            for nickhead in nicknames[name]:
                codes = coder(nickhead)
                for code in codes.split(','):
                    name2codes[name].add(code)
                    # make sure nickhead is added to the code bucket
                    code2names[code].add(nickhead)
    return name2codes, code2names

def eval_clusters(coder_name, coder, nicknames, data_df, query_names):
        name2codes, code2names = get_codes(coder_name, coder,
                                           # familysearch and nama coders handle nicknames
                                           [] if coder_name in ['familysearch', 'nama'] else nicknames,
                                           set(data_df["tree_name"]) | set(data_df["record_name"]))
        print("total names", len(name2codes))
        print("total index entries", sum(len(names) for names in code2names.values()))
        print("total codes", len(code2names))
        print("total queries", len(query_names))
        print("total lookups", sum(len(name2codes[query]) for query in query_names))
        precision, recall, f1, f2 = calc_avg_precision_recall(query_names, 
                                                                       name2codes, 
                                                                       code2names, 
                                                                       data_df)
        print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

In [None]:
tiny_df = all_df.sample(n=100_000, random_state=42)
len(tiny_df)

In [None]:
nama_threshold = 0.75
nama_limit = 40

coders = [
#     ('soundex', jellyfish.soundex), 
#     ('nysiis', jellyfish.nysiis), 
    ('nama', nama_coder),
#     ('familysearch', fs_coder),
    ]
data_sources = [
    ('tiny', tiny_df),
#     ('train', train_df),
#     ('test', test_df),
#     ('all', all_df),
    ]
for label, data_df in data_sources:
    print(label)
    for coder_name, coder in coders:
        print(coder_name)
        eval_clusters(coder_name, coder, nicknames, data_df, query_names)