In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Evaluate Coder PR
Calculate precision and recall for NYSIIS, Soundex, etc. code

In [None]:
from collections import defaultdict

import boto3
import jellyfish
import pandas as pd
from py4j.java_gateway import JavaGateway
from tqdm.autonotebook import tqdm

from src.eval.freq_metrics import calc_avg_precision_recall

In [None]:
# configure
given_surname = "given"

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
test_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-test-v2.csv.gz"
query_path = f"s3://familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"
nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"

### Load data

In [None]:
s3 = boto3.resource('s3')

# these nicknames include nickname heads going to themselves (e.g., john -> john)
nicknames = defaultdict(set)
if given_surname == "given":
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names:
            nicknames[name].add(headname)
print(len(nicknames))
print(nicknames['zachery'])
print(nicknames['zachariah'])

In [None]:
query_names = pd.read_csv(query_path, keep_default_na=False)["name"].tolist()
print(len(query_names))
query_names[0:3]

In [None]:
train_df = pd.read_csv(train_path, keep_default_na=False)
print(train_df.shape)
train_df.head(3)

In [None]:
test_df = pd.read_csv(test_path, keep_default_na=False)
print(test_df.shape)
test_df.head(3)

In [None]:
all_df = pd.concat([train_df, test_df])
print(all_df.shape)
all_df.head(3)

## Set up FamilySearch coder

In [None]:
gateway = JavaGateway()

In [None]:
def fs_coder(name):
    # can result ever contain multiple comma-separated codes?
    # if so, do we index both and query one, or index one and query both?
    return gateway.getClusters(name, given_surname == 'surname', True)

In [None]:
fs_coder('ab')

## Evaluate

In [None]:
def get_codes(coder, nicknames, names):
    # name2codes simulates query: given a name, what codes to lookup
    name2codes = defaultdict(set)
    # code2names simulates index: given a code, what names are indexed under that code
    code2names = defaultdict(set)
    for name in tqdm(names, mininterval=2.0):
        # get code for name
        codes = coder(name)
        for code in codes.split(','):
            # query code
            name2codes[name].add(code)
            # add name to code bucket
            code2names[code].add(name)
        if given_surname == "given" and name in nicknames:
            # query codes for each nickhead of nickname
            for nickhead in nicknames[name]:
                codes = coder(nickhead)
                for code in codes.split(','):
                    name2codes[name].add(code)
                    # make sure nickhead is added to the code bucket
                    code2names[code].add(nickhead)
    return name2codes, code2names

def eval_clusters(coder_name, coder, nicknames, data_df, query_names):
        name2codes, code2names = get_codes(coder,
                                           # familysearch coder handles nicknames
                                           [] if coder_name == 'familysearch' else nicknames,
                                           set(data_df["tree_name"]) | set(data_df["record_name"]))
        print("total names", len(name2codes))
        print("total index entries", sum(len(names) for names in code2names.values()))
        print("total codes", len(code2names))
        print("total queries", len(query_names))
        print("total lookups", sum(len(name2codes[query]) for query in query_names))
        precision, recall, f1, f2 = calc_avg_precision_recall(query_names, 
                                                              name2codes, 
                                                              code2names, 
                                                              data_df)
        print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")    

In [None]:
coders = [
#     ('soundex', jellyfish.soundex), 
#     ('nysiis', jellyfish.nysiis), 
    ('familysearch', fs_coder),
    ]
data_sources = [
#    ('train', train_df),
    ('all', all_df),
    ]
for label, data_df in data_sources:
    print(label)
    for coder_name, coder in coders:
        print(coder_name)
        eval_clusters(coder_name, coder, nicknames, data_df, query_names)