In [None]:
%load_ext autoreload
%autoreload 2

# Compute Frequency-based Precision & Recall for various codes

In [None]:
from collections import defaultdict

import boto3
import jellyfish
import pandas as pd
from py4j.java_gateway import JavaGateway
from tqdm.autonotebook import tqdm

from src.eval.freq_metrics import calc_avg_precision_recall

In [None]:
given_surname = "surname"
n_to_cluster = 200000
cluster_threshold = 0.1

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
test_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-test-v2.csv.gz"
query_path = f"s3://familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"

cluster_path = f"s3://nama-data/data/models/fs-{given_surname}-cluster-names-{n_to_cluster}-{cluster_threshold}.csv"

nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"

## Read data

In [None]:
train_df = pd.read_csv(train_path, keep_default_na=False)
print(train_df.shape)
train_df.head(3)

In [None]:
test_df = pd.read_csv(test_path, keep_default_na=False)
print(test_df.shape)
test_df.head(3)

In [None]:
query_names = pd.read_csv(query_path, keep_default_na=False)["name"].tolist()
print(len(query_names))
query_names[0:3]

### Read Nama Clusters

In [None]:
df = pd.read_csv(cluster_path, na_filter=False)
print(len(df))
df.head(3)

In [None]:
nama_name2clusters = defaultdict(set)
clusters = set()
for name, cluster in zip(df['name'], df['cluster']):
    nama_name2clusters[name].add(cluster)
    clusters.add(cluster)
    
print(len(nama_name2clusters))
print(len(clusters))
nama_name2clusters['ronald']

### Read Nicknames

In [None]:
s3 = boto3.resource('s3')

nama_nicknames = defaultdict(set)
if given_surname == "given":
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names[1:]:
            if name != headname:
                nama_nicknames[name].add(headname)
print(len(nama_nicknames))
nama_nicknames['zachery']

## Compute codes

In [None]:
gateway = JavaGateway()

In [None]:
def get_identity(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in names:
        code = name
        name2codes[name].add(code)
        code2names[code].add(name)
    return name2codes, code2names

def get_nysiis(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in names:
        code = jellyfish.nysiis(name)
        name2codes[name].add(code)
        code2names[code].add(name)
    return name2codes, code2names

def get_soundex(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in names:
        code = jellyfish.soundex(name)
        name2codes[name].add(code)
        code2names[code].add(name)
    return name2codes, code2names

def get_fs(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        result = gateway.getClusters(name, given_surname == "surname")
        for code in result.split(','):
            name2codes[name].add(code)
            code2names[code].add(name)
    return name2codes, code2names   

def get_fs_soundex(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        result = gateway.getClustersUsingSoundex(name, given_surname == "surname")
        codes = result.split(',')
        for code in codes:
            name2codes[name].add(code)
            if len(codes) > 1 and code.startswith("_"):
                # if this name is associated with multiple clusters, 
                # and this cluster is a soundex code, 
                # then don't associate this name with this soundex cluster,
                # because it is already associated (indexed) under another cluster
                continue
            code2names[code].add(name)
    return name2codes, code2names

def get_fs_nysiis(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        result = gateway.getClustersUsingNysiis(name, given_surname == "surname")
        codes = result.split(',')
        for code in codes:
            name2codes[name].add(code)
            if len(codes) > 1 and code.startswith("_"):
                # if this name is associated with multiple clusters, 
                # and this cluster is a nysiis code, 
                # then don't associate this name with this nysiis cluster,
                # because it is already associated (indexed) under another cluster
                continue
            code2names[code].add(name)
    return name2codes, code2names

def _get_nama_standards(name):
    standards = set()
    lookups = set([name])
    if given_surname == "given" and name in nama_nicknames:
        lookups.update(nama_nicknames[name])
    for lookup in lookups:
        if lookup in nama_name2clusters:
            standards.update(nama_name2clusters[lookup])
    return standards

def get_nama_soundex(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        codes = _get_nama_standards(name)
        for code in codes:
            name2codes[name].add(code)
            code2names[code].add(name)
        code = jellyfish.soundex(name)
        # always query soundex code
        # name2codes simulates query: given a name, what codes to lookup
        name2codes[name].add(code)
        # add name to soundex bucket only if it isn't in another bucket
        # code2names simulates an index lookup: given code, what names are indexed with that code
        if len(codes) == 0:
            code2names[code].add(name)
    return name2codes, code2names

def get_nama_nysiis(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        codes = _get_nama_standards(name)
        for code in codes:
            name2codes[name].add(code)
            code2names[code].add(name)
        code = jellyfish.nysiis(name)
        # always query nysiis code
        # name2codes simulates query: given a name, what codes to lookup
        name2codes[name].add(code)
        # add name to nysiis bucket only if it isn't in another bucket
        # code2names simulates an index lookup: given code, what names are indexed with that code
        if len(codes) == 0:
            code2names[code].add(name)
    return name2codes, code2names

In [None]:
get_nama_soundex(['john', 'dallan', 'johnny', 'elizabeth', 'quass'])

# Compute Precision and Recall on Train

## Identity

In [None]:
df = train_df
name2codes, code2names = get_identity(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## NYSIIS

In [None]:
df = train_df
name2codes, code2names = get_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Soundex

In [None]:
df = train_df
name2codes, code2names = get_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-Soundex

In [None]:
df = train_df
name2codes, code2names = get_nama_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-NYSIIS

In [None]:
df = train_df
name2codes, code2names = get_nama_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-Soundex

In [None]:
df = train_df
name2codes, code2names = get_fs_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-NYSIIS

In [None]:
df = train_df
name2codes, code2names = get_fs_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS

In [None]:
df = train_df
name2codes, code2names = get_fs(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

# Compute Precision and Recall on Train and Test

## Identity

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_identity(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## NYSIIS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Soundex

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-Soundex

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_nama_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-NYSIIS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_nama_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-Soundex

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_fs_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-NYSIIS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_fs_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_fs(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")