In [None]:
%load_ext autoreload
%autoreload 2

# Compute Frequency-based Precision & Recall for various codes

In [None]:
from collections import Counter, defaultdict

import boto3
import jellyfish
import pandas as pd
from py4j.java_gateway import JavaGateway
from tqdm.autonotebook import tqdm

from src.eval.freq_metrics import calc_avg_precision_recall
from src.models.cluster import read_cluster_scores
from src.models.utils import remove_padding

In [None]:
given_surname = "surname"
n_to_cluster = 250000
cluster_threshold = 0.15
# cluster_scores_threshold = 0.95
# cluster_scores_limit = 20

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
test_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-test-v2.csv.gz"
query_path = f"s3://familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"

# upper = 0.95
# lower = 0.6
# m = 0.059133018459459175
# b = 0.10088609567188966
# cluster_path = f"../data/models/fs-{given_surname}-cluster-greedy-{n_to_cluster}-upper_{upper}-lower_{lower}-m_{m}-b_{b}.csv"
cluster_path = f"s3://nama-data/data/models/fs-{given_surname}-cluster-names-{n_to_cluster}-{cluster_threshold}.csv"
# cluster_path=f"s3://nama-data/data/models/fs-{given_surname}-cluster-names.csv"

# vocab_size = 610000 if given_surname == "given" else 2100000
# embed_dim = 100
# cluster_scores_path=f"s3://nama-data/data/processed/fs-{given_surname}-cluster-scores-{vocab_size}-{embed_dim}-precomputed.jsonl.gz"

nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"

## Read data

In [None]:
train_df = pd.read_csv(train_path, na_filter=False)
print(train_df.shape)
train_df.head(3)

In [None]:
test_df = pd.read_csv(test_path, na_filter=False)
print(test_df.shape)
test_df.head(3)

In [None]:
query_names = pd.read_csv(query_path, na_filter=False)["name"].tolist()
print(len(query_names))
query_names[0:3]

In [None]:
pref_df = pd.read_csv(pref_path, na_filter=False)
print(pref_df.shape)
pref_df.head(3)

In [None]:
# get the most-common names
common_names = set(pref_df.nlargest(n_to_cluster, 'frequency')['name'].tolist())
len(common_names)

### Read Nama Cluster Scores

In [None]:
# all_cluster_scores = read_cluster_scores(cluster_scores_path)
# print(len(all_cluster_scores))

In [None]:
# cluster_scores = {}
# clustered_names = set(remove_padding(name) for name in all_cluster_scores.keys() \
#                       if remove_padding(name) in common_names)
# total_cluster_scores = 0
# for name, all_scores in all_cluster_scores.items():
#     name = remove_padding(name)
#     if name not in clustered_names:
#         continue
#     scores = [(name, score) for name, score in all_scores if name in clustered_names and score >= cluster_scores_threshold]
#     if len(scores) == 0:
#         continue
#     cluster_scores[name] = scores
#     total_cluster_scores += len(scores)
# print(len(cluster_scores))
# print(total_cluster_scores)

### Read Nama Clusters

In [None]:
df = pd.read_csv(cluster_path, na_filter=False)
print(len(df))
df.head(3)

In [None]:
nama_name2clusters = defaultdict(set)
clusters = set()
for name, cluster in zip(df['name'], df['cluster']):
    nama_name2clusters[name].add(cluster)
    clusters.add(cluster)
    
print(len(nama_name2clusters))
print(len(clusters))
nama_name2clusters['ronald']

### Read Nicknames

In [None]:
s3 = boto3.resource('s3')

nama_nicknames = defaultdict(set)
if given_surname == "given":
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names[1:]:
            if name != headname:
                nama_nicknames[name].add(headname)
print(len(nama_nicknames))
nama_nicknames['zachery']

## Compute codes

In [None]:
gateway = JavaGateway()

In [None]:
def get_identity(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in names:
        code = name
        name2codes[name].add(code)
        code2names[code].add(name)
    return name2codes, code2names

def get_nysiis(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in names:
        code = jellyfish.nysiis(name)
        name2codes[name].add(code)
        code2names[code].add(name)
    return name2codes, code2names

def get_soundex(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in names:
        code = jellyfish.soundex(name)
        name2codes[name].add(code)
        code2names[code].add(name)
    return name2codes, code2names

def get_fs(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        result = gateway.getClusters(name, given_surname == "surname")
        for code in result.split(','):
            name2codes[name].add(code)
            code2names[code].add(name)
    return name2codes, code2names   

def get_fs_soundex(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        result = gateway.getClustersUsingSoundex(name, given_surname == "surname")
        codes = result.split(',')
        for code in codes:
            name2codes[name].add(code)
            if len(codes) > 1 and code.startswith("_"):
                # if this name is associated with multiple clusters, 
                # and this cluster is a soundex code, 
                # then don't associate this name with this soundex cluster,
                # because it is already associated (indexed) under another cluster
                continue
            code2names[code].add(name)
    return name2codes, code2names

def get_fs_nysiis(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        result = gateway.getClustersUsingNysiis(name, given_surname == "surname")
        codes = result.split(',')
        for code in codes:
            name2codes[name].add(code)
            if len(codes) > 1 and code.startswith("_"):
                # if this name is associated with multiple clusters, 
                # and this cluster is a nysiis code, 
                # then don't associate this name with this nysiis cluster,
                # because it is already associated (indexed) under another cluster
                continue
            code2names[code].add(name)
    return name2codes, code2names

def _get_nama_standards(name):
    standards = set()
    lookups = set([name])
    if given_surname == "given" and name in nama_nicknames:
        lookups.update(nama_nicknames[name])
    for lookup in lookups:
        if lookup in nama_name2clusters:
            standards.update(nama_name2clusters[lookup])
    return standards

def get_nama_soundex(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        codes = _get_nama_standards(name)
        for code in codes:
            name2codes[name].add(code)
            code2names[code].add(name)
        code = jellyfish.soundex(name)
        # always query soundex code
        # name2codes simulates query: given a name, what codes to lookup
        name2codes[name].add(code)
        # add name to soundex bucket only if it isn't in another bucket
        # code2names simulates an index lookup: given code, what names are indexed with that code
        if len(codes) == 0:
            code2names[code].add(name)
    return name2codes, code2names

def get_nama_nysiis(names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        codes = _get_nama_standards(name)
        for code in codes:
            name2codes[name].add(code)
            code2names[code].add(name)
        code = jellyfish.nysiis(name)
        # always query nysiis code
        # name2codes simulates query: given a name, what codes to lookup
        name2codes[name].add(code)
        # add name to nysiis bucket only if it isn't in another bucket
        # code2names simulates an index lookup: given code, what names are indexed with that code
        if len(codes) == 0:
            code2names[code].add(name)
    return name2codes, code2names

# def _get_multi_nama_standards(name, limit):
#     standards = set()
#     lookups = set([name])
#     if given_surname == "given" and name in nama_nicknames:
#         lookups.update(nama_nicknames[name])
#     for lookup in lookups:
#         if lookup in cluster_scores:
#             clusters = [cluster for cluster, _ in cluster_scores[lookup]]
#             lim = limit if lookup == name else 1
#             standards.update(clusters[0:limit])
#     return standards

# def get_multi_nama_nysiis(names):
#     name2codes = defaultdict(set)
#     code2names = defaultdict(set)
#     for name in tqdm(names):
#         nysiis_code = jellyfish.nysiis(name)
#         # get index codes
#         index_codes = _get_multi_nama_standards(name, 1)
#         if len(index_codes) == 0:
#             code2names[nysiis_code].add(name)
#         else:
#             for code in index_codes:
#                 code2names[code].add(name)
#         # get query codes
#         query_codes = _get_multi_nama_standards(name, cluster_scores_limit)
#         for code in query_codes:
#             name2codes[name].add(code)
#         name2codes[name].add(nysiis_code)
#     return name2codes, code2names
    

In [None]:
get_nama_nysiis(['ann', 'anna', 'anne'])

In [None]:
get_fs_nysiis(['ann', 'anna', 'anne'])

In [None]:
get_nama_soundex(['john', 'dallan', 'johnny', 'elizabeth', 'quass'])

# Compute Precision and Recall on Train

## Identity

In [None]:
df = train_df
name2codes, code2names = get_identity(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## NYSIIS

In [None]:
df = train_df
name2codes, code2names = get_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Soundex

In [None]:
df = train_df
name2codes, code2names = get_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-Soundex

In [None]:
df = train_df
name2codes, code2names = get_nama_soundex(set(df["tree_name"]) | set(df["record_name"]))
print("total names", len(name2codes))
print("total index entries", sum(len(names) for names in code2names.values()))
print("total queries", len(query_names))
print("total lookups", sum(len(name2codes[query]) for query in query_names))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-NYSIIS

In [None]:
df = train_df
name2codes, code2names = get_nama_nysiis(set(df["tree_name"]) | set(df["record_name"]))
print("total names", len(name2codes))
print("total index entries", sum(len(names) for names in code2names.values()))
print("total codes", len(code2names))
print("total queries", len(query_names))
print("total lookups", sum(len(name2codes[query]) for query in query_names))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Multi-Nama-NYSIIS

In [None]:
# df = train_df
# name2codes, code2names = get_multi_nama_nysiis(set(df["tree_name"]) | set(df["record_name"]))
# print("total names", len(name2codes))
# print("total index entries", sum(len(names) for names in code2names.values()))
# print("total queries", len(query_names))
# print("total lookups", sum(len(name2codes[query]) for query in query_names))
# precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
# print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-Soundex

In [None]:
df = train_df
name2codes, code2names = get_fs_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-NYSIIS

In [None]:
df = train_df
name2codes, code2names = get_fs_nysiis(set(df["tree_name"]) | set(df["record_name"]))
print("total names", len(name2codes))
print("total index entries", sum(len(names) for names in code2names.values()))
print("total codes", len(code2names))
print("total queries", len(query_names))
print("total lookups", sum(len(name2codes[query]) for query in query_names))
# precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
# print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS

In [None]:
df = train_df
name2codes, code2names = get_fs(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

# Compute Precision and Recall on Train and Test

## Identity

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_identity(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## NYSIIS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Soundex

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-Soundex

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_nama_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## Nama-NYSIIS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_nama_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-Soundex

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_fs_soundex(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS-NYSIIS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_fs_nysiis(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

## FS

In [None]:
df = pd.concat([train_df, test_df], ignore_index=True)
name2codes, code2names = get_fs(set(df["tree_name"]) | set(df["record_name"]))
precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, df)
print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")

# Remove

In [None]:
n_to_cluster_values = [50000]
upper_values = [0.9]
lower_values = [0.6]
high_freq_ix_values = [100]  # 100, 500, 2000
low_freq_ix_values = [25000, 100000]  # 10000, 25000, 100000

query_names_sample = query_names[:2000]

for n_to_cluster in n_to_cluster_values:
    for high_freq_ix in high_freq_ix_values:
        for low_freq_ix in low_freq_ix_values:
            for upper in upper_values:
                for lower in lower_values:
                    print(n_to_cluster, high_freq_ix, low_freq_ix, upper, lower)
                    path = f"../data/models/fs-{given_surname}-cluster-greedy-{n_to_cluster}-upper_{upper}-lower_{lower}-high_freq_ix_{high_freq_ix}-low_freq_ix_{low_freq_ix}.csv"
                    df = pd.read_csv(path, na_filter=False)
                    nama_name2clusters = defaultdict(set)
                    clusters = set()
                    for name, cluster in zip(df['name'], df['cluster']):
                        nama_name2clusters[name].add(cluster)
                        clusters.add(cluster)
                    df = train_df
                    name2codes, code2names = get_nama_nysiis(set(df["tree_name"]) | set(df["record_name"]))
                    print("total names", len(name2codes))
                    print("total index entries", sum(len(names) for names in code2names.values()))
                    print("total codes", len(code2names))
                    print("total queries", len(query_names_sample))
                    print("total lookups", sum(len(name2codes[query]) for query in query_names_sample))
                    precision, recall, f1, f2 = calc_avg_precision_recall(query_names_sample, name2codes, code2names, df)
                    print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")
