In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple
import cologne_phonetics
import jellyfish
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from metaphone import doublemetaphone
from mpire import WorkerPool
import numpy as np
from pyphonetics import RefinedSoundex
from spellwise import CaverphoneOne, CaverphoneTwo
from tqdm import tqdm
import wandb

from src.data.utils import load_train_test
from src.eval import metrics
from src.models.utils import remove_padding

In [None]:
# config

# from src.data.ancestry import load_train_test
# train_path = "../data/raw/records25k_data_train.csv"
# test_path = "../data/raw/records25k_data_test.csv"
# sample_all_names = True

given_surname = "given"
Config = namedtuple("Config", "train_path test_path triplet_model_path sample_all_names")
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-train.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-train-freq.csv.gz",
    # test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-test-freq.csv.gz",
    triplet_model_path=f"s3://nama-data/data/models/anc-triplet-bilstm-100-512-40-05.pth",
    sample_all_names=False,
)

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()
wandb.init(
    project="nama",
    entity="nama",
    name="90_compare_coders",
    group=given_surname,
    notes="fs in-vocab",
    config=config._asdict(),
)

### Load data

In [None]:
# read data

train, test = load_train_test([config.train_path, config.test_path])

input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = test

candidate_names_all = np.concatenate((candidate_names_train, candidate_names_test))
input_names_all = input_names_train + input_names_test
weighted_actual_names_all = weighted_actual_names_train + weighted_actual_names_test

In [None]:
# sample

input_names_sample = input_names_all if config.sample_all_names else input_names_test
weighted_actual_names_sample = weighted_actual_names_all if config.sample_all_names else weighted_actual_names_test
candidate_names_sample = candidate_names_all if config.sample_all_names else candidate_names_test

In [None]:
print("input_names_train", len(input_names_train))
print("input_names_test", len(input_names_test))
print("input_names_all", len(input_names_all))
print("input_names_sample", len(input_names_sample))
print("weighted_actual_names_train", len(weighted_actual_names_train))
print("weighted_actual_names_test", len(weighted_actual_names_test))
print("weighted_actual_names_all", len(weighted_actual_names_all))
print("weighted_actual_names_sample", len(weighted_actual_names_sample))
print("candidate_names_train", len(candidate_names_train))
print("candidate_names_test", len(candidate_names_test))
print("candidate_names_all", len(candidate_names_all))
print("candidate_names_sample", len(candidate_names_sample))

### Models

In [None]:
# various coders
caverphone_one = CaverphoneOne()
caverphone_two = CaverphoneTwo()
refined_soundex = RefinedSoundex()

In [None]:
coding_algos = [
    "soundex",
    "nysiis",
    "metaphone",
    "caverphone1",
    "caverphone2",
    "refined_soundex",
    #     "double_metaphone",  # bad implementation?
    "cologne_phonetics",
    "match_rating",
]

In [None]:
def calc_similarity_to(name, algo="levenshtein"):
    name = remove_padding(name)

    def calc_similarity(row):
        cand_name = remove_padding(row[0])
        similarity = 0.0
        if algo == "caverphone1":
            similarity = 1.0 if caverphone_one._pre_process(name) == caverphone_one._pre_process(cand_name) else 0.0
        elif algo == "caverphone2":
            similarity = 1.0 if caverphone_two._pre_process(name) == caverphone_two._pre_process(cand_name) else 0.0
        elif algo == "refined_soundex":
            similarity = 1.0 if refined_soundex.phonetics(name) == refined_soundex.phonetics(cand_name) else 0.0
        elif algo == "double_metaphone":
            dm1 = doublemetaphone(name)
            dm2 = doublemetaphone(cand_name)
            similarity = 1.0 if any(code in dm2 for code in dm1) else 0.0
        elif algo == "cologne_phonetics":
            similarity = (
                1.0 if cologne_phonetics.encode(name)[0][1] == cologne_phonetics.encode(cand_name)[0][1] else 0.0
            )
        elif algo == "match_rating":
            similarity = 1.0 if jellyfish.match_rating_comparison(name, cand_name) else 0.0
        elif algo == "soundex":
            similarity = 1.0 if jellyfish.soundex(name) == jellyfish.soundex(cand_name) else 0.0
        elif algo == "nysiis":
            similarity = 1.0 if jellyfish.nysiis(name) == jellyfish.nysiis(cand_name) else 0.0
        elif algo == "metaphone":
            similarity = 1.0 if jellyfish.metaphone(name) == jellyfish.metaphone(cand_name) else 0.0
        return similarity

    return calc_similarity

In [None]:
# test double metaphone
name = "smith"
cand_name = "schmidt"
dm1 = doublemetaphone(name)
dm2 = doublemetaphone(cand_name)
similarity = 1.0 if any(code in dm2 for code in dm1) else 0.0
print("dm1", dm1)
print("dm2", dm2)
print("similarity", similarity)

#### Similarity Function

In [None]:
def get_similars(shared, name=""):
    candidate_names_sample, k, algo = shared
    scores = np.apply_along_axis(calc_similarity_to(name, algo), 1, candidate_names_sample[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names_sample[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

#### Demo

## Test Soundex

In [None]:
k = 1000  # Number of candidates to consider

ix = 251
probe_name = input_names_test[ix]
print(probe_name)
similar_names_scores = [get_similars((candidate_names_sample, k, "soundex"), probe_name)]
similar_names_scores[0][:5]

In [None]:
# Ugh - how can I create a 3D array with (str, float) as the third axis without taking apart and re-assembling the array?
# names is a 2D array [names, name of k similar-names]
names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
# scores is a 2D array [names, score of k similar-names]
scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
# similar_names_scores is now a 3D array [names, k similar-names, name or score]
similar_names_scores = np.dstack((names, scores))

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[ix], similar_names_scores[0], 0.5)

In [None]:
metrics.precision_at_threshold(weighted_actual_names_test[ix], similar_names_scores[0], 0.5)

# Evaluate each algorithm

In [None]:
k = 5000  # Number of candidates to consider
extra_algos = 0
actual_names_sample = [[name for name, _, _ in name_weights] for name_weights in weighted_actual_names_sample]
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("PR at threshold")
colors = cm.rainbow(np.linspace(0, 1, len(coding_algos) + extra_algos))

# plot anc-triplet-bilstm-100-512-40-05 model
# ax.plot([.809], [.664], "o--", color=colors[0], label="triplet-cluster")
# ax.plot([.594], [.543], "o--", color=colors[1], label="dam-lev-cluster")

for algo, color in zip(coding_algos, colors[extra_algos:]):
    print(algo)
    with WorkerPool(shared_objects=(candidate_names_sample, k, algo)) as pool:
        similar_names_scores = pool.map(get_similars, input_names_sample, progress_bar=True)
    similar_names = [[name for name, _ in name_similarities] for name_similarities in similar_names_scores]
    names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
    scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
    total = max(scores.sum(axis=1))
    print("max sum of scores", total)
    if total == k:
        print("WARNING!!! need to increase k!!!")
    similar_names_scores = np.dstack((names, scores))
    precision = metrics.avg_precision_at_threshold(weighted_actual_names_sample, similar_names_scores, 0.5)
    recall = metrics.avg_weighted_recall_at_threshold(weighted_actual_names_sample, similar_names_scores, 0.5)
    print(f"precision={precision} recall={recall}")
    precisions = [precision]
    recalls = [recall]
    ax.plot(recalls, precisions, "o--", color=color, label=algo)

ax.legend()
plt.xlim([0, 1.0])
plt.ylim([0, 1.0])
plt.show()

In [None]:
wandb.finish()