In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple
import cologne_phonetics
import jellyfish
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from metaphone import doublemetaphone
from mpire import WorkerPool
import numpy as np
from pyphonetics import RefinedSoundex
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import safe_sparse_dot
from spellwise import CaverphoneOne, CaverphoneTwo
import torch
from tqdm import tqdm

from src.data.ancestry import load_train_test
from src.metrics import metrics
from src.models.utils import remove_padding, get_best_matches, build_token_idx_maps, convert_names_to_model_inputs

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()

### Load and process data

In [None]:
# read ancestry data

train, test = load_train_test(f"../data/raw/records25k_data_train.csv", f"../data/raw/records25k_data_test.csv")

input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = test

candidate_names_all = np.concatenate((candidate_names_train, candidate_names_test))
input_names_all = input_names_train + input_names_test
weighted_actual_names_all = weighted_actual_names_train + weighted_actual_names_test

### Model

In [None]:
# various coders
caverphone_one = CaverphoneOne()
caverphone_two = CaverphoneTwo()
refined_soundex = RefinedSoundex()

# tfidf
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer="char_wb", min_df=10, max_df=0.5)
tfidf_X_train = tfidf_vectorizer.fit_transform(candidate_names_train)
tfidf_X_test = tfidf_vectorizer.transform(candidate_names_test)
tfidf_X_all = vstack((tfidf_X_train, tfidf_X_test))

# autoencoder with triplet loss
triplet_model = torch.load("../data/models/anc-triplet-bilstm-100-512-40-05.pth")
# move to cpu for evaluation so we don't run out of GPU memory
triplet_model.to("cpu")
triplet_model.device = "cpu"


In [None]:
SimilarityAlgo = namedtuple("SimilarityAlgo", "name min_threshold max_threshold distances")
similarity_algos = [
    SimilarityAlgo("tfidf", 0.5, 1.0, False),
    SimilarityAlgo("levenshtein", 0.5, 1.0, False),
    SimilarityAlgo("damerau_levenshtein", 0.5, 1.0, False),
    SimilarityAlgo("jaro_winkler", 0.5, 1.0, False),
    SimilarityAlgo("triplet", 0.01, 1.0, True),
]
coding_algos = [
    "soundex",
    "nysiis",
    "metaphone",
    "caverphone1",
    "caverphone2",
    "refined_soundex",
    "double_metaphone",
    "cologne_phonetics",
    "match_rating",
]

In [None]:
def calc_similarity_to(name, algo="levenshtein"):
    name = remove_padding(name)
    
    def calc_similarity(row):
        cand_name = remove_padding(row[0])
        similarity = 0.0
        if algo == "levenshtein":
            dist = jellyfish.levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "damerau_levenshtein":
            dist = jellyfish.damerau_levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "jaro_winkler":
            similarity = jellyfish.jaro_winkler_similarity(name, cand_name)

        elif algo == "caverphone1":
            similarity = 1.0 if caverphone_one._pre_process(name) == caverphone_one._pre_process(cand_name) else 0.0
        elif algo == "caverphone2":
            similarity = 1.0 if caverphone_two._pre_process(name) == caverphone_two._pre_process(cand_name) else 0.0
        elif algo == "refined_soundex":
            similarity = 1.0 if refined_soundex.phonetics(name) == refined_soundex.phonetics(cand_name) else 0.0
        elif algo == "double_metaphone":
            dm1 = doublemetaphone(name)
            dm2 = doublemetaphone(cand_name)
            similarity = 1.0 if any(code in dm2 for code in dm1) else 0.0
        elif algo == "cologne_phonetics":
            similarity = 1.0 if cologne_phonetics.encode(name)[0][1] == cologne_phonetics.encode(cand_name)[0][1] else 0.0
        elif algo == "match_rating":
            similarity = 1.0 if jellyfish.match_rating_comparison(name, cand_name) else 0.0
        elif algo == "soundex":
            similarity = 1.0 if jellyfish.soundex(name) == jellyfish.soundex(cand_name) else 0.0
        elif algo == "nysiis":
            similarity = 1.0 if jellyfish.nysiis(name) == jellyfish.nysiis(cand_name) else 0.0
        elif algo == "metaphone":
            similarity = 1.0 if jellyfish.metaphone(name) == jellyfish.metaphone(cand_name) else 0.0
        return similarity

    return calc_similarity

In [None]:
# test double metaphone
name = "smith"
cand_name = "schmidt"
dm1 = doublemetaphone(name)
dm2 = doublemetaphone(cand_name)
similarity = 1.0 if any(code in dm2 for code in dm1) else 0.0
print("dm1", dm1)
print("dm2", dm2)
print("similarity", similarity)

#### Similarity Function

In [None]:
def get_similars(shared, name=''):
    candidate_names_all, k, algo, tfidf_vectorizer, tfidf_X_all  = shared
    if algo == "tfidf":
        x = tfidf_vectorizer.transform([name]).toarray()
        scores = safe_sparse_dot(tfidf_X_all, x.T).flatten()
    else:
        scores = np.apply_along_axis(calc_similarity_to(name, algo), 1, candidate_names_all[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names_all[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

#### Demo

In [None]:
# get_similars('schumacher', 10, 'jaro_winkler', True)
get_similars((candidate_names_all, 10, "levenshtein", None, None), "<bostelman>")

## Test tfidf

In [None]:
get_similars((candidate_names_all, 10, "tfidf", tfidf_vectorizer, tfidf_X_all), "<schumacher>")

## Test levenshtein

In [None]:
input_names_test[251]

In [None]:
weighted_actual_names_test[251]

In [None]:
k = 100  # Number of candidates to consider
similar_names_scores = [get_similars((candidate_names_all, k, "levenshtein", None, None), input_names_test[251])]
similar_names_scores[0][:5]

In [None]:
# Ugh - how can I create a 3D array with (str, float) as the third axis without taking apart and re-assembling the array?
# names is a 2D array axis 0 = names, axis 1 = name of k similar-names
names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
# scores is a 2D array axis 0 = names, axis 1 = score of k similar-names
scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
# similar_names is now a 3D array axis 0 = names, axis 1 = k similar-names, axis 2 = name or score
similar_names_scores = np.dstack((names, scores))

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores[0], 0.85)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores[0], 0.75)

## Test Soundex

In [None]:
k = 1000  # Number of candidates to consider
similar_names_scores = [get_similars((candidate_names_all, k, "soundex", None, None), input_names_test[251])]
similar_names_scores[0][:5]

In [None]:
# Ugh - how can I create a 3D array with (str, float) as the third axis without taking apart and re-assembling the array?
# names is a 2D array axis 0 = names, axis 1 = name of k similar-names
names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
# scores is a 2D array axis 0 = names, axis 1 = score of k similar-names
scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
# similar_names is now a 3D array axis 0 = names, axis 1 = k similar-names, axis 2 = name or score
similar_names_scores = np.dstack((names, scores))

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores[0], 0.5)

In [None]:
metrics.precision_at_threshold(weighted_actual_names_test[251], similar_names_scores[0], 0.5)

# Evaluate each algorithm

In [None]:
def triplet_eval(triplet_model, input_names, candidate_names_all, k):
    MAX_NAME_LENGTH = 30
    char_to_idx_map, idx_to_char_map = build_token_idx_maps()

    # Get embeddings for input names
    input_names_X, _ = convert_names_to_model_inputs(input_names, char_to_idx_map, MAX_NAME_LENGTH)
    input_names_encoded = triplet_model(input_names_X, just_encoder=True).detach().numpy()

    # Get embeddings for candidate names
    candidate_names_all_X, _ = convert_names_to_model_inputs(
        candidate_names_all, char_to_idx_map, MAX_NAME_LENGTH
    )
    candidate_names_all_encoded = triplet_model(candidate_names_all_X, just_encoder=True).detach().numpy()

    return get_best_matches(
        input_names_encoded, candidate_names_all_encoded, candidate_names_all, num_candidates=k, metric="euclidean"
    )

In [None]:
k = 1000  # Number of candidates to consider
actual_names_all = [[name for name, _, _ in name_weights] for name_weights in weighted_actual_names_all]
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("PR at threshold")
colors = cm.rainbow(np.linspace(0, 1, len(similarity_algos)))
# TODO use input_names_test and weighted_Actual_names_test
input_names_sample = input_names_all

for algo, color in zip(similarity_algos, colors):
    print(algo.name)
    if algo.name == "triplet":
        similar_names_scores = triplet_eval(triplet_model, input_names_sample, candidate_names_all, k)
    else:
        with WorkerPool(shared_objects=(candidate_names_all, k, algo.name, tfidf_vectorizer, tfidf_X_all)) as pool:
            similar_names_scores = pool.map(get_similars, input_names_sample, progress_bar=True)
        similar_names = [[name for name, _ in name_similarities] for name_similarities in similar_names_scores]
        names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
        scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
        similar_names_scores = np.dstack((names, scores))
    precisions, recalls = metrics.precision_weighted_recall_at_threshold(
        weighted_actual_names_all, similar_names_scores,
        min_threshold=algo.min_threshold, max_threshold=algo.max_threshold, distances=algo.distances
    )
    ax.plot(recalls, precisions, "o--", color=color, label=algo.name)

ax.legend()
plt.show()

In [None]:
k = 1000  # Number of candidates to consider
actual_names_all = [[name for name, _, _ in name_weights] for name_weights in weighted_actual_names_all]
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("PR at threshold")
colors = cm.rainbow(np.linspace(0, 1, len(coding_algos)+2))
input_names_sample = input_names_all

# plot anc-triplet-bilstm-100-512-40-05 model
ax.plot([.809], [.664], "o--", color=colors[0], label="triplet-cluster")
ax.plot([.594], [.543], "o--", color=colors[1], label="dam-lev-cluster")

for algo, color in zip(coding_algos, colors[2:]):
    print(algo)
#     similar_names_scores = list(map(lambda x: get_similars(x, k=k, algo=algo), tqdm(input_names_all)))
    with WorkerPool(shared_objects=(candidate_names_all, k, algo, tfidf_vectorizer, tfidf_X_all)) as pool:
        similar_names_scores = pool.map(get_similars, input_names_sample, progress_bar=True)
    similar_names = [[name for name, _ in name_similarities] for name_similarities in similar_names_scores]
    names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
    scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
    similar_names_scores = np.dstack((names, scores))
    precision = metrics.avg_precision_at_threshold(weighted_actual_names_all, similar_names_scores, 0.5)
    recall = metrics.avg_weighted_recall_at_threshold(weighted_actual_names_all, similar_names_scores, 0.5)
    print(f"precision={precision} recall={recall}")
    precisions = [precision]
    recalls = [recall]
    ax.plot(recalls, precisions, "o--", color=color, label=algo)

ax.legend()
plt.show()