In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from abydos import distance
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import numpy as np
import jellyfish
from tqdm import tqdm

from src.data.ancestry import load_train_test
from src.metrics import metrics
from src.models import utils

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()

### Load and process data

In [None]:
# Only need to run this once
# It we split the data into train/test and will persist the data on disk
# train_test_split(test_size=0.1)

In [None]:
train, test = load_train_test(f"../data/raw/records25k_data_train.csv", f"../data/raw/records25k_data_test.csv")

input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = test

candidate_names_all = np.concatenate((candidate_names_train, candidate_names_test))
input_names_all = input_names_train + input_names_test
weighted_actual_names_all = weighted_actual_names_train + weighted_actual_names_test

### Model

In [None]:
sc95 = distance.Strcmp95()
nw = distance.NeedlemanWunsch()
sw = distance.SmithWaterman()
gotoh = distance.Gotoh()
dice = distance.Dice()
me = distance.MongeElkan(symmetric=True)

In [None]:
continuous_algos = ["levenshtein", "damerau_levenshtein", "jaro_winkler", "strcmp95", "dice"]
boolean_algos = ["match_rating", "soundex", "nysiis", "metaphone"]
# Elasticsearch has metaphone double_metaphone, soundex, refined_soundex, caverphone1, caverphone2, cologne, nysiis, koelnerphonetik, haasephonetik, beider_morse, daitch_mokotoff
# algos = continuous_algos + boolean_algos
algos = continuous_algos

In [None]:
def calc_similarity_to(name, algo="levenshtein"):
    name = utils.remove_padding(name)

    def calc_similarity(row):
        cand_name = utils.remove_padding(row[0])
        similarity = 0
        if algo == "levenshtein":
            dist = jellyfish.levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "damerau_levenshtein":
            dist = jellyfish.damerau_levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "jaro_winkler":
            similarity = jellyfish.jaro_winkler_similarity(name, cand_name)
        elif algo == "strcmp95":
            similarity = sc95.sim(name, cand_name)
        elif algo == "dice":
            similarity = dice.sim(name, cand_name)
        elif algo == "needleman_wunsch":
            similarity = nw.sim(name, cand_name)
        elif algo == "smith_waterman":
            similarity = sw.sim(name, cand_name)
        elif algo == "gotoh":
            similarity = gotoh.sim(name, cand_name)
        elif algo == "monge_elkan":
            similarity = me.sim(name, cand_name)
        elif algo == "match_rating":
            similarity = 1 if jellyfish.match_rating_comparison(name, cand_name) else 0
        elif algo == "soundex":
            similarity = 1 if jellyfish.soundex(name) == jellyfish.soundex(cand_name) else 0
        elif algo == "nysiis":
            similarity = 1 if jellyfish.nysiis(name) == jellyfish.nysiis(cand_name) else 0
        elif algo == "metaphone":
            similarity = 1 if jellyfish.metaphone(name) == jellyfish.metaphone(cand_name) else 0
        return similarity

    return calc_similarity

#### Similarity Function

In [None]:
def get_similars(name, k=10, algo="levenshtein", demo_mode=False):
    if demo_mode:
        name = utils.add_padding(name)
    scores = np.apply_along_axis(calc_similarity_to(name, algo), 1, candidate_names_all[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names_all[sorted_scores_idx]
    if demo_mode:
        candidate_names = [utils.remove_padding(candidate) for candidate in candidate_names]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

#### Demo

In [None]:
# get_similars('schumacher', 10, 'jaro_winkler', True)
get_similars("bostelman", 10, "levenshtein", True)

## Test levenshtein

In [None]:
input_names_test[251]

In [None]:
weighted_actual_names_test[251]

In [None]:
k = 100  # Number of candidates to consider
similar_names_scores = [get_similars(input_names_test[251], k=k, algo="levenshtein")]
similar_names_scores[0][:5]

In [None]:
# Ugh - how can I create a 3D array with (str, float) as the third axis without taking apart and re-assembling the array?
# names is a 2D array axis 0 = names, axis 1 = name of k similar-names
names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
# scores is a 2D array axis 0 = names, axis 1 = score of k similar-names
scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
# similar_names is now a 3D array axis 0 = names, axis 1 = k similar-names, axis 2 = name or score
similar_names_scores = np.dstack((names, scores))

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores[0], 0.85)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores[0], 0.75)

# Evaluate each algorithm

In [None]:
k = 100  # Number of candidates to consider
min_threshold = 0.5
actual_names_all = [[name for name, _, _ in name_weights] for name_weights in weighted_actual_names_all]
figure, axis = plt.subplots(2, 1, figsize=(20, 30))
axis[0].set_title("PR at k")
axis[1].set_title("PR at threshold")
colors = cm.rainbow(np.linspace(0, 1, len(algos)))

for algo, color in zip(algos, colors):
    print(algo)
    similar_names_scores = list(map(lambda x: get_similars(x, k=k, algo=algo), tqdm(input_names_all)))
    similar_names = [[name for name, _ in name_similarities] for name_similarities in similar_names_scores]
    names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
    scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
    similar_names_scores = np.dstack((names, scores))
    if algo in continuous_algos:
        precisions, recalls = metrics.precision_recall_at_k(actual_names_all, similar_names, k)
        axis[0].plot(recalls, precisions, "o--", color=color, label=algo)

        precisions, recalls = metrics.precision_weighted_recall_at_threshold(
            weighted_actual_names_all, similar_names_scores, min_threshold
        )
        # metrics.get_auc(all_weighted_actuals, similar_names, step=.01)
    else:
        precisions = [metrics.avg_precision_at_threshold(weighted_actual_names_all, similar_names_scores, 0.5)]
        recalls = [metrics.avg_weighted_recall_at_threshold(weighted_actual_names_test, similar_names_scores, 0.5)]

    axis[1].plot(recalls, precisions, "o--", color=color, label=algo)

axis[0].legend()
axis[1].legend()
plt.show()