In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple
import jellyfish
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from mpire import WorkerPool
import numpy as np
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import safe_sparse_dot
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_train_test
from src.eval import metrics
from src.eval.triplet import eval_triplet
from src.eval.utils import similar_names_scores_to_ndarray
from src.models.utils import remove_padding, get_best_matches, build_token_idx_maps, convert_names_to_model_inputs

In [None]:
# config

# from src.data.ancestry import load_train_test
# train_path = "../data/raw/records25k_data_train.csv"
# test_path = "../data/raw/records25k_data_test.csv"
# sample_all_names = True

given_surname = "given"
Config = namedtuple("Config", "train_path test_path triplet_model_path sample_all_names")
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-train.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-train-freq.csv.gz",
    # test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-test-freq.csv.gz",
    triplet_model_path=f"s3://nama-data/data/models/anc-triplet-bilstm-100-512-40-05.pth",
    sample_all_names=False,
)

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()
wandb.init(
    project="nama",
    entity="nama",
    name="60_compare_similarity",
    group=given_surname,
    notes="fs in-vocab",
    config=config._asdict(),
)

### Load data

In [None]:
# read data

train, test = load_train_test([config.train_path, config.test_path])

input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = test

candidate_names_all = np.concatenate((candidate_names_train, candidate_names_test))
input_names_all = input_names_train + input_names_test
weighted_actual_names_all = weighted_actual_names_train + weighted_actual_names_test

In [None]:
# sample

input_names_sample = input_names_all if config.sample_all_names else input_names_test
weighted_actual_names_sample = weighted_actual_names_all if config.sample_all_names else weighted_actual_names_test
candidate_names_sample = candidate_names_all if config.sample_all_names else candidate_names_test

In [None]:
print("input_names_train", len(input_names_train))
print("input_names_test", len(input_names_test))
print("input_names_all", len(input_names_all))
print("input_names_sample", len(input_names_sample))
print("weighted_actual_names_train", len(weighted_actual_names_train))
print("weighted_actual_names_test", len(weighted_actual_names_test))
print("weighted_actual_names_all", len(weighted_actual_names_all))
print("weighted_actual_names_sample", len(weighted_actual_names_sample))
print("candidate_names_train", len(candidate_names_train))
print("candidate_names_test", len(candidate_names_test))
print("candidate_names_all", len(candidate_names_all))
print("candidate_names_sample", len(candidate_names_sample))

### Models

In [None]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer="char_wb", min_df=10, max_df=0.5)
tfidf_X_train = tfidf_vectorizer.fit_transform(candidate_names_train)
tfidf_X_test = tfidf_vectorizer.transform(candidate_names_test)
tfidf_X_sample = vstack((tfidf_X_train, tfidf_X_test)) if config.sample_all_names else tfidf_X_test

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# autoencoder with triplet loss
triplet_model = torch.load(fopen(config.triplet_model_path, "rb"), map_location=torch.device(device))

In [None]:
SimilarityAlgo = namedtuple("SimilarityAlgo", "name min_threshold max_threshold distances")
similarity_algos = [
    SimilarityAlgo("tfidf", 0.5, 1.0, False),
    SimilarityAlgo("levenshtein", 0.5, 1.0, False),
    SimilarityAlgo("damerau_levenshtein", 0.5, 1.0, False),
    SimilarityAlgo("jaro_winkler", 0.5, 1.0, False),
    SimilarityAlgo("triplet", 0.01, 1.0, True),
]

In [None]:
def calc_similarity_to(name, algo="levenshtein"):
    name = remove_padding(name)

    def calc_similarity(row):
        cand_name = remove_padding(row[0])
        similarity = 0.0
        if algo == "levenshtein":
            dist = jellyfish.levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "damerau_levenshtein":
            dist = jellyfish.damerau_levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "jaro_winkler":
            similarity = jellyfish.jaro_winkler_similarity(name, cand_name)

        return similarity

    return calc_similarity

#### Similarity Function

In [None]:
def get_similars(shared, name=""):
    candidate_names_sample, k, algo, tfidf_vectorizer, tfidf_X_sample = shared
    if algo == "tfidf":
        x = tfidf_vectorizer.transform([name]).toarray()
        scores = safe_sparse_dot(tfidf_X_sample, x.T).flatten()
    else:
        scores = np.apply_along_axis(calc_similarity_to(name, algo), 1, candidate_names_sample[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names_sample[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

#### Demo

In [None]:
probe_name = "<bostelman>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_sample, 10, "levenshtein", None, None), probe_name)

## Test tfidf

In [None]:
probe_name = "<schumacher>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_sample, 10, "tfidf", tfidf_vectorizer, tfidf_X_sample), probe_name)

## Test levenshtein

In [None]:
ix = 251
input_names_test[ix]

In [None]:
weighted_actual_names_test[ix]

In [None]:
k = 100  # Number of candidates to consider
similar_names_scores = [get_similars((candidate_names_sample, k, "levenshtein", None, None), input_names_test[ix])]
similar_names_scores[0][:5]

In [None]:
similar_names_scores = similar_names_scores_to_ndarray(similar_names_scores)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[ix], similar_names_scores[0], 0.85)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[ix], similar_names_scores[0], 0.75)

# Evaluate each algorithm

In [None]:
k = 1000  # Number of candidates to consider
n_pr_jobs = 16  # Number of jobs for precision_weighted_recall_at_thrshold

actual_names_sample = [[name for name, _, _ in name_weights] for name_weights in weighted_actual_names_sample]
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("PR at threshold")
colors = cm.rainbow(np.linspace(0, 1, len(similarity_algos)))

for algo, color in zip(similarity_algos, colors):
    print(algo.name)
    if algo.name == "triplet":
        similar_names_scores = eval_triplet(triplet_model, input_names_sample, candidate_names_sample, k, 512)
    else:
        with WorkerPool(
            shared_objects=(candidate_names_sample, k, algo.name, tfidf_vectorizer, tfidf_X_sample)
        ) as pool:
            similar_names_scores = pool.map(get_similars, input_names_sample, progress_bar=True)
        similar_names_scores = similar_names_scores_to_ndarray(similar_names_scores)
    precisions, recalls = metrics.precision_weighted_recall_at_threshold(
        weighted_actual_names_sample,
        similar_names_scores,
        min_threshold=algo.min_threshold,
        max_threshold=algo.max_threshold,
        distances=algo.distances,
        n_jobs=n_pr_jobs,
        progress_bar=True,
    )
    ax.plot(recalls, precisions, "o--", color=color, label=algo.name)

ax.legend()
plt.xlim([0, 1.0])
plt.ylim([0, 1.0])
plt.show()

In [None]:
wandb.finish()