In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple

import jellyfish
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from mpire import WorkerPool
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.extmath import safe_sparse_dot
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_datasets, select_frequent_k, frequent_k_names
from src.eval import metrics
from src.eval.utils import similars_to_ndarray
from src.models.swivel import SwivelModel, get_best_swivel_matches
from src.models.swivel_encoder import SwivelEncoderModel
from src.models.utils import remove_padding

In [None]:
# config

given_surname = "given"
vocab_size = 600000 if given_surname == "given" else 2100000
sample_size = 1000
embed_dim = 100
Config = namedtuple("Config", "train_path test_path embed_dim swivel_vocab_path swivel_model_path encoder_model_path")
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}.csv",
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-50.pth",
    encoder_model_path=f"s3://nama-data/data/models/fs-{given_surname}-encoder-model-{vocab_size}-{embed_dim}.pth",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="70_compare_similarity",
    group=given_surname,
    notes="swivel vs lev on 100k",
    config=config._asdict(),
)

### Load data

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
train, test = load_datasets([config.train_path, config.test_path])
input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = train

vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}

swivel_model = SwivelModel(len(swivel_vocab), config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path, "rb")))
swivel_model.eval()
swivel_model.to(device)

encoder_model = SwivelEncoderModel(output_dim=config.embed_dim, device=device)
encoder_model.load_state_dict(torch.load(fopen(config.encoder_model_path, "rb"), map_location=torch.device(device)))
encoder_model.to(device)
encoder_model.eval()

In [None]:
# sample names (train for in-vocab and test for out-of-vocab)
_, input_names_sample, _, weighted_actual_names_sample = \
    train_test_split(input_names_train, weighted_actual_names_train, test_size=sample_size)
candidate_names_sample = candidate_names_train

_, input_names_test_sample, _, weighted_actual_names_test_sample = \
    train_test_split(input_names_test, weighted_actual_names_test, test_size=sample_size)
candidate_names_test_sample = candidate_names_test

In [None]:
print("input_names_sample", len(input_names_sample))
print("weighted_actual_names_sample", len(weighted_actual_names_sample))
print("candidate_names_sample", len(candidate_names_sample))

print("input_names_test_sample", len(input_names_test_sample))
print("weighted_actual_names_test_sample", len(weighted_actual_names_test_sample))
print("candidate_names_test_sample", len(candidate_names_test_sample))

### Other Models

In [None]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer="char_wb", min_df=10, max_df=0.5)
tfidf_X_sample = tfidf_vectorizer.fit_transform(candidate_names_sample)
tfidf_X_test_sample = tfidf_vectorizer.transform(candidate_names_test_sample)

In [None]:
SimilarityAlgo = namedtuple("SimilarityAlgo", "name min_threshold max_threshold distances")
similarity_algos = [
    SimilarityAlgo("swivel", 0.01, 1.0, False),
    SimilarityAlgo("swivel_encoder", 0.01, 1.0, False),
    SimilarityAlgo("tfidf", 0.01, 1.0, False),
    SimilarityAlgo("levenshtein", 0.01, 1.0, False),
    SimilarityAlgo("damerau_levenshtein", 0.01, 1.0, False),
    SimilarityAlgo("jaro_winkler", 0.01, 1.0, False),
]

In [None]:
def calc_similarity_to(name, algo="levenshtein"):
    name = remove_padding(name)

    def calc_similarity(row):
        cand_name = remove_padding(row[0])
        similarity = 0.0
        if algo == "levenshtein":
            dist = jellyfish.levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "damerau_levenshtein":
            dist = jellyfish.damerau_levenshtein_distance(name, cand_name)
            similarity = 1 - (dist / max(len(name), len(cand_name)))
        elif algo == "jaro_winkler":
            similarity = jellyfish.jaro_winkler_similarity(name, cand_name)

        return similarity

    return calc_similarity

#### Similarity Function

In [None]:
def get_similars(shared, name=""):
    candidate_names_test, k, algo, tfidf_vectorizer, tfidf_X_test = shared
    if algo == "tfidf":
        x = tfidf_vectorizer.transform([name]).toarray()
        scores = safe_sparse_dot(tfidf_X_test, x.T).flatten()
    else:
        scores = np.apply_along_axis(calc_similarity_to(name, algo), 1, candidate_names_test[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names_test[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

#### Demo

In [None]:
probe_name = "<bostelman>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_test, 10, "levenshtein", None, None), probe_name)

## Test tfidf

In [None]:
probe_name = "<schumacher>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_test, 10, "tfidf", tfidf_vectorizer, tfidf_X_test), probe_name)

## Test levenshtein

In [None]:
ix = 251
input_names_test[ix]

In [None]:
weighted_actual_names_test[ix]

In [None]:
k = 100  # Number of candidates to consider
similar_names_scores = [get_similars((candidate_names_test, k, "levenshtein", None, None), input_names_test[ix])]
similar_names_scores[0][:5]

In [None]:
similar_names_scores = similars_to_ndarray(similar_names_scores)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[ix], similar_names_scores[0], 0.85)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[ix], similar_names_scores[0], 0.75)

# Evaluate each algorithm

In [None]:
def evaluate_algos(similarity_algos, swivel_vocab, swivel_model, encoder_model, input_names, weighted_actual_names, candidate_names, tfidf_X):
    n_jobs = 4

    figure, ax = plt.subplots(1, 1, figsize=(20, 15))
    ax.set_title("PR at threshold")
    colors = cm.rainbow(np.linspace(0, 1, len(similarity_algos)))

    for algo, color in zip(similarity_algos, colors):
        print(algo.name)
        if algo.name == "swivel":
            similar_names_scores = get_best_swivel_matches(model=swivel_model, 
                                                           vocab=swivel_vocab, 
                                                           input_names=input_names,
                                                           candidate_names=candidate_names, 
                                                           encoder_model=encoder_model,
                                                           k=100, 
                                                           batch_size=1024, 
                                                           add_context=True, 
                                                           n_jobs=n_jobs)
        elif algo.name == "swivel_encoder":
            similar_names_scores = get_best_swivel_matches(model=None, 
                                                           vocab=None, 
                                                           input_names=input_names,
                                                           candidate_names=candidate_names, 
                                                           encoder_model=encoder_model,
                                                           k=100, 
                                                           batch_size=1024, 
                                                           add_context=True, 
                                                           n_jobs=n_jobs)
        else:
            with WorkerPool(
                shared_objects=(candidate_names, 1000, algo.name, tfidf_vectorizer, tfidf_X)
            ) as pool:
                similar_names_scores = pool.map(get_similars, input_names, progress_bar=True)
            similar_names_scores = similars_to_ndarray(similar_names_scores)
        precisions, recalls = metrics.precision_weighted_recall_at_threshold(
            weighted_actual_names,
            similar_names_scores,
            min_threshold=algo.min_threshold,
            max_threshold=algo.max_threshold,
            distances=algo.distances,
            n_jobs=1,
            progress_bar=True,
        )
        ax.plot(recalls, precisions, "o--", color=color, label=algo.name)

    ax.legend()
    plt.xlim([0, 1.0])
    plt.ylim([0, 1.0])
    plt.show()

## on in-vocabulary names (training data)

In [None]:
evaluate_algos(similarity_algos, 
               swivel_vocab, 
               swivel_model, 
               encoder_model, 
               input_names_sample, 
               weighted_actual_names_sample, 
               candidate_names_sample, 
               tfidf_X_sample)

## on out-of-vocabulary names (test data)

In [None]:
evaluate_algos(similarity_algos, 
               swivel_vocab, 
               swivel_model, 
               encoder_model, 
               input_names_test_sample, 
               weighted_actual_names_test_sample, 
               candidate_names_test_sample, 
               tfidf_X_test_sample)

In [None]:
wandb.finish()