In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Compare our approach to handling out of vocab names to four simpler approaches

In [None]:
from collections import namedtuple, defaultdict
from datetime import datetime

import cologne_phonetics
import jellyfish
import joblib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from metaphone import doublemetaphone
from mpire import WorkerPool
import numpy as np
from pyphonetics import RefinedSoundex
from sklearn.model_selection import train_test_split
from spellwise import CaverphoneOne, CaverphoneTwo
import pandas as pd
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.normalize import normalize_freq_names
from src.data.utils import load_dataset, select_frequent_k
from src.eval import metrics
from src.models.levenshtein import get_best_lev_matches
from src.models.utils import remove_padding, add_padding
from src.models.cluster import read_clusters, get_validation_results, read_cluster_scores
from src.models.swivel import SwivelModel

In [None]:
# config

given_surname = "surname"
vocab_size = 610000 if given_surname == "given" else 2100000
eval_size = 200000
sample_size = 5000
embed_dim = 100
NAMA_MAX_CLUSTERS = 20
n_jobs = 1
verbose = True
num_matches = 1000  # Number of candidates to consider

Config = namedtuple("Config", [
    "eval_path",
    "test_path",
    "freq_path",
    "embed_dim",
    "swivel_vocab_path",
    "swivel_model_path",
    "tfidf_path",
    "ensemble_model_path",
    "cluster_path",
    "cluster_scores_path",
    "aggr_path",
])
config = Config(
    eval_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv",
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth",
    tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf.joblib",
    ensemble_model_path=f"s3://nama-data/data/models/fs-{given_surname}-ensemble-model-{vocab_size}-{embed_dim}-augmented-100.joblib",    
    cluster_path=f"s3://nama-data/data/models/fs-{given_surname}-cluster-names.csv",
    cluster_scores_path=f"s3://nama-data/data/processed/fs-{given_surname}-cluster-scores-{vocab_size}-{embed_dim}-precomputed.jsonl.gz",
    aggr_path=f"s3://familysearch-names/interim/tree-hr-{given_surname}-aggr.parquet",
)

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()

wandb.init(
    project="nama",
    entity="nama",
    name="92_compare_oov_approaches",
    group=given_surname,
    notes="",
    config=config._asdict(),
)

### Load data

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
input_names_test, weighted_actual_names_test, candidate_names_test = load_dataset(config.test_path, is_eval=True)

In [None]:
print("input_names_test", len(input_names_test))
print("candidate_names_test", len(candidate_names_test))

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = normalize_freq_names(freq_df, is_surname=given_surname != "given", add_padding=True)
freq_df = None

In [None]:
vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}
len(swivel_vocab)

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
tfidf_vectorizer = joblib.load(fopen(config.tfidf_path, mode='rb'))

In [None]:
print(len(tfidf_vectorizer.vocabulary_))
tfidf_vectorizer.vocabulary_

In [None]:
tfidf_vectorizer.idf_

In [None]:
tfidf_vectorizer.transform(["<richard>", "<dallan>"]).todense()

In [None]:
ensemble_model = joblib.load(fopen(config.ensemble_model_path, mode='rb'))

In [None]:
name_cluster = read_clusters(config.cluster_path)
print("name_cluster", len(name_cluster))
print("unique clusters", len(set(name_cluster.values())))

In [None]:
for ix, (key, value) in enumerate(name_cluster.items()):
    print(key, value)
    if ix > 20:
        break

In [None]:
cluster_scores = read_cluster_scores(config.cluster_scores_path)

In [None]:
name_cluster_large = {name: name_scores[0][0] for name, name_scores in cluster_scores.items()}
len(name_cluster_large)

In [None]:
for ix, (key, value) in enumerate(name_cluster_large.items()):
    print(key, value)
    if ix > 10:
        break

In [None]:
# remove oov names from input_names_test and remove in-vocab names from weighted_actual_names_test
# so we only compare in-vocab against out-of-vocab
input_names_test_iv = []
weighted_actual_names_test_iv = []
candidate_names_test_oov = set()
for input_name, wans in zip(input_names_test, weighted_actual_names_test):
    if input_name not in swivel_vocab:
        continue
    if input_name not in name_cluster_large:
        continue
    wans_oov = []
    sum_freq = 0
    for name, weight, freq in wans:
        if name in swivel_vocab or freq == 0:
            continue
        wans_oov.append((name, freq))
        sum_freq += freq
    wans_oov = [(name, freq / sum_freq, freq) for name, freq in wans_oov]
    if len(wans_oov) == 0:
        continue
    input_names_test_iv.append(input_name)
    weighted_actual_names_test_iv.append(wans_oov)
    for name, _, _ in wans_oov:
        candidate_names_test_oov.add(name)
candidate_names_test_oov = list(candidate_names_test_oov)

In [None]:
print(len(input_names_test))
print(len(input_names_test_iv))
print(len(candidate_names_test))
print(len(candidate_names_test_oov))

In [None]:
input_names_test = input_names_test_iv
weighted_actual_names_test = weighted_actual_names_test_iv
candidate_names_test = np.array(candidate_names_test_oov)

### Other Models

In [None]:
refined_soundex = RefinedSoundex()

In [None]:
coding_algos = [
    "levclustered-100",
    "levclusters-100",
    "soundex",
    "refined_soundex",
    "nysiis",
    "metaphone",
#     "nama-60",
]

### Similarity functions

In [None]:
def get_codes(name, algo):
    name = remove_padding(name)
    if algo == "soundex":
        return [jellyfish.soundex(name)]
    elif algo == "nysiis":
        return [jellyfish.nysiis(name)]
    elif algo == "metaphone":
        return [jellyfish.metaphone(name)]
    elif algo == "refined_soundex":
        return [refined_soundex.phonetics(name)]    

In [None]:
def calc_similarity_to(name, name2codes):
    codes1 = set(name2codes[name])

    def calc_similarity(row):
        cand_name = row[0]
        code2 = name2codes[cand_name][0]  # code2 is the code cand_name is indexed under
        return 1.0 if code2 in codes1 else 0.0

    return calc_similarity

In [None]:
def get_similars(shared, name=""):
    candidate_names, k, name2codes = shared
    scores = np.apply_along_axis(calc_similarity_to(name, name2codes), 1, candidate_names[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

In [None]:
def _get_similar_names_scores(input_names, cluster_candidates, name_cluster_large):
    similar_names = []
    max_names = 0
    similar_scores = []
    # get candidate names in the input name's cluster
    for input_name in input_names:
        candidates = list(cluster_candidates[name_cluster_large[input_name]])
        similar_names.append(candidates)
        similar_scores.append([1.0] * len(candidates))
        if len(candidates) > max_names:
            max_names = len(candidates)
    # pad
    for ix in range(len(input_names)):
        if len(similar_names[ix]) < max_names:
            similar_names[ix] += [''] * (max_names - len(similar_names[ix]))
            similar_scores[ix] += [0.0] * (max_names - len(similar_scores[ix]))
    # turn into np array
    similar_names = np.array(similar_names, dtype="O")
    similar_scores = np.array(similar_scores, dtype="f8")
    # return np.array(input names, candidate names (name, score))
    return np.dstack((similar_names, similar_scores))

# Evaluate each algorithm

In [None]:
def evaluate_algos(coding_algos,
                   swivel_vocab,
                   swivel_model,
                   name_freq,
                   name_cluster,
                   tfidf_vectorizer,
                   ensemble_model,
                   input_names,
                   weighted_actual_names,
                   candidate_names,
                   name_cluster_large):

    figure, ax = plt.subplots(1, 1, figsize=(20, 15))
    ax.set_title("PR at threshold")
    colors = cm.rainbow(np.linspace(0, 1, len(coding_algos)))
    all_names = list(set(input_names).union(set(candidate_names)))

    for algo, color in zip(coding_algos, colors):
        print(algo, datetime.now())
        if algo.startswith("nama"):
            if algo == "nama":
                search_threshold = 0
                max_clusters = 1  # return just one cluster
            else:
                _, search_threshold = algo.split('-')
                search_threshold = int(search_threshold) / 100.0
                max_clusters = NAMA_MAX_CLUSTERS
            results = get_validation_results(
                input_names_eval=input_names,
                weighted_actual_names_eval=weighted_actual_names,
                candidate_names_eval=candidate_names,
                name_freq=name_freq,
                name_cluster=name_cluster,
                swivel_model=swivel_model,
                swivel_vocab=swivel_vocab,
                tfidf_vectorizer=tfidf_vectorizer,
                ensemble_model=ensemble_model,
                search_threshold=search_threshold,
                num_matches=num_matches,
                max_clusters=max_clusters,
                sample_size=sample_size,
                validation_sizes=[0],
                n_jobs=n_jobs,
                verbose=verbose) 
            precision = results['precisions'][0][search_threshold]
            recall = results['recalls'][0][search_threshold]
        elif algo.startswith("levclusters"):
            # associate candidates with the closest cluster "root"
            _, num_candidates = algo.split('-')
            num_candidates = int(num_candidates)
            print("candidate_names", len(candidate_names))
            # clusters is the "root" names for each cluster
            clusters = np.array([add_padding(cluster) for cluster in set(name_cluster.values())])
            print("clusters", len(clusters))
            # lev_matches is a list of cluster matches for each candidate name
            lev_matches = get_best_lev_matches(tfidf_vectorizer, candidate_names, clusters, num_candidates, \
                                               n_jobs=8)
            print("lev_matches", len(lev_matches))
            # candidate_clusters is the cluster that each candidate is closest to
            candidate_clusters = {}
            for candidate_name, lev_match in zip(candidate_names, lev_matches):
                candidate_clusters[candidate_name] = lev_match[0][0]
            # cluster_candidates is the candidates assigned to each cluster
            cluster_candidates = defaultdict(set)
            for candidate, cluster in candidate_clusters.items():
                cluster_candidates[remove_padding(cluster)].add(candidate)
            print("cluster_candidates", len(cluster_candidates))
            # for each input name, similar_names_scores contains (candidate_name, 1.0)
            # for each candidate name in the cluster that the input name has been assigned to
            similar_names_scores = _get_similar_names_scores(input_names, cluster_candidates, name_cluster_large)
            print("similar_names_scores", len(similar_names_scores))
            # calculate precision and recall
            precision = metrics.avg_precision_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
            recall = metrics.avg_weighted_recall_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
        elif algo.startswith("levclustered"):
            # associate candidates with the cluster of the closest clustered name
            _, num_candidates = algo.split('-')
            num_candidates = int(num_candidates)
            print("candidate_names", len(candidate_names))
            # clusters is the "clustered" names for each cluster
            clusters = np.array([clustered_name for clustered_name in name_cluster.keys()])
            print("clusters", len(clusters))
            # lev_matches is a list of clustered-name matches for each candidate name
            lev_matches = get_best_lev_matches(tfidf_vectorizer, candidate_names, clusters, num_candidates, \
                                               n_jobs=8)
            print("lev_matches", len(lev_matches))
            # candidate_clusters is the clustered-name that each candidate is closest to
            candidate_clusters = {}
            for candidate_name, lev_match in zip(candidate_names, lev_matches):
                candidate_clusters[candidate_name] = lev_match[0][0]
            # cluster_candidates is the candidates assigned to each cluster
            cluster_candidates = defaultdict(set)
            for candidate, clustered_name in candidate_clusters.items():
                cluster_candidates[name_cluster_large[clustered_name]].add(candidate)
            print("cluster_candidates", len(cluster_candidates))
            # for each input name, similar_names_scores contains (candidate_name, 1.0)
            # for each candidate name in the cluster that the input name has been assigned to
            similar_names_scores = _get_similar_names_scores(input_names, cluster_candidates, name_cluster_large)
            print("similar_names_scores", len(similar_names_scores))
            # calculate precision and recall
            precision = metrics.avg_precision_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
            recall = metrics.avg_weighted_recall_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
        else:
            name2codes = {name: get_codes(name, algo) for name in all_names}
            with WorkerPool(shared_objects=(candidate_names, num_matches, name2codes)) as pool:
                similar_names_scores = pool.map(get_similars, input_names, progress_bar=True)
            similar_names = [[name for name, _ in name_similarities] for name_similarities in similar_names_scores]
            names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
            scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
            total = max(scores.sum(axis=1))
            print("max sum of scores", total)
            if total == num_matches:
                print("WARNING: need to increase num_matches")
            similar_names_scores = np.dstack((names, scores))
            precision = metrics.avg_precision_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
            recall = metrics.avg_weighted_recall_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
        print(f"precision={precision} recall={recall}")
        precisions = [precision]
        recalls = [recall]
        ax.plot(recalls, precisions, "o--", color=color, label=algo)
    print("complete", datetime.now())

    ax.legend()
    plt.xlim([0, 1.0])
    plt.ylim([0, 1.0])
    plt.show()

#### on out-of-vocabulary names (test data)

In [None]:
_, input_names_sample, _, weighted_actual_names_sample = \
    train_test_split(input_names_test, weighted_actual_names_test, test_size=sample_size)
candidate_names_sample = candidate_names_test

print("input_names", len(input_names_sample))
print("weighted_actual_names", len(weighted_actual_names_sample))
print("candidate_names", len(candidate_names_sample))
print("all names", len(set(input_names_sample).union(set(candidate_names_sample))))

In [None]:
n_zero = n_one = n_two = 0
for input_name, wans in zip(input_names_sample, weighted_actual_names_sample):
    for actual_name, _, _ in wans:
        if input_name in swivel_vocab and actual_name in swivel_vocab:
            n_two += 1
        elif input_name in swivel_vocab or actual_name in swivel_vocab:
            n_one += 1
        else:
            n_zero += 1
print("two names in vocab (should not be possible)", n_two)
print("one name in vocab", n_one)
print("zero names in vocab", n_zero)

In [None]:
%%time
evaluate_algos(coding_algos,
               swivel_vocab,
               swivel_model,
               name_freq,
               name_cluster,
               tfidf_vectorizer,
               ensemble_model,
               input_names_sample,
               weighted_actual_names_sample,
               candidate_names_sample,
               name_cluster_large)

In [None]:
aggr_df = pd.read_parquet(config.aggr_path)

In [None]:
print(aggr_df.shape)
aggr_df.head(3)

In [None]:
total_in_cluster_scores = 0
total_not_in_cluster_scores = 0
for name, alt_name, frequency in zip(aggr_df['name'], aggr_df['alt_name'], aggr_df['frequency']):
    if len(name) > 1:
        if add_padding(name) in cluster_scores:
            total_in_cluster_scores += frequency
        else:
            total_not_in_cluster_scores += frequency
    if len(alt_name) > 1:
        if add_padding(alt_name) in cluster_scores:
            total_in_cluster_scores += frequency
        else:
            total_not_in_cluster_scores += frequency

In [None]:
print(total_in_cluster_scores)
print(total_not_in_cluster_scores)
print(total_not_in_cluster_scores / (total_in_cluster_scores + total_not_in_cluster_scores))

In [None]:
wandb.finish()