In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple

import cologne_phonetics
import jellyfish
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from metaphone import doublemetaphone
from mpire import WorkerPool
import numpy as np
from pyphonetics import RefinedSoundex
from sklearn.model_selection import train_test_split
from spellwise import CaverphoneOne, CaverphoneTwo
import pandas as pd
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_datasets
from src.eval import metrics
from src.models.utils import remove_padding
from src.models.cluster import read_clusters, get_clusters
from src.models.swivel import SwivelModel, get_swivel_embeddings
from src.models.swivel_encoder import SwivelEncoderModel

In [None]:
# config

given_surname = "given"
vocab_size = 600000 if given_surname == "given" else 2100000
encoder_vocab_size = vocab_size
embed_dim = 100
NAMA_THRESHOLD = 0.55
NAMA_MAX_CLUSTERS = 10
Config = namedtuple("Config", "train_path test_path embed_dim swivel_vocab_path swivel_model_path encoder_model_path cluster_path")
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}.csv",
    # FIX
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-50.pth",
    encoder_model_path=f"s3://nama-data/data/models/fs-{given_surname}-encoder-model-{encoder_vocab_size}-{embed_dim}.pth",
    cluster_path=f"s3://nama-data/processed/tree-hr-{given_surname}-clusters-{vocab_size}-{embed_dim}.csv.gz"
)

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()
wandb.init(
    project="nama",
    entity="nama",
    name="90_compare_coders",
    group=given_surname,
    notes="fs in-vocab",
    config=config._asdict(),
)

### Load data

In [None]:
# read data

train, test = load_datasets([config.train_path, config.test_path])

input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = test

In [None]:
print("input_names_train", len(input_names_train))
print("candidate_names_train", len(candidate_names_train))

print("input_names_test", len(input_names_test))
print("candidate_names_test", len(candidate_names_test))

In [None]:
swivel_vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(swivel_vocab_df["name"], swivel_vocab_df["index"])}

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), embedding_dim=config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()
print(swivel_model)

In [None]:
encoder_model = SwivelEncoderModel(output_dim=config.embed_dim, device=device)
encoder_model.load_state_dict(torch.load(fopen(config.encoder_model_path, "rb"), map_location=torch.device(device)))
encoder_model.to(device)
encoder_model.eval()
print(encoder_model)

In [None]:
name2cluster = read_clusters(config.cluster_path)
clustered_names = list(name2cluster.keys())
clustered_name2cluster_id =list(name2cluster.values())
print("cluster_names", len(clustered_names))

In [None]:
clustered_name_embeddings = get_swivel_embeddings(model=swivel_model,
                                                  vocab=swivel_vocab,
                                                  names=clustered_names,
                                                  encoder_model=encoder_model)

### Other Models

In [None]:
# various coders
caverphone_one = CaverphoneOne()
caverphone_two = CaverphoneTwo()
refined_soundex = RefinedSoundex()

In [None]:
coding_algos = [
    "nama",
    "soundex",
    "nysiis",
    "metaphone",
    # "caverphone1",
    # "caverphone2",
    "refined_soundex",
    #     "double_metaphone",  # bad implementation?
    # "cologne_phonetics",
]

In [None]:
# test double metaphone
name = "smith"
cand_name = "schmidt"
dm1 = doublemetaphone(name)
dm2 = doublemetaphone(cand_name)
similarity = 1.0 if any(code in dm2 for code in dm1) else 0.0
print("dm1", dm1)
print("dm2", dm2)
print("similarity", similarity)

### Similarity functions

In [None]:
def get_codes(name, algo):
    if algo == "nama":
        return [cluster[0] for cluster in name2clusters[name] if cluster[1] >= NAMA_THRESHOLD]

    name = remove_padding(name)
    if algo == "caverphone1":
        return [caverphone_one._pre_process(name)]
    elif algo == "caverphone2":
        return [caverphone_two._pre_process(name)]
    elif algo == "refined_soundex":
        return [refined_soundex.phonetics(name)]
    elif algo == "double_metaphone":
        return doublemetaphone(name)
    elif algo == "cologne_phonetics":
        return [cologne_phonetics.encode(name)[0][1]]
    elif algo == "soundex":
        return [jellyfish.soundex(name)]
    elif algo == "nysiis":
        return [jellyfish.nysiis(name)]
    elif algo == "metaphone":
        return [jellyfish.metaphone(name)]

In [None]:
def calc_similarity_to(name, name2codes):
    codes1 = name2codes[name]

    def calc_similarity(row):
        cand_name = row[0]
        codes2 = name2codes[cand_name]
        return 1.0 if any(code1 == code2 for code1 in codes1 for code2 in codes2) else 0.0

    return calc_similarity

In [None]:
def get_similars(shared, name=""):
    candidate_names, k, name2codes = shared
    scores = np.apply_along_axis(calc_similarity_to(name, name2codes), 1, candidate_names[:, None])
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidate_names = candidate_names[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

# Evaluate each algorithm

In [None]:
# eval on training data
# input_names = input_names_train
# weighted_actual_names = weighted_actual_names_train
# candidate_names = candidate_names_train

_, input_names, _, weighted_actual_names = \
    train_test_split(input_names_train, weighted_actual_names_train, test_size=10000)
candidate_names = candidate_names_train

all_names = list(set(input_names).union(set(candidate_names)))

print("input_names", len(input_names))
print("weighted_actual_names", len(weighted_actual_names))
print("candidate_names", len(candidate_names))

In [None]:
all_name_embeddings = get_swivel_embeddings(model=swivel_model,
                                        vocab=swivel_vocab,
                                        names=all_names,
                                        encoder_model=encoder_model)

In [None]:
%%time
name2clusters, _ = get_clusters(all_names,
                                all_name_embeddings,
                                clustered_name2cluster_id,
                                clustered_name_embeddings,
                                k=100,
                                max_clusters=NAMA_MAX_CLUSTERS,
                                verbose=True,
                               )

In [None]:
k = 5000  # Number of candidates to consider
extra_algos = 0
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("PR at threshold")
colors = cm.rainbow(np.linspace(0, 1, len(coding_algos) + extra_algos))

# plot anc-triplet-bilstm-100-512-40-05 model
# ax.plot([.809], [.664], "o--", color=colors[0], label="triplet-cluster")
# ax.plot([.594], [.543], "o--", color=colors[1], label="dam-lev-cluster")

for algo, color in zip(coding_algos, colors[extra_algos:]):
    print(algo)
    name2codes = {name: get_codes(name, algo) for name in all_names}
    with WorkerPool(shared_objects=(candidate_names, k, name2codes)) as pool:
        similar_names_scores = pool.map(get_similars, input_names, progress_bar=True)
    similar_names = [[name for name, _ in name_similarities] for name_similarities in similar_names_scores]
    names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores), dtype="O")
    scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores), dtype="f8")
    total = max(scores.sum(axis=1))
    print("max sum of scores", total)
    if total == k:
        print("WARNING!!! need to increase k!!!")
    similar_names_scores = np.dstack((names, scores))
    precision = metrics.avg_precision_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
    recall = metrics.avg_weighted_recall_at_threshold(weighted_actual_names, similar_names_scores, 0.5)
    print(f"precision={precision} recall={recall}")
    precisions = [precision]
    recalls = [recall]
    ax.plot(recalls, precisions, "o--", color=color, label=algo)

ax.legend()
plt.xlim([0, 1.0])
plt.ylim([0, 1.0])
plt.show()

In [None]:
wandb.finish()