# Train a TfidfVectorizer to filter names sent to levenshtein
Levenshtein is slow, so we want to use TfidfVectorizer to filter the number of candidates we pass into levenshtein. Try different hyperparameters to see which yields the best results.

Save the best TfidfVectorizer model so we can re-use it later

In addition, implement our own TfidfVectorizer (transform only) so we can port it to other languages.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple
import math

import jellyfish
import joblib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from mpire import WorkerPool
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.extmath import safe_sparse_dot
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_dataset, select_frequent_k, frequent_k_names
from src.eval import metrics
from src.eval.utils import similars_to_ndarray
from src.models.ensemble import get_best_ensemble_matches
from src.models.swivel import SwivelModel, get_best_swivel_matches
from src.models.swivel_encoder import SwivelEncoderModel
from src.models.utils import remove_padding, add_padding

In [None]:
# config

given_surname = "surname"
vocab_size = 610000 if given_surname == "given" else 2100000
sample_size = 1000
num_matches = 5000
batch_size = 32 # 256

Config = namedtuple("Config", [
    "train_path",
    "test_path",
    "tfidf_path",
])
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-augmented.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz",
    tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf.joblib",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="65_tfidf",
    group=given_surname,
    notes="",
    config=config._asdict(),
)

### Load data

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

In [None]:
input_names_train, weighted_actual_names_train, candidate_names_train = load_dataset(config.train_path, verbose=True)

In [None]:
input_names_test, weighted_actual_names_test, candidate_names_test = load_dataset(config.test_path, is_eval=True, verbose=True)

In [None]:
# sample names (train, eval, and freq_eval for in-vocab and test for out-of-vocab)
_, input_names_train_sample, _, weighted_actual_names_train_sample = \
    train_test_split(input_names_train, weighted_actual_names_train, test_size=sample_size)
candidate_names_train_sample = candidate_names_train

_, input_names_test_sample, _, weighted_actual_names_test_sample = \
    train_test_split(input_names_test, weighted_actual_names_test, test_size=sample_size)
candidate_names_test_sample = candidate_names_test

In [None]:
print("input_names_train_sample", len(input_names_train_sample))
print("weighted_actual_names_train_sample", len(weighted_actual_names_train_sample))
print("candidate_names_train_sample", len(candidate_names_train_sample))

print("input_names_test_sample", len(input_names_test_sample))
print("weighted_actual_names_test_sample", len(weighted_actual_names_test_sample))
print("candidate_names_test_sample", len(candidate_names_test_sample))

In [None]:
# free memory
del input_names_train
del weighted_actual_names_train
del input_names_test
del weighted_actual_names_test

### Set up tfidf

In [None]:
# tfidf
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer="char_wb", min_df=40000, max_df=0.5)
tfidf_X_train_sample = tfidf_vectorizer.fit_transform(candidate_names_train_sample)
tfidf_X_test_sample = tfidf_vectorizer.transform(candidate_names_test_sample)

In [None]:
tfidf_X_test_sample.shape

In [None]:
candidate_names_test_sample[0:3]

In [None]:
# (1,3), 10, 0.5 => 8936
# (1,3), 100, 0.5 => 4744
# (1,3), 1000, 0.5 => 1428
# (1,3), 10000, 0.5 => 221
# (1,3), 15000, 0.5 => 160 not as good as bigrams
# (1,2), 10, 0.5 => 700
# (1,2), 100, 0.5 => 592
# (1,2), 1000, 0.5 => 406
# (1,2), 5000, 0.5 => 233 @ threshold=0.4: 30sec, .4193auc
# (1,2), 10000, 0.5 => 160 @ threshold=0.45 28sec .4188auc
# (1,2), 20000, 0.5 => 97 @ threshold=0.55 21sec  .4167auc threshold=0.5 29sec  .4180auc
# (1,2), 40000, 0.5 => 46 @ threshold=0.65 20sec  .411auc  threshold=0.6 30sec  .415auc
# ^^^ winner

In [None]:
tfidf_vectorizer.vocabulary_

In [None]:
tfidf_vectorizer.idf_

In [None]:
test = ["<a<c<a"]
tfidf_vectorizer.transform(test).todense()

### Simple (but slow) Tfidf transformer implementation that is portable to other languages

In [None]:
class MyTfidfVectorizer:
    def __init__(self, vocab, idf, ngram_range=None):
        self.vocab = vocab
        self.idf = idf
        self.ngram_range = (1,2) if ngram_range is None else ngram_range
        
    def transform(self, word):
        # get counts
        result = np.zeros(len(self.vocab))
        for ngram_len in self.ngram_range:
            for pos in range(len(word)-ngram_len+1):
                tok = word[pos:pos+ngram_len]
                ix = self.vocab.get(tok, -1)
                if ix >= 0:
                    result[ix] += 1
        # multiply counts by idf
        sum_squares = 0.0
        for ix in range(len(self.vocab)):
            tf_idf = result[ix] * self.idf[ix]
            result[ix] = tf_idf
            sum_squares += tf_idf * tf_idf
        # divide by l2 norm
        norm = math.sqrt(sum_squares)
        if norm > 0.0:
            for ix in range(len(self.vocab)):
                result[ix] /= norm
            
        return result

In [None]:
tfidf = MyTfidfVectorizer(tfidf_vectorizer.vocabulary_, tfidf_vectorizer.idf_)

In [None]:
tfidf.transform("<a<c<a")

### Levenshtein similarity

In [None]:
def calc_lev_similarity_to(name):
    name = remove_padding(name)

    def calc_similarity(row):
        cand_name = remove_padding(row[0])
        dist = jellyfish.levenshtein_distance(name, cand_name)
        return 1 - (dist / max(len(name), len(cand_name)))

    return calc_similarity

In [None]:
def get_similars(shared, names, _=None):
    candidate_names_test, k, algo, tfidf_vectorizer, tfidf_X_test = shared
    
    def get_similars_for_name(name):
        if algo == "tfidf":
            x = tfidf_vectorizer.transform([name]).toarray()
            scores = safe_sparse_dot(tfidf_X_test, x.T).flatten()
        elif algo.startswith("tfidf+lev"):
            scores = np.zeros(len(candidate_names_test))
            threshold = float(algo.split("_")[1])
            x = tfidf_vectorizer.transform([name]).toarray()
            tfidf_scores = safe_sparse_dot(tfidf_X_test, x.T).flatten()
            ixs = (tfidf_scores > threshold).nonzero()[0]
            if len(ixs) > 0:
                lev_scores = np.apply_along_axis(calc_lev_similarity_to(name),
                                                1, candidate_names_test[ixs, None])
                scores[ixs] = lev_scores
        else:
            scores = np.apply_along_axis(calc_lev_similarity_to(name), 
                                         1, candidate_names_test[:, None])

        # sorted_scores_idx = np.argsort(scores)[::-1][:k]
        partitioned_idx = np.argpartition(scores, -k)[-k:]
        sorted_partitioned_idx = np.argsort(scores[partitioned_idx])[::-1]
        sorted_scores_idx = partitioned_idx[sorted_partitioned_idx]

        candidate_names = candidate_names_test[sorted_scores_idx]
        candidate_scores = scores[sorted_scores_idx]

        return list(zip(candidate_names, candidate_scores))
    
    result = []
    for name in names:
        result.append(get_similars_for_name(name))
    return result

#### Create batches

In [None]:
def create_batches(names, batch_size):
    batches = []
    for ix in range(0, len(names), batch_size):
        # batches are tuples to keep mpire from expanding the batch 
        batches.append((names[ix:ix + batch_size], ix))
    return batches

### Test levenshtein

In [None]:
probe_name = "<bostelman>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_test_sample, 10, "levenshtein", None, None), [probe_name])

### Test tfidf

In [None]:
probe_name = "<bostelman>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_test_sample, 10, "tfidf", tfidf_vectorizer, tfidf_X_test_sample), [probe_name])

### Test tfidf+lev

In [None]:
probe_name = "<bostelman>" if given_surname == "surname" else "<richard>"
get_similars((candidate_names_test_sample, 10, "tfidf+lev_0.76", tfidf_vectorizer, tfidf_X_test_sample), [probe_name])

# Evaluate at various thresholds

In [None]:
SimilarityAlgo = namedtuple("SimilarityAlgo", "name min_threshold max_threshold distances")
similarity_algos = [
    SimilarityAlgo("tfidf+lev_0.7", 0.0, 1.01, False),
    SimilarityAlgo("tfidf+lev_0.65", 0.0, 1.01, False),
    SimilarityAlgo("tfidf+lev_0.6", 0.0, 1.01, False),
#     SimilarityAlgo("levenshtein", 0.0, 1.01, False),
]

In [None]:
def evaluate_algos(similarity_algos, 
                   input_names, 
                   weighted_actual_names, 
                   candidate_names, 
                   tfidf_X):
    n_jobs = 1

    figure, ax = plt.subplots(1, 1, figsize=(20, 15))
    ax.set_title("PR at threshold")
    colors = cm.rainbow(np.linspace(0, 1, len(similarity_algos)))

    for algo, color in zip(similarity_algos, colors):
        print(algo.name)
        input_names_batches = create_batches(input_names, batch_size=batch_size)
        if n_jobs == 1:
            similar_names_scores = []
            for input_names_batch, _ in tqdm(input_names_batches):
                similar_names_scores.append(
                    get_similars((candidate_names, num_matches, algo.name, tfidf_vectorizer, tfidf_X),
                                 input_names_batch))
        else:
            with WorkerPool(
                shared_objects=(candidate_names, num_matches, algo.name, tfidf_vectorizer, tfidf_X),
                n_jobs=n_jobs,
            ) as pool:
                similar_names_scores = pool.map(get_similars, input_names_batches, progress_bar=True)
        input_names_batches = None
        # flatten
        similar_names_scores = [name_score for batch in similar_names_scores for name_score in batch]
        # convert to ndarray
        similar_names_scores = similars_to_ndarray(similar_names_scores)
        print("calculating precision and recall")
        precisions, recalls = metrics.precision_weighted_recall_at_threshold(
            weighted_actual_names,
            similar_names_scores,
            min_threshold=algo.min_threshold,
            max_threshold=algo.max_threshold,
            step=0.01,
            distances=algo.distances,
            n_jobs=1,
            progress_bar=True,
        )
        similar_names_scores = None
        print("auc", metrics.get_auc_from_precisions_recalls(
            precisions, 
            recalls, 
            distances=algo.distances
        ))
        ax.plot(recalls, precisions, "o--", color=color, label=algo.name)

    ax.legend()
    plt.xlim([0, 1.0])
    plt.ylim([0, 1.0])
    plt.show()

## on out-of-vocabulary names (test data)

In [None]:
evaluate_algos(similarity_algos, 
               input_names_test_sample, 
               weighted_actual_names_test_sample, 
               candidate_names_test_sample, 
               tfidf_X_test_sample)

In [None]:
joblib.dump(tfidf_vectorizer, fopen(config.tfidf_path, mode='wb'))

In [None]:
wandb.finish()