## Train an ensemble model over swivel + levenshtein

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, PrecisionRecallDisplay, precision_recall_curve
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_dataset
from src.models.ensemble import featurize
from src.models.levenshtein import calc_lev_similarity, get_best_lev_matches
from src.models.swivel import SwivelModel, get_best_swivel_matches
from src.models.utils import add_padding, remove_padding

In [None]:
# config

given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
train_sample_size = 40000
test_sample_size = 40000
neg_sample_rate = 0.50
embed_dim = 100
num_matches = 4500
batch_size = 256
tfidf_threshold=0.65
Config = namedtuple("Config", [
    "train_path",
    "test_path",
    "freq_path",
    "embed_dim",
    "swivel_vocab_path",
    "swivel_model_path",
    "tfidf_path",
    "ensemble_model_path",
])
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    test_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-test.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv",
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth",
    tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf.joblib",
    ensemble_model_path=f"s3://nama-data/data/models/fs-{given_surname}-ensemble-model-{vocab_size}-{embed_dim}-augmented-40-40-50.joblib",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="66_ensemble",
    group=given_surname,
    notes="",
    config=config._asdict(),
)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
if torch.cuda.is_available():
    print("cuda total", torch.cuda.get_device_properties(0).total_memory)
    print("cuda reserved", torch.cuda.memory_reserved(0))
    print("cuda allocated", torch.cuda.memory_allocated(0))

## Load data

In [None]:
input_names_train, weighted_actual_names_train, candidate_names_train = load_dataset(config.train_path)
input_names_test, weighted_actual_names_test, candidate_names_test = load_dataset(config.test_path)

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = {add_padding(name): freq for name, freq in zip(freq_df["name"], freq_df["frequency"])}
del freq_df

In [None]:
print(name_freq.get("<john>", 0))
print(name_freq.get("<dallan>", 0))

In [None]:
vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}
del vocab_df

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
tfidf_vectorizer = joblib.load(fopen(config.tfidf_path, mode='rb'))

In [None]:
_, input_names_train_sample, _, weighted_actual_names_train_sample = \
    train_test_split(input_names_train, weighted_actual_names_train, test_size=train_sample_size)
candidate_names_train_sample = candidate_names_train
_, input_names_test_sample, _, weighted_actual_names_test_sample = \
    train_test_split(input_names_test, weighted_actual_names_test, test_size=test_sample_size)
candidate_names_test_sample = candidate_names_test

In [None]:
print("input_names_train_sample", len(input_names_train_sample))
print("weighted_actual_names_train_sample", len(weighted_actual_names_train_sample))
print("candidate_names_train_sample", len(candidate_names_train_sample))
print("input_names_test_sample", len(input_names_test_sample))
print("weighted_actual_names_test_sample", len(weighted_actual_names_test_sample))
print("candidate_names_test_sample", len(candidate_names_test_sample))

In [None]:
# free memory
del input_names_train
del input_names_test
del weighted_actual_names_train
del weighted_actual_names_test
del candidate_names_train 
del candidate_names_test

## Generate ensemble training data

In [None]:
features = []
labels = []

### from train data

In [None]:
swivel_names_scores = get_best_swivel_matches(model=swivel_model, 
                                              vocab=swivel_vocab, 
                                              input_names=input_names_train_sample,
                                              candidate_names=candidate_names_train_sample,
                                              k=num_matches, 
                                              batch_size=batch_size,
                                              add_context=True,
                                              n_jobs=1)

In [None]:
print(sum(len(names_scores) for names_scores in swivel_names_scores))

In [None]:
# free memory
del swivel_model
del candidate_names_train_sample

In [None]:
# generate features and labels
for input_name, wans, swivels in tqdm(zip(input_names_train_sample, 
                                          weighted_actual_names_train_sample, 
                                          swivel_names_scores)):
    actual_names = set(name for name, _, _ in wans)
    # generate features from swivel scores and frequency
    input_name_freq = name_freq.get(input_name, 0)
    input_name_unpadded = remove_padding(input_name)
    for candidate_name, swivel_score in swivels:
        candidate_name_freq = name_freq.get(candidate_name, 0)
        candidate_name_unpadded = remove_padding(candidate_name)
        lev_score = calc_lev_similarity(input_name_unpadded, candidate_name_unpadded)
        feature = featurize(
            swivel_score, 
            lev_score,
            input_name_freq, 
            candidate_name_freq,
            False,
        )
        label = 1 if candidate_name in actual_names else 0
        features.append(feature)
        labels.append(label)

In [None]:
# free memory
del input_names_train_sample
del weighted_actual_names_train_sample
del swivel_names_scores

### from test data

In [None]:
# test get_best_lev_matches
get_best_lev_matches(tfidf_vectorizer=tfidf_vectorizer,
                     input_names = np.array(["<richard>", "<dallan>", 
                                             "<william>", "<john>"]),
                     candidate_names = np.array(["<ricardo>", "<dallin>", 
                                                 "<richaard>", "<dalliin>",
                                                 "<willem>", "<johnny>"]),
                     k=4,
                     batch_size=2,
                    )

In [None]:
lev_names_scores = get_best_lev_matches(tfidf_vectorizer=tfidf_vectorizer, 
                                        input_names=input_names_test_sample,
                                        candidate_names=candidate_names_test_sample,
                                        k=num_matches,
                                        batch_size=batch_size,
                                        n_jobs=1)

In [None]:
# free memory
del candidate_names_test_sample

In [None]:
# generate features and labels
for input_name, wans, levs in tqdm(zip(input_names_test_sample, 
                                       weighted_actual_names_test_sample, 
                                       lev_names_scores)):
    actual_names = set(name for name, _, _ in wans)
    # generate features from lev scores and frequency
    input_name_freq = name_freq.get(input_name, 0)
    for candidate_name, lev_score in levs:
        candidate_name_freq = name_freq.get(candidate_name, 0)
        feature = featurize(
            0.0,  # no way to get swivel score
            lev_score, 
            input_name_freq, 
            candidate_name_freq,
            True,
        )
        label = 1 if candidate_name in actual_names else 0
        features.append(feature)
        labels.append(label)

In [None]:
# free memory
del input_names_test_sample
del weighted_actual_names_test_sample
del lev_names_scores

In [None]:
print(len(features))
print(len(labels))
print(sum(labels))

### Downsample negatives

In [None]:
features_sample = []
labels_sample = []
for feature, label in zip(features, labels):
    if label == 1 or random.random() <= neg_sample_rate:
        features_sample.append(feature)
        labels_sample.append(label)

In [None]:
print(len(features_sample))
print(len(labels_sample))
print(sum(labels_sample))

### Train model

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(features_sample, labels_sample)

In [None]:
print(clf.coef_)
print(clf.intercept_)

### Save model

In [None]:
joblib.dump(clf, fopen(config.ensemble_model_path, mode='wb'))

In [None]:
clf = joblib.load(fopen(config.ensemble_model_path, mode='rb'))

In [None]:
print(clf.coef_)
print(clf.intercept_)

### Eval model on itself

In [None]:
predictions = clf.predict_proba(features)[:, 1]

In [None]:
precisions, recalls, thresholds = precision_recall_curve(labels, predictions)
disp = PrecisionRecallDisplay(precision=precisions, recall=recalls)
disp.plot()
plt.show()

In [None]:
print(auc(recalls, precisions))

In [None]:
wandb.finish()