In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple
import math

import jellyfish
import joblib
import matplotlib.pyplot as plt
from mpire import WorkerPool
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, PrecisionRecallDisplay, precision_recall_curve
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_dataset
from src.eval.utils import similars_to_ndarray
from src.models.ensemble import featurize
from src.models.swivel import SwivelModel, get_best_swivel_matches
from src.models.utils import add_padding, remove_padding

In [None]:
# config

given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
sample_size = 100000
embed_dim = 50  # 100
encoder_layers = 2
num_matches = 5000
batch_size = 256
swivel_threshold = 0.45
lev_threshold = 0.55
Config = namedtuple("Config", [
    "train_path",
    "freq_path",
    "embed_dim",
    "swivel_vocab_path",
    "swivel_model_path",
    "ensemble_model_path",
])
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv",
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth",
    ensemble_model_path=f"s3://nama-data/data/models/fs-{given_surname}-ensemble-model-{vocab_size}-{embed_dim}-augmented.pth",
)

In [None]:
# wandb.init(
#     project="nama",
#     entity="nama",
#     name="65_ensemble",
#     group=given_surname,
#     notes="",
#     config=config._asdict(),
# )

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Load data

In [None]:
input_names_train, weighted_actual_names_train, candidate_names_train = load_dataset(config.train_path)

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = {add_padding(name): freq for name, freq in zip(freq_df["name"], freq_df["frequency"])}

In [None]:
print(name_freq.get("<john>", 0))
print(name_freq.get("<dallan>", 0))

In [None]:
vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
_, input_names_train_sample, _, weighted_actual_names_train_sample = \
    train_test_split(input_names_train, weighted_actual_names_train, test_size=sample_size)
candidate_names_train_sample = candidate_names_train

### Generate training data

In [None]:
def calc_similarity_to(name):
    name = remove_padding(name)

    def calc_similarity(row):
        cand_name = remove_padding(row[0])
        dist = jellyfish.levenshtein_distance(name, cand_name)
        return 1 - (dist / max(len(name), len(cand_name)))

    return calc_similarity

In [None]:
def get_similars_for_name(name, candidate_names):
    scores = np.apply_along_axis(calc_similarity_to(name), 1, candidate_names[:, None])

    sorted_scores_idx = np.argsort(scores)[::-1]
    candidate_names = candidate_names[sorted_scores_idx]
    candidate_scores = scores[sorted_scores_idx]

    return list(zip(candidate_names, candidate_scores))

In [None]:
swivel_names_scores = get_best_swivel_matches(model=swivel_model, 
                                              vocab=swivel_vocab, 
                                              input_names=input_names_train_sample,
                                              candidate_names=candidate_names_train_sample,
                                              encoder_model=None,
                                              k=num_matches, 
                                              batch_size=batch_size,
                                              add_context=True,
                                              n_jobs=1)

In [None]:
print(sum(len(names_scores) for names_scores in swivel_names_scores))

In [None]:
# generate training data
features = []
labels = []
all_candidate_names = set(candidate_names_train_sample)
for input_name, wans, swivels in tqdm(zip(input_names_train_sample, 
                                          weighted_actual_names_train_sample, 
                                          swivel_names_scores)):
    # actuals - ensure names are in all_candidate_names
    actual_names = set(name for name, _, _ in wans if name in all_candidate_names)
    # swivel
    swivel_scores = {name: score for name, score in swivels \
                     if score >= swivel_threshold}
    swivel_names = set(swivel_scores.keys())
    # levenshtein
    lev_scores = {name: score for name, score in \
                  get_similars_for_name(input_name, np.array(list(swivel_names))) \
                  if score >= lev_threshold}
    lev_names = set(lev_scores.keys())

    # generate features from swivel and levenshtein scores and frequency
    input_name_freq = name_freq.get(input_name, 0)
    candidate_names = swivel_names.intersection(lev_names)
    for candidate_name in candidate_names:
        swivel_score = swivel_scores[candidate_name]
        lev_score = lev_scores[candidate_name]
        candidate_name_freq = name_freq.get(candidate_name, 0)
        feature = featurize(swivel_score, lev_score, input_name_freq, candidate_name_freq)
        label = 1 if candidate_name in actual_names else 0
#         if label == 1:
#             print(input_name, input_name_freq, candidate_name, candidate_name_freq, feature, label)
        features.append(feature)
        labels.append(label)

In [None]:
print(len(features))
print(len(labels))
print(sum(labels))

#### Downsample negatives

In [None]:
neg_sample_rate = 1.0
features_sample = []
labels_sample = []
for feature, label in zip(features, labels):
    if label == 1 or random.random() <= neg_sample_rate:
        features_sample.append(feature)
        labels_sample.append(label)

In [None]:
print(len(features_sample))
print(len(labels_sample))
print(sum(labels_sample))

### Train model

In [None]:
clf = LogisticRegression()
clf.fit(features_sample, labels_sample)

In [None]:
print(clf.coef_)
print(clf.intercept_)

### Eval model on itself

In [None]:
predictions = clf.predict_proba(features)[:, 1]

In [None]:
precisions, recalls, thresholds = precision_recall_curve(labels, predictions)
disp = PrecisionRecallDisplay(precision=precisions, recall=recalls)
disp.plot()
plt.show()

In [None]:
print(auc(recalls, precisions))

### Save ensemble model

In [None]:
joblib.dump(clf, fopen(config.ensemble_model_path, mode='wb'))

In [None]:
clf = joblib.load(fopen(config.ensemble_model_path, mode='rb'))

In [None]:
print(clf.coef_)
print(clf.intercept_)