In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import namedtuple
from datetime import datetime

import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import torch
from tqdm import tqdm
import umap
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_dataset
from src.eval import metrics
from src.models.swivel import SwivelDataset, SwivelModel, train_swivel, get_best_swivel_matches

In [None]:
# Config

given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
embed_dim = 100
n_epochs = 200
num_matches = 500
Config = namedtuple("Config", "train_path eval_path vocab_size embed_dim confidence_base confidence_scale confidence_exponent n_epochs submatrix_size lr vocab_path model_path")
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-augmented.csv.gz",
    eval_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    confidence_base=0.18 if given_surname == "given" else 0.14,
    confidence_scale=0.5 if given_surname == "given" else 0.45,
    confidence_exponent=0.3 if given_surname == "given" else 0.3,
    lr = 0.14 if given_surname == "given" else 0.24,
    n_epochs = n_epochs,
    submatrix_size = 4096,
    vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv",
    model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth",
)

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="61_swivel",
    group=given_surname,
    notes="umap",
    config=config._asdict()
)

### Load data

In [None]:
input_names_train, weighted_actual_names_train, candidate_names_train = \
    load_dataset(config.train_path)

In [None]:
# keep only the most-frequent vocab_size names
# input_names_train, weighted_actual_names_train, candidate_names_train = \
#     select_frequent_k(input_names_train, 
#                       weighted_actual_names_train, 
#                       candidate_names_train,
#                       config.vocab_size)

In [None]:
print("input_names_train", len(input_names_train))
print("weighted_actual_names_train", sum(len(wan) for wan in weighted_actual_names_train))
print("total pairs", sum(freq for wans in weighted_actual_names_train for _, _, freq in wans))
print("candidate_names_train", len(candidate_names_train))
print("total names", len(set(input_names_train).union(set(candidate_names_train))))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
dataset = SwivelDataset(input_names_train, weighted_actual_names_train, config.vocab_size, symmetric=True)
vocab = dataset.get_vocab()

In [None]:
# get vocab names in order by id
vocab_names = list(name_id[0] for name_id in sorted(vocab.items(), key=lambda x: x[1]))
print(len(vocab_names))

In [None]:
model = SwivelModel(len(vocab), config.embed_dim, config.confidence_base, config.confidence_scale, config.confidence_exponent)

### Initialize vectors

In [None]:
# create vectors with tfidf values
max_ngram = 3
min_df = 10
max_df = 0.8
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), analyzer="char_wb", min_df=min_df, max_df=max_df)
tfidf_X_train = tfidf_vectorizer.fit_transform(vocab_names)
print(tfidf_X_train.shape)

In [None]:
# reducer = TruncatedSVD(n_components=config.embed_dim)
reducer = umap.UMAP(n_components=config.embed_dim)
tfidf_X_train = reducer.fit_transform(tfidf_X_train)
print(tfidf_X_train.shape)

In [None]:
# scale to uniform [-sqrt(1/embed_dim), sqrt(1/embed_dim)]
scaler_max = math.sqrt(1 / config.embed_dim)
scaler = MinMaxScaler(feature_range=(-scaler_max, scaler_max))
tfidf_X_train = scaler.fit_transform(tfidf_X_train)

In [None]:
# init weights
model.init_params(dataset.get_row_sums(), dataset.get_col_sums(), tfidf_X_train)

### Train

In [None]:
import torch.optim as optim

n_steps_per_epoch = 0

model.to(device)
optimizer = optim.Adagrad(model.parameters(), lr=config.lr)

all_loss_values = []
for e in tqdm(range(0, config.n_epochs)):
    print("Epoch", e, datetime.now())
    loss_values = train_swivel(model, dataset, n_steps=n_steps_per_epoch, 
                     submatrix_size=config.submatrix_size, 
                     lr=config.lr, device=device, optimizer=optimizer)
    all_loss_values.extend(loss_values)
    torch.save(model.state_dict(), fopen(f"{config.model_path}.{e}", "wb"))

#### Save vocab

In [None]:
vocab_df = pd.DataFrame(vocab.items(), columns=["name", "index"])
vocab_df.to_csv(fopen(config.vocab_path, "wb"), index=False)

#### Save model

In [None]:
torch.save(model.state_dict(), fopen(config.model_path, "wb"))

In [None]:
print("Vocab and model saved")

#### Reload model

In [None]:
vocab_df = pd.read_csv(fopen(config.vocab_path, "rb"))
vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}
model = SwivelModel(len(vocab), config.embed_dim)
model.load_state_dict(torch.load(fopen(config.model_path, "rb")))
model.eval()

### Eval

In [None]:
ax = plt.gca()
ax.set_ylim([0, 1.0])
plt.plot(all_loss_values[::1000])

### PR Curve

In [None]:
# get best matches
# NOTE: only considers as potential matches names in candidate_names_eval, not names in input_names_eval
eval_batch_size = 1024
add_context = True
n_jobs=1
input_names_sample = input_names_train[::10]
weighted_actual_names_sample = weighted_actual_names_train[::10]
best_matches = get_best_swivel_matches(model, 
                                       vocab, 
                                       input_names_sample,
                                       candidate_names_train, 
                                       k=num_matches, 
                                       eval_batch_size,
                                       add_context=add_context, 
                                       n_jobs=n_jobs)

In [None]:
metrics.precision_weighted_recall_curve_at_threshold(
    weighted_actual_names_sample, best_matches, min_threshold=0.01, max_threshold=2.0, step=0.05, distances=False
)

In [None]:
metrics.get_auc(
    weighted_actual_names_sample, best_matches, min_threshold=0.01, max_threshold=1.0, step=0.05, distances=False
)

### Eval on original (unaugmented) data

In [None]:
input_names_eval, weighted_actual_names_eval, candidate_names_eval = \
    load_dataset(config.eval_path, is_eval=True)

In [None]:
# make sure all the names are in the model
for name in input_names_eval:
    if name not in vocab.keys():
        print("name missing", name)
        break
for name in candidate_names_eval:
    if name not in vocab.keys():
        print("name missing", name)
        break

In [None]:
# get best matches
# NOTE: only considers as potential matches names in candidate_names_eval, not names in input_names_eval
eval_batch_size = 1024
add_context = True
n_jobs=1
input_names_sample = input_names_eval[::10]
weighted_actual_names_sample = weighted_actual_names_eval[::10]
best_matches = get_best_swivel_matches(model, 
                                       vocab, 
                                       input_names_sample,
                                       candidate_names_eval, 
                                       k=num_matches, 
                                       eval_batch_size,
                                       add_context=add_context, 
                                       n_jobs=n_jobs)

### PR Curve

In [None]:
pos = 120
input_names_sample[pos]
# for ix, name in enumerate(input_names_sample):
#     print(ix, name)

In [None]:
input_names_test = input_names_sample[pos:pos+1]
weighted_actual_names_test = weighted_actual_names_sample[pos:pos+1]
print(weighted_actual_names_test)
best_matches_test = best_matches[pos:pos+1]
print(best_matches_test)

In [None]:
metrics.precision_at_threshold(weighted_actual_names_test[0], best_matches_test[0], 0.7725, False)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[0], best_matches_test[0], 0.7725, False)

In [None]:
metrics.precision_weighted_recall_curve_at_threshold(
    weighted_actual_names_test, best_matches_test, min_threshold=0.01, max_threshold=1.0, step=0.05, distances=False
)

In [None]:
metrics.precision_weighted_recall_curve_at_threshold(
    weighted_actual_names_sample, best_matches, min_threshold=0.01, max_threshold=1.0, step=0.05, distances=False
)

In [None]:
metrics.get_auc(
    weighted_actual_names_sample, best_matches, min_threshold=0.01, max_threshold=1.0, step=0.05, distances=False
)

In [None]:
wandb.finish()

## Review

In [None]:
threshold = 0.6
for i in range(100001, 400000, 10000):
    print(i, input_names_eval[i])
    matches_above_threshold = best_matches[i][best_matches[i,:,1] > threshold]
    matched_wans = []
    unmatched_wans = []
    for wan in weighted_actual_names_eval[i]:
        if wan[0] in matches_above_threshold[:, 0]:
            matched_wans.append(wan)
        elif wan[2] > 0:
            unmatched_wans.append(wan)
    print("  matched wans", matched_wans)
    print("  unmatched wans", unmatched_wans)
    print("  matches above threshold", len(matches_above_threshold), matches_above_threshold)

In [None]:
i = 390000

In [None]:
best_matches[i]

In [None]:
weighted_actual_names_train[input_names_train.index(input_names_eval[i])]

In [None]:
variant = "<shirley>"

In [None]:
weighted_actual_names_train[input_names_train.index(variant)]

In [None]:
weighted_actual_names_eval[input_names_eval.index(variant)]

### Test

In [None]:
import numpy as np
from src.models.swivel import get_swivel_embeddings
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
# demo names
input_names_train = ["<john>", "<mary>"]
weighted_actual_names_train = [
    [("<johnny>", 0.2, 20), ("<jonathan>", 0.5, 50), ("<jon>", 0.3, 30)],
    [("<marie>", 0.7, 70), ("<maria>", 0.3, 30)],
    # [("<johnny>", 0.2, 20), ("<jonathan>", 0.5, 50), ("<jon>", 0.3, 30), ("<mary>", 0.0, 0.5)],
    # [("<marie>", 0.7, 70), ("<maria>", 0.3, 30), ("<john>", 0.0, 0.5)],
    # [("<johnny>", 0.2, 20), ("<jonathan>", 0.5, 50), ("<jon>", 0.3, 30), ("<mary>", 0.0, 1), ("<maria>", 0.0, 1), ("<marie>", 0.0, 1)],
    # [("<marie>", 0.7, 70), ("<maria>", 0.3, 30), ("<john>", 0.0, 1), ("<johnny>", 0.0, 1), ("<jonathan>", 0.0, 1), ("<jon>", 0.0, 1)],
]
candidate_names_train = np.array(["<johnny>", "<jonathan>", "<marie>", "<maria>", "<jon>"])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
symmetric = True
dataset = SwivelDataset(input_names_train, weighted_actual_names_train, config.vocab_size, symmetric=symmetric)
vocab = dataset.get_vocab()
print(vocab)

In [None]:
print(dataset._sparse_cooc)

In [None]:
# get vocab names in order by id
vocab_names = list(name_id[0] for name_id in sorted(vocab.items(), key=lambda x: x[1]))
print(vocab_names)

In [None]:
# create vectors with tfidf values
max_ngram = 5  # 3
min_df = 1  # 10
max_df = 1.0  # 0.5
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), analyzer="char_wb", min_df=min_df, max_df=max_df)
tfidf_X_train = tfidf_vectorizer.fit_transform(vocab_names)
print(tfidf_X_train.shape)

In [None]:
embed_dim = 2

In [None]:
# reduce tfidf values to embed_dim
svd = TruncatedSVD(n_components=embed_dim)
tfidf_X_train = svd.fit_transform(tfidf_X_train)
tfidf_X_train.shape

In [None]:
# create swivel model
model = SwivelModel(len(vocab), embed_dim, config.confidence_base, config.confidence_scale, config.confidence_exponent)

In [None]:
# init weights to tfidf values
# model.init_params(dataset.get_row_sums(), dataset.get_col_sums(), tfidf_X_train)
model.init_params(dataset.get_row_sums(), dataset.get_col_sums(), tfidf_X_train)

In [None]:
# device="cpu"
n_steps = 10
submatrix_size = 64
learning_rate = 0.05
loss_values = train_swivel(model, dataset, n_steps=n_steps, submatrix_size=submatrix_size, lr=learning_rate, device=device)

In [None]:
ax = plt.gca()
# ax.set_ylim([0, 0.1])
plt.plot(loss_values)

In [None]:
k = 10
add_context = True

all_names = np.array(input_names_train + candidate_names_train.tolist())
all_embeddings = get_swivel_embeddings(model, vocab, all_names, add_context=add_context)

In [None]:
print(all_names)

In [None]:
demo_name = '<john>'
demo_name_pos = 0
demo_embeddings = get_swivel_embeddings(model, vocab, [demo_name], add_context=add_context)

In [None]:
# try cosine similarity
# totals = all_embeddings.sum(axis=0)
# all_embeddings_norm = all_embeddings / totals
# demo_embeddings_norm = all_embeddings_norm[[demo_name_pos]]
# scores = cosine_similarity(demo_embeddings_norm, all_embeddings_norm)
# ixs = np.argsort(-scores)[:, :k]
# sorted_scores = scores[:, ixs[0]]
# sorted_names = all_names[ixs[0]]
# best_matches = np.dstack((sorted_names, sorted_scores))
# print("cosine_norm_0", best_matches)

# totals = demo_embeddings.sum(axis=1)
# demo_embeddings_norm = demo_embeddings / totals[:, np.newaxis]
# totals = all_embeddings.sum(axis=1)
# all_embeddings_norm = all_embeddings / totals[:, np.newaxis]
# scores = cosine_similarity(demo_embeddings_norm, all_embeddings_norm)
# ixs = np.argsort(-scores)[:, :k]
# sorted_scores = scores[:, ixs[0]]
# sorted_names = all_names[ixs[0]]
# best_matches = np.dstack((sorted_names, sorted_scores))
# print("cosine_norm_1", best_matches)

scores = cosine_similarity(demo_embeddings, all_embeddings)
ixs = np.argsort(-scores)[:, :k]
sorted_scores = scores[:, ixs[0]]
sorted_names = all_names[ixs[0]]
best_matches = np.dstack((sorted_names, sorted_scores))
print("cosine", best_matches)

In [None]:
# try euclidean similarity
totals = all_embeddings.sum(axis=0)
all_embeddings_norm = all_embeddings / totals
demo_embeddings_norm = all_embeddings_norm[[demo_name_pos]]
scores = euclidean_distances(demo_embeddings_norm, all_embeddings_norm)
ixs = np.argsort(scores)[:, :k]
sorted_scores = scores[:, ixs[0]]
sorted_names = all_names[ixs[0]]
best_matches = np.dstack((sorted_names, sorted_scores))
print("euclidean_norm_0", best_matches)

# totals = demo_embeddings.sum(axis=1)
# demo_embeddings_norm = demo_embeddings / totals[:, np.newaxis]
# totals = all_embeddings.sum(axis=1)
# all_embeddings_norm = all_embeddings / totals[:, np.newaxis]
# scores = euclidean_distances(demo_embeddings_norm, all_embeddings_norm)
# ixs = np.argsort(scores)[:, :k]
# sorted_scores = scores[:, ixs[0]]
# sorted_names = all_names[ixs[0]]
# best_matches = np.dstack((sorted_names, sorted_scores))
# print("euclidean_norm_1", best_matches)

scores = euclidean_distances(demo_embeddings, all_embeddings)
ixs = np.argsort(scores)[:, :k]
sorted_scores = scores[:, ixs[0]]
sorted_names = all_names[ixs[0]]
best_matches = np.dstack((sorted_names, sorted_scores))
print("euclidean", best_matches)

In [None]:
# plot embeddings
xs = list(x for x, _ in all_embeddings)
ys = list(y for _, y in all_embeddings)
plt.scatter(xs, ys)
for ix, name in enumerate(all_names):
    plt.annotate(name, xy=(xs[ix], ys[ix]), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')

In [None]:
source_names = np.array(["tom", "dick", "harry"])
source_names_X = np.array([[1,2,3],[4,5,6],[7,8,9]])
rows = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [None]:
scores = cosine_similarity(rows, source_names_X)
scores

In [None]:
sorted_scores_idx = np.argsort(scores, axis=1)
sorted_scores_idx = np.flip(sorted_scores_idx, axis=1)
sorted_scores_idx

In [None]:
sorted_scores_idx = sorted_scores_idx[:, :2]
sorted_scores_idx

In [None]:
sorted_scores = np.take_along_axis(scores, sorted_scores_idx, axis=1)
sorted_scores

In [None]:
sorted_source_names_X = source_names_X[sorted_scores_idx]
sorted_source_names_X

In [None]:
for i, (row, source_names_X) in enumerate(zip(rows, sorted_source_names_X)):
    for j, source_name_X in enumerate(source_names_X):
        if np.array_equal(row, source_name_X):
            sorted_scores[i, j] = 0
sorted_scores                        

In [None]:
re_sorted_scores_idx = np.argsort(sorted_scores, axis=1)
re_sorted_scores_idx

In [None]:
re_sorted_scores_idx = np.flip(re_sorted_scores_idx, axis=1)
re_sorted_scores_idx

In [None]:
re_sorted_scores_idx = re_sorted_scores_idx[:, :1]
re_sorted_scores_idx

In [None]:
sorted_scores = np.take_along_axis(sorted_scores, re_sorted_scores_idx, axis=1)
sorted_scores

In [None]:
sorted_scores_idx = np.take_along_axis(sorted_scores_idx, re_sorted_scores_idx, axis=1)
sorted_scores_idx

In [None]:
sorted_source_names = source_names[sorted_scores_idx]
sorted_source_names