In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Generate a glove model

In [None]:
from collections import namedtuple
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import torch
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_train_test
from src.eval import metrics
from src.models.glove import GloveDataset, GloveModel, train_glove, get_best_glove_matches

In [None]:
# Config

given_surname = "given"
size = "freq"
vocab_size = 500000
embed_dim = 200
Config = namedtuple("Config", "train_path vocab_size embed_dim glove_vocab_path glove_model_path")
config = Config(
    train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar-train-{size}.csv.gz",
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    glove_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-{size}-glove-{vocab_size}-vocab-tfidf.csv",
    glove_model_path=f"s3://nama-data/data/models/fs-{given_surname}-{size}-glove-{vocab_size}-{embed_dim}-tfidf.pt",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="52_glove",
    group=given_surname,
    notes="",
    config=config._asdict()
)

### Load data

In [None]:
[train] = load_train_test([config.train_path])

In [None]:
input_names_train, weighted_actual_names_train, candidate_names_train = train

In [None]:
print("input_names_train", len(input_names_train))
print("weighted_actual_names_train", sum(len(wan) for wan in weighted_actual_names_train))
print("candidate_names_train", len(candidate_names_train))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
symmetric=True
dataset = GloveDataset(input_names_train, weighted_actual_names_train, config.vocab_size, device=device, symmetric=symmetric)
vocab = dataset.get_vocab()

In [None]:
# get vocab names in order by id
vocab_names = list(name_id[0] for name_id in sorted(vocab.items(), key=lambda x: x[1]))
print(len(vocab_names))

In [None]:
# create vectors with tfidf values
max_ngram = 4
min_df = 10
max_df = 0.5
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), analyzer="char_wb", min_df=min_df, max_df=max_df)
tfidf_X_train = tfidf_vectorizer.fit_transform(vocab_names)
print(tfidf_X_train.shape)

In [None]:
# reduce tfidf values to embed_dim
svd = TruncatedSVD(n_components=config.embed_dim)
tfidf_X_train = svd.fit_transform(tfidf_X_train)
print(tfidf_X_train.shape)

In [None]:
model = GloveModel(len(vocab), config.embed_dim)

In [None]:
# init weights to tfidf values
model.wi.weight.data = torch.from_numpy(tfidf_X_train)
model.wj.weight.data = torch.from_numpy(tfidf_X_train)

In [None]:
model.to(device=device)

In [None]:
n_epochs = 50  # 100
batch_size = 64
learning_rate = 0.05
x_max = 100
alpha = 0.75
loss_values = train_glove(model, dataset, n_epochs=n_epochs, batch_size=batch_size, x_max=x_max, alpha=alpha, lr=learning_rate, device=device)

In [None]:
ax = plt.gca()
ax.set_ylim([0, 0.01])
plt.plot(loss_values[::1000])

In [None]:
len(vocab)

In [None]:
vocab_df = pd.DataFrame(vocab.items(), columns=["name", "index"])

In [None]:
vocab_df.to_csv(fopen(config.glove_vocab_path, "wb"))

In [None]:

torch.save(model.state_dict(), fopen(config.glove_model_path, "wb"))

In [None]:
model.load_state_dict(torch.load(fopen(config.glove_model_path, "rb")))
model.eval()

### Eval

In [None]:
# make sure all the names are in the model
for name in input_names_train:
    if name not in vocab.keys():
        print("name missing", name)
        break
for name in candidate_names_train:
    if name not in vocab.keys():
        print("name missing", name)
        break

In [None]:
# get best matches
# NOTE: only considers as potential matches names in candidate_names_train, not names in input_names_train
k = 100
batch_size = 256
add_context = True
best_matches = get_best_glove_matches(model, vocab, input_names_train, candidate_names_train, k, batch_size, add_context=add_context)

### PR Curve

In [None]:
metrics.precision_weighted_recall_curve_at_threshold(
    weighted_actual_names_train, best_matches, min_threshold=0.01, max_threshold=1.0, step=0.05, distances=False
)


In [None]:
wandb.finish()

### Test

In [None]:
import numpy as np
from src.models.glove import get_glove_embeddings
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
# demo names
input_names_train = ["<john>", "<mary>"]
weighted_actual_names_train = [
    [("<johnny>", 0.2, 20), ("<jonathan>", 0.5, 50), ("<jon>", 0.3, 30)],
    [("<marie>", 0.7, 70), ("<maria>", 0.3, 30)],
    # [("<johnny>", 0.2, 20), ("<jonathan>", 0.5, 50), ("<jon>", 0.3, 30), ("<mary>", 0.0, 0.5)],
    # [("<marie>", 0.7, 70), ("<maria>", 0.3, 30), ("<john>", 0.0, 0.5)],
    # [("<johnny>", 0.2, 20), ("<jonathan>", 0.5, 50), ("<jon>", 0.3, 30), ("<mary>", 0.0, 1), ("<maria>", 0.0, 1), ("<marie>", 0.0, 1)],
    # [("<marie>", 0.7, 70), ("<maria>", 0.3, 30), ("<john>", 0.0, 1), ("<johnny>", 0.0, 1), ("<jonathan>", 0.0, 1), ("<jon>", 0.0, 1)],
]
candidate_names_train = np.array(["<johnny>", "<jonathan>", "<marie>", "<maria>", "<jon>"])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
symmetric = True
dataset = GloveDataset(input_names_train, weighted_actual_names_train, config.vocab_size, device=device,
                       symmetric=symmetric)
vocab = dataset.get_vocab()
print(vocab)

In [None]:
for ix in range(len(dataset._xij)):
    name = dataset._id2word[int(dataset._i_idx[ix])]
    context = dataset._id2word[int(dataset._j_idx[ix])]
    freq = int(dataset._xij[ix])
    print(f"{name}/{context} {freq}")

In [None]:
# get vocab names in order by id
vocab_names = list(name_id[0] for name_id in sorted(vocab.items(), key=lambda x: x[1]))
print(vocab_names)

In [None]:
# create vectors with tfidf values
max_ngram = 5  # 3
min_df = 1  # 10
max_df = 1.0  # 0.5
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), analyzer="char_wb", min_df=min_df, max_df=max_df)
tfidf_X_train = tfidf_vectorizer.fit_transform(vocab_names)
print(tfidf_X_train.shape)

In [None]:
embed_dim = 2

In [None]:
# reduce tfidf values to embed_dim
svd = TruncatedSVD(n_components=embed_dim)
tfidf_X_train = svd.fit_transform(tfidf_X_train)
tfidf_X_train.shape

In [None]:
# create glove model
model = GloveModel(len(vocab), embed_dim)

In [None]:
# init weights to tfidf values
model.wi.weight.data = torch.from_numpy(tfidf_X_train)
model.wj.weight.data = torch.from_numpy(tfidf_X_train)

In [None]:
model.to(device=device)

In [None]:
n_epochs = 1000
batch_size = 64
learning_rate = 0.05
x_max = 100
alpha = 0.75
loss_values = train_glove(model, dataset, n_epochs=n_epochs, batch_size=batch_size, x_max=x_max, alpha=alpha, lr=learning_rate, device=device)

In [None]:
ax = plt.gca()
ax.set_ylim([0, 0.1])
plt.plot(loss_values)

In [None]:
k = 10
add_context = True

all_names = np.array(input_names_train + candidate_names_train.tolist())
all_embeddings = get_glove_embeddings(model, vocab, all_names, add_context=add_context)

In [None]:
print(all_names)

In [None]:
demo_name = '<john>'
demo_name_pos = 0
demo_embeddings = get_glove_embeddings(model, vocab, [demo_name], add_context=add_context)

In [None]:
# try cosine similarity
# totals = all_embeddings.sum(axis=0)
# all_embeddings_norm = all_embeddings / totals
# demo_embeddings_norm = all_embeddings_norm[[demo_name_pos]]
# scores = cosine_similarity(demo_embeddings_norm, all_embeddings_norm)
# ixs = np.argsort(-scores)[:, :k]
# sorted_scores = scores[:, ixs[0]]
# sorted_names = all_names[ixs[0]]
# best_matches = np.dstack((sorted_names, sorted_scores))
# print("cosine_norm_0", best_matches)

# totals = demo_embeddings.sum(axis=1)
# demo_embeddings_norm = demo_embeddings / totals[:, np.newaxis]
# totals = all_embeddings.sum(axis=1)
# all_embeddings_norm = all_embeddings / totals[:, np.newaxis]
# scores = cosine_similarity(demo_embeddings_norm, all_embeddings_norm)
# ixs = np.argsort(-scores)[:, :k]
# sorted_scores = scores[:, ixs[0]]
# sorted_names = all_names[ixs[0]]
# best_matches = np.dstack((sorted_names, sorted_scores))
# print("cosine_norm_1", best_matches)

scores = cosine_similarity(demo_embeddings, all_embeddings)
ixs = np.argsort(-scores)[:, :k]
sorted_scores = scores[:, ixs[0]]
sorted_names = all_names[ixs[0]]
best_matches = np.dstack((sorted_names, sorted_scores))
print("cosine", best_matches)

In [None]:
# try euclidean similarity
totals = all_embeddings.sum(axis=0)
all_embeddings_norm = all_embeddings / totals
demo_embeddings_norm = all_embeddings_norm[[demo_name_pos]]
scores = euclidean_distances(demo_embeddings_norm, all_embeddings_norm)
ixs = np.argsort(scores)[:, :k]
sorted_scores = scores[:, ixs[0]]
sorted_names = all_names[ixs[0]]
best_matches = np.dstack((sorted_names, sorted_scores))
print("euclidean_norm_0", best_matches)

# totals = demo_embeddings.sum(axis=1)
# demo_embeddings_norm = demo_embeddings / totals[:, np.newaxis]
# totals = all_embeddings.sum(axis=1)
# all_embeddings_norm = all_embeddings / totals[:, np.newaxis]
# scores = euclidean_distances(demo_embeddings_norm, all_embeddings_norm)
# ixs = np.argsort(scores)[:, :k]
# sorted_scores = scores[:, ixs[0]]
# sorted_names = all_names[ixs[0]]
# best_matches = np.dstack((sorted_names, sorted_scores))
# print("euclidean_norm_1", best_matches)

scores = euclidean_distances(demo_embeddings, all_embeddings)
ixs = np.argsort(scores)[:, :k]
sorted_scores = scores[:, ixs[0]]
sorted_names = all_names[ixs[0]]
best_matches = np.dstack((sorted_names, sorted_scores))
print("euclidean", best_matches)

In [None]:
# plot embeddings
xs = list(x for x, _ in all_embeddings)
ys = list(y for _, y in all_embeddings)
plt.scatter(xs, ys)
for ix, name in enumerate(all_names):
    plt.annotate(name, xy=(xs[ix], ys[ix]), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')