In [None]:
%load_ext autoreload
%autoreload 2

# Train a bi-encoder to learn name-to-vec encodings
Try phonemes, subwords, and/or n-grams using anchor-pos-neg triplets with MarginMSELoss

| phon/sub | negs | notes  | uni/bi | size | loss | errors | negs |
| -------- | ---- | ------ | ------ | ---- | ---- | ------ | ---- |
| Phonemes | 100  |        | unigrams |      | 0.102 | ?, 155 | ?, ?, 146 |
| Phonemes | 100  |        | bigrams  |      | 0.073 | 8.5, 94 | 12, 43, 56 |
| Subwords | 100  |        | unigrams | 500  | 0.74 | ?, 89 | 12, 54, 60, |
| Subwords | 100  |        | unigrams | 1000 | 0.063 | 7.6, 69 | 7, 22, 30 |
| Subwords | 100  |        | unigrams | 2000 | 0.052 | 5.5, 61 | 7, 25, 32 |
| Subwords | 200  |        | unigrams | 1000 | 0.055 | 9.6, 80 | 4, 27, 37 |
| Subwords | 200  |        | unigrams | 2000 | 0.047 | 6.2, 56 | 5, 24, 32 |
| Subwords | 200  | noback | unigrams | 2000 | 0.049 | 7.4, 46 | 6, 28, 39 |
| Subwords | 200  | noback9 | unigrams | 2000 | 0.049 | 3.7, 62 | 7, 31, 46 |
| Subwords | 200  | noback9mask | unigrams | 2000 | 0.049 | 7.6, 76 | 6, 30, 46 |
| Subwords | 100+1000 | noback9mask | unigrams | 2000 | 0.050 | 2.6, 66 | 4, 21, 33 |
| Subwords | 100+1000 | noback10mask | unigrams | 2000 | 0.049 | 11.6, 77 | 6, 24, 34 |
| Subwords | 100+1000 | noback10mask 64dim 40epochs | unigrams | 2000 | 0.036 | 17.8, 75 | 2, 12, 21 |
| Subwords | 100+1000 | noback10mask 64dim 10epochs | unigrams | 2000 | 0.037 | 11.2, 80 | 2, 10, 18 |
| Subwords | 100+1000 | noback10mask 64dim 5epochs | unigrams | 2000 | 0.037 | 6, 75 | 1, 11, 22 |
| Subwords | 150+1000 | noback10mask 64dim 5epochs | unigrams | 2000 | 0.035 | 11.2, 80 | 3, 15, 22 |
| Subwords | 100+1500 | noback9mask | unigrams | 2000 | 0.046 | 6.4. 80 | 4, 22, 34 |
| Subwords | 200 | nofront | unigrams | 2000 | 0.048 | 6.7, 56 | 6, 28, 37 |
| Subwords | 200 | noback  | unigrams | 1500 | 0.052 | 6.3, 57 | 8, 31, 47 |
| Subwords | 100  |        | bigrams  | 500  | 0.066 | 19.5, 85 | 13, 23, 31 |
| Subwords | 100  |        | bigrams  | 1000 | 0.56 | 12, 63 | 7, 30, 40 |
| Subwords | 100  |        | bigrams  | 2000 | 0.040 | 9.6, 64 | 16, 56, 66 |
| Subwords | 100  | edit   | unigrams | 500  | | | |
| Subwords | 100  | edit   | unigrams | 1000 | 0.065 | 12.2, 102, | 7, 34, 36 |
| Subwords | 100  | edit   | unigrams | 2000 | 0.055 | 6.9, 77 | 11, 42, 46 |
| Subwords | 100  | edit   | bigrams  | 500  | | | |
| Subwords | 100  | edit   | bigrams  | 1000 | | | |
| Subwords | 100  | edit   | bigrams  | 2000 | | | |

**TODO:**
- fix training data issues

- try sin+cos pos emb

In [None]:
import json
import math
import random
import re

import boto3
from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
import joblib
import pandas as pd
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from transformers import PreTrainedTokenizerFast

from src.data.filesystem import fopen

In [None]:
given_surname = "given"
sample_frac = 1.0
num_common_names = 10000
report_size = 10000
max_tokens = 10

use_phonemes = False
use_edit_tokenizer = False
use_bigrams = False
subword_vocab_size = 2000  # 500, 1000, 1500, 2000
num_easy_negs = 100  # 100, 150, 200
num_common_negs = 1000

embedding_dim = 64
learning_rate = 0.001
batch_size = 64
num_epochs = 5

pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
triplets_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-triplets.csv.gz"
tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf-v2.joblib"
vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-espeak_phoneme_vocab.json"
bigrams_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-espeak_phoneme_vocab_bigrams.json"

nama_bucket = 'nama-data'
tokenizer_path=f"data/models/fs-{given_surname}-subword-tokenizer-{subword_vocab_size}.json"
edit_tokenizer_path=f"data/models/fs-{given_surname}-edit-subword-tokenizer-{subword_vocab_size}.json"
tokenizer_bigrams_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-tokenizer_vocab_bigrams-{subword_vocab_size}.json"
edit_tokenizer_bigrams_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-edit_tokenizer_vocab_bigrams-{subword_vocab_size}.json"

model_path = f"../data/models/bi_encoder-{given_surname}"

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Load data

In [None]:
# read triplets
triplets_df = pd.read_csv(triplets_path).sample(frac=sample_frac)
print(len(triplets_df))
triplets_df.head(3)

### read common names

In [None]:
pref_df = pd.read_csv(pref_path, keep_default_na=False)
common_names = [name for name in pref_df['name'][:num_common_names].tolist() \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)]
pref_df = None
len(common_names)

### read phoneme vocab

In [None]:
with fopen(bigrams_vocab_path if use_bigrams else vocab_path, 'r') as f:
    phoneme_vocab = json.load(f)

In [None]:
phoneme_vocab['[UNK]'] = len(phoneme_vocab)
phoneme_vocab['[PAD]'] = len(phoneme_vocab)
len(phoneme_vocab)

### read subword tokenizer and vocab

In [None]:
s3 = boto3.client('s3')

path = edit_tokenizer_path if use_edit_tokenizer else tokenizer_path
with open(f"../{path}", 'wb') as f:
    s3.download_fileobj(nama_bucket, path, f)
subword_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f"../{path}")

In [None]:
if use_bigrams:
    path = edit_tokenizer_bigrams_vocab_path if use_edit_tokenizer else tokenizer_bigrams_vocab_path
    with fopen(path, 'r') as f:
        subword_vocab = json.load(f)  
else:
    subword_vocab = subword_tokenizer.get_vocab()
print(len(subword_vocab), 
      subword_vocab['[UNK]'], 
      subword_vocab['[PAD]'])

## Set up generators

In [None]:
espeak = EspeakBackend('en-us')
separator = Separator(phone=' ', syllable=None, word='|')

## Create training data

In [None]:
def tokenize_phonemes(name):
    return espeak.phonemize([name], separator=separator, strip=True)[0].split(' ')

def tokenize_subwords(name):
    return subword_tokenizer.convert_ids_to_tokens(subword_tokenizer.encode(name))

# set up tokenizer and tokenizer_vocab
tokenizer = tokenize_phonemes if use_phonemes else tokenize_subwords
tokenizer_vocab = phoneme_vocab if use_phonemes else subword_vocab

In [None]:
def tokenize(name, max_tokens):
    if name in name_tokens_cache:
        return name_tokens_cache[name]
    
    result = [tokenizer_vocab['[PAD]']] * max_tokens
    unk = tokenizer_vocab['[UNK]']
    tokens = tokenizer(name)
    context = 'START'
    if use_bigrams:
        tokens.append('END')
    for ix, token in enumerate(tokens):
        if ix == max_tokens:
            break
        if use_bigrams:
            bigram = f"{context},{token}"
            result[ix] = tokenizer_vocab.get(bigram, tokenizer_vocab.get(token, unk))
        else:
            result[ix] = tokenizer_vocab.get(token, unk)
        context = token
    name_tokens_cache[name] = result
    
    return result

In [None]:
# array of (anchor_tokens, pos_tokens, neg_tokens, target_margin)
all_data = []
name_tokens_cache = {}
seen_anchor_pos = set()
for tup in tqdm(triplets_df.itertuples()):
    anchor = tup.anchor[1:-1]
    pos = tup.positive[1:-1]
    neg = tup.negative[1:-1]
    anchor_tokens = tokenize(anchor, max_tokens)
    pos_tokens = tokenize(pos, max_tokens)
    neg_tokens = tokenize(neg, max_tokens)
    target_margin = tup.positive_score - tup.negative_score
    # anchor, positive, hard-negative
    all_data.append({
        'anchor': torch.tensor(anchor_tokens),
        'pos': torch.tensor(pos_tokens),
        'neg': torch.tensor(neg_tokens),
        'target': torch.tensor(target_margin, dtype=torch.float),
    })
    anchor_pos = f"{anchor},{pos}"
    if anchor_pos in seen_anchor_pos:
        continue
    seen_anchor_pos.add(anchor_pos)
    for ix in range(num_easy_negs):
        # anchor, positive, easy-negative
        easy_neg = random.choice(common_names)
        easy_neg_tokens = tokenize(easy_neg, max_tokens)
        if anchor_tokens == easy_neg_tokens:
            continue
        all_data.append({
            'anchor': torch.tensor(anchor_tokens),
            'pos': torch.tensor(pos_tokens),
            'neg': torch.tensor(easy_neg_tokens),
            'target': torch.tensor(tup.positive_score, dtype=torch.float)
        })
        

In [None]:
cnt = 0
for pos in tqdm(common_names[:num_common_negs]):
    pos_tokens = tokenize(pos, max_tokens)
    for neg in common_names[:num_common_negs]:
        if pos == neg:
            continue
        pos_neg = f"{pos},{neg}"
        if pos_neg in seen_anchor_pos:
            continue
        neg_pos = f"{neg},{pos}"
        if neg_pos in seen_anchor_pos:
            continue
        neg_tokens = tokenize(neg, max_tokens)
        all_data.append({
            'anchor': torch.tensor(pos_tokens),
            'pos': torch.tensor(pos_tokens),
            'neg': torch.tensor(neg_tokens),
            'target': torch.tensor(1.0, dtype=torch.float)
        })
        cnt += 1
print(cnt)

In [None]:
train_data, val_data = train_test_split(all_data, test_size=0.10)
print(len(train_data), len(val_data))        

In [None]:
train_data[0]

## Train bi-encoder

In [None]:
def loss_fn(anchors, positives, negatives, labels):
#     anchor_pos_sim = (anchors * positives).sum(dim=-1)
#     anchor_neg_sim = (anchors * negatives).sum(dim=-1)
    anchor_pos_sim = F.cosine_similarity(anchors, positives, dim=-1)
    anchor_neg_sim = F.cosine_similarity(anchors, negatives, dim=-1)
    margin_pred = anchor_pos_sim - anchor_neg_sim
    return F.mse_loss(margin_pred, labels)

In [None]:
# Define your bi-encoder model
class BiEncoder(nn.Module):
    def __init__(self, embedding_dim, vocab_size, max_tokens, pad_token):
        super(BiEncoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.max_tokens = max_tokens
        self.pad_token = pad_token
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.positional_embedding = nn.Embedding(num_embeddings=max_tokens, embedding_dim=embedding_dim)
#         self.forward_positional_embedding = nn.Embedding(num_embeddings=max_tokens+1, embedding_dim=embedding_dim)
#         self.backward_positional_embedding = nn.Embedding(num_embeddings=max_tokens+1, embedding_dim=embedding_dim)
        self.pooling = nn.AdaptiveAvgPool1d(1)  # Pooling layer to create a single vector

    def forward(self, input):
        # get token embedding
        embedded = self.embedding(input)  # Shape: (batch_size, max_tokens, embedding_dim)
        # get mask
        mask = torch.where(input == self.pad_token, 0, 1)[..., None]  # Shape: (batch_size, max_tokens, 1)
        # get positional embedding
        positions = torch.arange(start=0, end=self.max_tokens).repeat(input.shape[0], 1)
        positional_embedded = self.positional_embedding(positions)
#         # get forward positional embedding: pad token is position 0
#         positions = torch.arange(start=1, end=self.max_tokens+1).repeat(input.shape[0], 1)
#         forward_positions = torch.where(input == self.pad_token, 0, positions)
#         forward_positional_embedded = self.forward_positional_embedding(forward_positions)
        # get backward positional embedding
#         backward_positions = torch.where(input == self.pad_token, 0, 1)
#         backward_n_tokens = backward_positions.sum(dim=1)
#         for ix in range(backward_n_tokens.shape[0]):
#             n_tokens = backward_n_tokens[ix]
#             backward = torch.arange(start=n_tokens, end=0, step=-1)
#             backward_positions[ix][:n_tokens] = backward
#         backward_positional_embedded = self.backward_positional_embedding(backward_positions)
        # multiply embeddings
#         embedded = embedded * forward_positional_embedded * backward_positional_embedded
        embedded = embedded * positional_embedded * mask
        pooled = self.pooling(embedded.permute(0, 2, 1)).squeeze(2)  # Shape: (batch_size, embedding_dim)
        return pooled

In [None]:
# Training loop
def train(model, train_loader, val_loader, loss_fn, optimizer, num_epochs, verbose=True):
    for epoch in range(num_epochs):
        # make sure gradient tracking is on
        model.train()
        running_loss = 0

        for ix, data in enumerate(train_loader):
            # get batch
            anchors = data['anchor']
            positives = data['pos']
            negatives = data['neg']
            target_margins = data['target']

            # zero gradients
            optimizer.zero_grad()

            # Forward pass
            anchor_embeddings = model(anchors)  # Shape: (batch_size, embedding_dim)
            pos_embeddings = model(positives)  # Shape: (batch_size, embedding_dim)
            neg_embeddings = model(negatives)  # Shape: (batch_size, embedding_dim)

            # Calculate loss
            loss = loss_fn(anchor_embeddings, pos_embeddings, neg_embeddings, target_margins)

            # Backward pass and optimization step
            loss.backward()
            optimizer.step()

            # Calculate loss and report
            if verbose:
                running_loss += loss.item()
                if ix % report_size == report_size - 1:
                    avg_loss = running_loss / report_size  # loss per batch
                    print(f"Epoch {epoch} batch {ix} loss {avg_loss}")
                    running_loss = 0

        # set model to evaluation mode
        model.eval()

        # disable gradient computation
        running_loss = 0
        num_val_batches = 0
        with torch.no_grad():
            for data in val_loader:
                anchors = data['anchor']
                positives = data['pos']
                negatives = data['neg']
                target_margins = data['target']
                anchor_embeddings = model(anchors)  # Shape: (batch_size, embedding_dim)
                pos_embeddings = model(positives)  # Shape: (batch_size, embedding_dim)
                neg_embeddings = model(negatives)  # Shape: (batch_size, embedding_dim)
                loss = loss_fn(anchor_embeddings, pos_embeddings, neg_embeddings, target_margins)
                running_loss += loss.item()  
                num_val_batches += 1

        # calculate average validation loss
        val_loss = running_loss / num_val_batches
        if verbose:
            print(f"VALIDATION: Epoch {epoch} loss {val_loss}")
        # epoch_model_path = f"{model_path}-{epoch}"
        # torch.save(model.state_dict, epoch_model_path)
        
    # return final epoch validation loss
    return val_loss

## Hyperparameter search

In [None]:
def hyperopt_objective_function(train_data, 
                                val_data, 
                                vocab_size,
                                max_tokens,
                                pad_token,
                                verbose=True,
                               ):
    
    def objective(config):
        learning_rate = config['learning_rate']
        batch_size = config['batch_size']
        embedding_dim = config['embedding_dim']
        num_epochs = config['num_epochs']
        
        if verbose:
            print('train', config)
        
        # Create an instance of the bi-encoder model
        model = BiEncoder(embedding_dim, vocab_size, max_tokens, pad_token)
        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # model.to(device)

        # Define the optimizer
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

        # Create data loader
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True) 

        val_loss = train(model, train_loader, val_loader, loss_fn, optimizer, num_epochs, verbose=False)
        if verbose:
            print('val_loss', val_loss)
        
        return {
            'status': STATUS_OK,
            'loss': val_loss,
            'config': config,            
        }

    return objective

In [None]:
# HyperOpt search space
search_space = {
    "learning_rate": hp.loguniform('learning_rate', math.log(1e-4), math.log(1e-2)),
    "batch_size": hp.choice('batch_size', [8,16,32,64]),
    "embedding_dim": hp.choice('embedding_dim', [8,16,32,64]),
    "num_epochs": hp.choice('num_epochs', [5,10,20,40]),
}
objective = hyperopt_objective_function(train_data=train_data,
                                        val_data=val_data,
                                        vocab_size=len(tokenizer_vocab),
                                        max_tokens=max_tokens,
                                        pad_token=tokenizer_vocab['[PAD]'],
                                        verbose=True,
                                       )
trials = Trials()

# minimize the objective over the space
best = fmin(objective, 
            search_space, 
            algo=tpe.suggest, 
            trials=trials,
            max_evals=50)

In [None]:
print("best", best)
print("results", trials.results) 

In [None]:
# batch_size = best_result.config['batch_size']
# learning_rate = best_result.config['learning_rate']
# embedding_dim = best_result.config['embedding_dim']
# num_epochs = best_result.config['num_epochs']

## Train model and Review predictions

In [None]:
%%time

# Create an instance of the bi-encoder model
model = BiEncoder(embedding_dim, len(tokenizer_vocab), max_tokens, tokenizer_vocab['[PAD]'])

# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Create data loader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

train(model, train_loader, val_loader, loss_fn, optimizer, num_epochs)

In [None]:
def predict(name1, name2):
    name1_tokens = tokenize(name1, max_tokens)
    name2_tokens = tokenize(name2, max_tokens)
    model.eval()
    with torch.no_grad():
        embeddings = model(torch.tensor([name1_tokens, name2_tokens]))
    # return (embeddings[0] * embeddings[1]).sum(dim=-1).item()
    return F.cosine_similarity(embeddings[0], embeddings[1], dim=-1).item()

In [None]:
num_error = 0
for ix, row in enumerate(triplets_df.head(1000).itertuples()):
    anchor = row.anchor[1:-1]
    pos = row.positive[1:-1]
    neg = row.negative[1:-1]
    pos_predict = predict(anchor, pos)
    neg_predict = predict(anchor, neg)
    correct = pos_predict > neg_predict
    if pos_predict < 0.5 or not correct:
        num_error += 1
        print(num_error, anchor, pos, pos_predict, row.positive_score, '' if correct else 'ERROR')
        print('  ', anchor, neg, neg_predict, row.negative_score)
print(num_error)

In [None]:
%%time

cnt = 0
n_names = 1000
for ix, pos in enumerate(common_names[:n_names]):
    pos_tokens = tokenizer(pos)
    for neg in common_names[ix+1:n_names]:
        neg_tokens = tokenizer(neg)
        sim = predict(pos, neg)
        if sim > 0.5:
            print(pos, neg)  # , sim, '***' if sim >= 0.6 else '', pos_tokens, neg_tokens)
            cnt += 1
print(cnt)