In [1]:
%load_ext autoreload
%autoreload 2

# Save bi-encoder model weights

Save the weights for each token and position so we can use them in the java fs-nama.

In [5]:
import json

import torch

from nama.data.filesystem import download_file_from_s3, save_file
from nama.models.biencoder import BiEncoder
from nama.models.tokenizer import get_tokenize_function_and_vocab

In [3]:
# TODO run both given and surname
given_surname = "given"
# given_surname = "surname"

max_tokens = 10
bi_encoder_vocab_size = 2048
num_epochs = 8
embedding_dim = 256
learning_rate = 0.00005 
tokenizer_path=f"s3://fs-nama-data/2024/nama-data/data/models/fs-{given_surname}-subword-tokenizer-{bi_encoder_vocab_size}.json"
bi_encoder_path = f"s3://fs-nama-data/2024/nama-data/data/models/bi_encoder-ce-{given_surname}-{num_epochs}-{embedding_dim}-{num_epochs}-{bi_encoder_vocab_size}-{learning_rate}.pth"

weights_path=f"s3://fs-nama-data/2024/nama-data/data/models/bi_encoder-{given_surname}-{num_epochs}-{embedding_dim}-{num_epochs}-{bi_encoder_vocab_size}-{learning_rate}-weights.json"

In [4]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

True
cuda total 8141471744
cuda reserved 0
cuda allocated 0


## Load bi-encoder and vocabulary

In [6]:
# load bi-encoder
path = download_file_from_s3(bi_encoder_path) if bi_encoder_path.startswith("s3://") else bi_encoder_path
model = torch.load(path)
model.eval()

  model = torch.load(path)


BiEncoder(
  (embedding): Embedding(2048, 256)
  (positional_embedding): Embedding(10, 256)
  (pooling): AdaptiveAvgPool1d(output_size=1)
)

In [7]:
# load tokenize function
path = download_file_from_s3(tokenizer_path) if tokenizer_path.startswith("s3://") else tokenizer_path
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(tokenizer_path=path, max_tokens=max_tokens)
len(tokenizer_vocab)

2048

In [8]:
tokenid2token = {token_id: token for token, token_id in tokenizer_vocab.items()}

## Save weights

In [9]:
embedding_weights = {tokenid2token[ix]: embedding for ix, embedding in enumerate(model.embedding.weight.tolist())}

In [10]:
len(embedding_weights)

2048

In [11]:
len(embedding_weights['a'])

256

In [12]:
positional_weights = model.positional_embedding.weight.tolist()

In [13]:
len(positional_weights)

10

In [14]:
weights = {
    "tokens": embedding_weights,
    "positions": positional_weights,
}

In [15]:
weights_path

's3://fs-nama-data/2024/nama-data/data/models/bi_encoder-given-8-256-8-2048-5e-05-weights.json'

In [16]:
save_file(weights_path,
          lambda local_out_path : json.dump(weights, open(local_out_path, 'w')))

## Test similarity

In [17]:
name1 = "richard"
name2 = "rickert"
tokens1 = tokenize(name1)
tokens2 = tokenize(name2)
sim = model.predict(tokens1, tokens2)
print(sim)

0.4984965920448303
