In [1]:
%load_ext autoreload
%autoreload 2

# Train cross encoder
# Train cross encoder
Train a cross-encoder on the triplets from notebook 373, using the Roberta model from notebook 375.


| vocab | notes                           | corr | random / mean | difference | pos-neg / mean | name-name |
|-------| ------------------------------- |------| ------ | ---------- | ------- | --------- |
| 265   |                                 |      |  |         |    |        |


In [6]:
import math
import os
import random

from sentence_transformers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from nama.data.filesystem import download_file_from_s3, upload_file_to_s3
from nama.data.utils import read_csv

In [17]:
#config
# TODO do for given and surname
given_surname = "given"
# given_surname = "surname"

num_epochs = 1
train_batch_size = 16
warmup_fraction = 0.1
train_all = True
# these flags affect the test data, so when comparing models generated with different flag values,
# make sure you re-generate the test data with the same flag values for the comparisons
each_pair_once = False
add_same_name = True

cross_encoder_vocab_size = 265
tokenizer_max_length = 32
tree_name_min_freq = 1000
roberta_dir = f"../data/models/roberta-{given_surname}-{cross_encoder_vocab_size}"
roberta_dir_s3 = f"s3://fs-nama-data/2024/nama-data/data/models/roberta-{given_surname}-{cross_encoder_vocab_size}/"
triplets_path=f"s3://fs-nama-data/2024/familysearch-names/processed/tree-hr-{given_surname}-triplets-{tree_name_min_freq}-augmented.csv.gz"

cross_encoder_dir = f"../data/models/cross-encoder-{given_surname}-{cross_encoder_vocab_size}"
cross_encoder_dir_s3 = f"s3://fs-nama-data/2024/nama-data/data/models/cross-encoder-{given_surname}-{cross_encoder_vocab_size}/"

print(cross_encoder_dir)

../data/models/cross-encoder-given-265


In [4]:
if not os.path.exists(roberta_dir):
    os.makedirs(roberta_dir) 
if not os.path.exists(cross_encoder_dir):
    os.makedirs(cross_encoder_dir) 

In [5]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

True
cuda total 8141471744
cuda reserved 0
cuda allocated 0


## Load data

In [None]:
# download roberta
for filename in [
    'vocab.json',
    'merges.txt',
    'config.json',
    'training_args.bin',
    'model.safetensors',
]:
    print(filename)
    roberta_dir_filename = os.path.join(roberta_dir, filename)
    if not os.path.exists(roberta_dir_filename):
        download_file_from_s3(roberta_dir_s3+filename, roberta_dir_filename)

In [7]:
# load triplets
path = download_file_from_s3(triplets_path) if triplets_path.startswith("s3://") else triplets_path
triplets_df = read_csv(path)
print(len(triplets_df))
triplets_df.head(30)

15486190


Unnamed: 0,anchor,positive,negative,positive_score,negative_score
0,frits,frith,lisetta,0.426568,0.0
1,penina,penna,devina,0.50662,0.410542
2,cherstin,kerstin,taft,0.622299,0.0
3,severin,severine,ludie,0.501471,0.0
4,carrington,corrington,thoribia,0.609195,0.0
5,irma,irman,albertha,0.59204,0.0
6,martini,martin,marlyn,0.543451,0.400991
7,werner,verner,patience,0.508241,0.0
8,alphonse,alpho,alphens,0.63427,0.408732
9,trijntje,rijntje,bryngel,0.436747,0.0


## Convert triplets into training examples

In [8]:
%%time
data = []
all_names = set()
seen_pairs = set()
for anchor, pos, pos_score, neg, neg_score in tqdm(zip(
    triplets_df['anchor'],
    triplets_df['positive'],
    triplets_df['positive_score'],
    triplets_df['negative'],
    triplets_df['negative_score'],
)):
    if (not add_same_name or anchor != pos) and \
       (not each_pair_once or (not (anchor, pos) in seen_pairs and not (pos, anchor) in seen_pairs)):
        data.append(InputExample(texts=[anchor, pos], label=pos_score))
        seen_pairs.add((anchor, pos))
    if (not add_same_name or anchor != neg) and \
       (not each_pair_once or (not (anchor, neg) in seen_pairs and not (neg, anchor) in seen_pairs)):
        data.append(InputExample(texts=[anchor, neg], label=neg_score))
        seen_pairs.add((anchor, neg))
    all_names.add(anchor)
    all_names.add(pos)
    all_names.add(neg)
len(data)

15486190it [02:15, 114307.80it/s]


28981013

In [9]:
len(all_names)

66979

In [10]:
# add name, name, 1.0
if add_same_name:
    for name in all_names:
        data.append(InputExample(texts=[name, name], label=1.0))
len(data)

29047992

In [11]:
%%time
if train_all:
    dev_size = 0.01
    test_size = 0.01
else:
    dev_size = 0.1
    test_size = 0.1
raw_train_data, test_data = train_test_split(data, test_size=(dev_size+test_size), random_state=42)
dev_data, test_data = train_test_split(test_data, test_size=(test_size / (dev_size+test_size)), random_state=42)

train_data = []
for example in raw_train_data:
    name1, name2 = example.texts
    train_data.append(InputExample(texts=[name1, name2], label=example.label))
    if name1 != name2:
        train_data.append(InputExample(texts=[name2, name1], label=example.label))
del raw_train_data

random.shuffle(train_data)

print('train', len(train_data))
print('dev', len(dev_data))
print('test', len(test_data))

train 56868373
dev 290480
test 290480


## Train cross-encoder

In [12]:
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)

# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_data, name='dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * warmup_fraction) #N% of train data for warm-up
print("Warmup-steps: {}".format(warmup_steps))

Warmup-steps: 355428


In [13]:
model = CrossEncoder(roberta_dir, num_labels=1, max_length=tokenizer_max_length)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../data/models/roberta-given-265 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
%%time

model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          evaluation_steps=200_000,
          show_progress_bar=True,
          output_path=cross_encoder_dir)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3554274 [00:00<?, ?it/s]

CPU times: user 1d 9h 11min 23s, sys: 2h 36min 28s, total: 1d 11h 47min 52s
Wall time: 1d 5h 56min 53s


## Save cross-encoder

In [16]:
model.save(cross_encoder_dir)

In [18]:
# upload to s3
for filename in [
    'CECorrelationEvaluator_dev_results.csv',
    'config.json',
    'merges.txt',
    'special_tokens_map.json',
    'tokenizer_config.json',
    'tokenizer.json',
    'vocab.json',
    'model.safetensors',
]:
    print(filename)
    upload_file_to_s3(os.path.join(cross_encoder_dir, filename), cross_encoder_dir_s3+filename)

CECorrelationEvaluator_dev_results.csv
config.json
merges.txt
special_tokens_map.json
tokenizer_config.json
tokenizer.json
vocab.json
model.safetensors


## Evaluate cross-encoder

In [19]:
model = CrossEncoder(cross_encoder_dir, max_length=tokenizer_max_length)
cross_encoder_dir

'../data/models/cross-encoder-given-265'

In [20]:
%%time

evaluator = CECorrelationEvaluator.from_input_examples(test_data, name='test')
evaluator(model)

CPU times: user 2min 17s, sys: 23.9 s, total: 2min 41s
Wall time: 1min 43s


np.float64(0.9566643846517128)

## Test predictions

### Sample

In [21]:
for example in test_data[:250]:
    if example.label == 0.0:
        continue
    name1, name2 = example.texts
    score = model.predict([[name1, name2]])[0]
    print(name1, name2, score, example.label)

celanire celenire 0.88051033 0.8760655737704919
haseltine hasseltine 0.5421073 0.5912704598597038
lavon lavanna 0.40955743 0.4247191011235955
magarita margarita 0.78680915 0.8686695278969957
geraldine gerldine 0.95641774 0.9626445300919064
mintie miltie 0.41366017 0.4272232304900182
wilhelmine guilelminae 0.5805446 0.5665768194070081
aldon alden 0.5766243 0.5488126649076517
absolum absalom 0.7174611 0.5896516393442623
latitia lelitia 0.44683722 0.4159126365054603
rozalia rozi 0.9059107 0.9478287153652392
sylvester sylveste 0.95586234 0.9457954545454546
curtis custis 0.67141306 0.7178479048111743
mareke mareka 0.62239945 0.6006756756756757
patience patient 0.9510035 0.9388034188034188
catarine catharina 0.49673945 0.5015784114052954
marcelia marcella 0.54874957 0.5062755291496472
fred freddie 0.6308613 0.6115419018989663
pilar pear 0.4197967 0.403171552138395
novella nevella 0.6547166 0.6307692307692307
lulia lolia 0.41000655 0.4125797629899726
lois loi 0.9291004 0.921311475409836
arabe

### How many random pairs score above a low threshold?

In [22]:
def harmonic_mean(x,y):
    return 2 / (1/x+1/y)

In [23]:
%%time

threshold = 0.41
cnt = 0
seen_pairs = set()
for i in range(0, min(len(test_data)-1, 50000)):
    name1 = test_data[i].texts[0]
    name2 = test_data[i+1].texts[0]
    if name1 == name2 or (name1,name2) in seen_pairs or (name2,name1) in seen_pairs:
        continue
    scores = model.predict([[name1, name2],[name2, name1]])
    if harmonic_mean(scores[0],scores[1]) > threshold:
        print(name1, name2, scores)
        cnt += 1
    seen_pairs.add((name1,name2))
print(cnt)

arabella arrilla [0.40825588 0.41221616]
serafina seraphine [0.4223442 0.4305678]
friederike fredericka [0.5052289  0.48464355]
christina christophine [0.42018816 0.42238525]
juana johanna [0.4139102  0.42060548]
cleatus claes [0.41839203 0.41968575]
frederik frederic [0.41877 0.41783]
mattia lattie [0.41174743 0.41262403]
marian moriah [0.41334358 0.41407606]
katharine kathrine [0.44278458 0.4286129 ]
catharina catharinae [0.7289712  0.72761184]
minnie miner [0.49804702 0.49664605]
birgitte birthe [0.4144755  0.41168505]
dominick domingas [0.41899586 0.41814163]
kirstine christiane [0.41531363 0.41436496]
filipe philip [0.41795895 0.41812778]
margarita margia [0.41686842 0.41625157]
raimunda amanda [0.41033164 0.4171916 ]
leola lepha [0.41522574 0.41857493]
hessie jessamine [0.41048086 0.4112171 ]
lessie bessie [0.4146549  0.41526413]
sirena lorena [0.4107069  0.41029382]
magrietha margarethe [0.45232892 0.46344233]
lorraine lorance [0.41676113 0.41906658]
lilly lillar [0.4166477  0.4

### How many pairs score significantly differently than their label?

In [24]:
%%time

threshold = 0.1
cnt = 0
for example in test_data[:1000]:
    name1, name2 = example.texts
    scores = model.predict([[name1, name2],[name2, name1]])
    score = harmonic_mean(scores[0],scores[1])
    if abs(score - example.label) > threshold:
        print(name1, name2, score, example.label)
        cnt += 1
print(cnt)

absolum absalom 0.7231626 0.5896516393442623
lavinnia lavinna 0.53864753 0.4382405745062837
christian christiani 0.6152768 0.8080110497237569
damas damase 0.66388303 0.5552709359605912
marion mariona 0.44165045 0.5434094903339192
sarena serena 0.7259096 0.6035149384885765
winfred winnfred 0.5817621 0.4440065681444992
elonzo alonzo 0.68828225 0.5675177898702386
wille willie 0.7405068 0.8575582852845325
delrefugio derefugio 0.7361068 0.8541893362350381
consolacion consuelo 1.7504643e-05 0.4578348035284683
deltransito deltrancito 0.76761097 0.9201817695803262
zachris zacharias 0.64121205 0.5117465224111283
13
CPU times: user 7.71 s, sys: 701 ms, total: 8.41 s
Wall time: 7.18 s


### How many positive pairs score negatively, and how many negative pairs score positively?

In [25]:
%%time

threshold = 0.1
cnt = 0
for example in test_data[:1000]:
    name1, name2 = example.texts
    scores = model.predict([[name1, name2],[name2, name1]])
    score = harmonic_mean(scores[0],scores[1])
    if (example.label >= 0.5+threshold and score < 0.5) or (example.label < 0.5-threshold and score >= 0.5):
        print(name1, name2, score, example.label, '***' if example.label >= 0.5+threshold and score < 0.5 else '')
        cnt += 1
print(cnt)


0
CPU times: user 7.36 s, sys: 656 ms, total: 8.02 s
Wall time: 6.82 s


### How many names don't score highly against themselves?

In [26]:
%%time

threshold = 0.75
cnt = 0
seen_names = set()
for example in test_data[:1000]:
    name1, name2 = example.texts
    scores = model.predict([[name1, name1],[name2, name2]])
    if name1 not in seen_names and scores[0] < threshold:
        print(name1, scores[0])
        cnt += 1
        seen_names.add(name1)
    if name2 not in seen_names and scores[1] < threshold:
        print(name2, scores[1])
        cnt += 1
        seen_names.add(name2)
print(cnt)

0
CPU times: user 7.64 s, sys: 712 ms, total: 8.35 s
Wall time: 7.15 s
