In [None]:
%load_ext autoreload
%autoreload 2

# Train cross encoder
Train a cross-encoder on the triplets from notebook 200, using the Roberta model from notebook 221.


| vocab | notes                           | corr | random / mean | difference | pos-neg / mean | name-name |
| ----- | ------------------------------- | ---- | ------ | ---------- | ------- | --------- |
| 261   |                                 | 0.931| 56(41) |    183     |  36(3)  |   766     |
| 261   | 2 epochs                        | 0.964| **28(15)** |     58     |  17(2)  |   802     |
| 261   | each pair once, 6 epochs        | 0.851| 43(25) |    384     | 115(94) |   900     |
| 261   | add same-name                   | 0.937| 43(28) |    165     |  32(2)  |   **149**     |
| 265   |                                 | 0.941| 48(30) / **30(16)** |    156     |  29(0) / **21(0)**  |   750     |
| 265   | 2 epochs                        | 0.960| 41(24) |     71     |  21(1)  |   816     |
| 265   | add same-name          | 0.942| 47(27) / **24(14)** |    141  |  24(1) / **22(0)** |  **394**      |
| 300   | add same-name                   | 0.948| 43(30) |    104     |  23(3)  |   **370**     |
| 400   |                                 | 0.951| 263(??)|     88     |  12(1)  |   892     |
| 265   | new-triplets, add same-name     | 0.897| >0.4 / **57(12)**     | **40**  |  / **7(1)** |  **14**   |
| 265   | all, .38-triplets, add same-name| 0.901| >0.4 / **75(19)**     | **19**  |  / **4(0)** |  **5**   |
| 265   | all, .40-triplets, add same-name| 0.947|>0.4 / 220(49) >0.41 / 146(28)| **9**   |  / **0**    |  **5** |

In [None]:
from collections import defaultdict
import math
import os
import random
import re

import pandas as pd
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from src.data.utils import read_csv

In [None]:
given_surname = 'surname'

num_epochs = 1
train_batch_size = 16
warmup_fraction = 0.1
train_all = True
# these flags affect the test data, so when comparing models generated with different flag values,
# make sure you re-generate the test data with the same flag values for the comparisons
each_pair_once = False
add_same_name = True

vocab_size = 265
tokenizer_max_length = 32
roberta_dir = f"../data/models/roberta-{given_surname}-10m-{vocab_size}"
triplets_path=f"../data/processed/tree-hr-{given_surname}-triplets-v2-1000-augmented.csv.gz"

cross_encoder_dir = f"../data/models/cross-encoder-{given_surname}-10m{num_epochs if num_epochs > 1 else ''}-{vocab_size}{'-once' if each_pair_once else ''}{'-same' if add_same_name else ''}{'-all' if train_all else ''}"

print(cross_encoder_dir)

In [None]:
if not os.path.exists(cross_encoder_dir):
    os.makedirs(cross_encoder_dir) 

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Load triplets

In [None]:
# read triplets
triplets_df = read_csv(triplets_path)
print(len(triplets_df))
triplets_df.head(30)

## Convert triplets into training examples

In [None]:
data = []
all_names = set()
seen_pairs = set()
for anchor, pos, pos_score, neg, neg_score in tqdm(zip(
    triplets_df['anchor'],
    triplets_df['positive'],
    triplets_df['positive_score'],
    triplets_df['negative'],
    triplets_df['negative_score'],
)):
    if (not add_same_name or anchor != pos) and \
       (not each_pair_once or (not (anchor, pos) in seen_pairs and not (pos, anchor) in seen_pairs)):
        data.append(InputExample(texts=[anchor, pos], label=pos_score))
        seen_pairs.add((anchor, pos))
    if (not add_same_name or anchor != neg) and \
       (not each_pair_once or (not (anchor, neg) in seen_pairs and not (neg, anchor) in seen_pairs)):
        data.append(InputExample(texts=[anchor, neg], label=neg_score))
        seen_pairs.add((anchor, neg))
    all_names.add(anchor)
    all_names.add(pos)
    all_names.add(neg)
len(data)

In [None]:
len(all_names)

In [None]:
# add name, name, 1.0
if add_same_name:
    for name in all_names:
        data.append(InputExample(texts=[name, name], label=1.0))
len(data)

In [None]:
if train_all:
    dev_size = 0.01
    test_size = 0.01
else:
    dev_size = 0.1
    test_size = 0.1
raw_train_data, test_data = train_test_split(data, test_size=(dev_size+test_size), random_state=42)
dev_data, test_data = train_test_split(test_data, test_size=(test_size / (dev_size+test_size)), random_state=42)

train_data = []
for example in raw_train_data:
    name1, name2 = example.texts
    train_data.append(InputExample(texts=[name1, name2], label=example.label))
    if name1 != name2:
        train_data.append(InputExample(texts=[name2, name1], label=example.label))
del raw_train_data

random.shuffle(train_data)

print('train', len(train_data))
print('dev', len(dev_data))
print('test', len(test_data))

## Train cross-encoder

In [None]:
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)

# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_data, name='dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * warmup_fraction) #N% of train data for warm-up
print("Warmup-steps: {}".format(warmup_steps))

In [None]:
model = CrossEncoder(roberta_dir, num_labels=1, max_length=tokenizer_max_length)

In [None]:
%%time

model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          evaluation_steps=200_000,
          show_progress_bar=True,
          output_path=cross_encoder_dir)

In [None]:
model.save(cross_encoder_dir)

## Evaluate cross-encoder

In [None]:
model = CrossEncoder(cross_encoder_dir, max_length=tokenizer_max_length)
cross_encoder_dir

In [None]:
%%time

evaluator = CECorrelationEvaluator.from_input_examples(test_data, name='test')
evaluator(model)

## Test predictions

### Sample

In [None]:
for example in test_data[:250]:
    if example.label == 0.0:
        continue
    name1, name2 = example.texts
    score = model.predict([[name1, name2]])[0]
    print(name1, name2, score, example.label)

### How many random pairs score above a low threshold?

In [None]:
def harmonic_mean(x,y):
    return 2 / (1/x+1/y)

In [None]:
%%time

threshold = 0.41
cnt = 0
seen_pairs = set()
for i in range(0, min(len(test_data)-1, 50000)):
    name1 = test_data[i].texts[0]
    name2 = test_data[i+1].texts[0]
    if name1 == name2 or (name1,name2) in seen_pairs or (name2,name1) in seen_pairs:
        continue
    scores = model.predict([[name1, name2],[name2, name1]])
    if harmonic_mean(scores[0],scores[1]) > threshold:
        print(name1, name2, scores)
        cnt += 1
    seen_pairs.add((name1,name2))
print(cnt)

### How many pairs score significantly differently than their label?

In [None]:
%%time

threshold = 0.1
cnt = 0
for example in test_data[:1000]:
    name1, name2 = example.texts
    scores = model.predict([[name1, name2],[name2, name1]])
    score = harmonic_mean(scores[0],scores[1])
    if abs(score - example.label) > threshold:
        print(name1, name2, score, example.label)
        cnt += 1
print(cnt)

### How many positive pairs score negatively, and how many negative pairs score positively?

In [None]:
%%time

threshold = 0.1
cnt = 0
for example in test_data[:1000]:
    name1, name2 = example.texts
    scores = model.predict([[name1, name2],[name2, name1]])
    score = harmonic_mean(scores[0],scores[1])
    if (example.label >= 0.5+threshold and score < 0.5) or (example.label < 0.5-threshold and score >= 0.5):
        print(name1, name2, score, example.label, '***' if example.label >= 0.5+threshold and score < 0.5 else '')
        cnt += 1
print(cnt)


### How many names don't score highly against themselves?

In [None]:
%%time

threshold = 0.75
cnt = 0
seen_names = set()
for example in test_data[:1000]:
    name1, name2 = example.texts
    scores = model.predict([[name1, name1],[name2, name2]])
    if name1 not in seen_names and scores[0] < threshold:
        print(name1, scores[0])
        cnt += 1
        seen_names.add(name1)
    if name2 not in seen_names and scores[1] < threshold:
        print(name2, scores[1])
        cnt += 1
        seen_names.add(name2)
print(cnt)