<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/bi_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[torch] datasets evaluate

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"

In [4]:
import gc
import random
import itertools
import evaluate
import torch
import numpy as np
from pprint import pprint
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterGrid
from datasets import Dataset, DatasetDict
from transformers import get_scheduler, AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback

# Collate input dataset

In [5]:
def read_bi_tokenized_from_file(filepath):
  s_input_ids, s_token_type_ids, s_attention_mask, t_input_ids, t_token_type_ids, t_attention_mask, labels = [], [], [], [], [], [], []
  with open(filepath) as f:
    for line in f:
      strings = line.strip().split("],")
      lists = [list(map(int, s.strip("[]").split(","))) for s in strings[:-1]]
      s_input_id, s_token_type_id, s_attention, t_input_id, t_token_type_id, t_attention = lists
      label = int(strings[-1])

      s_input_ids.append(s_input_id)
      s_token_type_ids.append(s_token_type_id)
      s_attention_mask.append(s_attention)
      t_input_ids.append(t_input_id)
      t_token_type_ids.append(t_token_type_id)
      t_attention_mask.append(t_attention)
      labels.append(label)

  source_tokenized = {"input_ids" : s_input_ids,
                      "token_type_ids" : s_token_type_ids,
                      "attention_mask" : s_attention_mask}
  target_tokenized = {"input_ids" : t_input_ids,
                      "token_type_ids" : t_token_type_ids,
                      "attention_mask" : t_attention_mask}
  return source_tokenized, target_tokenized, labels

def read_cross_tokenized_from_file(filepath):
  input_ids, token_type_ids, attention_mask, labels = [], [], [], []
  with open(filepath) as f:
    for line in f:
      strings = line.strip().split("],")
      lists = [list(map(int, s.strip("[]").split(","))) for s in strings[:-1]]
      input_id, token_type_id, attention = lists
      label = int(strings[-1])

      input_ids.append(input_id)
      token_type_ids.append(token_type_id)
      attention_mask.append(attention)
      labels.append(label)

  tokenized = {"input_ids" : input_ids,
               "token_type_ids" : token_type_ids,
               "attention_mask" : attention_mask}
  return tokenized, labels

def read_onto_tokenized_from_file(filepath):
  input_ids, token_type_ids, attention_mask = [], [], []
  with open(filepath) as f:
    for line in f:
      strings = line.strip().split("],")
      lists = [list(map(int, s.strip("[]").split(","))) for s in strings]
      input_id, token_type_id, attention = lists

      input_ids.append(input_id)
      token_type_ids.append(token_type_id)
      attention_mask.append(attention)

  tokenized = {"input_ids" : input_ids,
               "token_type_ids" : token_type_ids,
               "attention_mask" : attention_mask}
  return tokenized

In [6]:
def filter_source_target(Xi, source, target):
  X_source = {k : [v[i] for i in Xi] for (k, v) in source.items()}
  X_target = {k : [v[i] for i in Xi] for (k, v) in target.items()}
  return X_source, X_target


def collate_dataset(X_train, X_val, X_test, y_train, y_val, y_test, Xi_train, Xi_val, Xi_test, label="labels"):
  dataset_train = Dataset.from_dict(X_train | {label : y_train, "indices" : Xi_train})
  dataset_val = Dataset.from_dict(X_val | {label : y_val, "indices" : Xi_val})
  dataset_test = Dataset.from_dict(X_test | {label : y_test, "indices" : Xi_test})
  dataset = DatasetDict({
      'train' : dataset_train,
      'val' : dataset_val,
      'test' : dataset_test
  })
  dataset.set_format(type="torch")
  return dataset


def collate_ontology_dataset(onto_tokenized, purpose="test"):
  size = len(onto_tokenized["input_ids"])
  print(f"Ontology size: {size}")

  dataset_onto = Dataset.from_dict(onto_tokenized | {"indices" : list(range(size))})
  dataset = DatasetDict({purpose : dataset_onto})
  dataset.set_format(type="torch")
  return dataset


def get_bi_datasets_from_tokenized(source_tokenized, target_tokenized, labels):
  Xi = np.arange(len(labels))
  y = np.array(labels)
  Xi_train_val, Xi_test, y_train_val, y_test = train_test_split(Xi, y, test_size=0.2, random_state=3)
  Xi_train, Xi_val, y_train, y_val = train_test_split(Xi_train_val, y_train_val, test_size=0.25, random_state=3)

  X_source_train, X_target_train = filter_source_target(Xi_train, source_tokenized, target_tokenized)
  X_source_val, X_target_val = filter_source_target(Xi_val, source_tokenized, target_tokenized)
  X_source_test, X_target_test = filter_source_target(Xi_test, source_tokenized, target_tokenized)

  print(f"Train: {len(y_train)}")
  print(f"Validation: {len(y_val)}")
  print(f"Test: {len(y_test)}")

  source_data = collate_dataset(X_source_train, X_source_val, X_source_test, y_train, y_val, y_test, Xi_train, Xi_val, Xi_test)
  target_data = collate_dataset(X_target_train, X_target_val, X_target_test, y_train, y_val, y_test, Xi_train, Xi_val, Xi_test)
  return source_data, target_data

def get_cross_dataset_from_tokenized(tokenized, labels):
  Xi = np.arange(len(labels))
  y = np.array(labels)
  Xi_train_val, Xi_test, y_train_val, y_test = train_test_split(Xi, y, test_size=0.2, random_state=3)
  Xi_train, Xi_val, y_train, y_val = train_test_split(Xi_train_val, y_train_val, test_size=0.25, random_state=3)

  X_train = {k : [v[i] for i in Xi_train] for (k, v) in tokenized.items()}
  X_val = {k : [v[i] for i in Xi_val] for (k, v) in tokenized.items()}
  X_test = {k : [v[i] for i in Xi_test] for (k, v) in tokenized.items()}

  print(f"Train: {len(y_train)}")
  print(f"Validation: {len(y_val)}")
  print(f"Test: {len(y_test)}")

  dataset = collate_dataset(X_train, X_val, X_test, y_train, y_val, y_test, Xi_train, Xi_val, Xi_test, label="label")
  return dataset

# Bi-encoder

## Implementation

In [7]:
def full_determinism(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.use_deterministic_algorithms(True)
  torch.backends.cudnn.deterministic = True

In [8]:
class BiEncoderForSequenceClassification(torch.nn.Module):
  def __init__(self, model_name, num_labels, id2label=None, label2id=None, token_embeddings_size=None, hidden_layer=-1):
    super().__init__()
    self.source_model = AutoModel.from_pretrained(model_name)
    self.target_model = AutoModel.from_pretrained(model_name)
    if token_embeddings_size:
      self.source_model.resize_token_embeddings(token_embeddings_size)
      self.target_model.resize_token_embeddings(token_embeddings_size)

    self.num_labels = num_labels
    self.dropout = torch.nn.Dropout(0.1)
    self.similarity = torch.nn.CosineSimilarity(dim=-1)
    self.loss = torch.nn.BCEWithLogitsLoss()

  def forward(
      self,
      s_input_ids=None, t_input_ids=None,
      s_attention_mask=None, t_attention_mask=None,
      s_token_type_ids=None, t_token_type_ids=None,
      s_position_ids=None, t_position_ids=None,
      s_head_mask=None, t_head_mask=None,
      s_inputs_embeds=None, t_inputs_embeds=None,
      labels=None
    ):

    source_outputs = self.source_model(
      s_input_ids,
      attention_mask=s_attention_mask,
      token_type_ids=s_token_type_ids,
      position_ids=s_position_ids,
      head_mask=s_head_mask,
      inputs_embeds=s_inputs_embeds,
    )

    target_outputs = self.target_model(
      t_input_ids,
      attention_mask=t_attention_mask,
      token_type_ids=t_token_type_ids,
      position_ids=t_position_ids,
      head_mask=t_head_mask,
      inputs_embeds=t_inputs_embeds,
    )

    pooled_source_outputs = self.dropout(source_outputs[1])
    pooled_target_outputs = self.dropout(target_outputs[1])

    # Dot product of source and corresponding target embeddings
    logits = torch.sum(pooled_source_outputs * pooled_target_outputs, dim=-1)

    loss = None
    if labels is not None:
      loss = self.loss(logits.view(-1), labels.view(-1).float())

    return SequenceClassifierOutput(loss=loss, logits=logits)

  def get_source_model_outputs(self, input_ids=None, token_type_ids=None, attention_mask=None):
    with torch.no_grad():
      source_outputs = self.source_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
      )
    return source_outputs[1]

  def get_target_model_outputs(self, input_ids=None, token_type_ids=None, attention_mask=None):
    with torch.no_grad():
      target_outputs = self.target_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
      )
    return target_outputs[1]

In [9]:
class EarlyStopper:
  def __init__(self, learning_rate, batch_size, patience=1, delta=0):
    self.patience = patience
    self.delta = delta
    self.counter = 0
    self.min_loss = float('inf')

    self.best_epoch = 0
    self.best_metrics = {}
    self.best_model_state = None
    self.best_optimizer_state = None

    self.learning_rate = learning_rate
    self.batch_size = batch_size

  def early_stop(self, loss, epoch, model_state, optimizer_state, metrics):
    if loss < self.min_loss:
      self.min_loss = loss
      self.counter = 0

      self.best_epoch = epoch
      self.best_metrics = metrics
      self.best_model_state = model_state
      self.best_optimizer_state = optimizer_state

    elif loss >= (self.min_loss + self.delta):
      self.counter += 1
      if self.counter >= self.patience:
        return True
    return False

  def save_best_performance(self, filepath):
    performance = f"{self.batch_size},{self.learning_rate},{self.best_epoch},{self.min_loss},{self.best_metrics.get('accuracy')},{self.best_metrics.get('precision')},{self.best_metrics.get('recall')},{self.best_metrics.get('f1')}\n"
    print(performance)
    with open(filepath, 'a') as f:
      f.write(performance)

  def save_best_checkpoint(self, filepath):
    torch.save({
      'model_state_dict' : self.best_model_state,
      'optimizer_state_dict' : self.best_optimizer_state,
      'batch_size' : self.batch_size,
      'learning_rate': self.learning_rate,
      'epoch' : self.best_epoch,
      'loss' : self.min_loss,
      'accuracy' : self.best_metrics.get('accuracy'),
      'precision' : self.best_metrics.get('precision'),
      'recall' : self.best_metrics.get('recall'),
      'f1' : self.best_metrics.get('f1'),
      }, filepath)

  def restore_model_state(self):
    return self.best_model_state

In [10]:
def show_results(epoch, loss, metrics):
  print(f"\n\nEPOCH {epoch}\n")
  print(f"Training loss: {loss}")
  pprint(metrics)
  print()

def evaluate_biencoder(model, source_data, target_data, batch_size=32, testing=False, verbose=False):
  split = "test" if testing else "val"
  eval_dataloader_index = DataLoader(Dataset.from_dict({'index' : range(len(source_data[split]))}), batch_size=batch_size)
  metrics = [evaluate.load('accuracy'), evaluate.load('precision'), evaluate.load('recall'), evaluate.load('f1')]
  all_indices, all_predictions = [], []

  model.eval()
  avg_loss = 0
  for batch in eval_dataloader_index:
    batch_index = list(batch["index"])
    source_batch = source_data[split][batch_index]
    target_batch = target_data[split][batch_index]
    labels = source_batch["labels"]
    indices = source_batch["indices"]

    source_batch = {"s_" + k: v.to(device) for (k, v) in source_batch.items() if k not in ["labels", "indices"]}
    target_batch = {"t_" + k: v.to(device) for (k, v) in target_batch.items() if k not in ["labels", "indices"]}
    params = source_batch | target_batch
    params["labels"] = labels.to(device)

    with torch.no_grad():
      outputs = model(**params)

    logits = outputs.logits.cpu()
    loss = outputs.loss.item()
    avg_loss += loss * len(batch_index)

    predictions = np.where(logits.squeeze() >= 0.5, 1, 0)
    for metric in metrics:
      metric.add_batch(predictions=predictions, references=labels.cpu())

    all_indices.extend(indices.tolist())
    all_predictions.extend(predictions.tolist())

  avg_loss = avg_loss / len(source_data[split])
  metric_dict = {("Test loss" if testing else "Validation loss") : avg_loss}
  metric_dict.update(metrics[0].compute())
  for metric in metrics[1:]:
    metric_dict.update(metric.compute(average='macro'))

  if verbose:
    pprint(metric_dict)

  if testing:
    return metric_dict, list(zip(all_indices, all_predictions))

  return metric_dict


def train_biencoder(model, source_data, target_data,
                    epochs=10, batch_size=32, learning_rate=1e-5,
                    save_performance=False, save_checkpoint=False,
                    early_stopping=True,
                    verbose=True):
  train_dataloader_index = DataLoader(Dataset.from_dict({'index' : range(len(source_data["train"]))}), shuffle=True, batch_size=batch_size)
  num_training_steps = epochs * len(train_dataloader_index)

  optimizer = AdamW(model.parameters(), lr=learning_rate)
  scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
  early_stopper = EarlyStopper(patience=1, learning_rate=learning_rate, batch_size=batch_size)

  progress_bar = tqdm(range(num_training_steps), position=0, leave=True)

  model.train()
  for epoch in range(1, epochs+1):
    avg_loss = 0
    for batch in train_dataloader_index:
      batch_index = list(batch["index"])
      source_batch = source_data["train"][batch_index]
      target_batch = target_data["train"][batch_index]
      labels = source_batch["labels"]

      source_batch = {"s_" + k: v.to(device) for (k, v) in source_batch.items() if k not in ["labels", "indices"]}
      target_batch = {"t_" + k: v.to(device) for (k, v) in target_batch.items() if k not in ["labels", "indices"]}
      params = source_batch | target_batch
      params["labels"] = labels.to(device)

      outputs = model(**params)
      loss = outputs.loss
      avg_loss += loss * len(batch["index"])
      loss.backward()

      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

    avg_loss = avg_loss / len(source_data["train"])
    metrics = evaluate_biencoder(model, source_data, target_data, batch_size=batch_size)
    if verbose:
      show_results(epoch, avg_loss, metrics)

    val_loss = metrics["Validation loss"]
    if early_stopper.early_stop(val_loss, epoch, model.state_dict(), optimizer.state_dict(), metrics) and early_stopping:
      break
    if epoch == 3:
      break

  model.load_state_dict(early_stopper.restore_model_state())

  if save_performance:
    early_stopper.save_best_performance(save_performance)
  if save_checkpoint:
    early_stopper.save_best_checkpoint(save_checkpoint)

## Hyperparameter tuning

In [None]:
# features = ['term', 'int', 'ext']
# negative_sampling = ['random', 'multi', 'neighbour']
# direction = "ncit2doid"

# feature = features[1]
# negatives = negative_sampling[2]

# source_tokenized, target_tokenized, labels = read_bi_tokenized_from_file(f"bi_tokenized_{feature}_{negatives}_{direction}.csv")
# source_data, target_data = get_bi_datasets_from_tokenized(source_tokenized, target_tokenized, labels)

# param_grid = ParameterGrid({
#     "batch_size" : [2 ** i for i in range(4, 7)],
#     "learning_rate" : [10 ** i for i in range(-6, -2)]
# })

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

# for params in list(param_grid)[::-1]:
#   print(params)

#   full_determinism(seed=3)
#   model = BiEncoderForSequenceClassification(pretrained, num_labels=1)
#   model.to(device)

#   save_performance = f"bi_performance_{feature}_{negatives}.csv"
#   train_biencoder(
#       model, source_data, target_data,
#       save_performance=save_performance,
#       verbose=True,
#       **params
#   )

#   model.cpu()
#   del model
#   torch.cuda.empty_cache()
#   gc.collect()

In [None]:
# model.cpu()
# del model
# torch.cuda.empty_cache()
# gc.collect()

## Experiments

In [None]:
# tuned_params = {
#     ("term", "random")    : {"batch_size" : 16, "learning_rate" : 1e-05},  # 4 epochs
#     ("term", "multi")     : {"batch_size" : 16, "learning_rate" : 1e-05},  # 4 epochs
#     ("term", "neighbour") : {"batch_size" : 32, "learning_rate" : 1e-05},  # 4 epochs
#     ("int", "random")     : {"batch_size" : 16, "learning_rate" : 1e-04},  # 3 epochs
#     ("int", "multi")      : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
#     ("int", "neighbour")  : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
#     ("ext", "random")     : {"batch_size" : 16, "learning_rate" : 1e-05},  # 5 epochs
#     ("ext", "multi")      : {"batch_size" : 32, "learning_rate" : 1e-05},  # 7 epochs
#     ("ext", "neighbour")  : {"batch_size" : 64, "learning_rate" : 1e-05},  # 2 epochs
# }

# features = ['term', 'int', 'ext']
# negative_sampling = ['random', 'multi', 'neighbour']
# direction = "ncit2doid"

# feature = features[1]
# negatives = negative_sampling[2]
# load_from_file = True

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

# full_determinism(seed=3)
# model = BiEncoderForSequenceClassification(pretrained, num_labels=1)
# if load_from_file:
#   checkpoint = torch.load(f"/content/drive/MyDrive/bi_checkpoint_{feature}_{negatives}.pt")
#   model.load_state_dict(checkpoint['model_state_dict'])
# model.to(device)

# experiment = (feature, negatives)
# params = tuned_params[experiment]

# source_tokenized, target_tokenized, labels = read_bi_tokenized_from_file(f"/content/drive/MyDrive/tokenized/bi_tokenized_{feature}_{negatives}_{direction}.csv")
# source_data, target_data = get_bi_datasets_from_tokenized(source_tokenized, target_tokenized, labels)

# if not load_from_file:
#   save_checkpoint = f"/content/drive/MyDrive/bi_checkpoint_{feature}_{negatives}.pt"
#   train_biencoder(
#       model, source_data, target_data,
#       save_checkpoint=save_checkpoint,
#       verbose=False,
#       **params
#   )
#   torch.save({"model_state_dict" : model.state_dict()}, save_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train: 7600
Validation: 2534
Test: 2534


## Test set

In [None]:
def write_bi_encoder_predictions(predictions, filepath):
  with open(filepath, "w") as f:
    for (i, prediction) in predictions:
      f.write(f"{i},{prediction}\n")

def write_experiment_metrics(experiment, metrics, filepath):
  experiment_str = ",".join(experiment)
  metrics_str = f'{metrics.get("Test loss")},{metrics.get("accuracy")},{metrics.get("precision")},{metrics.get("recall")},{metrics.get("f1")}\n'
  entry = ",".join([experiment_str, metrics_str])
  with open(filepath, 'a') as f:
    f.write(entry)

In [None]:
# full_determinism(seed=3)

# metrics, predictions = evaluate_biencoder(
#     model, source_data, target_data,
#     batch_size=params["batch_size"],
#     testing=True,
#     verbose=True
# )

# write_bi_encoder_predictions(predictions, f"/content/drive/MyDrive/bi_predictions_{feature}_{negatives}.csv")
# write_experiment_metrics(experiment, metrics, "/content/drive/MyDrive/bi_test_metrics.csv")

In [None]:
# # model.cpu()
# # del model
# torch.cuda.empty_cache()
# gc.collect()

## Get embeddings

In [12]:
def write_bi_encoder_embeddings(embeddings, filepath):
  with open(filepath, "a") as f:
    for (i, source, target) in embeddings:
      if source is None:
        f.write(f"{i},{target.tolist()}\n")
      else:
        f.write(f"{i},{source.tolist()},{target.tolist()}\n")

def generate_biencoder_embeddings(model, target_data, source_data=None, split="test", batch_size=32, filepath="bi_embeddings.csv"):
  if source_data is not None:
    target_positive = target_data[split].filter(lambda row: row["labels"] == 1)
    source_positive = source_data[split].filter(lambda row: row["labels"] == 1)
  else:
    target_positive = target_data[split]

  dataloader_index = DataLoader(Dataset.from_dict({'index' : range(len(target_positive))}), shuffle=False, batch_size=batch_size)
  progress_bar = tqdm(range(len(dataloader_index)), position=0, leave=True)

  model.eval()
  for batch in dataloader_index:
    batch_index = list(batch["index"])
    target_batch = target_positive[batch_index]
    if source_data is not None:
      source_batch = source_positive[batch_index]
    indices = target_positive["indices"]

    target_batch = {k: v.to(device) for (k, v) in target_batch.items() if k not in ["labels", "indices"]}
    if source_data is not None:
      source_batch = {k: v.to(device) for (k, v) in source_batch.items() if k not in ["labels", "indices"]}

    target_embeddings = model.get_target_model_outputs(**target_batch)
    if source_data is not None:
      source_embeddings = model.get_source_model_outputs(**source_batch)
    else:
      source_embeddings = [None] * len(target_embeddings)

    write_bi_encoder_embeddings(zip(indices, source_embeddings, target_embeddings), filepath)

    target_embeddings.cpu()
    if source_data is not None:
      source_embeddings.cpu()
    del target_embeddings
    del source_embeddings
    torch.cuda.empty_cache()
    gc.collect()

    progress_bar.update(1)

In [None]:
# Bi-encoder embeddings
split = "test"
bi_embeddings_file = f"/content/drive/MyDrive/bi_{split}_embeddings_{feature}_{negatives}.csv"
generate_biencoder_embeddings(model, target_data, source_data, split=split, batch_size=16, filepath=bi_embeddings_file)

Filter:   0%|          | 0/2534 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2534 [00:00<?, ? examples/s]

100%|██████████| 41/41 [00:54<00:00,  1.33s/it]


In [None]:
# Neighbour embeddings
if negatives != "neighbour":
  source_tokenized, target_tokenized, labels = read_bi_tokenized_from_file(f"/content/drive/MyDrive/tokenized/bi_tokenized_{feature}_neighbour_{direction}.csv")
  source_data, target_data = get_bi_datasets_from_tokenized(source_tokenized, target_tokenized, labels)
  gc.collect()

hard_embeddings_file = f"/content/drive/MyDrive/hard_embeddings_{feature}_{negatives}.csv"
generate_biencoder_embeddings(model, target_data=target_data, split="test", batch_size=32, filepath=hard_embeddings_file)

100%|██████████| 80/80 [01:48<00:00,  1.36s/it]


In [None]:
# Ontology embeddings
purpose = "test"
onto_tokenized = read_onto_tokenized_from_file(f"/content/drive/MyDrive/tokenized/doid_tokenized_{feature}_{negatives}.csv")
onto_data = collate_ontology_dataset(onto_tokenized, purpose=purpose)

onto_embeddings_file = f"/content/drive/MyDrive/doid_embeddings_{feature}_{negatives}.csv"
generate_biencoder_embeddings(model, target_data=onto_data, split=purpose, batch_size=32, filepath=onto_embeddings_file)

Ontology size: 13848


100%|██████████| 433/433 [11:32<00:00,  1.60s/it]


In [None]:
model.cpu()
del model
torch.cuda.empty_cache()
gc.collect()

## Embedding generation loop

In [15]:
tuned_params = {
    ("term", "random")    : {"batch_size" : 16, "learning_rate" : 1e-05},  # 4 epochs
    ("term", "multi")     : {"batch_size" : 16, "learning_rate" : 1e-05},  # 4 epochs
    ("term", "neighbour") : {"batch_size" : 32, "learning_rate" : 1e-05},  # 4 epochs
    ("int", "random")     : {"batch_size" : 16, "learning_rate" : 1e-04},  # 3 epochs
    ("int", "multi")      : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
    ("int", "neighbour")  : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
    ("ext", "random")     : {"batch_size" : 16, "learning_rate" : 1e-05},  # 5 epochs
    ("ext", "multi")      : {"batch_size" : 32, "learning_rate" : 1e-05},  # 7 epochs
    ("ext", "neighbour")  : {"batch_size" : 64, "learning_rate" : 1e-05},  # 2 epochs
}

features = ['term', 'int', 'ext']
negative_sampling = ['random', 'multi', 'neighbour']
direction = "ncit2doid"
load_from_file = True

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

for (feature, negatives) in list(itertools.product(features, negative_sampling))[4:]:
  print(feature, negatives)

  full_determinism(seed=3)
  model = BiEncoderForSequenceClassification(pretrained, num_labels=1)
  if load_from_file:
    checkpoint = torch.load(f"/content/drive/MyDrive/bi-checkpoints/bi_checkpoint_{feature}_{negatives}.csv")
    model.load_state_dict(checkpoint['model_state_dict'])
  model.to(device)

  experiment = (feature, negatives)
  params = tuned_params[experiment]
  split = "test"


  # Query embeddings
  source_tokenized, target_tokenized, labels = read_bi_tokenized_from_file(f"/content/drive/MyDrive/tokenized/bi_tokenized_{feature}_{negatives}_{direction}.csv")
  source_data, target_data = get_bi_datasets_from_tokenized(source_tokenized, target_tokenized, labels)
  target_positive = target_data[split].filter(lambda row: row["labels"] == 1)

  bi_embeddings_file = f"/content/drive/MyDrive/embeddings/bi_{split}_embeddings_{feature}_{negatives}.csv"
  generate_biencoder_embeddings(model, target_data, source_data, split=split, batch_size=params["batch_size"], filepath=bi_embeddings_file)


  # Hard embeddings
  if negatives != "neighbour":
    hard_source_tokenized, hard_target_tokenized, hard_labels = read_bi_tokenized_from_file(f"/content/drive/MyDrive/tokenized/bi_tokenized_{feature}_neighbour_{direction}.csv")
    hard_source_data, hard_target_data = get_bi_datasets_from_tokenized(hard_source_tokenized, hard_target_tokenized, hard_labels)
  else:
    hard_source_data, hard_target_data = source_data, target_data

  to_drop = []
  for i, index in enumerate(hard_target_data[split]["indices"]):
    if index in target_positive["indices"]:
      to_drop.append(i)
  hard_target_data_clean = DatasetDict({split : hard_target_data[split].select([i for i in range(len(hard_target_data[split])) if i not in to_drop])})

  hard_embeddings_file = f"/content/drive/MyDrive/embeddings/hard_embeddings_{feature}_{negatives}.csv"
  generate_biencoder_embeddings(model, target_data=hard_target_data_clean, split="test", batch_size=params["batch_size"]*2, filepath=hard_embeddings_file)


  # Random embeddings
  if negatives != "multi":
    random_source_tokenized, random_target_tokenized, random_labels = read_bi_tokenized_from_file(f"/content/drive/MyDrive/tokenized/bi_tokenized_{feature}_multi_{direction}.csv")
    random_source_data, random_target_data = get_bi_datasets_from_tokenized(random_source_tokenized, random_target_tokenized, random_labels)
  else:
    random_source_data, random_target_data = source_data, target_data

  to_drop = []
  for i, index in enumerate(random_target_data[split]["indices"]):
    if index in target_positive["indices"]:
      to_drop.append(i)
  random_target_data_clean = DatasetDict({split : random_target_data[split].select([i for i in range(len(random_target_data[split])) if i not in to_drop])})

  random_embeddings_file = f"/content/drive/MyDrive/embeddings/random_embeddings_{feature}_{negatives}.csv"
  generate_biencoder_embeddings(model, target_data=random_target_data_clean, split="test", batch_size=params["batch_size"]*2, filepath=random_embeddings_file)


  # # Ontology embeddings
  onto_tokenized = read_onto_tokenized_from_file(f"/content/drive/MyDrive/tokenized/doid_tokenized_{feature}_{negatives}.csv")
  onto_data = collate_ontology_dataset(onto_tokenized, purpose=split)


  to_drop = []
  for i, tokenized in tqdm(enumerate(onto_data[split]["input_ids"])):
    for target in target_positive["input_ids"]:
      if np.array_equal(tokenized, target):
        to_drop.append(i)
        break
  onto_data[split] = onto_data[split].select([i for i in range(len(onto_data[split])) if i not in to_drop])

  onto_embeddings_file = f"/content/drive/MyDrive/embeddings/doid_embeddings_{feature}_{negatives}.csv"
  generate_biencoder_embeddings(model, target_data=onto_data, split=split, batch_size=params["batch_size"]*2, filepath=onto_embeddings_file)

  print()

  model.cpu()
  del model
  torch.cuda.empty_cache()
  gc.collect()


 int multi
Train: 7720
Validation: 2574
Test: 2574


Filter:   0%|          | 0/2574 [00:00<?, ? examples/s]

Train: 7600
Validation: 2534
Test: 2534


100%|██████████| 69/69 [01:56<00:00,  1.68s/it]
100%|██████████| 61/61 [01:31<00:00,  1.49s/it]



 int neighbour
Train: 7600
Validation: 2534
Test: 2534


Filter:   0%|          | 0/2534 [00:00<?, ? examples/s]

100%|██████████| 60/60 [01:33<00:00,  1.56s/it]


Train: 7720
Validation: 2574
Test: 2574


100%|██████████| 71/71 [01:45<00:00,  1.49s/it]



 ext random
Train: 3860
Validation: 1287
Test: 1287


Filter:   0%|          | 0/1287 [00:00<?, ? examples/s]

Train: 7600
Validation: 2534
Test: 2534


100%|██████████| 74/74 [01:11<00:00,  1.03it/s]


Train: 7720
Validation: 2574
Test: 2574


100%|██████████| 75/75 [01:08<00:00,  1.10it/s]



 ext multi
Train: 7720
Validation: 2574
Test: 2574


Filter:   0%|          | 0/2574 [00:00<?, ? examples/s]

Train: 7600
Validation: 2534
Test: 2534


100%|██████████| 35/35 [00:40<00:00,  1.17s/it]
100%|██████████| 31/31 [00:36<00:00,  1.17s/it]



 ext neighbour
Train: 7600
Validation: 2534
Test: 2534


Filter:   0%|          | 0/2534 [00:00<?, ? examples/s]

100%|██████████| 15/15 [00:25<00:00,  1.73s/it]


Train: 7720
Validation: 2574
Test: 2574


100%|██████████| 18/18 [00:31<00:00,  1.73s/it]


In [13]:
model.cpu()
del model
torch.cuda.empty_cache()
gc.collect()

75

# Cross-encoder

## Implementation

In [None]:
def compute_cross_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  metrics = [evaluate.load('accuracy'), evaluate.load('precision'), evaluate.load('recall'), evaluate.load('f1')]
  metric_dict = metrics[0].compute(predictions=predictions, references=labels)
  for metric in metrics[1:]:
    metric_dict.update(metric.compute(predictions=predictions, references=labels, average='macro'))

  return metric_dict

def prepare_model(model, tokenized_data, learning_rate=1e-5, batch_size=32, epochs=10):
  pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

  tokenizer = AutoTokenizer.from_pretrained(pretrained)
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

  model.resize_token_embeddings(len(tokenizer))
  model.config.pad_token_id = tokenizer.pad_token_id

  training_args = TrainingArguments(
    output_dir="testing",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_only_model=True,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    logging_steps=1,
    metric_for_best_model='eval_loss',
    load_best_model_at_end=True,
    seed=3,
    data_seed=3,
    full_determinism=True,
    no_cuda=False
  )

  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_cross_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
  )

  return trainer

## Hyperparameter tuning

In [None]:
# param_grid = ParameterGrid({
#     "batch_size" : [2 ** i for i in range(4, 7)],
#     "learning_rate" : [10 ** i for i in range(-6, -2)]
# })

# pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
# feature, negatives = "term", "random"
# direction = "ncit2doid"
# experiment = (feature, negatives)

# tokenized, labels = read_cross_tokenized_from_file(f"/content/drive/MyDrive/tokenized/cross_tokenized_{feature}_{negatives}_{direction}.csv")
# dataset = get_cross_dataset_from_tokenized(tokenized, labels)
# indices = dataset["test"]["indices"]
# tokenized_data = {k : v for (k,v) in dataset.items() if k != "indices"}

# for params in list(param_grid)[5:6]:
#   print(params)
#   full_determinism(seed=3)
#   model = AutoModelForSequenceClassification.from_pretrained(pretrained, num_labels=2)
#   trainer = prepare_model(model, tokenized_data, **params)
#   trainer.train()

## Test set

In [None]:
def write_cross_encoder_predictions(predictions, filepath):
  with open(filepath, "w") as f:
    for (i, prediction) in predictions:
      f.write(f"{i},{prediction}\n")

def write_cross_experiment_metrics(experiment, metrics, filepath):
  experiment_str = ",".join(experiment)
  metrics_str = f'{metrics.get("test_loss")},{metrics.get("test_accuracy")},{metrics.get("test_precision")},{metrics.get("test_recall")},{metrics.get("test_f1")}\n'
  entry = ",".join([experiment_str, metrics_str])
  with open(filepath, 'a') as f:
    f.write(entry)

In [None]:
import itertools

tuned_params = {
    ("term", "random")    : {"batch_size" : 32, "learning_rate" : 1e-05},  # 2 epochs
    ("term", "multi")     : {"batch_size" : 32, "learning_rate" : 1e-05},  # 4 epochs
    ("term", "neighbour") : {"batch_size" : 32, "learning_rate" : 1e-05},  # 4 epochs
    ("int", "random")     : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
    ("int", "multi")      : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
    ("int", "neighbour")  : {"batch_size" : 16, "learning_rate" : 1e-05},  # 3 epochs
    ("ext", "random")     : {"batch_size" : 32, "learning_rate" : 1e-05},  # 5 epochs
    ("ext", "multi")      : {"batch_size" : 32, "learning_rate" : 1e-05},  # 7 epochs
    ("ext", "neighbour")  : {"batch_size" : 32, "learning_rate" : 1e-05},  # 2 epochs
}

features = ['term', 'int', 'ext']
negative_sampling = ['random', 'multi', 'neighbour']
direction = "ncit2doid"
load_from_file = False

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"

for (feature, negatives) in list(itertools.product(features, negative_sampling))[5:]:
  print(feature, negatives)

  experiment = (feature, negatives)
  params = tuned_params[experiment]

  tokenized, labels = read_cross_tokenized_from_file(f"/content/drive/MyDrive/tokenized/cross_tokenized_{feature}_{negatives}_{direction}.csv")
  dataset = get_cross_dataset_from_tokenized(tokenized, labels)
  indices = dataset["test"]["indices"]
  tokenized_data = {k : v for (k,v) in dataset.items() if k != "indices"}

  full_determinism(seed=3)
  model = AutoModelForSequenceClassification.from_pretrained(pretrained, num_labels=2)
  model.to(device)

  trainer = prepare_model(model, tokenized_data, **params)
  trainer.train()
  trainer.save_model(f"/content/drive/MyDrive/cross_checkpoint_{feature}_{negatives}.pt")

  full_determinism(seed=3)
  predictions, label_ids, metrics = trainer.predict(tokenized_data["test"])
  print(metrics)
  predictions = np.argmax(predictions, axis=1).tolist()

  write_cross_encoder_predictions(list(zip(indices, predictions)), f"/content/drive/MyDrive/cross_predictions_{feature}_{negatives}.csv")
  write_cross_experiment_metrics(experiment, metrics, "/content/drive/MyDrive/cross_test_metrics.csv")

  model.cpu()
  del model
  torch.cuda.empty_cache()
  gc.collect()

In [None]:
model.cpu()
del model
torch.cuda.empty_cache()
gc.collect()

5202