<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[torch]

In [None]:
import gc
import random
import torch
import numpy as np
from transformers import AutoTokenizer

# Read/Write to file

In [None]:
def read_cross_sentences(filepath):
  sentences, labels = [], []
  with open(filepath) as f:
    for line in f:
      sentence, label = [field.strip('"') for field in line.strip().split('","')]
      sentences.append(sentence)
      labels.append(label)
  return sentences, labels

def read_bi_sentences(filepath):
  source_sentences, target_sentences, labels = [], [], []
  with open(filepath) as f:
    for line in f:
      source_sentence, target_sentence, label = [field.strip('"') for field in line.strip().split('","')]
      source_sentences.append(source_sentence)
      target_sentences.append(target_sentence)
      labels.append(label)
  return source_sentences, target_sentences, labels

def read_onto_sentences(filepath):
  sentences = []
  with open(filepath) as f:
    for line in f:
      sentences.append(line.strip().strip('"'))
  return sentences

In [None]:
def write_bi_encoder_tokenized(tokenized_source, tokenized_target, labels, filepath):
  s_input_ids = tokenized_source["input_ids"]
  s_token_type_ids = tokenized_source["token_type_ids"]
  s_attention_mask = tokenized_source["attention_mask"]
  t_input_ids = tokenized_target["input_ids"]
  t_token_type_ids = tokenized_target["token_type_ids"]
  t_attention_mask = tokenized_target["attention_mask"]

  with open(filepath, "w") as f:
    for (s_input_id, s_token_type_id, s_attention, t_input_id, t_token_type_id, t_attention, label) in zip(
        s_input_ids,
        s_token_type_ids,
        s_attention_mask,
        t_input_ids,
        t_token_type_ids,
        t_attention_mask,
        labels
      ):
      f.write(f"{s_input_id},{s_token_type_id},{s_attention},{t_input_id},{t_token_type_id},{t_attention},{label}\n")

def write_cross_encoder_tokenized(tokenized, labels, filepath):
  input_ids = tokenized["input_ids"]
  token_type_ids = tokenized["token_type_ids"]
  attention_mask = tokenized["attention_mask"]

  with open(filepath, "w") as f:
    for (input_id, token_type_id, attention, label) in zip(input_ids, token_type_ids, attention_mask, labels):
      f.write(f"{input_id},{token_type_id},{attention},{label}\n")

def write_ontology_tokenized(tokenized, filepath):
  input_ids = tokenized["input_ids"]
  token_type_ids = tokenized["token_type_ids"]
  attention_mask = tokenized["attention_mask"]

  with open(filepath, "w") as f:
    for (input_id, token_type_id, attention) in zip(input_ids, token_type_ids, attention_mask):
      f.write(f"{input_id},{token_type_id},{attention}\n")

# Tokenize

In [None]:
def full_determinism(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.use_deterministic_algorithms(True)
  torch.backends.cudnn.deterministic = True

In [None]:
def tokenize_sentences_for_experimental_setting(feature, negatives, direction="ncit2doid"):
  gc.collect()
  full_determinism(seed=3)

  sources, targets, labels = read_bi_sentences(f"bi_sentences_{feature}_{negatives}_{direction}.csv")
  combined, labels = read_cross_sentences(f"cross_sentences_{feature}_{negatives}_{direction}.csv")
  onto = read_onto_sentences(f"{direction.split('2')[1]}_sentences_{feature}.csv")
  n = len(sources)

  tokenized = tokenizer(sources + targets + combined + onto, padding="longest", truncation=True, max_length=512)
  tokenized_source = {k : v[:n] for (k, v) in tokenized.items()}
  tokenized_target = {k : v[n:2*n] for (k, v) in tokenized.items()}
  tokenized_cross = {k : v[2*n:3*n] for (k, v) in tokenized.items()}
  tokenized_onto = {k : v[3*n:] for (k, v) in tokenized.items()}

  write_bi_encoder_tokenized(tokenized_source, tokenized_target, labels, f"bi_tokenized_{feature}_{negatives}_{direction}.csv")
  write_cross_encoder_tokenized(tokenized_cross, labels, f"cross_tokenized_{feature}_{negatives}_{direction}.csv")
  write_ontology_tokenized(tokenized_onto, f"{direction.split('2')[1]}_tokenized_{feature}_{negatives}.csv")

  del tokenized_source
  del tokenized_target
  del tokenized_cross
  del tokenized_onto
  del tokenized
  gc.collect()

In [None]:
pretrained = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained(pretrained)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

negative_sampling = ['random', 'multi', 'neighbour']
features = ['term', 'int', 'ext']
direction = "ncit2doid"

for feature in features:
  for negatives in negative_sampling:
    print(feature, negatives)
    tokenize_sentences_for_experimental_setting(feature, negatives, direction)