<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pronto transformers datasets evaluate \
&& wget -O doid.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EfUC_RdrfZdOsOrtmNATjuoBPDaIkSTUMyxJXyO2KKC6yw?download=1 \
&& wget -O ncit.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/ETmaJIC0fAlItdsp8WQxS_wBzKN_6x08EZrtsOxVnbzvSg?download=1

In [23]:
import pronto
from transformers import AutoTokenizer, AutoModel, BioGptTokenizer, BioGptModel

In [3]:
ncit = pronto.Ontology("ncit.obo")
doid = pronto.Ontology("doid.obo")

In [4]:
# Get subsumptions from CSV file to a dictionary

def get_mappings_from_file(filename):
  mappings = {}
  with open(filename) as f:
    for line in f:
      source_id, target_id = line.strip().split(',')
      mappings[source_id] = target_id
  return mappings

In [5]:
doid2ncit_equiv = get_mappings_from_file("doid2ncit_equiv.csv")
ncit2doid_equiv = get_mappings_from_file("ncit2doid_equiv.csv")

# Convert relations to sentences

> Currently considering parents, children & siblings for conceptual reasons, but could also take 'n-hop' appraoch, e.g. 1-hop only with parents and children, or 2-hop to include grandparents, grandchildren and siblings.

> How do I incorporate the desired mapping for training? Should I incorporate both all this AND target info, or too much? Could be SELF + desired relatives instead, or SELF + PARENT + DESIRED PARENT, etc.

In [15]:
# Create sentence from the given entity, containing its direct parents, children & siblings

def get_sentence(source_id, target_id, source_onto, target_onto):
  markers = ["[SUB]", "[/SUB]", "[SUP]", "[/SUP]", "[SIBL]", "[/SIBL]"]
  sub_in, sub_out, sup_in, sup_out, sibl_in, sibl_out = markers

  if source_id not in source_onto.terms() or target_id not in target_onto.terms():
    return

  subsumer = source_onto.get_term(source_id)
  equivalent = target_onto.get_term(target_id)
  parents = list(equivalent.superclasses(distance=1, with_self=False))

  if len(parents) != 1:
    return

  supersumer = parents[0]
  siblings = set(supersumer.subclasses(distance=1, with_self=False))
  siblings.remove(equivalent)

  sentence = [sub_in, subsumer.name, sub_out, sup_in, supersumer.name, sup_out]
  for sibling in siblings:
    sentence.extend([sibl_in, sibling.name, sibl_out])

  return " ".join(sentence)

In [19]:
source_id = "DOID:0014667"
sentence = get_sentence(source_id, doid2ncit_equiv[source_id], doid, ncit)
print(sentence)

[SUB] disease of metabolism [/SUB] [SUP] Non-Neoplastic Disorder by Special Category [/SUP] [SIBL] Endemic Disorder [/SIBL] [SIBL] Epidemic Disorder [/SIBL] [SIBL] Degenerative Disorder [/SIBL] [SIBL] Radiation-Related Non-Neoplastic Disorder [/SIBL] [SIBL] Rare Non-Neoplastic Disorder [/SIBL] [SIBL] Hamartoma [/SIBL] [SIBL] Hyperplasia [/SIBL] [SIBL] Inflammatory Disorder [/SIBL] [SIBL] Congenital or Acquired Anatomic Abnormality [/SIBL] [SIBL] Non-Communicable Disorder [/SIBL]


In [16]:
samples = []
for (doid_id, ncit_id) in doid2ncit_equiv.items():
  if (sentence := get_sentence(doid_id, ncit_id, doid, ncit)):
    samples.append(sentence)

In [29]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
tokenizer.add_tokens(["[SUB]", "[/SUB]", "[SUP]", "[/SUP]", "[SIBL]", "[/SIBL]"])

6

In [20]:
tokenized = tokenizer(sentence)
tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

## Should we separate at dashes?

['[CLS]',
 '[SUB]',
 'disease',
 'of',
 'metabolism',
 '[/SUB]',
 '[SUP]',
 'non',
 '-',
 'neoplastic',
 'disorder',
 'by',
 'special',
 'category',
 '[/SUP]',
 '[SIBL]',
 'endemic',
 'disorder',
 '[/SIBL]',
 '[SIBL]',
 'epidemic',
 'disorder',
 '[/SIBL]',
 '[SIBL]',
 'degenerative',
 'disorder',
 '[/SIBL]',
 '[SIBL]',
 'radiation',
 '-',
 'related',
 'non',
 '-',
 'neoplastic',
 'disorder',
 '[/SIBL]',
 '[SIBL]',
 'rare',
 'non',
 '-',
 'neoplastic',
 'disorder',
 '[/SIBL]',
 '[SIBL]',
 'hamart',
 '##oma',
 '[/SIBL]',
 '[SIBL]',
 'hyperplasia',
 '[/SIBL]',
 '[SIBL]',
 'inflammatory',
 'disorder',
 '[/SIBL]',
 '[SIBL]',
 'congenital',
 'or',
 'acquired',
 'anatomic',
 'abnormality',
 '[/SIBL]',
 '[SIBL]',
 'non',
 '-',
 'communicable',
 'disorder',
 '[/SIBL]',
 '[SEP]']

In [22]:
model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
model.resize_token_embeddings(len(tokenizer))

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Embedding(28901, 768)