<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/load_and_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pronto owlready2 \
&& wget -O doid.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EfUC_RdrfZdOsOrtmNATjuoBPDaIkSTUMyxJXyO2KKC6yw?download=1 \
&& wget -O ncit.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/ETmaJIC0fAlItdsp8WQxS_wBzKN_6x08EZrtsOxVnbzvSg?download=1 \
&& wget -O doid.owl https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EVjp-NoZlZtDq42LqxZH08ABz2h7IWqdw2gxetzvA-u3hQ?download=1

In [2]:
import pronto
import owlready2
import numpy as np
from tqdm import tqdm
from collections import Counter

In [3]:
ncit = pronto.Ontology("ncit.obo")
doid = pronto.Ontology("doid.obo")
doid_owl = owlready2.get_ontology("doid.owl").load()

# Extract equivalences

In [4]:
# Get Property object with the given name, from the given owlready2 Ontology

def get_owl_property(onto, target_prop_name):
  all_props = list(onto.properties())
  target_prop = None
  for prop in all_props:
    if prop.python_name == target_prop_name:
      target_prop = prop
      break
  return target_prop

print("Found: ", get_owl_property(doid_owl, "hasDbXref"))

Found:  oboInOwl.hasDbXref


In [5]:
# Correct the ontology code in the given entity id

def correct_code(entity_id, corrections={"NCI" : "NCIT"}, delimiter=":"):
  code, num = entity_id.split(delimiter)
  if code in corrections:
    return corrections[code] + delimiter + num
  else:
    return entity_id

# Get entity equivalences encoded by the given Property name to the given target ontology, in the given owlready2 Ontology

def get_owl_entity_equivalences(source_onto, target_onto_code="NCI:", xref_prop_name="hasDbXref"):
  all_equivalences = {}
  xref_prop = get_owl_property(source_onto, xref_prop_name)

  if xref_prop:
    for entity in list(source_onto.classes()):
      if xref_prop in entity.get_properties(entity):
        xrefs = [correct_code(ref) for ref in xref_prop[entity] if target_onto_code in ref]
        # Only keep equivalences with single target
        if xrefs and len(xrefs) == 1:
          all_equivalences[correct_code(entity.id[0])] = xrefs[0]
  return all_equivalences

In [6]:
def write_mappings(mappings, filepath):
  with open(filepath, "w") as f:
    for source_id, target_id in mappings.items():
      f.write(f"{source_id},{target_id}\n")

In [7]:
all_equivalences = get_owl_entity_equivalences(doid_owl)
# print("Equivalences (DOID to NCIT): ", all_equivalences)
print("Total equivalences: ", len(all_equivalences))

Total equivalences:  4331


In [8]:
write_mappings(all_equivalences, "equiv_doid2ncit.csv")
write_mappings({v:k for (k,v) in all_equivalences.items()}, "equiv_ncit2doid.csv")

# Derive subsumptions

In [15]:
def get_subsumptions_from_equivalences(equivalences, source_onto, target_onto, negative=False, reverse=False):
  np.random.seed(3)
  source_ids = [term.id for term in source_onto.terms()]
  target_ids = [term.id for term in target_onto.terms()]
  subsumptions = {}

  source_parent_counts = []
  target_parent_counts = []

  for source_id, target_id in tqdm(equivalences.items()):
    if source_id in source_onto.terms():
      source_entity = source_onto.get_term(source_id)
      source_parents = set(source_entity.superclasses(distance=1, with_self=False))
      source_parent_counts.append(len(source_parents))

      if target_id in target_onto.terms():
        target_entity = target_onto.get_term(target_id)
        target_parents = set(target_entity.superclasses(distance=1, with_self=False))
        target_parent_counts.append(len(target_parents))

        # Only keep mappings with single parents
        if target_parents and source_parents and len(target_parents) == 1 and len(source_parents) == 1:

          if not negative:
            if reverse:
              subsumptions[target_id] = source_parents.pop().id
            else:
              subsumptions[source_id] = target_parents.pop().id

          else:
            if reverse:
              source_parent_id = source_parents.pop().id
              neg_sub = source_ids[np.random.randint(0, len(source_ids))]
              while neg_sub == source_parent_id:
                neg_sub = source_ids[np.random.randint(0, len(source_ids))]
              subsumptions[target_id] = neg_sub
            else:
              target_parent_id = target_parents.pop().id
              neg_sub = target_ids[np.random.randint(0, len(target_ids))]
              while neg_sub == target_parent_id:
                neg_sub = target_ids[np.random.randint(0, len(target_ids))]
              subsumptions[source_id] = neg_sub
  print()
  print("Source parent counts: ", Counter(source_parent_counts))
  print("Target parent counts: ", Counter(target_parent_counts))
  return subsumptions

In [10]:
doid_to_ncit_subs = get_subsumptions_from_equivalences(all_equivalences, doid, ncit)
# print("Subsumption relations (DOID to NCIT): ", doid_to_ncit_subs)
print("Total subsumptions (DOID to NCIT): ", len(doid_to_ncit_subs))

100%|██████████| 4331/4331 [00:00<00:00, 5199.66it/s]


Source parent counts:  Counter({1: 3242, 2: 983, 3: 81, 0: 10, 4: 10, 5: 2, 6: 2, 7: 1})
Target parent counts:  Counter({1: 2209, 2: 1701, 3: 356, 4: 50, 5: 13, 6: 1})
Total subsumptions (DOID to NCIT):  1883





In [16]:
neg_doid_to_ncit_subs = get_subsumptions_from_equivalences(all_equivalences, doid, ncit, negative=True)
# print("Negative subsumption relations (DOID to NCIT): ", neg_doid_to_ncit_subs)
print("Total negative subsumptions (DOID to NCIT): ", len(neg_doid_to_ncit_subs))

100%|██████████| 4331/4331 [00:00<00:00, 5732.28it/s]


Source parent counts:  Counter({1: 3242, 2: 983, 3: 81, 0: 10, 4: 10, 5: 2, 6: 2, 7: 1})
Target parent counts:  Counter({1: 2209, 2: 1701, 3: 356, 4: 50, 5: 13, 6: 1})
Total negative subsumptions (DOID to NCIT):  1883





In [12]:
ncit_to_doid_subs = get_subsumptions_from_equivalences(all_equivalences, doid, ncit, reverse=True)
# print("Subsumption relations (NCIT to DOID): ", ncit_to_doid_subs)
print("Total subsumptions (NCIT to DOID): ", len(ncit_to_doid_subs))

100%|██████████| 4331/4331 [00:00<00:00, 9071.40it/s]


Source parent counts:  Counter({1: 3242, 2: 983, 3: 81, 0: 10, 4: 10, 5: 2, 6: 2, 7: 1})
Target parent counts:  Counter({1: 2209, 2: 1701, 3: 356, 4: 50, 5: 13, 6: 1})
Total subsumptions (NCIT to DOID):  1873





In [17]:
neg_ncit_to_doid_subs = get_subsumptions_from_equivalences(all_equivalences, doid, ncit, negative=True, reverse=True)
# print("Negative subsumption relations (NCIT to DOID): ", neg_ncit_to_doid_subs)
print("Total negative subsumptions (NCIT to DOID): ", len(neg_ncit_to_doid_subs))

100%|██████████| 4331/4331 [00:00<00:00, 8990.94it/s]


Source parent counts:  Counter({1: 3242, 2: 983, 3: 81, 0: 10, 4: 10, 5: 2, 6: 2, 7: 1})
Target parent counts:  Counter({1: 2209, 2: 1701, 3: 356, 4: 50, 5: 13, 6: 1})
Total negative subsumptions (NCIT to DOID):  1873





In [19]:
write_mappings(doid_to_ncit_subs, "subs_doid2ncit.csv")
write_mappings(neg_doid_to_ncit_subs, "neg_subs_doid2ncit.csv")
write_mappings(ncit_to_doid_subs, "subs_ncit2doid.csv")
write_mappings(neg_ncit_to_doid_subs, "neg_subs_ncit2doid.csv")