<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/load_and_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pronto owlready2 \
&& wget -O doid.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EfUC_RdrfZdOsOrtmNATjuoBPDaIkSTUMyxJXyO2KKC6yw?download=1 \
&& wget -O ncit.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/ETmaJIC0fAlItdsp8WQxS_wBzKN_6x08EZrtsOxVnbzvSg?download=1 \
&& wget -O doid.owl https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EVjp-NoZlZtDq42LqxZH08ABz2h7IWqdw2gxetzvA-u3hQ?download=1

In [2]:
import pronto
import owlready2
from collections import Counter

In [3]:
ncit = pronto.Ontology("ncit.obo")
doid = pronto.Ontology("doid.obo")
doid_owl = owlready2.get_ontology("doid.owl").load()

# Extract equivalences

In [4]:
# Get Property object with the given name, from the given owlready2 Ontology

def get_owl_property(onto, target_prop_name):
  all_props = list(onto.properties())
  target_prop = None
  for prop in all_props:
    if prop.python_name == target_prop_name:
      target_prop = prop
      break
  return target_prop

print("Found: ", get_owl_property(doid_owl, "hasDbXref"))

Found:  oboInOwl.hasDbXref


In [5]:
# Correct the ontology code in the given entity id

def correct_code(entity_id, corrections={"NCI" : "NCIT"}, delimiter=":"):
  code, num = entity_id.split(delimiter)
  if code in corrections:
    return corrections[code] + delimiter + num
  else:
    return entity_id

# Get entity equivalences encoded by the given Property name to the given target ontology, in the given owlready2 Ontology

def get_owl_entity_equivalences(source_onto, target_onto_code="NCI:", xref_prop_name="hasDbXref"):
  all_equivalences = {}
  xref_prop = get_owl_property(source_onto, xref_prop_name)

  if xref_prop:
    for entity in list(source_onto.classes()):
      if xref_prop in entity.get_properties(entity):
        xrefs = [correct_code(ref) for ref in xref_prop[entity] if target_onto_code in ref]
        # Only keep equivalences with single target
        if xrefs and len(xrefs) == 1:
          all_equivalences[correct_code(entity.id[0])] = xrefs[0]
  return all_equivalences

In [6]:
def write_mappings(mappings, filepath):
  with open(filepath, "w") as f:
    for source_id, target_id in mappings.items():
      f.write(f"{source_id},{target_id}\n")

In [21]:
all_equivalences = get_owl_entity_equivalences(doid_owl)
# print("Equivalences (DOID to NCIT): ", all_equivalences)
print("Total equivalences: ", len(all_equivalences))

Total equivalences:  4331


In [8]:
write_mappings(all_equivalences, "doid2ncit_equiv.csv")
write_mappings({v:k for (k,v) in all_equivalences.items()}, "ncit2doid_equiv.csv")

# Derive subsumptions

In [14]:
def get_subsumptions_from_equivalences(equivalences, source_onto, target_onto, reverse=False):
  subsumptions = {}
  source_parent_counts = []
  target_parent_counts = []
  for source_id, target_id in equivalences.items():
    if source_id in source_onto.terms():
      source_entity = source_onto.get_term(source_id)
      source_parent_ids = [p.id for p in source_entity.superclasses(distance=1, with_self=False)]
      source_parent_counts.append(len(source_parent_ids))

      if target_id in target_onto.terms():
        target_entity = target_onto.get_term(target_id)
        target_parent_ids = [p.id for p in target_entity.superclasses(distance=1, with_self=False)]
        target_parent_counts.append(len(target_parent_ids))

        # Only keep mappings with single parent
        if target_parent_ids and len(target_parent_ids) == 1 and source_parent_ids and len(source_parent_ids) == 1:
          if not reverse:
            subsumptions[source_id] = target_parent_ids[0]
          elif reverse:
            subsumptions[target_id] = source_parent_ids[0]

  print("Source parents:", Counter(source_parent_counts))
  print("Target parents:", Counter(target_parent_counts))
  return subsumptions

In [19]:
doid_to_ncit_subs = get_subsumptions_from_equivalences(all_equivalences, doid, ncit)
# print("Subsumption relations (DOID to NCIT): ", doid_to_ncit_subs)
print("Total subsumptions: ", len(doid_to_ncit_subs))

Source parents: Counter({1: 3242, 2: 983, 3: 81, 0: 10, 4: 10, 5: 2, 6: 2, 7: 1})
Target parents: Counter({1: 2209, 2: 1701, 3: 356, 4: 50, 5: 13, 6: 1})
Total subsumptions:  1883


In [20]:
ncit_to_doid_subs = get_subsumptions_from_equivalences(all_equivalences, doid, ncit, reverse=True)
# print("Subsumption relations (NCIT to DOID): ", ncit_to_doid_subs)
print("Total subsumptions: ", len(ncit_to_doid_subs))

Source parents: Counter({1: 3242, 2: 983, 3: 81, 0: 10, 4: 10, 5: 2, 6: 2, 7: 1})
Target parents: Counter({1: 2209, 2: 1701, 3: 356, 4: 50, 5: 13, 6: 1})
Total subsumptions:  1873


In [12]:
write_mappings(doid_to_ncit_subs, "doid2ncit_subs.csv")
write_mappings(ncit_to_doid_subs, "ncit2doid_subs.csv")