<a href="https://colab.research.google.com/github/stellaevat/ontology-mapping/blob/main/colabs/load_and_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pronto owlready2 \
&& wget -O doid.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EfUC_RdrfZdOsOrtmNATjuoBPDaIkSTUMyxJXyO2KKC6yw?download=1 \
&& wget -O ncit.obo https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/ETmaJIC0fAlItdsp8WQxS_wBzKN_6x08EZrtsOxVnbzvSg?download=1 \
&& wget -O doid.owl https://gla-my.sharepoint.com/:u:/g/personal/2526934t_student_gla_ac_uk/EVjp-NoZlZtDq42LqxZH08ABz2h7IWqdw2gxetzvA-u3hQ?download=1

Collecting pronto
  Downloading pronto-2.5.5-py2.py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting owlready2
  Downloading owlready2-0.44.tar.gz (27.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fastobo~=0.12.2 (from pronto)
  Downloading fastobo-0.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: owlready2
  Building whe

In [2]:
import pronto
import owlready2

In [3]:
ncit = pronto.Ontology("ncit.obo")
doid = pronto.Ontology("doid.obo")
doid_owl = owlready2.get_ontology("doid.owl").load()

# Extract equivalences

In [4]:
# Get Property object with the given name, from the given owlready2 Ontology

def get_owl_property(onto, target_prop_name):
  all_props = list(onto.properties())
  target_prop = None
  for prop in all_props:
    if prop.python_name == target_prop_name:
      target_prop = prop
      break
  return target_prop

print("Found: ", get_owl_property(doid_owl, "hasDbXref"))

Found:  oboInOwl.hasDbXref


In [5]:
# Correct the ontology code in the given entity id

def correct_code(entity_id, corrections={"NCI" : "NCIT"}, delimiter=":"):
  code, num = entity_id.split(delimiter)
  if code in corrections:
    return corrections[code] + delimiter + num
  else:
    return entity_id

# Get entity equivalences encoded by the given Property name to the given target ontology, in the given owlready2 Ontology

def get_owl_entity_equivalences(source_onto, target_onto_code="NCI:", xref_prop_name="hasDbXref"):
  all_equivalences = {}
  xref_prop = get_owl_property(source_onto, xref_prop_name)

  if xref_prop:
    for entity in list(source_onto.classes()):
      if xref_prop in entity.get_properties(entity):
        xrefs = [correct_code(ref) for ref in xref_prop[entity] if target_onto_code in ref]
        if xrefs and len(xrefs) == 1:
          all_equivalences[correct_code(entity.id[0])] = xrefs[0]
  return all_equivalences

In [6]:
all_equivalences = get_owl_entity_equivalences(doid_owl)
print("External equivalences: ", all_equivalences)
print("Total: ", len(all_equivalences))

External equivalences:  {'DOID:0014667': 'NCIT:C3235', 'DOID:0040093': 'NCIT:C114354', 'DOID:0050120': 'NCIT:C34792', 'DOID:0050127': 'NCIT:C128411', 'DOID:0050144': 'NCIT:C84797', 'DOID:0050156': 'NCIT:C35716', 'DOID:0050157': 'NCIT:C62586', 'DOID:0050158': 'NCIT:C35288', 'DOID:0050214': 'NCIT:C3155', 'DOID:0050269': 'NCIT:C35083', 'DOID:0050424': 'NCIT:C3339', 'DOID:0050425': 'NCIT:C84501', 'DOID:0050426': 'NCIT:C79484', 'DOID:0050427': 'NCIT:C3452', 'DOID:0050428': 'NCIT:C3147', 'DOID:0050429': 'NCIT:C82865', 'DOID:0050430': 'NCIT:C3226', 'DOID:0050431': 'NCIT:C84571', 'DOID:0050432': 'NCIT:C97159', 'DOID:0050433': 'NCIT:C84711', 'DOID:0050434': 'NCIT:C84559', 'DOID:0050436': 'NCIT:C84906', 'DOID:0050437': 'NCIT:C84735', 'DOID:0050438': 'NCIT:C122805', 'DOID:0050439': 'NCIT:C85217', 'DOID:0050441': 'NCIT:C84908', 'DOID:0050444': 'NCIT:C84789', 'DOID:0050445': 'NCIT:C85234', 'DOID:0050448': 'NCIT:C84760', 'DOID:0050449': 'NCIT:C84986', 'DOID:0050450': 'NCIT:C84730', 'DOID:0050451': '

# Derive subsumptions

> Maybe manipulate somehow so that when multiple hasDbXrefs exist, if any of them are parent/child of the other, keep only the one that's closest (e.g. lexically) to the source entity?

In [7]:
def get_subsumptions_from_equivalences(equivalences, source_onto, target_onto, reverse=False):
  subsumptions = {}
  for source_id, target_id in equivalences.items():
    if source_id in source_onto.terms():
      source_entity = source_onto.get_term(source_id)
      source_parent_ids = [p.id for p in source_entity.superclasses(distance=1, with_self=False)]

      if target_id in target_onto.terms():
        target_entity = target_onto.get_term(target_id)
        target_parent_ids = [p.id for p in target_entity.superclasses(distance=1, with_self=False)]

        if not reverse and target_parent_ids:
          subsumptions[source_id] = target_parent_ids
        elif reverse and source_parent_ids:
          subsumptions[target_id] = source_parent_ids

  return subsumptions

In [8]:
def write_subsumptions(subsumptions, filepath):
  with open(filepath, "w") as f:
    for source_id, target_id in subsumptions.items():
      f.write(f"{source_id},{target_id}\n")

In [9]:
subs_doid_to_ncit = get_subsumptions_from_equivalences(all_equivalences, doid, ncit)
print("Subsumption relations (DOID to NCIT): ", subs_doid_to_ncit)
print("Total: ", len(subs_doid_to_ncit))

Subsumption relations (DOID to NCIT):  {'DOID:0014667': ['NCIT:C53547'], 'DOID:0040093': ['NCIT:C27153'], 'DOID:0050120': ['NCIT:C35439', 'NCIT:C53543'], 'DOID:0050127': ['NCIT:C35024'], 'DOID:0050144': ['NCIT:C28193', 'NCIT:C53543'], 'DOID:0050156': ['NCIT:C26869'], 'DOID:0050157': ['NCIT:C113159'], 'DOID:0050158': ['NCIT:C35714'], 'DOID:0050214': ['NCIT:C189005', 'NCIT:C3311'], 'DOID:0050269': ['NCIT:C35720', 'NCIT:C84353'], 'DOID:0050424': ['NCIT:C165470'], 'DOID:0050425': ['NCIT:C28193'], 'DOID:0050426': ['NCIT:C27555', 'NCIT:C28193'], 'DOID:0050427': ['NCIT:C156032', 'NCIT:C7757', 'NCIT:C8957'], 'DOID:0050428': ['NCIT:C34748', 'NCIT:C8957'], 'DOID:0050429': ['NCIT:C34909'], 'DOID:0050430': ['NCIT:C123329'], 'DOID:0050431': ['NCIT:C53543'], 'DOID:0050432': ['NCIT:C88412'], 'DOID:0050433': ['NCIT:C28286', 'NCIT:C53543'], 'DOID:0050434': ['NCIT:C34786', 'NCIT:C53543'], 'DOID:0050436': ['NCIT:C28193', 'NCIT:C53543'], 'DOID:0050437': ['NCIT:C84734'], 'DOID:0050438': ['NCIT:C28193'], 'D

In [10]:
subs_ncit_to_doid = get_subsumptions_from_equivalences(all_equivalences, doid, ncit, reverse=True)
print("Subsumption relations (NCIT to DOID): ", subs_ncit_to_doid)
print("Total: ", len(subs_ncit_to_doid))

Subsumption relations (NCIT to DOID):  {'NCIT:C3235': ['DOID:4'], 'NCIT:C114354': ['DOID:8857'], 'NCIT:C34792': ['DOID:75'], 'NCIT:C128411': ['DOID:1352'], 'NCIT:C84797': ['DOID:9562'], 'NCIT:C35716': ['DOID:3770'], 'NCIT:C62586': ['DOID:2797'], 'NCIT:C35288': ['DOID:2797'], 'NCIT:C3155': ['DOID:0060032', 'DOID:0060033', 'DOID:439'], 'NCIT:C35083': ['DOID:121', 'DOID:1947', 'DOID:2059', 'DOID:2253', 'DOID:732'], 'NCIT:C3339': ['DOID:0050739', 'DOID:5295'], 'NCIT:C84501': ['DOID:331'], 'NCIT:C79484': ['DOID:37'], 'NCIT:C3452': ['DOID:0050737', 'DOID:225'], 'NCIT:C3147': ['DOID:3390'], 'NCIT:C82865': ['DOID:0050736', 'DOID:9182'], 'NCIT:C3226': ['DOID:0050736', 'DOID:3125'], 'NCIT:C84571': ['DOID:0060036'], 'NCIT:C97159': ['DOID:0060041'], 'NCIT:C84711': ['DOID:649'], 'NCIT:C84559': ['DOID:0050736', 'DOID:2843'], 'NCIT:C84906': ['DOID:0050737', 'DOID:225'], 'NCIT:C84735': ['DOID:0080009', 'DOID:3211'], 'NCIT:C122805': ['DOID:0050736', 'DOID:225'], 'NCIT:C85217': ['DOID:0050737', 'DOID:22

In [11]:
write_subsumptions(subs_doid_to_ncit, "doid_to_ncit.csv")
write_subsumptions(subs_ncit_to_doid, "ncit_to_doid.csv")

# Convert relations to sentences

> Currently considering parents, children & siblings for conceptual reasons, but could also take 'n-hop' appraoch, e.g. 1-hop only with parents and children, or 2-hop to include grandparents, grandchildren and siblings.

> How do I incorporate the desired mapping for training? Should I incorporate both all this AND target info, or too much? Could be SELF + desired relatives instead, or SELF + PARENT + DESIRED PARENT, etc.

In [12]:
# Create sentence from the given entity, containing its direct parents, children & siblings

def get_term_plus_relatives(entity_id):
  markers = ["[SELF]", "[/SELF]", "[SUP]", "[/SUP]", "[SUB]", "[/SUB]", "[SIBL]", "[/SIBL]"]
  term_in, term_out, sup_in, sup_out, sub_in, sub_out, sibl_in, sibl_out = markers

  term = doid.get_term(entity_id)
  parents = term.superclasses(distance=1, with_self=False)
  children = term.subclasses(distance=1, with_self=False)
  siblings = set()
  for parent in parents:
    siblings.update(set(parent.subclasses(distance=1, with_self=False)))
  siblings.remove(term)

  term_plus_relatives = [term_in, term.name, term_out]
  for parent in parents:
    term_plus_relatives.extend([sup_in, parent.name, sup_out])
  for child in children:
    term_plus_relatives.extend([sub_in, child.name, sub_out])
  for sibling in siblings:
    term_plus_relatives.extend([sibl_in, sibling.name, sibl_out])

  return " ".join(term_plus_relatives)

In [13]:
print(get_term_plus_relatives("DOID:0050159"))

[SELF] lymphoid interstitial pneumonia [/SELF] [SUP] idiopathic interstitial pneumonia [/SUP] [SIBL] nonspecific interstitial pneumonia [/SIBL] [SIBL] cryptogenic organizing pneumonia [/SIBL] [SIBL] acute interstitial pneumonia [/SIBL] [SIBL] desquamative interstitial pneumonia [/SIBL]
