https://medium.com/@linuskohl/extracting-and-linking-ontology-terms-from-text-7806ae8d8189

In [2]:
#!pip install pronto

Print all direct child terms for term “disease by infectious agent” from DOID ontology

In [5]:
from pronto import Ontology
# load the DOID ontology
#doid = Ontology("http://purl.obolibrary.org/obo/doid.obo")
# # select a node by ID
# root_node = doid['DOID:0050117'] # disease by infectious agent
# # print all children (distance=1) without the node itself
# for term in root.subclasses(distance=1, with_self=False).to_set():  
#     print(term.name)

https://github.com/DiseaseOntology/HumanDiseaseOntology

In [17]:
# load the DOID ontology
doid = Ontology("/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/HumanDiseaseOntology/src/ontology/doid.obo")

In [19]:
# select a node by ID
root_node = doid['DOID:0050117'] # disease by infectious agent
# print all children (distance=1) without the node itself
for term in root_node.subclasses(distance=1, with_self=False).to_set():  
    print(term.name)

fungal infectious disease
pelvic inflammatory disease
ornithine translocase deficiency
infective endocarditis
osteomyelitis
parasitic infectious disease
bacterial infectious disease
viral infectious disease


In [21]:
!pip install progressbar

Collecting progressbar
  Downloading progressbar-2.5.tar.gz (10 kB)
Building wheels for collected packages: progressbar
  Building wheel for progressbar (setup.py) ... [?25ldone
[?25h  Created wheel for progressbar: filename=progressbar-2.5-py3-none-any.whl size=12074 sha256=a13b0c10863a3882d87bccbe4cbcd7ec6e99289e3adfbb19929478e262aacec2
  Stored in directory: /Users/patsnap/Library/Caches/pip/wheels/f0/fd/1f/3e35ed57e94cd8ced38dd46771f1f0f94f65fec548659ed855
Successfully built progressbar
Installing collected packages: progressbar
Successfully installed progressbar-2.5


In [22]:
import progressbar
from pronto import Ontology
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans

In [27]:
class DOIDExtractorComponent(object):
    # name of the component
    name = "doid_extractor"

    def __init__(self, nlp, label="DOID"):
        # label that is applied to the matches
        self.label = label

        # load ontology
        print("Loading DOID ontology")
        doid = Ontology("/Users/patsnap/Desktop/Neo4J_and_other_codes/Bioinformatics/HumanDiseaseOntology/src/ontology/doid.obo")
        
        # init terms and patterns
        self.terms = {}
        patterns = []

        i = 0
        nr_terms = len(doid.terms())
        # init progress bar as loading terms takes long
        print("Importing terms")
        bar = progressbar.ProgressBar(maxval=nr_terms, 
                                      widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()

        # iterate over terms in ontology
        for term in doid.terms():
          # if term has a name
          if term.name is not None:
            self.terms[term.name.lower()] = {'id': term.id}
            patterns.append(nlp(term.name))
          i += 1
          bar.update(i)
            
        bar.finish()
        
        # initialize matcher and add patterns
        self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self.matcher.add(label, None, *patterns)
        
        # set extensions to tokens, spans and docs
        Token.set_extension("is_doid_term", default=False, force=True)
        Token.set_extension("doid_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_doids", getter=self.has_doids, force=True)
        Doc.set_extension("doids", default=[], force=True)
        Span.set_extension("has_doids", getter=self.has_doids, force=True)
        
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = [Span(doc, match[1], match[2], label=self.label) for match in matches]
        for i, span in enumerate(spans):
          span._.set("has_doids", True)
          for token in span:
               token._.set("is_doid_term", True)
               token._.set("doid_id", self.terms[span.text.lower()]["id"])

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.doids = list(doc._.doids) + [span]

        return doc
    # setter function for doc level
    def has_doids(self, tokens):
        return any([t._.get("is_doid_term") for t in tokens])

In [24]:
from pronto import Ontology
import progressbar
import spacy
from spacy import displacy
from spacy.tokens import Doc, Span, Token
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans

In [25]:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [28]:
doid_extractor = DOIDExtractorComponent(nlp)
nlp.add_pipe(doid_extractor, after="ner")

Loading DOID ontology


[                                                                        ]   0%

Importing terms




In [29]:
# Random sample sentences from the publication:
# "Pulmonary and Cardiac Pathology in Covid-19: The First Autopsy Series from New Orleans", Sharon E. Fox et.al.
test = """
Whether this may represent an early manifestation of a viral myocarditis is not certain,
but there was no significant brisk lymphocytic inflammatory infiltrate consistent with the
typical pattern of viral myocarditis...
There is prior evidence of viral infection causing activation of both maladaptive cytokine pathways,
and platelet response, and our findings suggest that these immune functions may be related to
severe forms of Covid-19. In response to systemic and pulmonary viral infections of H1N1
influenza and dengue, megakaryocytes have been known to respond by overexpressing IFITM3,
and producing platelets with the same over-expression.
"""
doc = nlp(test)

In [34]:
for token in doc:
    if token._.is_doid_term:
        print(token)
        print("http://purl.obolibrary.org/obo/{}\t\t{}".format(token._.doid_id.replace(":","_"), token.text))

myocarditis
http://purl.obolibrary.org/obo/DOID_820		myocarditis
myocarditis
http://purl.obolibrary.org/obo/DOID_820		myocarditis
Covid-19
http://purl.obolibrary.org/obo/DOID_0080600		Covid-19
influenza
http://purl.obolibrary.org/obo/DOID_8469		influenza
