In [1]:
import time
import rdflib


class NciThesaurusTools(object):
    """A class to provide a toolkit for working with OWL files from NCI Thesaurus."""

    @staticmethod
    def _parse_owl(inputfile: str) -> rdflib.Graph:
        """
        Summary:
        --------
        A semantically related method to the class, to be used as a reproducible means
        to parse an OWL to XML.

        Parameters:
        -----------
        inputfile : str.
            inputfile NCI Thesaurus OWL file that is to be parsed.

        Returns:
        --------
        graph : rdflib.Graph.
            A live RDF graph instance of the NCI Thesaurus's release.

        """
        try:
            start_time = time.time()
            graph = rdflib.Graph()
            print("Begin to parse OWL inputfile...")
            # Need to bind prefix --> if not will result in blank prefix
            graph.namespace_manager.bind(
                "ncit", "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#"
            )
            graph.parse(inputfile, format="xml")  # format='xml' is default
            end_time = time.time()
            print(
                "Successfully completed parsing OWL in: {} ".format(
                    end_time - start_time
                )
            )
        except Exception as e:
            raise Exception(e)

        return graph

In [2]:
# Parse OWL file (NCI Thesaurus latest release -> Thesaurus.owl) to RDF/XML
parser = NciThesaurusTools()
g = parser._parse_owl(inputfile="./input_data/Thesaurus.owl")
len(g)

Begin to parse OWL inputfile...
Successfully completed parsing OWL in: 1138.9695100784302 


8575158

In [None]:
# %%time
# # Output a turtle serialization of the live RDF graph instance to same directory as input OWL
# # This can take ~10 minutes
# g.serialize(
#     destination="./output_data/ncit_serialized.ttl",
#     format="ttl",
#     encoding='utf-8'
# )

In [None]:
# %%time
# # Serialization to JSON-LD is supported & can be done as follows:
# g.serialize(destination='./output_data/ncit_serialized_jsonld.jsonld',
#             format='json-ld',
#             indent=2)

In [250]:
from pprint import pprint
import pandas as pd

pd.set_option("display.max_rows", 999)
pd.set_option("display.max_colwidth", None)

In [251]:
# Namespaces & Prefixes
df = pd.DataFrame(
    data=g.namespace_manager.namespaces(), columns=["prefix", "namespace"]
).sort_values(by=["prefix"])
df

# for namespace in g.namespaces():
# print(namespace)

Unnamed: 0,prefix,namespace
0,brick,https://brickschema.org/schema/Brick#
1,csvw,http://www.w3.org/ns/csvw#
2,dc,http://purl.org/dc/elements/1.1/
6,dcam,http://purl.org/dc/dcam/
3,dcat,http://www.w3.org/ns/dcat#
4,dcmitype,http://purl.org/dc/dcmitype/
5,dcterms,http://purl.org/dc/terms/
7,doap,http://usefulinc.com/ns/doap#
8,foaf,http://xmlns.com/foaf/0.1/
27,ncit,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#


In [13]:
from pprint import pprint

test_sparql = """SELECT DISTINCT ?s ?p ?o
                 WHERE { ?s rdf:type owl:Class .
                         ?s ?p ?o .
                     } LIMIT 20"""
result = g.query(test_sparql)
for x in result:
    pprint(x)
    df = pd.DataFrame(data=x, columns=["s", "p", "o"]).drop_duplicates()

df

(rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),)
(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#NHC0'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P106'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P108'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P207'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P319'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P322'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P366'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P90'),)
(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P97'),)
(rdflib.term.URIRef('http://www.w3.org/2000/01/r

In [26]:
# Build a function to print x amount of triples (limit(x) = 5)
def printtriples(graph, limit):
    n = 0
    for trip in graph:
        pprint(trip)
        print("\r")
        n = n + 1
        if n >= limit:
            break
    return trip


printtriples(graph=g, limit=5)

(rdflib.term.BNode('Nc560406bafd2479386840c749f6b2dda'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#annotatedTarget'),
 rdflib.term.Literal('EC 2.7.11.-'))

(rdflib.term.BNode('N18d6ae2054494ca79f66b6634ea43342'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Axiom'))

(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103358'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C81971'))

(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C34062'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C13236'))

(rdflib.term.BNode('N22ad5791fd1e42a98f3f8aa5c67330ff'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/20

(rdflib.term.BNode('N22ad5791fd1e42a98f3f8aa5c67330ff'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Axiom'))

In [42]:
def printtriples(graph, limit):
    n = 0
    for subj, pred, obj in graph:
        pprint(subj)
        pprint(pred)
        pprint(obj)
        print("\n\n")
        if limit > 0:
            n = n + 1
            if n == limit:
                break
    return subj, pred, obj


printtriples(graph=g, limit=3)

rdflib.term.BNode('Nc560406bafd2479386840c749f6b2dda')
rdflib.term.URIRef('http://www.w3.org/2002/07/owl#annotatedTarget')
rdflib.term.Literal('EC 2.7.11.-')



rdflib.term.BNode('N18d6ae2054494ca79f66b6634ea43342')
rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Axiom')



rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103358')
rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')
rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C81971')





(rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C103358'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.URIRef('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C81971'))

In [178]:
# SPARQL query to collect all predicates within graph
predicates = """SELECT DISTINCT ?p
                WHERE {
                    ?s rdf:type owl:Class .
                    ?s ?p ?o .
                        } LIMIT 100"""

# Run the query predicates, and save the results in variable result
result = g.query(predicates)

# Print all results
for row in result:
    # pprint(row[0])
    df = pd.DataFrame(data=result, columns=["p"]).drop_duplicates()
df.sort_values("p")

Unnamed: 0,p
56,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A1
64,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A12
45,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A13
92,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A15
98,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A16
33,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A17
34,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A18
94,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A19
57,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A2
95,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A20


In [176]:
# Find p & o where conceptUri = 'C62554' -> Parp Inhibitor

conceptUri = "ncit:C62554"

parp = f"""SELECT DISTINCT ?p ?o
          WHERE {{
              {conceptUri} rdf:type owl:Class .
              {conceptUri} ?p ?o .
        }} LIMIT 150"""

# Run the query q1, and save the results in variable r1
result = g.query(parp)

# Print the results
for x in result:
    # print(x[0], "\n")
    df = pd.DataFrame(data=result, columns=["p", "o"]).drop_duplicates()
df.head(20)

Unnamed: 0,p,o
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#Class
1,http://www.w3.org/2000/01/rdf-schema#subClassOf,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C163758
2,http://www.w3.org/2000/01/rdf-schema#subClassOf,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C471
3,http://www.w3.org/2000/01/rdf-schema#subClassOf,N55060f8dafe947aabfb0b0a8ed3dbdc4
4,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C116977
5,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#A8,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C116978
6,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#NHC0,C62554
7,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P106,Chemical Viewed Functionally
8,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P107,PARP Inhibitor
9,http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#P108,Poly (ADP-Ribose) Polymerase Inhibitor
