In [3]:
import time
import rdflib


class NciThesaurusTools(object):
    """A class to provide a toolkit for working with OWL, OWL2 & OBO files"""

    @staticmethod
    def _parse_owl(inputfile: str) -> rdflib.graph.Graph():
        """
        Summary:
        --------
        A semantically related method to the class, to be used as a reproducible means
        to parse an OWL to XML.
        Parameters:
        -----------
        inputfile : str.
            inputfile NCI Thesaurus OWL file that is to be parsed.
        Returns:
        --------
        graph : rdflib.graph.Graph.
            A live RDF graph instance of the NCI Thesaurus's release.
        """
        try:
            # OWL2 parsing will take ~700-950 seconds ()
            start_time = time.time()
            graph = rdflib.Graph()
            print("Begin to parse OWL inputfile...")
            # Need to bind prefix --> if not, result serialization will result in a prefix of just ':'
            graph.namespace_manager.bind(
                'ncit', 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#')
            graph = graph.parse(inputfile, format='xml')
            end_time = time.time()
            print("Successfully completed parsing OWL in: {} ".format(
                end_time-start_time))
        except Exception as e:
            raise Exception(e)

        return graph


In [4]:
# Parse OWL file (Thesaurus.owl) to XML (This can take ~10 minutes)
# Store live RDF graph instance as 'g'
parser = NciThesaurusTools()
g = parser._parse_owl(inputfile='./input_data/Thesaurus.owl')
len(g)

Begin to parse OWL inputfile...
Successfully completed parsing OWL in: 968.8714940547943 


8325062

In [None]:
%%time
# Output a turtle serialization of the live RDF graph instance to same directory as input OWL
# This can take ~10 minutes
g.serialize(destination='./output_data/ncit_serialized.ttl', format='turtle')

In [None]:
# # Serialization to JSON-LD is supported & can be done as follows:
# g.serialize(destination='./output_data/ncit_serialized_jsonld.jsonld', 
#             format='json-ld', 
#             indent=2)

In [2]:
# # What are the prefixes & namespaces in the graph?
# for namespace in g.namespaces():
#     print(namespace)

In [None]:
test_sparql = '''SELECT DISTINCT ?p
                 WHERE {
                         ?s ?p ?o .
                     } LIMIT 15'''
result = g.query(test_sparql)
for x in result:
    print(x, '\n')

In [None]:
# Build a function to print out the first several triples (limit set to 50)
def printtriples(graph, limit):
    n = 0
    for trip in graph:
        print(trip)
        print('')
        n = n+1
        if n >= limit:
            break


printtriples(g, 50)

In [None]:
def printtriples(graph, limit):
    n = 0
    for subj, pred, obj in graph:
        print(subj)
        print(pred)
        print(obj)
        print('')
        if limit > 0:
            n = n+1
            if n == limit:
                break


printtriples(g, 10)

In [None]:
# SPARQL query to collect all predicates within graph
predicates = '''SELECT DISTINCT ?predicate
                WHERE { 
                        ?subject
                        ?predicate
                        ?object
                        }'''

# Run the query predicates, and save the results in variable result
result = g.query(predicates)

# Print all results
for row in result:
    print(row[0])
    print()

In [None]:
# Find all predicates associated with 'C62554' -> Parp Inhibitor & 'C71721' -> Olaparib
parp = '''SELECT DISTINCT ?predicate
          WHERE { 
            obo:NCIT_C62554 ?predicate ?object .
        }'''

# Run the query q1, and save the results in variable r1
result = g.query(parp)

# Print the results
for x in result:
    print(x[0], '\n')
    print()
