## NaPDI machine reading knowledge graph instance closure

Closure run for SemRep and REACH predications in merged machine reading graph.

In [1]:
## Set up the CLIPS environment
import clips
env = clips.Environment()

MIN_PREDICATION_BELIEF = 0
MIN_TRANSITIVE_BELIEF = 0.65  # chosen because it retains most depth 1 transitive inferences over semmed  

## NOTE: BE SURE TO CLEAR test-inference.ntriples EACH TIME BEFORE RUNNING!!
## This accomplishes that
f = open("closure_output/test-inference-2.ntriples",'w')
f.close()

In [2]:
import pandas as pd
import numpy as np

In [3]:
#load merged machine reading graph - ntriples/gpickle - already mapped and processed - LOAD TSVs instead
##load TSVs after mapping for all machine reading output
mr_reach_gt = 'predication_files/greentea_pmid_all_predicates_umls_processed.tsv'
mr_reach_kt = 'predication_files/kratom_pmid_all_predicates_umls_processed.tsv'
mr_semrep = 'semrep_data/semrep_predications_mapped_only_20220406.csv'

In [4]:
dfgt = pd.read_csv(mr_reach_gt, sep='\t')
dfgt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7678 entries, 0 to 7677
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_cui           7678 non-null   object 
 1   subject_name          7678 non-null   object 
 2   subject_source        7678 non-null   object 
 3   predicate             7678 non-null   object 
 4   object_source         7678 non-null   object 
 5   object_cui            7678 non-null   object 
 6   object_name           7678 non-null   object 
 7   subj_reach_grounding  7678 non-null   object 
 8   obj_reach_grounding   7678 non-null   object 
 9   pmid                  7678 non-null   int64  
 10  pub_year              7678 non-null   object 
 11  belief                7678 non-null   float64
 12  sentence              7678 non-null   object 
 13  predicate_obo         7678 non-null   object 
 14  subject_obo           7678 non-null   object 
 15  object_obo           

In [5]:
dfkt = pd.read_csv(mr_reach_kt, sep='\t')
dfkt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_cui           356 non-null    object 
 1   subject_name          356 non-null    object 
 2   subject_source        368 non-null    object 
 3   predicate             368 non-null    object 
 4   object_source         368 non-null    object 
 5   object_cui            359 non-null    object 
 6   object_name           359 non-null    object 
 7   subj_reach_grounding  368 non-null    object 
 8   obj_reach_grounding   368 non-null    object 
 9   pmid                  368 non-null    int64  
 10  year                  368 non-null    object 
 11  belief                368 non-null    float64
 12  sentence              368 non-null    object 
 13  predicate_obo         368 non-null    object 
 14  subject_obo           368 non-null    object 
 15  object_obo            3

In [6]:
dfsem = pd.read_csv(mr_semrep)
dfsem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13676 entries, 0 to 13675
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           13676 non-null  int64  
 1   subject_cui    13676 non-null  object 
 2   subject_type   13676 non-null  object 
 3   object_cui     13676 non-null  object 
 4   object_type    13676 non-null  object 
 5   year           13602 non-null  float64
 6   subject_name   13676 non-null  object 
 7   object_name    13676 non-null  object 
 8   source_text    13676 non-null  object 
 9   predicate      13676 non-null  object 
 10  predicate_obo  13676 non-null  object 
 11  subject_obo    13676 non-null  object 
 12  object_obo     13676 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.4+ MB


In [7]:
dfgt = dfgt[['pmid', 'subject_cui', 'subject_name', 'object_cui', 'object_name', 'pub_year', 'predicate', 'sentence', 'predicate_obo', 'subject_obo', 'object_obo', 'belief']]

In [8]:
dfkt = dfkt[['pmid', 'subject_cui', 'subject_name', 'object_cui', 'object_name', 'year', 'predicate', 'sentence', 'predicate_obo', 'subject_obo', 'object_obo', 'belief']]

In [9]:
dfsem = dfsem[['pmid', 'subject_cui', 'subject_name', 'object_cui',  'object_name', 'year', 'predicate', 'source_text', 'predicate_obo', 'subject_obo', 'object_obo']]

In [10]:
##all dataframes should have the same columns
#add belief = 0.8 to semrep predications
dfsem['belief'] = 0.8

In [11]:
dfgt = dfgt.rename({'pub_year':'year'}, axis=1)
dfsem = dfsem.rename({'source_text':'sentence'}, axis=1)

In [12]:
df = pd.concat([dfgt, dfkt, dfsem])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21722 entries, 0 to 13675
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           21722 non-null  int64  
 1   subject_cui    21710 non-null  object 
 2   subject_name   21710 non-null  object 
 3   object_cui     21713 non-null  object 
 4   object_name    21713 non-null  object 
 5   year           21648 non-null  object 
 6   predicate      21722 non-null  object 
 7   sentence       21722 non-null  object 
 8   predicate_obo  21722 non-null  object 
 9   subject_obo    21722 non-null  object 
 10  object_obo     21722 non-null  object 
 11  belief         21722 non-null  float64
dtypes: float64(1), int64(1), object(10)
memory usage: 2.2+ MB


In [13]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21707 entries, 0 to 13675
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           21707 non-null  int64  
 1   subject_cui    21695 non-null  object 
 2   subject_name   21695 non-null  object 
 3   object_cui     21698 non-null  object 
 4   object_name    21698 non-null  object 
 5   year           21633 non-null  object 
 6   predicate      21707 non-null  object 
 7   sentence       21707 non-null  object 
 8   predicate_obo  21707 non-null  object 
 9   subject_obo    21707 non-null  object 
 10  object_obo     21707 non-null  object 
 11  belief         21707 non-null  float64
dtypes: float64(1), int64(1), object(10)
memory usage: 2.2+ MB


In [14]:
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21707 entries, 0 to 21706
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           21707 non-null  int64  
 1   subject_cui    21695 non-null  object 
 2   subject_name   21695 non-null  object 
 3   object_cui     21698 non-null  object 
 4   object_name    21698 non-null  object 
 5   year           21633 non-null  object 
 6   predicate      21707 non-null  object 
 7   sentence       21707 non-null  object 
 8   predicate_obo  21707 non-null  object 
 9   subject_obo    21707 non-null  object 
 10  object_obo     21707 non-null  object 
 11  belief         21707 non-null  float64
dtypes: float64(1), int64(1), object(10)
memory usage: 2.0+ MB


In [15]:
df['predicate'].value_counts()

Activation           3624
interacts_with       3287
Inhibition           2892
inhibits             2014
coexists_with        1945
stimulates           1902
part_of              1846
treats                927
IncreaseAmount        723
DecreaseAmount        530
affects               447
augments              334
produces              314
disrupts              242
causes                190
Phosphorylation       158
associated_with       135
prevents               59
Dephosphorylation      47
predisposes            31
Methylation            16
Hydroxylation          12
Acetylation             8
Glycosylation           6
Dehydroxylation         6
Demethylation           5
Deacetylation           2
complicates             2
precedes                1
Deubiquitination        1
Ubiquitination          1
Name: predicate, dtype: int64

In [16]:
df['predicate_obo'].value_counts()

http://purl.obolibrary.org/obo/RO_0002449     4906
http://purl.obolibrary.org/obo/RO_0002448     3624
http://purl.obolibrary.org/obo/RO_0002434     3287
http://purl.obolibrary.org/obo/RO_0002490     1945
http://purl.obolibrary.org/obo/RO_0002213     1902
http://purl.obolibrary.org/obo/BFO_0000050    1846
http://purl.obolibrary.org/obo/RO_0002606      927
http://purl.obolibrary.org/obo/RO_0011009      723
http://purl.obolibrary.org/obo/RO_0011010      530
http://purl.obolibrary.org/obo/RO_0002596      447
http://purl.obolibrary.org/obo/RO_0002598      334
http://purl.obolibrary.org/obo/RO_0003000      314
http://purl.obolibrary.org/obo/RO_0002212      242
http://purl.obolibrary.org/obo/RO_0002566      190
http://purl.obolibrary.org/obo/RO_0002447      158
http://purl.obolibrary.org/obo/RO_0002610      135
http://purl.obolibrary.org/obo/RO_0002599       59
http://purl.obolibrary.org/obo/GO_0006470       47
http://purl.obolibrary.org/obo/RO_0003302       31
http://purl.obolibrary.org/obo/

### Rules

In [17]:
## Example
##Transitive predicates - part of, precedes, stimulates/positively regulates
##Symmetric predicates - interacts_with, moleculary_interacts_with
env.clear()
env.reset()

env.eval('(open "closure_output/test-inference-2.ntriples" writeFile "a")')

env.build("""
(deftemplate oav
 (slot object)
 (slot attribute)
 (slot value)
 (slot predNS)
 (slot inferred (default No))
 (slot belief (default 0.0)))
""")

## Transitive rule 
env.build("""
(defrule transitive
  "a simple transitivity rule"
  (oav (object ?o)
       (attribute ?pred&:(member$ ?pred (create$ http://purl.obolibrary.org/obo/BFO_0000063 http://purl.obolibrary.org/obo/BFO_0000050 http://purl.obolibrary.org/obo/RO_0002213)))
       (value ?s)
       (predNS RO)
       (inferred No)
       (belief ?b1))
  (oav (object ?s)
       (attribute ?pred)
       (value ?q)
       (predNS RO)
       (inferred No)
       (belief ?b2))
   (test (>= (* ?b1 ?b2) {}))
  =>
  (assert (oav (object ?o)
               (attribute ?pred)
               (value ?q)
               (inferred Yes)
               (predNS RO)
               (belief (* ?b1 ?b2))))
  
  (printout writeFile (format nil "<%s><%s><%s>.%n" ?o ?pred ?q))   
)
""".format(MIN_TRANSITIVE_BELIEF))
# NOTE: add this line to RHS to see the belief scores:
# (printout writeFile (format nil "b1: %f, b2: %f, belief: %f>.%n" ?b1 ?b2 (* ?b1 ?b2))) 

## simplerule for symmetric relationships 
env.build("""
(defrule symmetric
  "a simple symmetry rule"
  (oav (object ?o)
       (attribute ?pred&:(member$ ?pred (create$ http://purl.obolibrary.org/obo/RO_0002434 http://purl.obolibrary.org/obo/RO_0002436)))
       (value ?s)
       (predNS RO)
       (inferred No)
       (belief ?b))
  =>
  (assert (oav (object ?s)
               (attribute ?pred)
               (value ?o)
               (inferred Yes)
               (predNS RO)
               (belief ?b)))
  
  (printout writeFile (format nil "<%s><%s><%s>.%n" ?o ?pred ?s))  
)
""")


In [18]:
predMapD = {
    'regulateactivity':'RO_0011002',
    'regulateamount':'RO_0011003',
    'phosphorylation':'RO_0002447',
    'dephosphorylation':'GO_0006470',
    'ubiquitination':'RO_0002480',
    'deubiquitination':'GO_0016579',
    'sumoylation':'RO_0002436',
    'desumoylation':'RO_0002436',
    'hydroxylation':'GO_0018126',
    'dehydroxylation':'RO_0002436',
    'acetylation':'GO_0006473',
    'deacetylation':'GO_0006476',
    'glycosylation':'GO_0006486',
    'deglycosylation':'GO_0006517',
    'farnesylation':'RO_0002436',
    'defarnesylation':'RO_0002436',
    'geranylgeranylation':'RO_0002436',
    'degeranylgeranylation':'RO_0002436',
    'palmitoylation':'RO_0002436',
    'depalmitoylation':'RO_0002436',
    'myristoylation':'RO_0002436',
    'demyristoylation':'RO_0002436',
    'ribosylation':'RO_0002436',
    'deribosylation':'RO_0002436',
    'methylation':'GO_0006479',
    'demethylation':'GO_0006482',
    'activation':'RO_0002448',
    'inhibition':'RO_0002449',
    'increaseamount':'RO_0011009',
    'decreaseamount':'RO_0011010',
    'affects': 'RO_0002596',
    'associated_with': 'RO_0002610',
    'augments': 'RO_0002598',
    'causes': 'RO_0002566',
    'coexists_with': 'RO_0002490',
    'complicates': 'RO_0003309',
    'disrupts': 'RO_0002212',
    'inhibits': 'RO_0002449',
    'interacts_with': 'RO_0002434',
    'part_of': 'BFO_0000050',
    'precedes': 'BFO_0000063',
    'predisposes': 'RO_0003302',
    'prevents': 'RO_0002599',
    'produces': 'RO_0003000',
    'stimulates': 'RO_0002213',
    'treats': 'RO_0002606'
}


### Facts 

In [19]:
originalTriplesF = open('closure_output/original-triples-2.ntriples','w')

resourceD = {}
resourceDinv = {}
rcnt = 0
fctStrD = {}
#semTypeD = {}
labelsD = {}

for i in range(len(df.index)): 
   
    belief = df.at[i, 'belief']
    if belief < MIN_PREDICATION_BELIEF:
        continue
    
    (subj_obo, pred_obo, obj_obo) = (df.at[i, 'subject_obo'],
                                    df.at[i, 'predicate_obo'],
                                    df.at[i, 'object_obo'])
    (subj, pred, obj) = (df.at[i,'subject_name'],
                        df.at[i,'predicate'].lower().strip(),
                        df.at[i, 'object_name'])

    # only write out and/or do inference over some predicates
    if pred not in predMapD:        
        continue
        
    # write the original triple to file, regardless of the predicate
    originalTriplesF.write("<{}><{}><{}>.\n".format(subj_obo,pred_obo,obj_obo))
        
    # Track the subject and object names
    subjName = subj
    objName = obj
    
    
    '''# Track the semantic types
    semTypesStr = stmtSplt[SUBJECT_TYPE]
    if semTypesStr.find('[') == -1:
        if not semTypeD.get(subj):
            semTypeD[subj] = [semTypesStr.strip()]
        else:
            semTypeD[subj] = semTypeD[subj].append(semTypesStr.strip())
    else:
        semTypesStr = semTypesStr.replace("'",'').replace('[','').replace(']','')
        semTypesL = [x.strip() for x in semTypesStr.split(',')]
        if not semTypeD.get(subj):
            semTypeD[subj] = semTypesL
        else:
            semTypeD[subj] = semTypeD[subj] + semTypesL
            
    semTypesStr = stmtSplt[OBJECT_TYPE]
    if semTypesStr.find('[') == -1:
        if not semTypeD.get(obj):
            semTypeD[obj] = [semTypesStr.strip()]
        else:
            semTypeD[obj] = semTypeD[obj].append(semTypesStr.strip())
    else:
        semTypesStr = semTypesStr.replace("'",'').replace('[','').replace(']','')
        semTypesL = [x.strip() for x in semTypesStr.split(',')]
        if not semTypeD.get(obj):
            semTypeD[obj] = semTypesL
        else:
            semTypeD[obj] = semTypeD[obj] + semTypesL'''
        
    if not resourceD.get(subj_obo):
        resourceD[subj_obo] = 'r{}'.format(rcnt)
        resourceDinv['r{}'.format(rcnt)] = subj_obo
        rcnt += 1
    
    if not resourceD.get(obj_obo):
        resourceD[obj_obo] = 'r{}'.format(rcnt)
        resourceDinv['r{}'.format(rcnt)] = obj_obo
        rcnt += 1    
    
        
    fctStr = """
(oav (object {})
     (attribute {})
     (value {})
     (predNS {})
     (belief {})
)""".format(resourceD[subj_obo], pred_obo, resourceD[obj_obo], 'RO', belief)
    
    if not fctStrD.get(fctStr): 
        env.assert_string(fctStr)
        fctStrD[fctStr] = 1
    
    
    # write the human readable labels as triples
    if isinstance(subj,str):
        originalTriplesF.write('<{}><http://www.w3.org/2000/01/rdf-schema#label> "{}".\n'.format(subj_obo,subj.replace('"','')))
    if isinstance(obj,str):
        originalTriplesF.write('<{}><http://www.w3.org/2000/01/rdf-schema#label> "{}".\n'.format(obj_obo,obj.replace('"','')))

'''# Write the semantic types as triples
for (e,st) in semTypeD.items():
    if st:
        stSet = set(st)
        for elt in stSet:
            originalTriplesF.write("<http://dikb.org/ad#{}><http://www.w3.org/1999/02/22-rdf-syntax-ns#type><http://umls.org/st/#{}>.\n".format(e,elt.replace('"','')))
'''
originalTriplesF.close()

In [20]:
i = 0
for fact in env.facts():
    print(fact)
    if i == 20:
        break
    i += 1

(oav (object r0) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r1) (predNS RO) (inferred No) (belief 0.65))
(oav (object r2) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r3) (predNS RO) (inferred No) (belief 0.65))
(oav (object r4) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r1) (predNS RO) (inferred No) (belief 0.65))
(oav (object r5) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r6) (predNS RO) (inferred No) (belief 0.65))
(oav (object r7) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r8) (predNS RO) (inferred No) (belief 0.86))
(oav (object r6) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r8) (predNS RO) (inferred No) (belief 0.65))
(oav (object r9) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r8) (predNS RO) (inferred No) (belief 0.86))
(oav (object r10) (attribute http://purl.obolibrary.org/obo/GO_0006473) (value r1) (predNS RO) (inferred No) (belief 0.65))
(oav (object r1

In [21]:
env.run()
# count of inferences by cutoff for transitive belief score: 0.65 = 2241, 0 = 14539

2421

The output tells use how many RHS made changes to working memory

In [38]:
for k in resourceD:
    print(k,resourceD[k])
    break

http://purl.obolibrary.org/obo/SO_0000704 r0


In [39]:
for k in resourceDinv:
    print(k,resourceDinv[k])
    break

r0 http://purl.obolibrary.org/obo/SO_0000704


In [22]:
import re

f = open('closure_output/test-inference-2.ntriples','r')
buf = f.read()
f.close()
rsL = buf.split('\n')

'''rgx = re.compile('(http://purl.obolibrary.org/obo/)([A-Z_0-9]+)')
for i in range(0,len(rsL)):
    if rsL[i] == "":
        continue
        
    ml = rgx.findall(rsL[i])
    if len(ml) != 1:
        print('ERROR: could not match on predicate regex: {}'.format(rsL[i]))
        continue
        
    (uri,predicate) = ml[0]
    rsL[i] = rsL[i].replace(predicate, predMapD[predicate])'''
    

f = open('closure_output/inferred-transitive-and-symmetric-2.ntriples','w')
rgx = re.compile('(r[0-9]+)')
for it in rsL:
    keyL = rgx.findall(it)
    newTr = it
    for k in keyL:
        if resourceDinv.get(k):
            newTr = newTr.replace(k, resourceDinv[k])            
        else:
            print('ERROR: key not found in resourceDinv: {}'.format(k))
    f.write(newTr + '\n')
f.close()


### Save as gpickle file with metadata

In [23]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import os
import pickle, json

#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

In [24]:
with open('closure_output/inferred-transitive-and-symmetric-2.ntriples','r') as file1:
    g = file1.read()
graph1 = g.split('\n')

In [25]:
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')
napdi = Namespace('http://napdi.org/napdi-srs-imports:')

In [26]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [27]:
graph1[0]

'<http://purl.obolibrary.org/obo/NCBITaxon_1224><http://purl.obolibrary.org/obo/RO_0002434><http://purl.obolibrary.org/obo/NCBITaxon_2>.'

In [28]:
import re

In [25]:
x = '<http://napdi.org/napdi_srs_imports:epigallocatechin_gallate><http://purl.obolibrary.org/obo/RO_0002213><http://purl.obolibrary.org/obo/CHEBI_27843>.'
rgx = re.compile('(http://[a-zA-Z0-9/._:]+)')
rgx.findall(x)

['http://napdi.org/napdi_srs_imports:epigallocatechin_gallate',
 'http://purl.obolibrary.org/obo/RO_0002213',
 'http://purl.obolibrary.org/obo/CHEBI_27843']

In [29]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - add INF tag for inferred triples
errors = []
nx_mdg = nx.MultiDiGraph()
for triple in graph1:
    #do not save label predicate to gpickle

    rgx = re.compile('(http://[a-zA-Z0-9/._:]+)')
    match = rgx.findall(triple)
    if len(match)!=3:
        errors.append(triple)
        continue
    subj = match[0]
    obj = match[2]
    pred = match[1]
    s = URIRef(subj)
    p = URIRef(pred)
    o = URIRef(obj)

    pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
    nx_mdg.add_node(s, key=n3(s))
    nx_mdg.add_node(o, key=n3(o))
    nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                             'source_graph': 'machine_read_INF'})

nx.write_gpickle(nx_mdg, "closure_output/machineread_inferred_symmetric_transitive-2.gpickle")

In [30]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

1247 2400 0.0015446381105986631 1.9246190858059342


In [31]:
len(errors)

2

In [32]:
errors

['<http://purl.obolibrary.org/obo/CHEBI_166890><http://purl.obolibrary.org/obo/RO_',
 '']

In [36]:
##get stats of inferred triples
with open('closure_output/inferred-transitive-and-symmetric.ntriples') as filei:
    g = filei.read()
graph1 = g.split('\n')
len(graph1)

14513

In [37]:
graph1[0]

'<http://purl.obolibrary.org/obo/NCBITaxon_1224><http://purl.obolibrary.org/obo/RO_0002434><http://purl.obolibrary.org/obo/NCBITaxon_2>.'

In [38]:
s = []
p = []
o = []
for triple in graph1:
    rgx = re.compile('(http://[a-zA-Z0-9/._:]+)')
    match = rgx.findall(triple)
    if len(match)!=3:
        errors.append(triple)
        continue
    subj = match[0]
    obj = match[2]
    pred = match[1]
    s.append(subj)
    p.append(pred)
    o.append(obj)

In [39]:
dfinf = pd.DataFrame()
dfinf['subject_obo'] = s
dfinf['pred_obo'] = p
dfinf['object_obo'] = o
dfinf.head()

Unnamed: 0,subject_obo,pred_obo,object_obo
0,http://purl.obolibrary.org/obo/NCBITaxon_1224,http://purl.obolibrary.org/obo/RO_0002434,http://purl.obolibrary.org/obo/NCBITaxon_2
1,http://purl.obolibrary.org/obo/UBERON_0013756,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/UBERON_0007023
2,http://purl.obolibrary.org/obo/UBERON_0013756,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/NCBITaxon_9606
3,http://purl.obolibrary.org/obo/UBERON_0013756,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CLO_0037339
4,http://purl.obolibrary.org/obo/UBERON_0013756,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/NCBITaxon_10088


In [40]:
dfinf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14511 entries, 0 to 14510
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subject_obo  14511 non-null  object
 1   pred_obo     14511 non-null  object
 2   object_obo   14511 non-null  object
dtypes: object(3)
memory usage: 340.2+ KB


In [41]:
dfinf['pred_obo'].value_counts()

http://purl.obolibrary.org/obo/RO_0002213     10627
http://purl.obolibrary.org/obo/RO_0002434      2398
http://purl.obolibrary.org/obo/BFO_0000050     1486
Name: pred_obo, dtype: int64

In [None]:
10627 - positively regulates
2398 - interacts with
1486 - part of