## NaPDI machine reading knowledge graph instance closure

Closure run for SemRep and REACH predications in merged machine reading graph.

In [None]:
## Set up the CLIPS environment
import clips
env = clips.Environment()

MIN_PREDICATION_BELIEF = 0
MIN_TRANSITIVE_BELIEF = 0  # chosen because it retains most depth 1 transitive inferences over semmed  

## NOTE: BE SURE TO CLEAR test-inference.ntriples EACH TIME BEFORE RUNNING!!
## This accomplishes that
f = open("closure_output/test-inference.ntriples",'w')
f.close()

In [None]:
import pandas as pd
import numpy as np

In [None]:
#load merged machine reading graph - ntriples/gpickle - already mapped and processed - LOAD TSVs instead
##load TSVs after mapping for all machine reading output
mr_reach_gt = '../resources/predication_files/greentea_all_predicates_INDRA_processed.tsv'
mr_reach_kt = '../resources/predication_files/kratom_all_predicates_INDRA_processed.tsv'
mr_semrep = '../resources/predication_files/semrep/semrep_predications_mapped_only.csv'

In [None]:
dfgt = pd.read_csv(mr_reach_gt, sep='\t')
dfgt.info()

In [None]:
dfkt = pd.read_csv(mr_reach_kt, sep='\t')
dfkt.info()

In [None]:
dfsem = pd.read_csv(mr_semrep)
dfsem.info()

In [None]:
dfgt = dfgt[['pmid', 'subject_cui', 'subject_name', 'object_cui', 'object_name', 'pub_year', 'predicate', 'sentence', 'predicate_obo', 'subject_obo', 'object_obo', 'belief']]

In [None]:
dfkt = dfkt[['pmid', 'subject_cui', 'subject_name', 'object_cui', 'object_name', 'year', 'predicate', 'sentence', 'predicate_obo', 'subject_obo', 'object_obo', 'belief']]

In [None]:
dfsem = dfsem[['pmid', 'subject_cui', 'subject_name', 'object_cui',  'object_name', 'year', 'predicate', 'source_text', 'predicate_obo', 'subject_obo', 'object_obo']]

In [None]:
##all dataframes should have the same columns
#add belief = 0.8 to semrep predications
dfsem['belief'] = 0.8

In [None]:
dfgt = dfgt.rename({'pub_year':'year'}, axis=1)
dfsem = dfsem.rename({'source_text':'sentence'}, axis=1)

In [None]:
df = pd.concat([dfgt, dfkt, dfsem])
df.info()

In [None]:
df = df.drop_duplicates()
df.info()

In [None]:
df = df.reset_index(drop=True)
df.info()

### Rules

In [None]:
## Example
##Transitive predicates - part of, precedes, stimulates/positively regulates
##Symmetric predicates - interacts_with, moleculary_interacts_with
env.clear()
env.reset()

env.eval('(open "closure_output/test-inference.ntriples" writeFile "a")')

env.build("""
(deftemplate oav
 (slot object)
 (slot attribute)
 (slot value)
 (slot predNS)
 (slot inferred (default No))
 (slot belief (default 0.0)))
""")

## Transitive rule 
env.build("""
(defrule transitive
  "a simple transitivity rule"
  (oav (object ?o)
       (attribute ?pred&:(member$ ?pred (create$ http://purl.obolibrary.org/obo/BFO_0000063 http://purl.obolibrary.org/obo/BFO_0000050 http://purl.obolibrary.org/obo/RO_0002213)))
       (value ?s)
       (predNS RO)
       (inferred No)
       (belief ?b1))
  (oav (object ?s)
       (attribute ?pred)
       (value ?q)
       (predNS RO)
       (inferred No)
       (belief ?b2))
   (test (>= (* ?b1 ?b2) {}))
  =>
  (assert (oav (object ?o)
               (attribute ?pred)
               (value ?q)
               (inferred Yes)
               (predNS RO)
               (belief (* ?b1 ?b2))))
  
  (printout writeFile (format nil "<%s><%s><%s>.%n" ?o ?pred ?q))   
)
""".format(MIN_TRANSITIVE_BELIEF))
# NOTE: add this line to RHS to see the belief scores:
# (printout writeFile (format nil "b1: %f, b2: %f, belief: %f>.%n" ?b1 ?b2 (* ?b1 ?b2))) 

## simplerule for symmetric relationships 
env.build("""
(defrule symmetric
  "a simple symmetry rule"
  (oav (object ?o)
       (attribute ?pred&:(member$ ?pred (create$ http://purl.obolibrary.org/obo/RO_0002434 http://purl.obolibrary.org/obo/RO_0002436)))
       (value ?s)
       (predNS RO)
       (inferred No)
       (belief ?b))
  =>
  (assert (oav (object ?s)
               (attribute ?pred)
               (value ?o)
               (inferred Yes)
               (predNS RO)
               (belief ?b)))
  
  (printout writeFile (format nil "<%s><%s><%s>.%n" ?o ?pred ?s))  
)
""")


In [None]:
predMapD = {
    'regulateactivity':'RO_0011002',
    'regulateamount':'RO_0011003',
    'phosphorylation':'RO_0002447',
    'dephosphorylation':'GO_0006470',
    'ubiquitination':'RO_0002480',
    'deubiquitination':'GO_0016579',
    'sumoylation':'RO_0002436',
    'desumoylation':'RO_0002436',
    'hydroxylation':'GO_0018126',
    'dehydroxylation':'RO_0002436',
    'acetylation':'GO_0006473',
    'deacetylation':'GO_0006476',
    'glycosylation':'GO_0006486',
    'deglycosylation':'GO_0006517',
    'farnesylation':'RO_0002436',
    'defarnesylation':'RO_0002436',
    'geranylgeranylation':'RO_0002436',
    'degeranylgeranylation':'RO_0002436',
    'palmitoylation':'RO_0002436',
    'depalmitoylation':'RO_0002436',
    'myristoylation':'RO_0002436',
    'demyristoylation':'RO_0002436',
    'ribosylation':'RO_0002436',
    'deribosylation':'RO_0002436',
    'methylation':'GO_0006479',
    'demethylation':'GO_0006482',
    'activation':'RO_0002448',
    'inhibition':'RO_0002449',
    'increaseamount':'RO_0011009',
    'decreaseamount':'RO_0011010',
    'affects': 'RO_0002596',
    'associated_with': 'RO_0002610',
    'augments': 'RO_0002598',
    'causes': 'RO_0002566',
    'coexists_with': 'RO_0002490',
    'complicates': 'RO_0003309',
    'disrupts': 'RO_0002212',
    'inhibits': 'RO_0002449',
    'interacts_with': 'RO_0002434',
    'part_of': 'BFO_0000050',
    'precedes': 'BFO_0000063',
    'predisposes': 'RO_0003302',
    'prevents': 'RO_0002599',
    'produces': 'RO_0003000',
    'stimulates': 'RO_0002213',
    'treats': 'RO_0002606'
}


### Facts 

In [None]:
originalTriplesF = open('closure_output/original-triples.ntriples','w')

resourceD = {}
resourceDinv = {}
rcnt = 0
fctStrD = {}
#semTypeD = {}
labelsD = {}

for i in range(len(df.index)): 
   
    belief = df.at[i, 'belief']
    if belief < MIN_PREDICATION_BELIEF:
        continue
    
    (subj_obo, pred_obo, obj_obo) = (df.at[i, 'subject_obo'],
                                    df.at[i, 'predicate_obo'],
                                    df.at[i, 'object_obo'])
    (subj, pred, obj) = (df.at[i,'subject_name'],
                        df.at[i,'predicate'].lower().strip(),
                        df.at[i, 'object_name'])

    # only write out and/or do inference over some predicates
    if pred not in predMapD:        
        continue
        
    # write the original triple to file, regardless of the predicate
    originalTriplesF.write("<{}><{}><{}>.\n".format(subj_obo,pred_obo,obj_obo))
        
    # Track the subject and object names
    subjName = subj
    objName = obj
        
    if not resourceD.get(subj_obo):
        resourceD[subj_obo] = 'r{}'.format(rcnt)
        resourceDinv['r{}'.format(rcnt)] = subj_obo
        rcnt += 1
    
    if not resourceD.get(obj_obo):
        resourceD[obj_obo] = 'r{}'.format(rcnt)
        resourceDinv['r{}'.format(rcnt)] = obj_obo
        rcnt += 1    
    
        
    fctStr = """
(oav (object {})
     (attribute {})
     (value {})
     (predNS {})
     (belief {})
)""".format(resourceD[subj_obo], pred_obo, resourceD[obj_obo], 'RO', belief)
    
    if not fctStrD.get(fctStr): 
        env.assert_string(fctStr)
        fctStrD[fctStr] = 1
    
    
    # write the human readable labels as triples
    if isinstance(subj,str):
        originalTriplesF.write('<{}><http://www.w3.org/2000/01/rdf-schema#label> "{}".\n'.format(subj_obo,subj.replace('"','')))
    if isinstance(obj,str):
        originalTriplesF.write('<{}><http://www.w3.org/2000/01/rdf-schema#label> "{}".\n'.format(obj_obo,obj.replace('"','')))

originalTriplesF.close()

In [None]:
i = 0
for fact in env.facts():
    print(fact)
    if i == 20:
        break
    i += 1

In [None]:
env.run()
# count of inferences by cutoff for transitive belief score: 0.65 = 2241, 0 = 14539

The output tells use how many RHS made changes to working memory

In [None]:
import re

f = open('closure_output/test-inference.ntriples','r')
buf = f.read()
f.close()
rsL = buf.split('\n')

'''rgx = re.compile('(http://purl.obolibrary.org/obo/)([A-Z_0-9]+)')
for i in range(0,len(rsL)):
    if rsL[i] == "":
        continue
        
    ml = rgx.findall(rsL[i])
    if len(ml) != 1:
        print('ERROR: could not match on predicate regex: {}'.format(rsL[i]))
        continue
        
    (uri,predicate) = ml[0]
    rsL[i] = rsL[i].replace(predicate, predMapD[predicate])'''
    

f = open('closure_output/inferred-transitive-and-symmetric.ntriples','w')
rgx = re.compile('(r[0-9]+)')
for it in rsL:
    keyL = rgx.findall(it)
    newTr = it
    for k in keyL:
        if resourceDinv.get(k):
            newTr = newTr.replace(k, resourceDinv[k])            
        else:
            print('ERROR: key not found in resourceDinv: {}'.format(k))
    f.write(newTr + '\n')
f.close()


### Save as gpickle file with metadata

In [None]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import os
import pickle, json

#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

In [None]:
with open('closure_output/inferred-transitive-and-symmetric.ntriples','r') as file1:
    g = file1.read()
graph1 = g.split('\n')

In [None]:
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')
napdi = Namespace('http://napdi.org/napdi_srs_imports:')

In [None]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [None]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - add INF tag for inferred triples
errors = []
nx_mdg = nx.MultiDiGraph()
for triple in graph1:
    #do not save label predicate to gpickle

    rgx = re.compile('(http://[a-zA-Z0-9/._:]+)')
    match = rgx.findall(triple)
    if len(match)!=3:
        errors.append(triple)
        continue
    subj = match[0]
    obj = match[2]
    pred = match[1]
    s = URIRef(subj)
    p = URIRef(pred)
    o = URIRef(obj)

    pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
    nx_mdg.add_node(s, key=n3(s))
    nx_mdg.add_node(o, key=n3(o))
    nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                             'source_graph': 'machine_read_INF'})

nx.write_gpickle(nx_mdg, "closure_output/machineread_inferred_symmetric_transitive.gpickle")

In [None]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

In [None]:
len(errors)

In [None]:
##get stats of inferred triples
with open('closure_output/inferred-transitive-and-symmetric.ntriples') as filei:
    g = filei.read()
graph1 = g.split('\n')
len(graph1)

In [None]:
s = []
p = []
o = []
for triple in graph1:
    rgx = re.compile('(http://[a-zA-Z0-9/._:]+)')
    match = rgx.findall(triple)
    if len(match)!=3:
        errors.append(triple)
        continue
    subj = match[0]
    obj = match[2]
    pred = match[1]
    s.append(subj)
    p.append(pred)
    o.append(obj)

In [None]:
dfinf = pd.DataFrame()
dfinf['subject_obo'] = s
dfinf['pred_obo'] = p
dfinf['object_obo'] = o
dfinf.head()

In [None]:
dfinf.info()

In [None]:
dfinf['pred_obo'].value_counts()