## Notebook to implement path searches in graph with -
1. Networkx algorithms
2. Discovery patterns
3. SPARQL queries

* Author: Sanya B Taneja
* Created: 2021-09-24
* Last edited: 2021-09-24

We combine the PheKnowLator and machine reading graphs (currently separate for each NP) using Networkx and search paths in the combined graph.

In [1]:
import os
import os.path
import networkx as nx
import json
import urllib
import traceback
from itertools import islice
from rdflib import Graph, URIRef, BNode, Namespace, Literal
from rdflib.namespace import RDF, OWL
from tqdm import tqdm
import json

In [2]:
import hashlib

In [3]:
import pickle
import pandas as pd
import numpy as np

In [4]:
#import pheknowlator kg_utils 
import sys
sys.path.append('../')
from pkt_kg.utils import *

In [5]:
KG_PATH = '/home/sanya/PheKnowLatorv2/resources/knowledge_graphs/'
MR_PATH = '/home/sanya/PheKnowLatorv2/machine_read/output_graphs/'
KG_NAME = 'PheKnowLator_v3.0.0_full_instance_inverseRelations_OWLNETS_NetworkxMultiDiGraph.gpickle'
#MR_GRAPH_NAME = 'machineread_greentea_version1.gpickle'
MR_GRAPH_NAME_GT = 'machineread_greentea_version2.gpickle'
MR_GRAPH_NAME_KT = 'machineread_kratom_version1.gpickle'
NodeLabelsFilePL = 'PheKnowLator_v3.0.0_full_instance_inverseRelations_OWLNETS_NodeLabels.txt'
NodeLabelsFileMR_gt = 'machineread_greentea_version2_NodeLabels.tsv'
NodeLabelsFileMR_kt = 'machineread_kratom_version1_NodeLabels.tsv'

In [5]:
#create dictionary for node labels from node labels files
df1 = pd.read_csv(KG_PATH+NodeLabelsFilePL, sep='\t')
df2 = pd.read_csv(MR_PATH+NodeLabelsFileMR_gt, sep='\t')
df3 = pd.read_csv(MR_PATH+NodeLabelsFileMR_kt, sep='\t')

In [21]:
df1.head()

Unnamed: 0,entity_type,integer_id,entity_uri,label,description/definition,synonym
0,NODES,344975,<https://uswest.ensembl.org/Homo_sapiens/Trans...,IPO4-205,Transcript IPO4-205 is classified as type 'pro...,
1,NODES,331306,<https://www.ncbi.nlm.nih.gov/snp/rs777826971>,NM_002397.5(MEF2C):c.860C>T (p.Ser287Leu),This variant is a de novo/germline single nucl...,
2,NODES,467156,<https://www.ncbi.nlm.nih.gov/snp/rs781825074>,NM_000489.5(ATRX):c.3218G>C (p.Ser1073Thr),This variant is a germline single nucleotide v...,
3,NODES,548489,<http://purl.obolibrary.org/obo/CHEBI_136208>,phosphatidylethanolamine (P-18:0/18:3),A phosphatidylethanolamine P-36:3 in which the...,
4,NODES,462657,<http://purl.obolibrary.org/obo/PR_O15079-1>,syntaphilin isoform h1 (human),A syntaphilin (human) that is a translation pr...,hSNPH/iso:h1


In [22]:
df2.head()

Unnamed: 0,entity_uri,entity_type,label,cui
0,http://purl.obolibrary.org/obo/SO_0000704,NODES,Genes,C0017337
1,http://purl.obolibrary.org/obo/PR_000041244,NODES,Histones,C0019652
2,http://purl.obolibrary.org/obo/GO_0006473,RELATIONS,Acetylation,
3,http://purl.obolibrary.org/obo/CHEBI_46024,NODES,PRDX2_gene,C1418880
4,http://purl.obolibrary.org/obo/PR_000028799,NODES,Tubulin,C0041348


In [23]:
df3.head()

Unnamed: 0,entity_uri,entity_type,label,cui
0,http://purl.obolibrary.org/obo/CHEBI_36080,NODES,Proteins,C0033684
1,http://purl.obolibrary.org/obo/CHEBI_15422,NODES,"ATP8A2_protein,_human",C2744579
2,http://purl.obolibrary.org/obo/RO_0002436,RELATIONS,Activation,
3,http://purl.obolibrary.org/obo/GO_0055085,NODES,Membrane_Transport_Proteins,C0596902
4,http://purl.obolibrary.org/obo/CHEBI_16335,NODES,Adenosine_Triphosphatases,C0001473


In [24]:
nodeLabels = {}
for i in range(len(df1.index)):
    uri = df1.at[i, 'entity_uri']
    if isinstance(uri, str):
        uri = uri.replace('<', '')
        uri = uri.replace('>', '')
        if uri not in nodeLabels:
            nodeLabels[uri] = df1.at[i, 'label']
len(nodeLabels)

752863

In [25]:
#N(nodeLabels) = 753217 (both PL and MR nodes combined)
for i in range(len(df2.index)):
    uri = df2.at[i, 'entity_uri']
    if isinstance(uri, str):
        uri = uri.replace('<', '')
        uri = uri.replace('>', '')
        if uri not in nodeLabels:
            nodeLabels[uri] = df2.at[i, 'label']
len(nodeLabels)

753204

In [26]:
for i in range(len(df3.index)):
    uri = df3.at[i, 'entity_uri']
    if isinstance(uri, str):
        uri = uri.replace('<', '')
        uri = uri.replace('>', '')
        if uri not in nodeLabels:
            nodeLabels[uri] = df3.at[i, 'label']
len(nodeLabels)

753217

In [64]:
#save to pickle file
#with open(KG_PATH+'nodeLabels_20211005.pickle', 'wb') as filep:
    #pickle.dump(nodeLabels, filep)
with open(KG_PATH+'nodeLabels_20211014.pickle', 'rb') as filep:
    nodeLabels = pickle.load(filep)

In [7]:
pl_kg = nx.read_gpickle(KG_PATH+KG_NAME)

In [14]:
# get the number of nodes, edges, and self-loops
nodes = nx.number_of_nodes(pl_kg)
edges = nx.number_of_edges(pl_kg)
self_loops = nx.number_of_selfloops(pl_kg)

print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
# get degree information
avg_degree = float(edges) / nodes

print('The Average Degree is {}'.format(avg_degree))

There are 757112 nodes, 7243418 edges, and 408 self-loop(s)
The Average Degree is 9.567168397806402


In [15]:
# get 5 nodes with the highest degress
n_deg = sorted([(str(x[0]), x[1]) for x in  pl_kg.degree], key=lambda x: x[1], reverse=1)[:6]

for x in n_deg:
    print('Label: {}'.format(nodeLabels[x[0]]))
    print('{} (degree={})'.format(x[0], x[1]))
# get network density
density = nx.density(pl_kg)

print('The density of the graph is: {}'.format(density))

Label: transcript
http://purl.obolibrary.org/obo/SO_0000673 (degree=190850)
Label: SNV
http://purl.obolibrary.org/obo/SO_0001483 (degree=121020)
Label: Homo sapiens
http://purl.obolibrary.org/obo/NCBITaxon_9606 (degree=116478)
Label: protein_coding_gene
http://purl.obolibrary.org/obo/SO_0001217 (degree=105046)
Label: testis
http://purl.obolibrary.org/obo/UBERON_0000473 (degree=43795)
Label: lncRNA_with_retained_intron
http://purl.obolibrary.org/obo/SO_0002113 (degree=29340)
The density of the graph is: 1.2636414472655133e-05


In [8]:
mr_kg = nx.read_gpickle(MR_PATH+MR_GRAPH_NAME_GT)
mr_kg2 = nx.read_gpickle(MR_PATH+MR_GRAPH_NAME_KT)

In [29]:
# get the number of nodes, edges, and self-loops
print('Green Tea Machine Read: ')
nodes = nx.number_of_nodes(mr_kg)
edges = nx.number_of_edges(mr_kg)
self_loops = nx.number_of_selfloops(mr_kg)

print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
# get degree information
avg_degree = float(edges) / nodes

print('The Average Degree is {}'.format(avg_degree))

print('Kratom Machine Read: ')
nodes = nx.number_of_nodes(mr_kg2)
edges = nx.number_of_edges(mr_kg2)
self_loops = nx.number_of_selfloops(mr_kg2)

print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
# get degree information
avg_degree = float(edges) / nodes

print('The Average Degree is {}'.format(avg_degree))

Green Tea Machine Read: 
There are 2009 nodes, 7365 edges, and 103 self-loop(s)
The Average Degree is 3.666002986560478
Kratom Machine Read: 
There are 272 nodes, 362 edges, and 5 self-loop(s)
The Average Degree is 1.3308823529411764


In [30]:
# get 5 nodes with the highest degress
print('Green Tea')
n_deg = sorted([(str(x[0]), x[1]) for x in  mr_kg.degree], key=lambda x: x[1], reverse=1)[:6]

for x in n_deg:
    print('Label: {}'.format(nodeLabels[x[0]]))
    print('{} (degree={})'.format(x[0], x[1]))
# get network density
density = nx.density(mr_kg)

print('The density of the graph is: {}'.format(density))
print('Kratom')
# get 5 nodes with the highest degress
n_deg = sorted([(str(x[0]), x[1]) for x in  mr_kg2.degree], key=lambda x: x[1], reverse=1)[:6]

for x in n_deg:
    print('Label: {}'.format(nodeLabels[x[0]]))
    print('{} (degree={})'.format(x[0], x[1]))
# get network density
density = nx.density(mr_kg2)

print('The density of the graph is: {}'.format(density))

Green Tea
Label: (-)-epigallocatechin 3-gallate
http://purl.obolibrary.org/obo/CHEBI_4806 (degree=939)
Label: Tea
http://napdi.org/napdi-srs-imports:camellia_sinensis_leaf (degree=369)
Label: catechin
http://purl.obolibrary.org/obo/CHEBI_23053 (degree=206)
Label: 凋亡过程
http://purl.obolibrary.org/obo/GO_0006915 (degree=161)
Label: glucose
http://purl.obolibrary.org/obo/CHEBI_17234 (degree=153)
Label: Mus <genus>
http://purl.obolibrary.org/obo/NCBITaxon_10088 (degree=144)
The density of the graph is: 0.0018256986984862937
Kratom
Label: Mitragynine
http://purl.obolibrary.org/obo/CHEBI_6956 (degree=52)
Label: carbon monoxide
http://purl.obolibrary.org/obo/CHEBI_17245 (degree=19)
Label: high-density lipoprotein particle
http://purl.obolibrary.org/obo/GO_0034364 (degree=16)
Label: phosphatidic acid
http://purl.obolibrary.org/obo/CHEBI_16337 (degree=13)
Label: potassium voltage-gated channel subfamily H member 2 (human)
http://purl.obolibrary.org/obo/PR_Q12809 (degree=13)
Label: NPC1-like intr

In [9]:
#combine graphs - PL and MR
nx_graph = nx.compose_all([pl_kg, mr_kg, mr_kg2])
print(type(nx_graph))

<class 'networkx.classes.multidigraph.MultiDiGraph'>


In [32]:
# get the number of nodes, edges, and self-loops
nodes = nx.number_of_nodes(nx_graph)
edges = nx.number_of_edges(nx_graph)
self_loops = nx.number_of_selfloops(nx_graph)

print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
# get degree information
avg_degree = float(edges) / nodes

print('The Average Degree is {}'.format(avg_degree))

There are 757457 nodes, 7251036 edges, and 514 self-loop(s)
The Average Degree is 9.572868162813203


In [19]:
# get 5 nodes with the highest degress
n_deg = sorted([(str(x[0]), x[1]) for x in  nx_graph.degree], key=lambda x: x[1], reverse=1)[:6]

for x in n_deg:
    print('{} (degree={})'.format(x[0], x[1]))
# get network density
density = nx.density(nx_graph)

print('The density of the graph is: {}'.format(density))

http://purl.obolibrary.org/obo/SO_0001217 (degree=184732)
http://purl.obolibrary.org/obo/SO_0000673 (degree=180194)
http://purl.obolibrary.org/obo/NCBITaxon_9606 (degree=85578)
http://purl.obolibrary.org/obo/PR_000000001 (degree=57850)
http://purl.obolibrary.org/obo/NCBITaxon_10090 (degree=52725)
http://purl.obolibrary.org/obo/NCBITaxon_3702 (degree=37565)
The density of the graph is: 8.613704937549329e-06


In [33]:
mr_kg_comb = nx.compose(mr_kg, mr_kg2)
# get the number of nodes, edges, and self-loops
nodes = nx.number_of_nodes(mr_kg_comb)
edges = nx.number_of_edges(mr_kg_comb)
self_loops = nx.number_of_selfloops(mr_kg_comb)

print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
# get degree information
avg_degree = float(edges) / nodes

print('The Average Degree is {}'.format(avg_degree))

There are 2169 nodes, 7719 edges, and 106 self-loop(s)
The Average Degree is 3.5587828492392806


In [34]:
# get 5 nodes with the highest degress
n_deg = sorted([(str(x[0]), x[1]) for x in  mr_kg_comb.degree], key=lambda x: x[1], reverse=1)[:6]

for x in n_deg:
    print('{} (degree={})'.format(x[0], x[1]))
# get network density
density = nx.density(mr_kg_comb)

print('The density of the graph is: {}'.format(density))

http://purl.obolibrary.org/obo/CHEBI_4806 (degree=939)
http://napdi.org/napdi-srs-imports:camellia_sinensis_leaf (degree=369)
http://purl.obolibrary.org/obo/CHEBI_23053 (degree=206)
http://purl.obolibrary.org/obo/GO_0006915 (degree=167)
http://purl.obolibrary.org/obo/CHEBI_17234 (degree=158)
http://purl.obolibrary.org/obo/NCBITaxon_10088 (degree=146)
The density of the graph is: 0.0016415050042616608


In [33]:
#nodes and edges examples
nodes = list(nx_graph.nodes(data=True))
for x in nodes:
    print(x)
    break

(rdflib.term.URIRef('http://purl.obolibrary.org/obo/CLO_0005160'), {'key': '<http://purl.obolibrary.org/obo/CLO_0005160>'})


In [34]:
i = 0
for u, v, keys in nx_graph.edges(keys=True):
    i = i+1
    print('Edge', i)
    print(u, nodeLabels[str(u)])
    print(keys, nodeLabels[str(keys)])
    print(v, nodeLabels[str(v)])
    if i==10:
        break

Edge 1
http://purl.obolibrary.org/obo/CLO_0005160 IGF037/87 cell
http://purl.obolibrary.org/obo/BFO_0000050 part of
http://purl.obolibrary.org/obo/NCBITaxon_9606 Homo sapiens
Edge 2
http://purl.obolibrary.org/obo/CLO_0005160 IGF037/87 cell
http://purl.obolibrary.org/obo/RO_0001000 derives from
http://purl.obolibrary.org/obo/CL_0000000 cell
Edge 3
http://purl.obolibrary.org/obo/CLO_0005160 IGF037/87 cell
http://www.w3.org/2000/01/rdf-schema#subClassOf subClassOf
http://purl.obolibrary.org/obo/CLO_0000020 immortal human cell line cell
Edge 4
http://purl.obolibrary.org/obo/CLO_0005160 IGF037/87 cell
http://purl.obolibrary.org/obo/BFO_0000050 part of
http://purl.obolibrary.org/obo/UBERON_0000014 zone of skin
Edge 5
http://purl.obolibrary.org/obo/CLO_0005160 IGF037/87 cell
http://www.w3.org/2000/01/rdf-schema#subClassOf subClassOf
http://purl.obolibrary.org/obo/CLO_0000438 immortal human skin-derived cell line cell
Edge 6
http://purl.obolibrary.org/obo/NCBITaxon_9606 Homo sapiens
http://www

In [116]:
#Useful functions
#nx_graph.get_edge_data(u, v, key=None, default=None]) 
#Returns the attribute dictionary associated with edge (u, v).
#key = hashable identifier, optional (default=None), Return data only for the edge with specified key.
node1 = URIRef('http://purl.obolibrary.org/obo/CHEBI_47495')
node2 = URIRef('http://purl.obolibrary.org/obo/GO_0031325')
edge_keys = list(nx_graph.get_edge_data(node1, node2).keys())
for item in edge_keys:
    print(str(item))
    print(nodeLabels[str(item)])

http://purl.obolibrary.org/obo/RO_0002436
molecularly interacts with


In [57]:
node1

rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_47495')

In [58]:
node2

rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0031325')

In [52]:
nx_graph[node1][node2]

AtlasView({rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002436'): {'predicate_key': '97f9f8732bc99bed95691c65240cf1b7', 'weight': 0.0}})

In [107]:
nx_graph[node1][node2]

AtlasView({rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002449'): {'predicate_key': '6dae54e8b8bbe9cdb6c615e404a5d7ce', 'weight': 0.0, 'pmid': '29356593', 'timestamp': '2018 Feb', 'source_graph': 'machine_read'}})

In [54]:
ewt = [e['weight'] for e in nx_graph[node1][node2].values()]
ewt

[0.0]

In [15]:
nx_graph.get_edge_data(node1, node2, default=0)

{rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002436'): {'predicate_key': '97f9f8732bc99bed95691c65240cf1b7',
  'weight': 0.0}}

In [108]:
#nx_graph.edges[node1, node2, 'key'] = key is hashable of triple -- how do we decipher/unhash?
nx_graph.edges[node1, node2]

ValueError: not enough values to unpack (expected 3, got 2)

## Path Searches
1. Single source shortest path (saved)
2. k-simple paths (saved for cyp3a4, midazolam)
3. Bidirectional shortest paths (in nb)
4. Shortest paths - do

In [65]:
DIR_OUT = '/home/sanya/PheKnowLatorv2/output_files/'

In [66]:
obo = Namespace('http://purl.obolibrary.org/obo/')
napdi = Namespace('http://napdi.org/napdi_srs_imports:')

Functions. Create function for -
1. Get path narrative given path or list of paths
2. Get path URIs given path or list of paths
3. Get path with machine reading output from 2017 and prior
4. Save path with labels to file


In [67]:
def get_path_labels(path):
    path_labels = []
    if len(path) < 1:
        print('Path length 1, skipping')
        return
    for edge in zip(path, path[1:]):
        data = nx_graph.get_edge_data(*edge)
        pred = list(data.keys())[0]
        node1_lab = str(edge[0])
        node2_lab = str(edge[1])
        if node1_lab in nodeLabels:
            node1_lab = nodeLabels[node1_lab]
        if node2_lab in nodeLabels:
            node2_lab = nodeLabels[node2_lab]
        pred_lab = nodeLabels[str(pred)]
        if list(data.values())[0]:
            if 'source_graph' in list(data.values())[0]:
                source_graph = 'machine_read'
            else:
                source_graph = ''
        else:
            source_graph = ''
        labels = [node1_lab, pred_lab, node2_lab, source_graph]
        path_labels.append(labels)
    return path_labels

In [68]:
def get_path_uri(path):
    path_uri = []
    if len(path) < 1:
        print('Path length 1, skipping')
        return
    for edge in zip(path, path[1:]):
        data = nx_graph.get_edge_data(*edge)
        pred = list(data.keys())[0]
        attribute = list(data.values())
        uri = [str(edge[0]), pred, str(edge[1]), attribute]
        path_uri.append(uri)
    return path_uri

In [69]:
#get shortest path from green tea leaf
greentea_path = nx.single_source_shortest_path(nx_graph, napdi.camellia_sinensis_leaf)

In [70]:
type(greentea_path)

dict

In [71]:
save1 = 'greentea_single_source_shortest_path_50.txt'

In [72]:
#get 20 paths from green tea single source shortest path
#if returned paths are dictionary
count = 0
for target, node_list in greentea_path.items():
    count += 1
    if target != napdi.camellia_sinensis_leaf:
        if str(target) not in nodeLabels:
            target_label = str(target).split('/')[-1]
        else:
            target_label = nodeLabels[str(target)]
        print('\n{} - {} Path:'.format(str(napdi.camellia_sinensis_leaf).split('/')[-1], target_label))
        path_labels = get_path_labels(node_list)
        print(path_labels)
    if count == 20:
        break


napdi_srs_imports:camellia_sinensis_leaf - (-)-epicatechin Path:
[['Camellia_sinensis_leaf', 'has component', '(-)-epicatechin', '']]

napdi_srs_imports:camellia_sinensis_leaf - gallocatechin Path:
[['Camellia_sinensis_leaf', 'has component', 'gallocatechin', '']]

napdi_srs_imports:camellia_sinensis_leaf - Camellia_sinensis_whole Path:
[['Camellia_sinensis_leaf', 'part of', 'Camellia_sinensis_whole', '']]

napdi_srs_imports:camellia_sinensis_leaf - plant anatomical entity Path:
[['Camellia_sinensis_leaf', 'subClassOf', 'plant anatomical entity', '']]

napdi_srs_imports:camellia_sinensis_leaf - (-)-epigallocatechin gallate Path:
[['Camellia_sinensis_leaf', 'has component', '(-)-epigallocatechin gallate', '']]

napdi_srs_imports:camellia_sinensis_leaf - (-)-epigallocatechin Path:
[['Camellia_sinensis_leaf', 'has component', '(-)-epigallocatechin', '']]

napdi_srs_imports:camellia_sinensis_leaf - (-)-epicatechin-3-O-gallate Path:
[['Camellia_sinensis_leaf', 'has component', '(-)-epicate

In [73]:
#save 100 paths from green tea single source shortest path to file
#if returned paths are dictionary
count = 0
file_save = open(DIR_OUT+save1, 'w')
for target, node_list in greentea_path.items():
    count += 1
    if target != napdi.camellia_sinensis_leaf:
        if str(target) not in nodeLabels:
            target_label = str(target).split('/')[-1]
        else:
            target_label = nodeLabels[str(target)]
        file_save.write('\n{} - {} Path:\n'.format(str(napdi.camellia_sinensis_leaf).split('/')[-1], target_label))
        path_labels = get_path_labels(node_list)
        for triples in path_labels:
            for item in triples:
                file_save.write(str(item)+' ')
            file_save.write('\n')
    if count == 100:
        break
file_save.close()

In [None]:
#obo.CHEBI_83161 - St. Johns Wort extract (to test graph)

In [74]:
#green tea and warfarin
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.CHEBI_10033)

In [75]:
pathx

[rdflib.term.URIRef('http://napdi.org/napdi_srs_imports:camellia_sinensis_leaf'),
 rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_68330'),
 rdflib.term.URIRef('http://napdi.org/napdi-srs-imports:camellia_sinensis_leaf'),
 rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_10033')]

In [35]:
#scratch try
for edge in zip(pathx,pathx[1:]):                                                 
    data = nx_graph.get_edge_data(*edge)    
    print('Edge info: ')
    print(data.values())
    print('source_graph' in list(data.values())[0])

Edge info: 
dict_values([{'predicate_key': '4bbadcc28097247fd55f77cbeb77ab74', 'weight': 0.0}])
False
Edge info: 
dict_values([{'predicate_key': 'cf627db0a97bc9798bf6f089a06581c9', 'weight': 0.0, 'pmid': '30286210', 'timestamp': '2018', 'source_graph': 'machine_read', 'belief': 0.65}])
True
Edge info: 
dict_values([{'predicate_key': '6dae54e8b8bbe9cdb6c615e404a5d7ce', 'weight': 0.0, 'pmid': '30286210', 'timestamp': '2018', 'source_graph': 'machine_read', 'belief': 0.65}])
True


In [21]:
path_labels

[['Camellia_sinensis_leaf',
  'has component',
  '(-)-epicatechin-3-O-gallate',
  ''],
 ['(-)-epicatechin-3-O-gallate',
  'molecularly interacts with',
  'UDP-glucuronosyltransferase 1A1 (human)',
  'machine_read']]

In [76]:
epicatechin = obo.CHEBI_90
catechin = obo.CHEBI_23053
egcg = obo.CHEBI_4806
greentea = napdi.camellia_sinensis_leaf

In [60]:
edge_path_test = nx.all_simple_edge_paths(nx_graph, catechin, obo.PR_P08684, 10)

In [61]:
i = 0
for x in edge_path_test:
    print(x)
    i += 1
    if i==3:
        break

[(rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_23053'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0018130'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002436')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0018130'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0044249'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0044249'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0009058'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0009058'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/PR_P17735'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0000057')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/PR_P17735'), rdflib.term.URIRef('https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000355962'), rdflib.term.URIRef('http://purl.ob

In [77]:
#returns simple paths nodes and edges, 
def k_simple_edge_paths(G, source, target, k, shortestLen):
    paths = nx.all_simple_edge_paths(G, source, target, cutoff=shortestLen+20)
    path_l = []
    path_n = []
    i = 0
    while i<k:
        try:
            print('[info] applying next operator to search for a simple path of max length {}'.format(shortestLen+20))
            path = next(paths)
        except StopIteration:
            break
        print('[info] Simple path found of length {}'.format(len(path))) 
        if len(path) > shortestLen:
            print('[info] Simple path length greater than shortest path length ({}) so adding to results'.format(shortestLen))
            path_l.append(path)
        i += 1
    for path in path_l:
        triple_list = []
        for triple in path:
            subj_lab = ''
            pred_lab = ''
            obj_lab = ''
            subj = str(triple[0])
            pred = str(triple[2])
            obj = str(triple[1])
            if subj in nodeLabels:
                subj_lab = nodeLabels[subj]
            if obj in nodeLabels:
                obj_lab = nodeLabels[obj]
            if pred in nodeLabels:
                pred_lab = nodeLabels[pred]
            triple_labels = (subj_lab, pred_lab, obj_lab)
            triple_list.append(triple_labels)
        path_n.append(triple_list)
    return path_l, path_n

In [79]:
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, greentea, obo.PR_P08684, 10, 0)

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [80]:
source = str(greentea)
target = str(obo.PR_P08684)
save2 = 'greentea_cyp3a4_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

In [81]:
pathx = nx.bidirectional_shortest_path(nx_graph, obo.CHEBI_23053, obo.PR_P08684)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['catechin', 'molecularly interacts with', 'protein binding', '']
['protein binding', 'function of', 'cytochrome P450 3A4 (human)', '']


In [82]:
source = str(obo.CHEBI_23053)
target = str(obo.PR_P08684)
save2 = 'catechin_cyp3a4_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, obo.CHEBI_23053, obo.PR_P08684, 20, 2)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 22
[info] Simple path found of length 21
[info] Simple path length greater than shortest path length (2) so adding to results
[info] applying next operator to search for a simple path of max length 22
[info] Simple path found of length 22
[info] Simple path length greater than shortest path length (2) so adding to results
[info] applying next operator to search for a simple path of max length 22
[info] Simple path found of length 22
[info] Simple path length greater than shortest path length (2) so adding to results
[info] applying next operator to search for a simple path of max length 22
[info] Simple path found of length 22
[info] Simple path length greater than shortest path length (2) so adding to results
[info] applying next operator to search for a simple path of max length 22
[info] Simple path found of length 22
[info] Simple path length greater than shortest path length (2) so adding to results
[info] app

In [83]:
source = str(obo.CHEBI_4806)
target = str(obo.CHEBI_41879)
save2 = 'EGCG_dexamethasone_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, obo.CHEBI_4806, obo.CHEBI_41879, 20, 0)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [84]:
source = str(obo.CHEBI_4806)
target = str(obo.UBERON_0000468)
save2 = 'EGCG_bodyweight_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, obo.CHEBI_4806, obo.UBERON_0000468, 20, 0)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [85]:
for item in zip(cyp3a4_edge_path_labs[0], cyp3a4_edge_paths[0]):
    print(item)

(('(-)-epigallocatechin 3-gallate', 'subClassOf', 'flavans'), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_4806'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_38672'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')))
(('flavans', 'subClassOf', '1-benzopyran'), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_38672'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_38443'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')))
(('1-benzopyran', 'subClassOf', 'benzopyran'), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_38443'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_22727'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')))
(('benzopyran', 'molecularly interacts with', 'secretory granule lumen'), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_22727'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0034774'), rdflib.term.URIRef('http://

In [87]:
source = str(obo.CHEBI_23053)
target = str(obo.PR_P08684)
save2 = 'catechin_cyp3a4_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, obo.CHEBI_23053, obo.PR_P08684, 20, 0)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [88]:
source = str(obo.CHEBI_23053)
target = str(obo.HP_0003074)
save2 = 'catechin_hyperglycemia_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, obo.CHEBI_23053, obo.HP_0003074, 20, 0)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [89]:
source = str(napdi.camellia_sinensis_leaf)
target = str(obo.PR_O08684)
save2 = 'greentea_cyp3a4_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, napdi.camellia_sinensis_leaf, obo.PR_P08684, 20, 0)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [90]:
cyp3a4_edge_path_labs[0]

[('Camellia_sinensis_leaf', 'has component', '(-)-epicatechin'),
 ('(-)-epicatechin', 'is enantiomer of', '(+)-epicatechin'),
 ('(+)-epicatechin', 'subClassOf', 'polyphenol'),
 ('polyphenol',
  'molecularly interacts with',
  'regulation of immune system process'),
 ('regulation of immune system process',
  '',
  'alpha-1-acid glycoprotein 1 (human)'),
 ('alpha-1-acid glycoprotein 1 (human)',
  'participates_in',
  'Innate Immune System'),
 ('Innate Immune System', '', 'CD3G (human)'),
 ('CD3G (human)', 'participates_in', 'TCR signaling'),
 ('TCR signaling', '', 'proteasome subunit beta type-8 (human)'),
 ('proteasome subunit beta type-8 (human)',
  'participates_in',
  'Beta-catenin independent WNT signaling'),
 ('Beta-catenin independent WNT signaling', '', 'GNGT2 (human)'),
 ('GNGT2 (human)', 'participates_in', 'Ca2+ pathway'),
 ('Ca2+ pathway', '', 'inositol 1,4,5-trisphosphate receptor type 2 (human)'),
 ('inositol 1,4,5-trisphosphate receptor type 2 (human)',
  'molecularly inter

In [41]:
cyp3a4_edge_paths[0]

[(rdflib.term.URIRef('http://napdi.org/napdi_srs_imports:camellia_sinensis_leaf'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_90'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002180')),
 (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_90'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_76125'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/chebi#is_enantiomer_of')),
 (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_76125'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_26195'),
  rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')),
 (rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_26195'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0002682'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002436')),
 (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0002682'),
  rdflib.term.URIRef('http://purl.obolibrary.org/obo/PR_P02763'),
  rdflib.term.URIRef('htt

In [91]:
def k_shortest_paths(G, source, target, k, weight='weight'):
    return list(islice(nx.all_shortest_paths(G, source, target, weight=weight), k))

In [92]:
#only returns node list, use get_path_labels to generate edges and get labels
def k_simple_paths(G, source, target, k, shortestLen):
    paths = nx.all_simple_paths(G, source, target, cutoff=shortestLen+20)
    path_l = []
    i = 0
    while i < k:
        try:
            print('[info] applying next operator to search for a simple path of max length {}'.format(shortestLen+20))
            path = next(paths)
        except StopIteration:
            break
        print('[info] Simple path found of length {}'.format(len(path))) 
        if len(path) > shortestLen:
            print('[info] Simple path length greater than shortest path length ({}) so adding to results'.format(shortestLen))
            path_l.append(path)
        i += 1
    return path_l

In [25]:
cyp3a4_paths = k_simple_paths(nx_graph, napdi.camellia_sinensis_leaf, obo.PR_P08684, 10, 4)

[info] applying next operator to search for a simple path of max length 24
[info] Simple path found of length 25
[info] Simple path length greater than shortest path length (4) so adding to results
[info] applying next operator to search for a simple path of max length 24
[info] Simple path found of length 25
[info] Simple path length greater than shortest path length (4) so adding to results
[info] applying next operator to search for a simple path of max length 24
[info] Simple path found of length 25
[info] Simple path length greater than shortest path length (4) so adding to results
[info] applying next operator to search for a simple path of max length 24
[info] Simple path found of length 25
[info] Simple path length greater than shortest path length (4) so adding to results
[info] applying next operator to search for a simple path of max length 24
[info] Simple path found of length 25
[info] Simple path length greater than shortest path length (4) so adding to results
[info] app

In [179]:
str(obo.PR_P08684).split('/')[-1]

'PR_P08684'

In [43]:
#if returned paths are list
#simple paths with max length 25
save2 = 'greentea_cyp3a4_simple_paths_10.txt'
file_save = open(DIR_OUT+save2, 'w')
source = str(napdi.camellia_sinensis_leaf)
target = str(obo.PR_P08684)
source_label = source
target_label = target
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=24):\n'.format(source_label, target_label))
i = 0
for node_list in cyp3a4_paths:
    file_save.write('\nPATH: '+str(i)+'\n')
    path_labels = get_path_labels(node_list)
    for triples in path_labels:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

In [75]:
source = str(napdi.camellia_sinensis_leaf)
target = str(obo.CHEBI_6931)
save2 = 'greentea_midazolam_simple_paths_20.txt'
file_save = open(DIR_OUT+save2, 'w')
cyp3a4_edge_paths, cyp3a4_edge_path_labs = k_simple_edge_paths(nx_graph, napdi.camellia_sinensis_leaf, obo.CHEBI_6931, 20, 0)
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for path_list in cyp3a4_edge_path_labs:
    file_save.write('\nPATH: '+str(i)+'\n')
    for triples in path_list:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 19
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] applying next operator to search for a simple path of max length 20
[info] Simple path found of length 20
[info] Simple path length greater than shortest path length (0) so adding to results
[info] app

In [93]:
#if returned paths are list
#simple paths with max length 25
save3 = 'greentea_midazolam_simple_paths_10.txt'
file_save = open(DIR_OUT+save3, 'w')
source = str(napdi.camellia_sinensis_leaf)
target = str(obo.CHEBI_6931)
source_label = source
target_label = target
if source in nodeLabels:
    source_label = nodeLabels[source]
if target in nodeLabels:
    target_label = nodeLabels[target]
file_save.write('\n{} - {} Simple Path (cutoff=20):\n'.format(source_label, target_label))
i = 0
for node_list in midazolam_paths:
    file_save.write('\nPATH: '+str(i)+'\n')
    path_labels = get_path_labels(node_list)
    for triples in path_labels:
        for item in triples:
            file_save.write(str(item)+' ')
        file_save.write('\n')
    i += 1
file_save.close()

NameError: name 'midazolam_paths' is not defined

In [94]:
##Bidirectional shortest paths
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.CHEBI_10033)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Camellia_sinensis_leaf', 'has component', 'gallocatechin', '']
['gallocatechin', 'directly negatively regulates activity of', 'Tea', 'machine_read']
['Tea', 'directly negatively regulates activity of', 'warfarin', 'machine_read']


In [95]:
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.PR_P08684)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Camellia_sinensis_leaf', 'has component', '(-)-epicatechin', '']
['(-)-epicatechin', 'negatively regulates', 'etoposide', 'machine_read']
['etoposide', 'interacts with', 'cytochrome P450 3A4 (human)', '']


In [96]:
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.CHEBI_6931)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Camellia_sinensis_leaf', 'has component', '(-)-epicatechin', '']
['(-)-epicatechin', 'IncreaseAmount', 'taurochenodeoxycholate 6alpha-hydroxylase activity', 'machine_read']
['taurochenodeoxycholate 6alpha-hydroxylase activity', 'molecularly interacts with', 'midazolam', 'machine_read']


In [97]:
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.HP_0003418)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Camellia_sinensis_leaf', 'has component', '(-)-epicatechin', '']
['(-)-epicatechin', 'subClassOf', 'catechin', '']
['catechin', 'molecularly interacts with', 'daunorubicin', 'machine_read']
['daunorubicin', 'is substance that treats', 'Back pain', '']


In [98]:
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.CHEBI_9150)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Camellia_sinensis_leaf', 'has component', '(-)-epicatechin', '']
['(-)-epicatechin', 'subClassOf', 'catechin', '']
['catechin', 'molecularly interacts with', 'simvastatin', 'machine_read']


In [99]:
pathx = nx.bidirectional_shortest_path(nx_graph, napdi.camellia_sinensis_leaf, obo.CHEBI_7444)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Camellia_sinensis_leaf', 'has component', 'gallocatechin', '']
['gallocatechin', 'directly negatively regulates activity of', 'Tea', 'machine_read']
['Tea', 'directly negatively regulates activity of', 'nadolol', 'machine_read']


KRATOM

In [100]:
kratom = napdi.mitragyna_speciosa
mitragynine = obo.CHEBI_6956
hydroxy_mitragynine = napdi['7_hydroxy_mitragynine']
hydroxy_mitragynine

rdflib.term.URIRef('http://napdi.org/napdi_srs_imports:7_hydroxy_mitragynine')

In [101]:
pathx = nx.bidirectional_shortest_path(nx_graph, kratom, obo.PR_P08684)
path_labels = get_path_labels(pathx)
for triples in path_labels:
    print(triples)

['Mitragyna_speciosa', 'has component', 'Mitragynine', '']
['Mitragynine', 'interacts with', 'cytochrome P450 3A4 (human)', '']


In [22]:
for key in nodeLabels:
    print(key)
    print(nodeLabels[key])
    break

https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000558233
IPO4-205


### Path searches with MR nodes as end points - predications with highest belief scores
1. Get MR predications with belief scores > 0.65
2. Use subject and object nodes as start and end points for simple path searches (shortest path would just be direct link between the nodes)

In [67]:
df = pd.read_csv('../machine_read/greentea_pmid_all_predicates_umls_processed.tsv', sep='\t')
df.head()

Unnamed: 0,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,pub_year,belief,predicate_obo,subject_obo,object_obo
0,C0017337,Genes,gene encoding SERCA2a,Acetylation,Histone_H3,C0019652,Histones,"(None, None)","('FPLX', 'Histone_H3')",30286210,2018,0.65,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/SO_0000704,http://purl.obolibrary.org/obo/PR_000041244
1,C1418880,PRDX2_gene,trichostatin A,Acetylation,Tubulin,C0041348,Tubulin,"('CHEBI', 'CHEBI:46024')","('FPLX', 'Tubulin')",25680958,2015 Apr,0.65,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/CHEBI_46024,http://purl.obolibrary.org/obo/PR_000028799
2,C0059438,epigallocatechin_gallate,--epigallocatechin 3-gallate,Acetylation,Histone,C0019652,Histones,"('CHEBI', 'CHEBI:4806')","('FPLX', 'Histone')",23210776,2013,0.65,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/CHEBI_4806,http://purl.obolibrary.org/obo/PR_000041244
3,C0073591,rosoxacin,ROS1,Acetylation,TMPRSS11D,C0444765,Hat_-_Headwear,"('HGNC', '10261')","('HGNC', '24059')",25847253,2015,0.65,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/CHEBI_131715,http://purl.obolibrary.org/obo/PR_Q9M2N5
4,C3539643,EIF4E_wt_Allele,CREBBP,Acetylation,RELA,C1453853,"WNK1_protein,_human","('HGNC', '2348')","('HGNC', '9955')",25847253,2015,0.86,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/PR_P63074,http://purl.obolibrary.org/obo/PR_000017431


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7677 entries, 0 to 7676
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_cui           7677 non-null   object 
 1   subject_name          7677 non-null   object 
 2   subject_source        7677 non-null   object 
 3   predicate             7677 non-null   object 
 4   object_source         7677 non-null   object 
 5   object_cui            7677 non-null   object 
 6   object_name           7677 non-null   object 
 7   subj_reach_grounding  7677 non-null   object 
 8   obj_reach_grounding   7677 non-null   object 
 9   pmid                  7677 non-null   int64  
 10  pub_year              7677 non-null   object 
 11  belief                7677 non-null   float64
 12  predicate_obo         7677 non-null   object 
 13  subject_obo           7677 non-null   object 
 14  object_obo            7677 non-null   object 
dtypes: float64(1), int64(

In [70]:
df = df.loc[df['belief'] > 0.8]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2012 entries, 4 to 7675
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   subject_cui           2012 non-null   object 
 1   subject_name          2012 non-null   object 
 2   subject_source        2012 non-null   object 
 3   predicate             2012 non-null   object 
 4   object_source         2012 non-null   object 
 5   object_cui            2012 non-null   object 
 6   object_name           2012 non-null   object 
 7   subj_reach_grounding  2012 non-null   object 
 8   obj_reach_grounding   2012 non-null   object 
 9   pmid                  2012 non-null   int64  
 10  pub_year              2012 non-null   object 
 11  belief                2012 non-null   float64
 12  predicate_obo         2012 non-null   object 
 13  subject_obo           2012 non-null   object 
 14  object_obo            2012 non-null   object 
dtypes: float64(1), int64(

In [71]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,pub_year,belief,predicate_obo,subject_obo,object_obo
0,C3539643,EIF4E_wt_Allele,CREBBP,Acetylation,RELA,C1453853,"WNK1_protein,_human","('HGNC', '2348')","('HGNC', '9955')",25847253,2015,0.86,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/PR_P63074,http://purl.obolibrary.org/obo/PR_000017431
1,C1530358,"EP300_protein,_human",EP300,Acetylation,RELA,C1453853,"WNK1_protein,_human","('HGNC', '3373')","('HGNC', '9955')",25847253,2015,0.86,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/PR_000007102,http://purl.obolibrary.org/obo/PR_000017431
2,C1843013,"Alzheimer_disease,_familial,_type_3",AD,Activation,long-term synaptic potentiation,C0206249,Long-Term_Potentiation,"(None, None)","('GO', 'GO:0060291')",29944861,2018 Aug,0.923,http://purl.obolibrary.org/obo/RO_0002436,http://purl.obolibrary.org/obo/MONDO_0100087,http://purl.obolibrary.org/obo/GO_0060291
3,C0025918,"Mice,_Inbred_AKR",AKR,Activation,secondary alcohol,C0001962,Ethanol,"(None, None)","('CHEBI', 'CHEBI:35681')",28283780,2017 Jun,0.86,http://purl.obolibrary.org/obo/RO_0002436,http://purl.obolibrary.org/obo/NCBITaxon_10088,http://purl.obolibrary.org/obo/CHEBI_16236
4,C3814396,CYP3A_Gene_Locus,CYP3A,Activation,metabolic process,C0025520,metabolic_aspects,"(None, None)","('GO', 'GO:0008152')",29368187,2018 May,0.949271,http://purl.obolibrary.org/obo/RO_0002436,http://purl.obolibrary.org/obo/CHEBI_38559,http://purl.obolibrary.org/obo/GO_0008152


In [72]:
df = df.sort_values(by=['belief'], ascending=False)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,pub_year,belief,predicate_obo,subject_obo,object_obo
0,C0596577,Flavonoids,flavonoids,Activation,apoptotic process,C0162638,Apoptosis,"('CHEBI', 'CHEBI:72544')","('GO', 'GO:0006915')",22830339,2012,0.95,http://purl.obolibrary.org/obo/RO_0002436,http://purl.obolibrary.org/obo/CHEBI_72544,http://purl.obolibrary.org/obo/GO_0006915
1,C0059438,epigallocatechin_gallate,--epigallocatechin 3-gallate,Inhibition,Neoplasms,C0027651,Neoplasms,"('CHEBI', 'CHEBI:4806')","('MESH', 'D009369')",29137307,2017 Oct 10,0.95,http://purl.obolibrary.org/obo/RO_0002449,http://purl.obolibrary.org/obo/CHEBI_4806,http://purl.obolibrary.org/obo/HP_0002664
2,C0016979,Gallic_acid,gallic acid,Activation,Mice,C0026809,Mus,"('CHEBI', 'CHEBI:30778')","('MESH', 'D051379')",24722818,2014 Jul,0.95,http://purl.obolibrary.org/obo/RO_0002436,http://purl.obolibrary.org/obo/CHEBI_30778,http://purl.obolibrary.org/obo/NCBITaxon_10088
3,C0016993,Gambia,gallic acid,Activation,apoptotic process,C0162638,Apoptosis,"('CHEBI', 'CHEBI:30778')","('GO', 'GO:0006915')",26251571,2015,0.95,http://purl.obolibrary.org/obo/RO_0002436,http://purl.obolibrary.org/obo/CHEBI_30778,http://purl.obolibrary.org/obo/GO_0006915
4,C0071649,polyphenols,polyphenol,Inhibition,glucose transmembrane transport,C0178666,glucose_transport,"('CHEBI', 'CHEBI:26195')","('GO', 'GO:1904659')",30667442,2019 Feb 20,0.95,http://purl.obolibrary.org/obo/RO_0002449,http://purl.obolibrary.org/obo/CHEBI_26195,http://purl.obolibrary.org/obo/GO_1904659


In [58]:
df.to_csv('../machine_read/MR_triples_searchpath.tsv', sep='\t', index=False)

In [None]:
#Subject-Object pairs (testing paths) for triples from machine reading with belief scores > 0.8
'''
1. catechin (CHEBI_90) -> ABCB1 (), biosynthetic process (), transport (), apoptotic process (), coronary disease (),
cholesterol (), myocardial eschemia, cisplatin, heart disease, glucose, glucose import, glucose metabolic process,
hyperglycemia, intestinal absorption
2. epigallocatechin gallate (CHEBI_4806) -> quinone, paracetamol, Endoplasmic Reticulum Stress, ATP, ATPase, autophagy, bile acid,
transport, cell death, cholesterol, cisplatin, dexamethasone, diclofenac, digoxin, dopamine, drug metabolic process, 
erythromycin, glutathione, heart failure, hemolysis, angiotensin-2, cortisol, insulin secretion, insulin resistance,
liver failure, nadolol, obesity, quercetin, tamoxifen, verapamil
3. greentea -> atorvastatin, rosuvastatin, benzo[a]pyrene, cardiovascular disease, stroke, cholesterol,
Myocardial Ischemia, Coronary Disease, Diabetes Mellitus, diclofenac, digoxin, doxorubicin, hypertension, liver disease,
nadolol, obesity, warfarin, glucose import, glutathione
EXTENDED LISTS BELOW
'''

In [95]:
catechin_list = ['ABCB1_gene', 'Anabolism', 'Biological_Transport', 'Apoptosis', 'Cell_Proliferation', 
                 'Coronary_Arteriosclerosis', 'Cholesterol', 'Cytochrome_P-450_CYP1A1', 'Cytochrome_P-450_CYP1A2',
                 'Cytochrome_P-450_CYP3A4', 'Insulin_Secretion', 'Cisplatin', 'Heart_Diseases', 'Glucose', 
                 'glucose_uptake', 'glucose_transport', 'Hyperglycemia', 'Obesity', 'P-Glycoprotein',
                'UGT1A1_gene', 'Weight_decreased']
egcg_list = ['1,4-benzoquinone', 'ABCA1_gene', 'Acetaminophen', 'Adenosine_Triphosphatases', 'Autophagy',
            'Bile_Acids', 'Bilirubin', 'Biological_Transport', 'Body_Weight', 'BRCA1_protein,_human',
             'Cell_Death', 'Cell_Proliferation', 'Cholesterol', 'Cisplatin', 'Collagen', 'Coronary_Arteriosclerosis',
             'Cytochrome_P-450_CYP1A1', 'Cytochrome_P-450_CYP1A2', 'Cytochrome_P-450_CYP3A4', 'Cytochrome_P-450_CYP2D6',
             'Cytochrome_P-450_CYP2C19', 'drug_metabolism', 'Dexamethasone', 'Diclofenac', 'Digoxin', 'Dopamine', 
             'GA-Binding_Protein_Transcription_Factor', 'Gluconeogenesis', 'Glucose_Transporter', 
             'glucose_transport', 'glucose_uptake', 'Glutathione', 'Glycogen', 'Erythromycin',  'Heart_failure', 
             'Hemolysis_(disorder)', 'Inflammation', 'Hydrocortisone',
             'Interleukin-1', 'Interleukin-6', 'Intestinal_Absorption', 'rosoxacin', 'UGT1A1_gene',
              'Insulin_Secretion', 'Insulin_Resistance', 'Liver_Failure',
             'Nadolol', 'Obesity', 'Quercetin', 'Tamoxifen', 'Verapamil']
greentea_list = ['ABCB1_gene', 'ABCG2_gene', 'Acetaminophen', 'Biological_Transport', 'Cardiovascular_Diseases',
            'Cerebrovascular_accident', 'Coronary_Arteriosclerosis', 'atorvastatin',  'Benzopyrenes', 'Cholesterol',
            'Cytochrome_P450', 'Cytochromes', 'Cytochrome_P-450_CYP1A1', 'Cytochrome_P-450_CYP1A2',
            'Cytochrome_P-450_CYP3A4', 'Diabetes_Mellitus', 'Diclofenac', 'Digoxin', 'Doxorubicin', 'glucose_transport',
            'Hypertensive_disease', 'Hay_fever', 'Interleukin-10', 'Lipid_Metabolism', 'Liver_diseases', 
            'Low-Density_Lipoproteins', 'Nadolol', 'Obesity', 'glucose_uptake', 'Glutathione',
            'SLC2A1_protein,_human', 'SLC5A1_gene', 'SLCO1A2_gene', 'SLCO2B1_gene', 'Warfarin',
            'rosuvastatin', 'rosoxacin', 'TNFSF11_protein,_human', 'TRPA1_gene', 'TRPV1_gene']


In [96]:
#get OBO identifiers from dataframe
node_dict = {}
for item in catechin_list:
    if item not in node_dict:
        print(item)
        obo_id = df.loc[df['object_name'] == item]['object_obo'].values[0]
        node_dict[item] = obo_id.split('/')[-1]
for item in egcg_list:
    if item not in node_dict:
        print(item)
        obo_id = df.loc[df['object_name'] == item]['object_obo'].values[0]
        node_dict[item] = obo_id.split('/')[-1]
for item in tea_list:
    if item not in node_dict:
        print(item)
        obo_id = df.loc[df['object_name'] == item]['object_obo'].values[0]
        node_dict[item] = obo_id.split('/')[-1]
len(node_dict)


ABCB1_gene
Anabolism
Biological_Transport
Apoptosis
Cell_Proliferation
Coronary_Arteriosclerosis
Cholesterol
Cytochrome_P-450_CYP1A1
Cytochrome_P-450_CYP1A2
Cytochrome_P-450_CYP3A4
Insulin_Secretion
Cisplatin
Heart_Diseases
Glucose
glucose_uptake
glucose_transport
Hyperglycemia
Obesity
P-Glycoprotein
UGT1A1_gene
Weight_decreased
1,4-benzoquinone
ABCA1_gene
Acetaminophen
Adenosine_Triphosphatases
Autophagy
Bile_Acids
Bilirubin
Body_Weight
BRCA1_protein,_human
Cell_Death
Collagen
Cytochrome_P-450_CYP2D6
Cytochrome_P-450_CYP2C19
drug_metabolism
Dexamethasone
Diclofenac
Digoxin
Dopamine
GA-Binding_Protein_Transcription_Factor
Gluconeogenesis
Glucose_Transporter
Glutathione
Glycogen
Erythromycin
Heart_failure
Hemolysis_(disorder)
Inflammation
Hydrocortisone
Interleukin-1
Interleukin-6
Intestinal_Absorption
rosoxacin
Insulin_Resistance
Liver_Failure
Nadolol
Quercetin
Tamoxifen
Verapamil
ABCG2_gene
Cardiovascular_Diseases
Cerebrovascular_accident
atorvastatin
Benzopyrenes
Cytochrome_P450
Cyto

83

In [122]:
node_dict['Hyperglycemia']

'HP_0003074'

In [35]:
x = zip(cyp3a4_edge_paths, cyp3a4_edge_path_labs)

In [38]:
for item in x:
    print(item)
    print(type(item))
    break

([(rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_23053'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0018130'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0002436')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0018130'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0044249'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0044249'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0009058'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/GO_0009058'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/PR_P17735'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/RO_0000057')), (rdflib.term.URIRef('http://purl.obolibrary.org/obo/PR_P17735'), rdflib.term.URIRef('https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000355962'), rdflib.term.URIRef('http://purl.o

In [61]:
from typing import Dict, List, Optional, Set, Tuple, Union

def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [59]:
s = URIRef('http://napdi.org/napdi_srs_imports:camellia_sinensis_leaf')
p = URIRef('http://purl.obolibrary.org/obo/RO_0002180')
o = URIRef('http://purl.obolibrary.org/obo/CHEBI_90')

In [62]:
pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()

In [63]:
pred_key

'65161e94646ef7334785bf7ac25257be'

In [71]:
nx_graph[s][o][p]

{'predicate_key': '65161e94646ef7334785bf7ac25257be', 'weight': 0.0}

### Fix nodeLabels in file from build 3.0.0

In [48]:
import json
import pickle

In [49]:
import re

In [50]:
len(nodeLabels)

753217

In [108]:
file1 = open(KG_PATH + 'nodeLabels_20211014.pickle', 'rb')
nods = pickle.load(file1)

In [109]:
nods['http://purl.obolibrary.org/obo/PR_Q9H9S0']

'homeobox protein NANOG (human)'

In [110]:
nods['http://purl.obolibrary.org/obo/PR_Q9H9S0']

'homeobox protein NANOG (human)'

In [111]:
nods['http://purl.obolibrary.org/obo/RO_0000057'] = 'has participant'

In [112]:
fileo = open(KG_PATH + 'nodeLabels_20211021.pickle', 'wb')


In [52]:
for key in correctLabels:
    print(key)
    print(correctLabels[key])
    break

<http://purl.obolibrary.org/obo/VO_0002752>
{'label': 'E2 from Western equine encephalomyelitis virus', 'description/definition': 'N/A'}


In [53]:
for key in correctLabels:
    node = key.strip('<')
    node = node.strip('>')
    newLabel = correctLabels[key]['label']
    if node in nods:
        if newLabel != 'N/A':
            nods[node] = newLabel
len(nods)

753217

In [54]:
nods['http://purl.obolibrary.org/obo/SO_0000704']

'gene'

In [56]:
nods['http://purl.obolibrary.org/obo/PR_Q9H9S0']

'homeobox protein NANOG (human)'

In [63]:
nas[:20]

['<http://purl.obolibrary.org/obo/PR_Q6V1P9>',
 '<http://purl.obolibrary.org/obo/PR_Q9NZV6-1>',
 '<http://purl.obolibrary.org/obo/PR_Q9GZL7>',
 '<http://purl.obolibrary.org/obo/PR_O76083-5>',
 '<http://purl.obolibrary.org/obo/PR_A0A087WT02>',
 '<http://purl.obolibrary.org/obo/PR_A0AVF1-1>',
 '<http://purl.obolibrary.org/obo/PR_Q12968-4>',
 '<http://purl.obolibrary.org/obo/PR_Q9HBI1-2>',
 '<http://purl.obolibrary.org/obo/PR_Q86SZ2>',
 '<http://purl.obolibrary.org/obo/PR_Q5HYK9>',
 '<http://purl.obolibrary.org/obo/PR_Q9H2P9-4>',
 '<http://purl.obolibrary.org/obo/PR_O60381-1>',
 '<http://purl.obolibrary.org/obo/PR_Q16667>',
 '<http://purl.obolibrary.org/obo/PR_A2NJV5>',
 '<http://purl.obolibrary.org/obo/PR_Q6P461-3>',
 '<http://purl.obolibrary.org/obo/PR_Q8TDC0>',
 '<http://purl.obolibrary.org/obo/PR_000054919>',
 '<http://purl.obolibrary.org/obo/PR_P53667-4>',
 '<http://purl.obolibrary.org/obo/PR_O14647-1>',
 '<http://purl.obolibrary.org/obo/PR_Q6ZSI9>']

In [58]:
nas = []
for key in correctLabels:
    label = correctLabels[key]['label']
    if label == 'N/A':
        nas.append(key)
len(nas)

54903

In [62]:
count = 1
for item in nas:
    if 'PR' in item:
        count += 1
count

54844

In [113]:
pickle.dump(nods, fileo)

## Reweighting the KG

1. Fix subclassof chemical entity
2. maybe downweight subclassof
3. use belief scores of MR to weight?
4. centrality measures - node degree centrality
5. Fix mapping to TEA in REACH/SemRep (maps to triethylamine CHEBI_35026)

In [None]:
#INDRA pathfinding module also searches in nx multidigraph 
#https://indra.readthedocs.io/en/latest/modules/explanation/pathfinding.html
#uses belief in metadata (I think)