## Merge ontology-grounded and literature-based graphs

In [1]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import os
import pickle, json

#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore

from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore

from tqdm import tqdm  # type: ignore

In [19]:
def get_stats(g):
    nodes = nx.number_of_nodes(g)
    edges = nx.number_of_edges(g)
    density = nx.density(g)
    avg_deg = float(edges)/nodes
    print('Number of nodes: ', nodes)
    print('Number of edges: ', edges)
    print('Density of graph: ', density)
    print('Average degree: ', avg_deg)

In [30]:
KG_PATH = '../resources/knowledge_graphs/'
MR_KG_PATH = '../literature-graphs/output_graphs/'

### Merge gpickle graphs

Load merged machine read graph, PheKnowLator KG and node labels files

In [12]:
KG_PATH = '../resources/knowledge_graphs/'
MR_PATH = '../literature-graphs/output_graphs/'
KG_NAME = 'PheKnowLator_v3.1.0_full_instance_inverseRelations_OWLNETS_NetworkxMultiDiGraph.gpickle'
MR_KG_NAME = 'machine_read_merged_v2.0.0.gpickle'

In [5]:
NodeLabelsFilePL = 'PheKnowLator_v3.1.0_full_instance_inverseRelations_OWLNETS_NodeLabels.txt'
NodeLabelsFileMR = 'machine_read_merged_v2.0.0_NodeLabels.pickle'

In [6]:
nodespl = pd.read_csv(KG_PATH+NodeLabelsFilePL, sep='\t')
nodespl.head()

Unnamed: 0,entity_type,integer_id,entity_uri,label,description/definition,synonym
0,NODES,1000933,<https://uswest.ensembl.org/Homo_sapiens/Trans...,THEMIS2-207,Transcript THEMIS2-207 is classified as type '...,
1,NODES,908489,<http://purl.obolibrary.org/obo/CHEBI_177842>,nostocyclamide,A keratan 6'-sulfate that has formula C20H22N6...,"(4S,18R)-4,7-dimethyl-18-propan-2-yl-6-oxa-13,..."
2,NODES,511717,<http://purl.obolibrary.org/obo/PR_Q93V61>,phospholipase A(1) LCAT3 (Arabidopsis thaliana),A protein that is a translation product of the...,At-LCAT3|LCAT3|T21P5.27|lecithin-cholesterol a...
3,NODES,459007,<https://www.ncbi.nlm.nih.gov/snp/rs771399009>,NM_003995.4(NPR2):c.2351G>A (p.Gly784Asp),This variant is a germline single nucleotide v...,
4,NODES,1039541,<http://purl.obolibrary.org/obo/PR_P97792-1>,coxsackievirus and adenovirus receptor homolog...,A coxsackievirus and adenovirus receptor isofo...,mCXADR/iso:1


In [7]:
nodespl.loc[nodespl['entity_uri'].isna()]

Unnamed: 0,entity_type,integer_id,entity_uri,label,description/definition,synonym


In [8]:
nodespl.shape

(1087706, 6)

In [9]:
nodespldict = {}
for i in range(len(nodespl.index)):
    #entity_type = nodespl.at[i, 'entity_type']
    entity_uri = nodespl.at[i, 'entity_uri']
    if isinstance(entity_uri, str):
        entity_uri = entity_uri.replace('<','').replace('>','')
        nodelabel = nodespl.at[i, 'label']
        if entity_uri not in nodespldict:
            nodespldict[entity_uri] = nodelabel
        #nodespldict[entity_uri]['entity_type'] = entity_type
len(nodespldict)

1087706

In [10]:
for key in nodespldict:
    print(key, nodespldict[key])
    break

https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000456990 THEMIS2-207


In [10]:
##fix foreign characters with bad_node_patch.json - not needed in v3.1.0
#with open(KG_PATH+'bad_node_patch.json') as f:
#    bad_node_patch = json.load(f)
#len(bad_node_patch)
# for key in bad_node_patch:
#    newkey = key.replace('<','').replace('>','')
#    if newkey in nodespldict:
#        print(newkey)
#        print(bad_node_patch[key])
#        print(nodespldict[newkey])
#        break
##fix foreign characters in nodespldict with bad_node_patch.json
#count = 0
#for key in bad_node_patch:
#    newkey = key.replace('<','').replace('>','')
#    if newkey in nodespldict:
#        nodespldict[newkey] = bad_node_patch[key]['label']
#        count += 1
#print(count)

387263

In [13]:
import pickle
with open(MR_PATH+NodeLabelsFileMR, 'rb') as filep:
    nodesmr = pickle.load(filep)
len(nodesmr)

8758

In [14]:
for key in nodesmr:
    print(key, nodesmr[key])
    break

http://purl.obolibrary.org/obo/CHEBI_38559 {'entity_type': 'NODES', 'label': 'Cytochrome P450', 'cui': 'C0010762'}


In [15]:
##combine node labels from both KGs
for key in nodesmr:
    if key not in nodespldict:
        nodespldict[key] = nodesmr[key]['label']
len(nodespldict)

1088248

In [16]:
with open(KG_PATH+'nodeLabels_v2.0.0.pickle', 'wb') as filep:
    pickle.dump(nodespldict, filep)

In [17]:
nodeLabels = nodespldict

In [18]:
##read in PL graph
pkl = nx.read_gpickle(KG_PATH+KG_NAME)

In [20]:
#v1.0.1: 980240, 7765868, 8.082125806864853e-06, 7.922414918795397
get_stats(pkl)

Number of nodes:  1089614
Number of edges:  7632027
Density of graph:  6.428282398477236e-06
Average degree:  7.004340069051977


In [21]:
mrkg = nx.read_gpickle(MR_PATH+MR_KG_NAME)

In [22]:
get_stats(mrkg)

Number of nodes:  8782
Number of edges:  84569
Density of graph:  0.0010966645002845241
Average degree:  9.629810976998407


In [23]:
##combine graphs
nxgraph = nx.compose_all([pkl, mrkg])

In [24]:
get_stats(nxgraph)

Number of nodes:  1090173
Number of edges:  7716284
Density of graph:  6.492586709514752e-06
Average degree:  7.078036238285116


In [25]:
##save graph
OUTFILENAME = 'NP-KG_v2.0.0.gpickle'
nx.write_gpickle(nxgraph, KG_PATH+OUTFILENAME)

### Merge serialized graphs

In [32]:
pl_file = KG_PATH+'PheKnowLator_v3.1.0_full_instance_inverseRelations_OWLNETS.nt'
mr_file = MR_KG_PATH+'machine_read_merged_with_closure_v2.0.0.nt'
outfile_merged = KG_PATH+'NP-KG_v2.0.0.nt'

In [27]:
with open(pl_file,'r') as filep:
    g = filep.read()
pl = g.split('\n')

In [28]:
len(pl)

7632028

In [33]:
with open(mr_file,'r') as filem:
    g = filem.read()
mr = g.split('\n')

In [34]:
len(mr)

107056

In [37]:
mr[-2]

'<http://purl.obolibrary.org/obo/CHEBI_47519> <http://purl.obolibrary.org/obo/RO_0002436> <http://purl.obolibrary.org/obo/CHEBI_6931> .'

In [38]:
##not writing labels to file
with open(outfile_merged, 'w') as fileout:
    newline = ''
    for i in range(0, len(pl)-1):
        if pl[i] == '':
            continue
        else:
            fileout.write(newline+pl[i])
            newline = '\n'
    newline = '\n'
    for i in range(0, len(mr)-1):
        if 'label' in mr[i] or mr[i] == '':
            continue
        else:
            fileout.write(newline+mr[i])
            newline = '\n'
fileout.close()

In [None]:
#merged == 21725642 ()
#no labels = 21720940

In [39]:
##check file
with open(outfile_merged, 'r') as filein:
    g = filein.read()
merged = g.split('\n')
len(merged)

7727497

In [40]:
merged[-1]

'<http://purl.obolibrary.org/obo/CHEBI_47519> <http://purl.obolibrary.org/obo/RO_0002436> <http://purl.obolibrary.org/obo/CHEBI_6931> .'

In [41]:
merged[0]

'<http://purl.obolibrary.org/obo/MONDO_0010777> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://purl.obolibrary.org/obo/MONDO_0003847> .'