## Merge ontology-grounded and literature-based graphs

In [1]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import os
import pickle, json

#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore

from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore

from tqdm import tqdm  # type: ignore

In [2]:
def get_stats(g):
    nodes = nx.number_of_nodes(g)
    edges = nx.number_of_edges(g)
    density = nx.density(g)
    avg_deg = float(edges)/nodes
    print('Number of nodes: ', nodes)
    print('Number of edges: ', edges)
    print('Density of graph: ', density)
    print('Average degree: ', avg_deg)

In [3]:
KG_PATH = '../resources/knowledge_graphs/'
MR_KG_PATH = '../literature-graphs/output_graphs/'

### Merge gpickle graphs

Load merged machine read graph, PheKnowLator KG and node labels files

In [4]:
KG_PATH = '../resources/knowledge_graphs/'
MR_PATH = '../literature-graphs/output_graphs/'
KG_NAME = 'PheKnowLator_v3.1.0_full_instance_inverseRelations_OWLNETS_NetworkxMultiDiGraph.gpickle'
MR_KG_NAME = 'machine_read_merged_v2.0.0.gpickle'

In [5]:
NodeLabelsFilePL = 'PheKnowLator_v3.1.0_full_instance_inverseRelations_OWLNETS_NodeLabels.txt'
NodeLabelsFileMR = 'machine_read_merged_v2.0.0_NodeLabels.pickle'

In [6]:
#leads to missing nodes
#nodespl = pd.read_csv(KG_PATH+NodeLabelsFilePL, sep='\t')
#nodespl.head()
with open(KG_PATH+NodeLabelsFilePL, 'r') as filep:
    nodespl = filep.readlines()
len(nodespl)

1089950

In [8]:
nodespl = [x.strip() for x in nodespl[1:]]
len(nodespl)

1089949

In [9]:
nodespl[0]

"NODES\t143063\t<https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000566435>\tNPW-201\tTranscript NPW-201 is classified as type 'protein_coding'.\tNone"

In [14]:
#split nodespl by \t and save to dictionary
nodespl_dict = {}
for node in nodespl:
    node = node.split('\t')
    nodespl_dict[node[2].replace('>', '').replace('<', '')] = node[3]
len(nodespl_dict)

1089949

In [15]:
for key in nodespl_dict:
    print(key, nodespl_dict[key])
    break

https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000566435 NPW-201


In [10]:
##fix foreign characters with bad_node_patch.json - fixed in v3.1.0
#with open(KG_PATH+'bad_node_patch.json') as f:
#    bad_node_patch = json.load(f)
#len(bad_node_patch)
# for key in bad_node_patch:
#    newkey = key.replace('<','').replace('>','')
#    if newkey in nodespldict:
#        print(newkey)
#        print(bad_node_patch[key])
#        print(nodespldict[newkey])
#        break
##fix foreign characters in nodespldict with bad_node_patch.json
#count = 0
#for key in bad_node_patch:
#    newkey = key.replace('<','').replace('>','')
#    if newkey in nodespldict:
#        nodespldict[newkey] = bad_node_patch[key]['label']
#        count += 1
#print(count)

387263

In [16]:
import pickle
with open(MR_PATH+NodeLabelsFileMR, 'rb') as filep:
    nodesmr = pickle.load(filep)
len(nodesmr)

8758

In [17]:
for key in nodesmr:
    print(key, nodesmr[key])
    break

http://purl.obolibrary.org/obo/CHEBI_38559 {'entity_type': 'NODES', 'label': 'Cytochrome P450', 'cui': 'C0010762'}


In [19]:
##combine node labels from both KGs
for key in nodesmr:
    if key not in nodespl_dict:
        nodespl_dict[key] = nodesmr[key]['label']
len(nodespl_dict)

1090469

In [20]:
with open(KG_PATH+'nodeLabels_v2.0.0.pickle', 'wb') as filep:
    pickle.dump(nodespl_dict, filep)

In [22]:
nodeLabels = nodespl_dict

In [17]:
##read in PL graph
pkl = nx.read_gpickle(KG_PATH+KG_NAME)

In [18]:
#v1.0.1: 980240, 7765868, 8.082125806864853e-06, 7.922414918795397
get_stats(pkl)

Number of nodes:  1089614
Number of edges:  7850306
Density of graph:  6.612133825320617e-06
Average degree:  7.204666973809074


In [19]:
mrkg = nx.read_gpickle(MR_PATH+MR_KG_NAME)

In [20]:
get_stats(mrkg)

Number of nodes:  8782
Number of edges:  84569
Density of graph:  0.0010966645002845241
Average degree:  9.629810976998407


In [21]:
##combine graphs
nxgraph = nx.compose_all([pkl, mrkg])

In [22]:
get_stats(nxgraph)

Number of nodes:  1090173
Number of edges:  7934518
Density of graph:  6.676211776705675e-06
Average degree:  7.278219145034779


In [23]:
##save graph
OUTFILENAME = 'NP-KG_v2.0.0.gpickle'
nx.write_gpickle(nxgraph, KG_PATH+OUTFILENAME)

### Save node labels as TSV

In [23]:
with open(KG_PATH+'nodeLabels_v2.0.0.pickle', 'rb') as filep:
    nodeLabels = pickle.load(filep)
len(nodeLabels)

1090469

In [24]:
##convert dictionary to dataframe with columns 'entity_uri' and 'label'
nodelabelsdf = pd.DataFrame.from_dict(nodeLabels, orient='index', columns=['label'])
nodelabelsdf.head()

Unnamed: 0,label
https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000566435,NPW-201
https://reactome.org/content/detail/R-HSA-381412,IGFBP2 binds IGF forming IGF:IGFBP2
http://purl.obolibrary.org/obo/PR_000034889,uncharacterized HTH-type transcriptional regul...
http://purl.obolibrary.org/obo/PR_000039150,probable ATP-dependent RNA helicase DDX6 isofo...
https://www.ncbi.nlm.nih.gov/snp/rs2048521407,NM_000419.5(ITGA2B):c.2975_2979del (p.Glu992fs)


In [25]:
nodelabelsdf = nodelabelsdf.reset_index()
nodelabelsdf.head()

Unnamed: 0,index,label
0,https://uswest.ensembl.org/Homo_sapiens/Transc...,NPW-201
1,https://reactome.org/content/detail/R-HSA-381412,IGFBP2 binds IGF forming IGF:IGFBP2
2,http://purl.obolibrary.org/obo/PR_000034889,uncharacterized HTH-type transcriptional regul...
3,http://purl.obolibrary.org/obo/PR_000039150,probable ATP-dependent RNA helicase DDX6 isofo...
4,https://www.ncbi.nlm.nih.gov/snp/rs2048521407,NM_000419.5(ITGA2B):c.2975_2979del (p.Glu992fs)


In [26]:
##rename index as 'entity_uri'
nodelabelsdf = nodelabelsdf.rename(columns={'index':'entity_uri'})
nodelabelsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090469 entries, 0 to 1090468
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   entity_uri  1090469 non-null  object
 1   label       1090469 non-null  object
dtypes: object(2)
memory usage: 16.6+ MB


In [27]:
##save as tsv
nodelabelsdf.to_csv(KG_PATH+'nodeLabels_v2.0.0.tsv', sep='\t', index=False)

### Merge serialized graphs
There is some issue with counts in serialized file of PheKnowLator KG so this has not been run in v2.0.0

In [24]:
pl_file = KG_PATH+'PheKnowLator_v3.1.0_full_instance_inverseRelations_OWLNETS.nt'
mr_file = MR_KG_PATH+'machine_read_merged_with_closure_v2.0.0.nt'
outfile_merged = KG_PATH+'NP-KG_v2.0.0.nt'

In [25]:
with open(pl_file,'r') as filep:
    g = filep.read()
pl = g.split('\n')

In [26]:
len(pl)

15482334

In [27]:
with open(mr_file,'r') as filem:
    g = filem.read()
mr = g.split('\n')

In [28]:
len(mr)

107056

In [29]:
mr[-2]

'<http://purl.obolibrary.org/obo/CHEBI_47519> <http://purl.obolibrary.org/obo/RO_0002436> <http://purl.obolibrary.org/obo/CHEBI_6931> .'

In [30]:
##not writing labels to file
with open(outfile_merged, 'w') as fileout:
    newline = ''
    for i in range(0, len(pl)-1):
        if pl[i] == '':
            continue
        else:
            fileout.write(newline+pl[i])
            newline = '\n'
    newline = '\n'
    for i in range(0, len(mr)-1):
        if 'label' in mr[i] or mr[i] == '':
            continue
        else:
            fileout.write(newline+mr[i])
            newline = '\n'
fileout.close()

In [31]:
#merged == 21725642 ()
#no labels = 21720940

In [32]:
##check file
with open(outfile_merged, 'r') as filein:
    g = filein.read()
merged = g.split('\n')
len(merged)

15577803

In [33]:
merged[-1]

'<http://purl.obolibrary.org/obo/CHEBI_47519> <http://purl.obolibrary.org/obo/RO_0002436> <http://purl.obolibrary.org/obo/CHEBI_6931> .'

In [34]:
merged[0]

'<http://purl.obolibrary.org/obo/MONDO_0010777> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://purl.obolibrary.org/obo/MONDO_0003847> .'