## Merge ontology-grounded and literature-based graphs

In [1]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import os
import pickle, json

#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore

from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore

from tqdm import tqdm  # type: ignore

In [2]:
def get_stats(g):
    nodes = nx.number_of_nodes(g)
    edges = nx.number_of_edges(g)
    density = nx.density(g)
    avg_deg = float(edges)/nodes
    print('Number of nodes: ', nodes)
    print('Number of edges: ', edges)
    print('Density of graph: ', density)
    print('Average degree: ', avg_deg)

In [3]:
KG_PATH = '../resources/knowledge_graphs/'
MR_KG_PATH = '../literature-graphs/output_graphs/'

### Merge gpickle graphs

Load merged machine read graph, PheKnowLator KG and node labels files

In [4]:
KG_PATH = '../resources/knowledge_graphs/'
MR_PATH = '../literature-graphs/output_graphs/'
KG_NAME = 'PheKnowLator_v3.1.2_full_instance_inverseRelations_OWLNETS_NetworkxMultiDiGraph.gpickle'
MR_KG_NAME = 'machine_read_merged_version2.gpickle'

In [5]:
NodeLabelsFilePL = 'PheKnowLator_v3.1.2_full_instance_inverseRelations_OWLNETS_NodeLabels.txt'
NodeLabelsFileMR = 'machine_read_merged_NodeLabels_version2.pickle'

In [6]:
#pandas function leads to missing nodes
#nodespl = pd.read_csv(KG_PATH+NodeLabelsFilePL, sep='\t')
#nodespl.head()
with open(KG_PATH+NodeLabelsFilePL, 'r') as filep:
    nodespl = filep.readlines()
len(nodespl)

1088871

In [7]:
nodespl = [x.strip() for x in nodespl[1:]]
len(nodespl)

1088870

In [8]:
nodespl[0]

'NODES\t1011189\t<http://purl.obolibrary.org/obo/CLO_0035452>\tND10852 cell\tNone\tNone'

In [9]:
#split nodespl by \t and save to dictionary
nodespl_dict = {}
for node in nodespl:
    node = node.split('\t')
    nodespl_dict[node[2].replace('>', '').replace('<', '')] = node[3]
len(nodespl_dict)

1088870

In [10]:
for key in nodespl_dict:
    print(key, nodespl_dict[key])
    break

http://purl.obolibrary.org/obo/CLO_0035452 ND10852 cell


In [10]:
##fix foreign characters with bad_node_patch.json - fixed in v3.1.0
#with open(KG_PATH+'bad_node_patch.json') as f:
#    bad_node_patch = json.load(f)
#len(bad_node_patch)
# for key in bad_node_patch:
#    newkey = key.replace('<','').replace('>','')
#    if newkey in nodespldict:
#        print(newkey)
#        print(bad_node_patch[key])
#        print(nodespldict[newkey])
#        break
##fix foreign characters in nodespldict with bad_node_patch.json
#count = 0
#for key in bad_node_patch:
#    newkey = key.replace('<','').replace('>','')
#    if newkey in nodespldict:
#        nodespldict[newkey] = bad_node_patch[key]['label']
#        count += 1
#print(count)

387263

In [11]:
import pickle
with open(MR_PATH+NodeLabelsFileMR, 'rb') as filep:
    nodesmr = pickle.load(filep)
len(nodesmr)

12201

In [12]:
for key in nodesmr:
    print(key, nodesmr[key])
    break

http://purl.obolibrary.org/obo/CHEBI_38559 {'entity_type': 'NODES', 'label': 'Cytochrome P450', 'cui': 'C0010762'}


In [13]:
##combine node labels from both KGs
for key in nodesmr:
    if key not in nodespl_dict:
        nodespl_dict[key] = nodesmr[key]['label']
len(nodespl_dict)

1089472

In [14]:
with open(KG_PATH+'nodeLabels_v3.0.0.pickle', 'wb') as filep:
    pickle.dump(nodespl_dict, filep)

In [15]:
nodeLabels = nodespl_dict

In [16]:
##read in PL graph - 3.1.2
pkl = nx.read_gpickle(KG_PATH+KG_NAME)

In [17]:
#v1.0.1: 980240, 7765868, 8.082125806864853e-06, 7.922414918795397
#v3.1.0: 1089614, 7850306, 6.612133825320617e-06, 7.204666973809074
#v3.1.1: 1089613, 7836662, 6.600653910666818e-06, 7.192151708909494
get_stats(pkl)

Number of nodes:  1088531
Number of edges:  7716032
Density of graph:  6.511976364992758e-06
Average degree:  7.088481632585567


In [18]:
mrkg = nx.read_gpickle(MR_PATH+MR_KG_NAME)

In [19]:
get_stats(mrkg)

Number of nodes:  12190
Number of edges:  120371
Density of graph:  0.0008101213650926268
Average degree:  9.874569319114029


In [20]:
##combine graphs
nxgraph = nx.compose_all([pkl, mrkg])

In [21]:
get_stats(nxgraph)

Number of nodes:  1089139
Number of edges:  7836115
Density of graph:  6.605939326371026e-06
Average degree:  7.194779546045087


In [22]:
#save graph
OUTFILENAME = 'NP-KG_v3.0.0.gpickle'
with open(KG_PATH+OUTFILENAME, 'wb') as filenx:
    pickle.dump(nxgraph, filenx)

### Save node labels as TSV

In [23]:
with open(KG_PATH+'nodeLabels_v3.0.0.pickle', 'rb') as filep:
    nodeLabels = pickle.load(filep)
len(nodeLabels)

1089472

In [24]:
##convert dictionary to dataframe with columns 'entity_uri' and 'label'
nodelabelsdf = pd.DataFrame.from_dict(nodeLabels, orient='index', columns=['label'])
nodelabelsdf.head()

Unnamed: 0,label
http://purl.obolibrary.org/obo/CLO_0035452,ND10852 cell
https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000256362,VRTN-201
https://www.ncbi.nlm.nih.gov/snp/rs1057517106,NM_003060.4(SLC22A5):c.394-1G>T
https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=ENST00000555563,SHMT2-219
http://purl.obolibrary.org/obo/CHEBI_129885,"(2S)-2-[(4R,5S)-8-[3-(dimethylamino)prop-1-yny..."


In [25]:
nodelabelsdf = nodelabelsdf.reset_index()
nodelabelsdf.head()

Unnamed: 0,index,label
0,http://purl.obolibrary.org/obo/CLO_0035452,ND10852 cell
1,https://uswest.ensembl.org/Homo_sapiens/Transc...,VRTN-201
2,https://www.ncbi.nlm.nih.gov/snp/rs1057517106,NM_003060.4(SLC22A5):c.394-1G>T
3,https://uswest.ensembl.org/Homo_sapiens/Transc...,SHMT2-219
4,http://purl.obolibrary.org/obo/CHEBI_129885,"(2S)-2-[(4R,5S)-8-[3-(dimethylamino)prop-1-yny..."


In [26]:
##rename index as 'entity_uri'
nodelabelsdf = nodelabelsdf.rename(columns={'index':'entity_uri'})
nodelabelsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089472 entries, 0 to 1089471
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   entity_uri  1089472 non-null  object
 1   label       1089472 non-null  object
dtypes: object(2)
memory usage: 16.6+ MB


In [27]:
##save as tsv
nodelabelsdf.to_csv(KG_PATH+'nodeLabels_v3.0.0.tsv', sep='\t', index=False)

### Merge serialized graphs

In [28]:
pl_file = KG_PATH+'PheKnowLator_v3.1.2_full_instance_inverseRelations_OWLNETS.nt'
mr_file = MR_KG_PATH+'machine_read_merged_with_closure_version2.nt'
outfile_merged = KG_PATH+'NP-KG_v3.0.0.nt'

In [29]:
with open(pl_file,'r') as filep:
    g = filep.read()
pl = g.split('\n')

In [30]:
len(pl)

7716033

In [31]:
with open(mr_file,'r') as filem:
    g = filem.read()
mr = g.split('\n')

In [32]:
len(mr)

155742

In [36]:
##not writing labels to file
with open(outfile_merged, 'w') as fileout:
    newline = ''
    for i in range(0, len(pl)-1):
        if pl[i] == '':
            continue
        else:
            fileout.write(newline+pl[i])
            newline = '\n'
    newline = '\n'
    for i in range(0, len(mr)-1):
        if 'label' in mr[i] or mr[i] == '':
            continue
        else:
            fileout.write(newline+mr[i])
            newline = '\n'
fileout.close()

In [37]:
##check file
with open(outfile_merged, 'r') as filein:
    g = filein.read()
merged = g.split('\n')
len(merged)

7855336

In [38]:
merged[-1]

'<http://purl.obolibrary.org/obo/CHEBI_17347> <http://purl.obolibrary.org/obo/RO_0002436> <http://purl.obolibrary.org/obo/GO_0006412'

In [39]:
merged[0]

'<http://www.informatics.jax.org/marker/MGI:1913508> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://purl.obolibrary.org/obo/SO_0001217> .'

In [40]:
with open(KG_PATH+'nodeLabels_v3.0.0.pickle', 'rb') as filep:
    nodeLabels = pickle.load(filep)
len(nodeLabels)

1089472

In [41]:
for key in nodeLabels:
    print(key, nodeLabels[key])
    break

http://purl.obolibrary.org/obo/CLO_0035452 ND10852 cell


In [44]:
nodeLabels['http://purl.obolibrary.org/obo/CLO_0035452']

'ND10852 cell'

In [48]:
##save file with labels -- NOT DONE FOR VERSION 3 due to error
outfile_merged_new = KG_PATH+'NP-KG_with_labels_v3.0.0.nt'
with open(outfile_merged_new, 'w') as fileout:
    newline = ''
    for triple in merged:
        if triple == '':
            continue
        else:
            fileout.write(newline+triple)
            newline = '\n'
    newline = '\n'
    for key in nodeLabels:
        try:
            newkey = '<'+key+'>'
            fileout.write(newline+newkey+' <http://www.w3.org/2000/01/rdf-schema#label> '+_quoteLiteral(nodeLabels[key].replace(' ', '_')))
        except Exception as e:
            print(e)
            print(key, nodeLabels[key])
            break

'str' object has no attribute 'language'
http://purl.obolibrary.org/obo/CLO_0035452 ND10852 cell


### Check for 'None' relations

In [49]:
##checking if None relations exist in PKL v3.1.1 (bug found and fixed in PKL v3.1.0)
for source, target, key, edge_data in pkl.edges(keys=True, data=True):
    if key == URIRef('None'):
        print(source, target, key, edge_data)
        break