## Code to generate TSV from NP-KG gpickle graph

In [None]:
# # uncomment and run to install any required modules from np-kg/requirements.txt
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [1]:
import os
import os.path
import networkx as nx
import json
import urllib
import traceback
from itertools import islice
from rdflib import Graph, URIRef, BNode, Namespace, Literal
from rdflib.namespace import RDF, OWL
from tqdm import tqdm
import json

In [2]:
import hashlib

In [3]:
import pickle
import pandas as pd
import numpy as np

In [4]:
KG_PATH = '../resources/knowledge_graphs/'
NodeLabelsFile = KG_PATH + 'nodeLabels_v1.0.1.pickle'
KG_NAME_MERGED = 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.1.gpickle'

In [7]:
with open(NodeLabelsFile, 'rb') as filep:
    nodeLabels = pickle.load(filep)

In [8]:
##labels for nodes = 753,369, edges = 7,249,576
len(nodeLabels)

757826

In [9]:
def get_graph_stats(kg):
    nodes = nx.number_of_nodes(kg)
    edges = nx.number_of_edges(kg)
    self_loops = nx.number_of_selfloops(kg)

    print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
    # get degree information
    avg_degree = float(edges) / nodes
    print('The Average Degree is {}'.format(avg_degree))
    
    print('Nodes with highest degree:')
    n_deg = sorted([(str(x[0]), x[1]) for x in  kg.degree], key=lambda x: x[1], reverse=1)[:6]

    for x in n_deg:
        print('Label: {}'.format(nodeLabels[x[0]]))
        print('{} (degree={})'.format(x[0], x[1]))
    # get network density
    density = nx.density(kg)

    print('The density of the graph is: {}'.format(density))

In [10]:
##READ MERGED GRAPH
nx_graph = nx.read_gpickle(KG_PATH+KG_NAME_MERGED)

In [11]:
get_graph_stats(nx_graph)

There are 745512 nodes, 7249576 edges, and 521 self-loop(s)
The Average Degree is 9.724291493631222
Nodes with highest degree:
Label: transcript
http://purl.obolibrary.org/obo/SO_0000673 (degree=190860)
Label: SNV
http://purl.obolibrary.org/obo/SO_0001483 (degree=121020)
Label: Homo sapiens
http://purl.obolibrary.org/obo/NCBITaxon_9606 (degree=116704)
Label: protein_coding_gene
http://purl.obolibrary.org/obo/SO_0001217 (degree=105046)
Label: testis
http://purl.obolibrary.org/obo/UBERON_0000473 (degree=43818)
Label: lncRNA_with_retained_intron
http://purl.obolibrary.org/obo/SO_0002113 (degree=29340)
The density of the graph is: 1.3043793443196979e-05


In [12]:
npkgdict = {
    'subject': [],
    'predicate': [],
    'object': []
}
nodelist = []
missing_nodes = []
relation_list = []

In [13]:
OUTFILE = KG_PATH + 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.1.tsv'

In [None]:
#nx.write_edgelist(nx_graph, KG_PATH+'nx_edgelist_test.gz', data=True, delimiter='\t')

In [14]:
i = 0
for edge in nx_graph.edges():
    edgelist = list(nx_graph.get_edge_data(edge[0], edge[1]))
    if edgelist:
        subj = str(edge[0])
        obj = str(edge[1])
        if subj not in nodelist:
            nodelist.append(subj)
            if subj not in nodeLabels:
                missing_nodes.append(subj)
        if obj not in nodelist:
            nodelist.append(obj)
            if obj not in nodeLabels:
                missing_nodes.append(obj)
        for item in edgelist:
            npkgdict['subject'].append(subj)
            npkgdict['object'].append(obj)
            npkgdict['predicate'].append(str(item))
            if str(item) not in relation_list:
                relation_list.append(str(item))
                if str(item) not in nodeLabels:
                    missing_nodes.append(str(item))
    i = i+1
    if i%100000 == 0:
        print('Completed edges: ', i)
        df = pd.DataFrame.from_dict(npkgdict)
        df = df.drop_duplicates(ignore_index=True)
        df.to_csv(OUTFILE, sep='\t', mode='a', index=False, header=False)
        npkgdict['subject'] = []
        npkgdict['predicate'] = []
        npkgdict['object'] = []
        print('Saved edges: ', i)
print('Completed edges: ', i)
df = pd.DataFrame.from_dict(npkgdict)
df = df.drop_duplicates(ignore_index=True)
df.to_csv(OUTFILE, sep='\t', mode='a', index=False, header=False)
print('Saved edges: ', i)

Completed edges:  100000
Saved edges:  100000
Completed edges:  200000
Saved edges:  200000
Completed edges:  300000
Saved edges:  300000
Completed edges:  400000
Saved edges:  400000
Completed edges:  500000
Saved edges:  500000
Completed edges:  600000
Saved edges:  600000
Completed edges:  700000
Saved edges:  700000
Completed edges:  800000
Saved edges:  800000
Completed edges:  900000
Saved edges:  900000
Completed edges:  1000000
Saved edges:  1000000
Completed edges:  1100000
Saved edges:  1100000
Completed edges:  1200000
Saved edges:  1200000
Completed edges:  1300000
Saved edges:  1300000
Completed edges:  1400000
Saved edges:  1400000
Completed edges:  1500000
Saved edges:  1500000
Completed edges:  1600000
Saved edges:  1600000
Completed edges:  1700000
Saved edges:  1700000
Completed edges:  1800000
Saved edges:  1800000
Completed edges:  1900000
Saved edges:  1900000
Completed edges:  2000000
Saved edges:  2000000
Completed edges:  2100000
Saved edges:  2100000
Completed 

In [15]:
#start time: 7.08pm approx (20220820)
len(nodelist)

745512

In [16]:
len(relation_list)

299

In [17]:
len(missing_nodes)

0

In [18]:
print(len(npkgdict['subject']), len(npkgdict['predicate']), len(npkgdict['object']))

49604 49604 49604


In [19]:
with open('../resources/NPKG_nodelist.txt', 'w') as fileo:
    for item in nodelist:
        fileo.write(item+'\n')

In [20]:
with open('../resources/NPKG_relationlist.txt', 'w') as fileo2:
    for item in relation_list:
        fileo2.write(item+'\n')

In [21]:
with open('../resources/NPKG_missing_nodelabels.txt', 'w') as fileo3:
    for item in missing_nodes:
        fileo3.write(item+'\n')

### Save TSV with only CURIES (solve issue #5)

In [7]:
INFILE = KG_PATH + 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.1.tsv'
OUTFILE = KG_PATH + 'NP-KG-merged-instance-based-OWLNETS-CURIE-only-v1.0.1.tsv'

In [6]:
import csv

In [8]:
##find all node prefixes
##see Notion (NP-KG TSV Processing)
with open('../resources/NPKG_nodelist.txt', 'r') as filei:
    nodes = filei.readlines()
len(nodes)

745512

In [13]:
prefixlist = []
curielist = []
for node in nodes:
    if 'http://purl.obolibrary.org/obo/' in node:
        onto = node.strip().replace('http://purl.obolibrary.org/obo/', '')
        curie = onto.split('_')[0]
        if curie not in curielist:
            curielist.append(curie)
    else:
        prefixlist.append(node.strip())
print(len(prefixlist))
print(len(curielist))

365987
46


In [14]:
curielist

['CHEBI',
 'PR',
 'GO',
 'HP',
 'MONDO',
 'NCBITaxon',
 'CL',
 'UBERON',
 'CLO',
 'SO',
 'SLC47A1SYNONYM',
 'DOID',
 'PO',
 'FOODON',
 'PW',
 'PATO',
 'BFO',
 'MPATH',
 'SLC47A1[SYNONYM',
 'SLC47A1',
 'CARO',
 'http',
 'NBO',
 'OBO',
 'ECTO',
 'OGG',
 'MOD',
 'MF',
 'GNO',
 'ENVO',
 'ExO',
 'MFOMD',
 'NCIT',
 'OAE',
 'HsapDv',
 'MFOEM',
 'OGMS',
 'VO',
 'FBbt',
 'MAXO',
 'PCO',
 'CP',
 'OBA',
 'UMLS',
 'UPHENO',
 'FMA']

In [27]:
prefixlist2 = []
for prefix in prefixlist:
    if 'napdi' in prefix or 'https://reactome.org/content/detail/' in prefix \
    or 'http://www.ncbi.nlm.nih.gov/gene/' in prefix or 'https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?' \
    in prefix or 'http://www.ebi.ac.uk/cellline#' in prefix or 'http://www.ebi.ac.uk/efo/' in prefix or \
    'https://www.ncbi.nlm.nih.gov/snp/' in prefix or 'http://ihtsdo.org/snomedct/' in prefix:
        continue
    else:
        prefixlist2.append(prefix)
len(prefixlist2)

22

In [26]:
prefixlist3

['http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#Amniotic_Fluid',
 'http://www.w3.org/2002/07/owl#Nothing',
 'http://sig.uw.edu/fma#Calcaneal_tendon',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#Amniotic_Sac',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#Embryonic_Fluid',
 'http://sig.uw.edu/fma#Umbilicus',
 'http://sig.uw.edu/fma#Common_iliac_artery',
 'http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=33204',
 'http://sig.uw.edu/fma#Trachealis',
 'http://sig.uw.edu/fma#Iliac_crest',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#Gonad',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#Iliac_Vein',
 'http://sig.uw.edu/fma#External_genitalia',
 'http://sig.uw.edu/fma#Tunica_albuginea_of_testis',
 'http://sig.uw.edu/fma#Amnion',
 'http://sig.uw.edu/fma#Ischium',
 'http://purl.bioontology.org/ontology/SNOMEDCT/277441005',
 'http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=53650',
 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#Chor

In [37]:
def relation_curie(rel):
    newrel = rel
    if 'uberon' in rel:
        newrel = 'uberon:'+rel.split('#')[1]
    elif 'rdf-schema' in rel:
        newrel = 'rdfs:'+rel.split('#')[1]
    elif 'rdf-syntax' in rel:
        newrel = 'rdf:'+rel.split('#')[1]
    elif 'ro.owl' in rel:
        newrel = 'ro:'+rel.split('#')[1]
    else:
        temp = rel.split('/')[-1]
        newrel = temp.replace('_', ':').lower()
    return newrel

In [38]:
def node_curie(node):
    nodecurie = node
    if 'SLC47A1SYNONYM' in node or 'SLC47A1[SYNONYM' in node or 'SLC47A1' in node:
        return node
    elif 'http://purl.obolibrary.org/obo/http' in node:
        return node
    elif 'http://purl.obolibrary.org/obo/OBO_' in node:
        tempnode = node.replace('http://purl.obolibrary.org/obo/OBO_', '')
        nodecurie = tempnode.replace('_',':').lower()
    elif 'napdi' in node:
        nodecurie = node.split('/')[-1]
    elif 'reactome' in node:
        nodecurie = 'reactome:'+node.split('/')[-1]
    elif 'http://www.ncbi.nlm.nih.gov/gene/' in node:
        nodecurie = 'ncbigene:'+node.split('/')[-1]
    elif 'https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?' in node:
        nodecurie = node.replace('https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=', 'ensembl:')
    elif 'http://www.ebi.ac.uk/efo/' in node:
        tempnode = node.split('/')[-1]
        nodecurie = tempnode.replace('_',':').lower()
    elif 'http://ihtsdo.org/snomedct/' in node or 'http://purl.bioontology.org/ontology/SNOMEDCT/' in node:
        nodecurie = 'snomedct:'+node.split('/')[-1]
    elif 'https://www.ncbi.nlm.nih.gov/snp/' in node:
        nodecurie = 'dbsnp:'+node.split('/')[-1]
    elif 'http://www.w3.org/2002/07/' in node:
        tempnode = node.split('/')[-1]
        nodecurie = 'owl:'+tempnode.split('#')[-1]
    elif 'hgnc_id' in node:
        nodecurie = 'hgnc:'+node.split('=')[-1]
    elif 'http://sig.uw.edu/fma' in node:
        tempnode = node.split('/')[-1]
        nodecurie = 'fma:'+tempnode.split('#')[-1]
    elif 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl' in node:
        tempnode = node.replace('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#', '')
        nodecurie = 'ncit:'+tempnode
    else:
        tempnode = node.replace('http://purl.obolibrary.org/obo/', '')
        nodecurie = tempnode.replace('_',':').lower()
    return nodecurie

In [None]:
##also create new nodelabels with curies

In [59]:
with open(INFILE, 'r') as fin, open(OUTFILE, 'w') as fout:
    freader = csv.reader(fin, delimiter='\t')
    fwriter = csv.writer(fout, delimiter='\t')
    fwriter.writerow(['source', 'relation', 'target'])
    idx = 1
    for row in freader:
        try:
            npkg_subject = row[0]
            npkg_relation = row[1]
            npkg_object = row[2]
            rel_curie = relation_curie(npkg_relation)
            subject_curie = node_curie(npkg_subject)
            object_curie = node_curie(npkg_object)
            fwriter.writerow([subject_curie, rel_curie, object_curie])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%100000 == 0:
            print('Completed edges: ', idx)

Completed edges:  100000
Completed edges:  200000
Completed edges:  300000
Completed edges:  400000
Completed edges:  500000
Completed edges:  600000
Completed edges:  700000
Completed edges:  800000
Completed edges:  900000
Completed edges:  1000000
Completed edges:  1100000
Completed edges:  1200000
Completed edges:  1300000
Completed edges:  1400000
Completed edges:  1500000
Completed edges:  1600000
Completed edges:  1700000
Completed edges:  1800000
Completed edges:  1900000
Completed edges:  2000000
Completed edges:  2100000
Completed edges:  2200000
Completed edges:  2300000
Completed edges:  2400000
Completed edges:  2500000
Completed edges:  2600000
Completed edges:  2700000
Completed edges:  2800000
Completed edges:  2900000
Completed edges:  3000000
Completed edges:  3100000
Completed edges:  3200000
Completed edges:  3300000
Completed edges:  3400000
Completed edges:  3500000
Completed edges:  3600000
Completed edges:  3700000
Completed edges:  3800000
Completed edges:  390

In [60]:
NODEINFILE = KG_PATH + 'nodeLabels_v1.0.1.tsv'
NODEOUTFILE = KG_PATH + 'nodeLabels_CURIE_v1.0.1.tsv'

In [61]:
##node labels with CURIEs
with open(NODEINFILE, 'r') as nodein, open(NODEOUTFILE, 'w') as nodeout:
    freader = csv.reader(nodein, delimiter='\t')
    fwriter = csv.writer(nodeout, delimiter='\t')
    fwriter.writerow(['source', 'entity_label'])
    next(freader)
    idx = 1
    for row in freader:
        try:
            npkg_node = row[0]
            npkg_label = row[1]
            nodecurie = node_curie(npkg_node)
            fwriter.writerow([nodecurie, npkg_label])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%10000 == 0:
            print('Completed nodes: ', idx)

Completed nodes:  10000
Completed nodes:  20000
Completed nodes:  30000
Completed nodes:  40000
Completed nodes:  50000
Completed nodes:  60000
Completed nodes:  70000
Completed nodes:  80000
Completed nodes:  90000
Completed nodes:  100000
Completed nodes:  110000
Completed nodes:  120000
Completed nodes:  130000
Completed nodes:  140000
Completed nodes:  150000
Completed nodes:  160000
Completed nodes:  170000
Completed nodes:  180000
Completed nodes:  190000
Completed nodes:  200000
Completed nodes:  210000
Completed nodes:  220000
Completed nodes:  230000
Completed nodes:  240000
Completed nodes:  250000
Completed nodes:  260000
Completed nodes:  270000
Completed nodes:  280000
Completed nodes:  290000
Completed nodes:  300000
Completed nodes:  310000
Completed nodes:  320000
Completed nodes:  330000
Completed nodes:  340000
Completed nodes:  350000
Completed nodes:  360000
Completed nodes:  370000
Completed nodes:  380000
Completed nodes:  390000
Completed nodes:  400000
Completed