## Code to generate TSV from NP-KG gpickle graph

In [None]:
# # uncomment and run to install any required modules from np-kg/requirements.txt
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [1]:
import os
import os.path
import networkx as nx
import json
import urllib
import traceback
from itertools import islice
from rdflib import Graph, URIRef, BNode, Namespace, Literal
from rdflib.namespace import RDF, OWL
from tqdm import tqdm
import json

In [2]:
import hashlib

In [3]:
import pickle
import pandas as pd
import numpy as np

In [6]:
KG_PATH = '../resources/knowledge_graphs/'
NodeLabelsFile = KG_PATH + 'nodeLabels_v1.0.1.pickle'
KG_NAME_MERGED = 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.1.gpickle'

In [7]:
with open(NodeLabelsFile, 'rb') as filep:
    nodeLabels = pickle.load(filep)

In [8]:
##labels for nodes = 753,369, edges = 7,249,576
len(nodeLabels)

757826

In [9]:
def get_graph_stats(kg):
    nodes = nx.number_of_nodes(kg)
    edges = nx.number_of_edges(kg)
    self_loops = nx.number_of_selfloops(kg)

    print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
    # get degree information
    avg_degree = float(edges) / nodes
    print('The Average Degree is {}'.format(avg_degree))
    
    print('Nodes with highest degree:')
    n_deg = sorted([(str(x[0]), x[1]) for x in  kg.degree], key=lambda x: x[1], reverse=1)[:6]

    for x in n_deg:
        print('Label: {}'.format(nodeLabels[x[0]]))
        print('{} (degree={})'.format(x[0], x[1]))
    # get network density
    density = nx.density(kg)

    print('The density of the graph is: {}'.format(density))

In [10]:
##READ MERGED GRAPH
nx_graph = nx.read_gpickle(KG_PATH+KG_NAME_MERGED)

In [11]:
get_graph_stats(nx_graph)

There are 745512 nodes, 7249576 edges, and 521 self-loop(s)
The Average Degree is 9.724291493631222
Nodes with highest degree:
Label: transcript
http://purl.obolibrary.org/obo/SO_0000673 (degree=190860)
Label: SNV
http://purl.obolibrary.org/obo/SO_0001483 (degree=121020)
Label: Homo sapiens
http://purl.obolibrary.org/obo/NCBITaxon_9606 (degree=116704)
Label: protein_coding_gene
http://purl.obolibrary.org/obo/SO_0001217 (degree=105046)
Label: testis
http://purl.obolibrary.org/obo/UBERON_0000473 (degree=43818)
Label: lncRNA_with_retained_intron
http://purl.obolibrary.org/obo/SO_0002113 (degree=29340)
The density of the graph is: 1.3043793443196979e-05


In [12]:
npkgdict = {
    'subject': [],
    'predicate': [],
    'object': []
}
nodelist = []
missing_nodes = []
relation_list = []

In [13]:
OUTFILE = KG_PATH + 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.1.tsv'

In [None]:
#nx.write_edgelist(nx_graph, KG_PATH+'nx_edgelist_test.gz', data=True, delimiter='\t')

In [14]:
i = 0
for edge in nx_graph.edges():
    edgelist = list(nx_graph.get_edge_data(edge[0], edge[1]))
    if edgelist:
        subj = str(edge[0])
        obj = str(edge[1])
        if subj not in nodelist:
            nodelist.append(subj)
            if subj not in nodeLabels:
                missing_nodes.append(subj)
        if obj not in nodelist:
            nodelist.append(obj)
            if obj not in nodeLabels:
                missing_nodes.append(obj)
        for item in edgelist:
            npkgdict['subject'].append(subj)
            npkgdict['object'].append(obj)
            npkgdict['predicate'].append(str(item))
            if str(item) not in relation_list:
                relation_list.append(str(item))
                if str(item) not in nodeLabels:
                    missing_nodes.append(str(item))
    i = i+1
    if i%100000 == 0:
        print('Completed edges: ', i)
        df = pd.DataFrame.from_dict(npkgdict)
        df = df.drop_duplicates(ignore_index=True)
        df.to_csv(OUTFILE, sep='\t', mode='a', index=False, header=False)
        npkgdict['subject'] = []
        npkgdict['predicate'] = []
        npkgdict['object'] = []
        print('Saved edges: ', i)
print('Completed edges: ', i)
df = pd.DataFrame.from_dict(npkgdict)
df = df.drop_duplicates(ignore_index=True)
df.to_csv(OUTFILE, sep='\t', mode='a', index=False, header=False)
print('Saved edges: ', i)

Completed edges:  100000
Saved edges:  100000
Completed edges:  200000
Saved edges:  200000
Completed edges:  300000
Saved edges:  300000
Completed edges:  400000
Saved edges:  400000
Completed edges:  500000
Saved edges:  500000
Completed edges:  600000
Saved edges:  600000
Completed edges:  700000
Saved edges:  700000
Completed edges:  800000
Saved edges:  800000
Completed edges:  900000
Saved edges:  900000
Completed edges:  1000000
Saved edges:  1000000
Completed edges:  1100000
Saved edges:  1100000
Completed edges:  1200000
Saved edges:  1200000
Completed edges:  1300000
Saved edges:  1300000
Completed edges:  1400000
Saved edges:  1400000
Completed edges:  1500000
Saved edges:  1500000
Completed edges:  1600000
Saved edges:  1600000
Completed edges:  1700000
Saved edges:  1700000
Completed edges:  1800000
Saved edges:  1800000
Completed edges:  1900000
Saved edges:  1900000
Completed edges:  2000000
Saved edges:  2000000
Completed edges:  2100000
Saved edges:  2100000
Completed 

In [15]:
#start time: 7.08pm approx (20220820)
len(nodelist)

745512

In [16]:
len(relation_list)

299

In [17]:
len(missing_nodes)

0

In [18]:
print(len(npkgdict['subject']), len(npkgdict['predicate']), len(npkgdict['object']))

49604 49604 49604


In [19]:
with open('../resources/NPKG_nodelist.txt', 'w') as fileo:
    for item in nodelist:
        fileo.write(item+'\n')

In [20]:
with open('../resources/NPKG_relationlist.txt', 'w') as fileo2:
    for item in relation_list:
        fileo2.write(item+'\n')

In [21]:
with open('../resources/NPKG_missing_nodelabels.txt', 'w') as fileo3:
    for item in missing_nodes:
        fileo3.write(item+'\n')