## Code to generate TSV from NP-KG gpickle graph

1. Load merged pickle graph.
2. Create TSV with URIs.
3. Create TSV with CURIEs.
4. Create node labels with CURIEs.

See weighting experiments for TSV files with weights.

In [None]:
# # uncomment and run to install any required modules from np-kg/requirements.txt
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [1]:
import os
import os.path
import networkx as nx
import json
import urllib
import traceback
from itertools import islice
from rdflib import Graph, URIRef, BNode, Namespace, Literal
from rdflib.namespace import RDF, OWL
from tqdm import tqdm
import json

In [2]:
import hashlib

In [3]:
import pickle
import pandas as pd
import numpy as np

In [4]:
KG_PATH = '../resources/knowledge_graphs/'
NodeLabelsFile = KG_PATH + 'nodeLabels_v3.0.0.pickle'
KG_NAME_MERGED = 'NP-KG_v3.0.0.gpickle'

In [5]:
with open(NodeLabelsFile, 'rb') as filep:
    nodeLabels = pickle.load(filep)

In [6]:
##N (v1.0.1) = 757826
len(nodeLabels)

1089472

In [7]:
def get_graph_stats(kg):
    nodes = nx.number_of_nodes(kg)
    edges = nx.number_of_edges(kg)
    self_loops = nx.number_of_selfloops(kg)

    print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops))
    # get degree information
    avg_degree = float(edges) / nodes
    print('The Average Degree is {}'.format(avg_degree))
    
    print('Nodes with highest degree:')
    n_deg = sorted([(str(x[0]), x[1]) for x in  kg.degree], key=lambda x: x[1], reverse=1)[:6]

    for x in n_deg:
        print('Label: {}'.format(nodeLabels[x[0]]))
        print('{} (degree={})'.format(x[0], x[1]))
    # get network density
    density = nx.density(kg)

    print('The density of the graph is: {}'.format(density))

In [8]:
##READ MERGED GRAPH
with open(KG_PATH+KG_NAME_MERGED, 'rb') as filep:
    nx_graph = pickle.load(filep)

In [9]:
get_graph_stats(nx_graph)

There are 1089139 nodes, 7836115 edges, and 658 self-loop(s)
The Average Degree is 7.194779546045087
Nodes with highest degree:
Label: SNV
http://purl.obolibrary.org/obo/SO_0001483 (degree=214261)
Label: protein_coding_gene
http://purl.obolibrary.org/obo/SO_0001217 (degree=189350)
Label: transcript
http://purl.obolibrary.org/obo/SO_0000673 (degree=178310)
Label: Homo sapiens
http://purl.obolibrary.org/obo/NCBITaxon_9606 (degree=91924)
Label: protein
http://purl.obolibrary.org/obo/PR_000000001 (degree=56361)
Label: Mus musculus
http://purl.obolibrary.org/obo/NCBITaxon_10090 (degree=53597)
The density of the graph is: 6.605939326371026e-06


In [10]:
npkgdict = {
    'subject': [],
    'predicate': [],
    'object': []
}
nodelist = []
missing_nodes = []
relation_list = []

In [11]:
OUTFILE = KG_PATH + 'NP-KG_v3.0.0.tsv'

In [12]:
i = 0
for edge in nx_graph.edges():
    edgelist = list(nx_graph.get_edge_data(edge[0], edge[1]))
    if edgelist:
        subj = str(edge[0])
        obj = str(edge[1])
        if subj not in nodelist:
            nodelist.append(subj)
            if subj not in nodeLabels:
                missing_nodes.append(subj)
        if obj not in nodelist:
            nodelist.append(obj)
            if obj not in nodeLabels:
                missing_nodes.append(obj)
        for item in edgelist:
            npkgdict['subject'].append(subj)
            npkgdict['object'].append(obj)
            npkgdict['predicate'].append(str(item))
            if str(item) not in relation_list:
                relation_list.append(str(item))
                if str(item) not in nodeLabels:
                    missing_nodes.append(str(item))
    i = i+1
    if i%100000 == 0:
        print('Completed edges: ', i)
        df = pd.DataFrame.from_dict(npkgdict)
        df = df.drop_duplicates(ignore_index=True)
        df.to_csv(OUTFILE, sep='\t', mode='a', index=False, header=False)
        npkgdict['subject'] = []
        npkgdict['predicate'] = []
        npkgdict['object'] = []
        print('Saved edges: ', i)
print('Completed edges: ', i)
df = pd.DataFrame.from_dict(npkgdict)
df = df.drop_duplicates(ignore_index=True)
df.to_csv(OUTFILE, sep='\t', mode='a', index=False, header=False)
print('Saved edges: ', i)

Completed edges:  100000
Saved edges:  100000
Completed edges:  200000
Saved edges:  200000
Completed edges:  300000
Saved edges:  300000
Completed edges:  400000
Saved edges:  400000
Completed edges:  500000
Saved edges:  500000
Completed edges:  600000
Saved edges:  600000
Completed edges:  700000
Saved edges:  700000
Completed edges:  800000
Saved edges:  800000
Completed edges:  900000
Saved edges:  900000
Completed edges:  1000000
Saved edges:  1000000
Completed edges:  1100000
Saved edges:  1100000
Completed edges:  1200000
Saved edges:  1200000
Completed edges:  1300000
Saved edges:  1300000
Completed edges:  1400000
Saved edges:  1400000
Completed edges:  1500000
Saved edges:  1500000
Completed edges:  1600000
Saved edges:  1600000
Completed edges:  1700000
Saved edges:  1700000
Completed edges:  1800000
Saved edges:  1800000
Completed edges:  1900000
Saved edges:  1900000
Completed edges:  2000000
Saved edges:  2000000
Completed edges:  2100000
Saved edges:  2100000
Completed 

In [13]:
len(nodelist)

1089139

In [14]:
len(relation_list)

356

In [15]:
len(missing_nodes)

14

In [16]:
print(len(npkgdict['subject']), len(npkgdict['predicate']), len(npkgdict['object']))

37077 37077 37077


In [17]:
with open('../resources/NPKG_nodelist.txt', 'w') as fileo:
    for item in nodelist:
        fileo.write(item+'\n')

In [18]:
with open('../resources/NPKG_relationlist.txt', 'w') as fileo2:
    for item in relation_list:
        fileo2.write(item+'\n')

In [19]:
with open('../resources/NPKG_missing_nodelabels.txt', 'w') as fileo3:
    for item in missing_nodes:
        fileo3.write(item+'\n')

### Save TSV with only CURIES (solve issue #5)

In [20]:
KG_PATH = '../resources/knowledge_graphs/'

In [21]:
INFILE = KG_PATH + 'NP-KG_v3.0.0.tsv'
OUTFILE = KG_PATH + 'NP-KG-CURIE-only-v3.0.0.tsv'

In [22]:
import csv

In [23]:
##find all node prefixes
with open('../resources/NPKG_nodelist.txt', 'r') as filei:
    nodes = filei.readlines()
len(nodes)

1089139

In [24]:
prefixlist = []
curielist = []
for node in nodes:
    if 'http://purl.obolibrary.org/obo/' in node:
        onto = node.strip().replace('http://purl.obolibrary.org/obo/', '')
        curie = onto.split('_')[0]
        if curie not in curielist:
            curielist.append(curie)
    else:
        prefixlist.append(node.strip())
print(len(prefixlist))
print(len(curielist))

534358
51


In [25]:
curielist

['SO',
 'NCBITaxon',
 'GO',
 'HP',
 'MONDO',
 'CHEBI',
 'PR',
 'CL',
 'CLO',
 'UBERON',
 'CARO',
 'MOD',
 'GNO',
 'PW',
 'BFO',
 'PATO',
 'CHR',
 'NBO',
 'FOODON',
 'DOID',
 'OAE',
 'ENVO',
 'ECTO',
 'MPATH',
 'NCIT',
 'DIDEO',
 'GAZ',
 'CHMO',
 'DRON',
 'IDO',
 'ERO',
 'FMA',
 'APOLLO',
 'UO',
 'PDRO/PDRO.owl#PDRO',
 'INO',
 'EnsemblBacteria#',
 'MFOMD',
 'VO',
 'HsapDv',
 'OGMS',
 'PO',
 'MF',
 'ExO',
 'OGG',
 'MAXO',
 'MOP',
 'Ensembl#',
 'PCO',
 'UPHENO',
 'STATO']

In [26]:
prefixlist2 = []
for prefix in prefixlist:
    if 'napdi' in prefix or 'https://reactome.org/content/detail/' in prefix \
    or 'http://www.ncbi.nlm.nih.gov/gene/' in prefix or 'https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?' \
    in prefix or 'http://www.ebi.ac.uk/cellline#' in prefix or 'http://www.ebi.ac.uk/efo/' in prefix or \
    'https://www.ncbi.nlm.nih.gov/snp/' in prefix or 'http://ihtsdo.org/snomedct/' in prefix \
    or 'https://bar.utoronto.ca/' in prefix or 'http://flybase.org/' in prefix \
    or 'http://dictybase.org/gene/' in prefix or 'http://rgd.mcw.edu/rgdweb/report/gene/' in prefix\
    or 'http://zfin.org/action/marker/view/' in prefix or 'http://birdgenenames.org/cgnc/' in prefix:
        continue
    else:
        prefixlist2.append(prefix)
len(prefixlist2)

37301

In [27]:
prefixlist2 = []
for prefix in prefixlist:
    if 'napdi' in prefix or 'https://reactome.org/content/detail/' in prefix \
    or 'http://www.ncbi.nlm.nih.gov/gene/' in prefix or 'https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?' \
    in prefix or 'http://www.ebi.ac.uk/cellline#' in prefix or 'http://www.ebi.ac.uk/efo/' in prefix or \
    'https://www.ncbi.nlm.nih.gov/snp/' in prefix or 'http://ihtsdo.org/snomedct/' in prefix \
    or 'http://dictybase.org/gene/' in prefix or 'https://bar.utoronto.ca/' in prefix\
    or 'http://rgd.mcw.edu/rgdweb/report/gene/' in prefix \
    or 'http://flybase.org/' in prefix or 'http://zfin.org/action/marker/view/' in prefix \
    or 'http://birdgenenames.org/cgnc/' in prefix or 'wormbase' in prefix or 'informatics' in prefix\
    or 'yeastgenome' in prefix or 'ecogene' in prefix or 'pombase' in prefix\
    or 'ensembl' in prefix:
        continue
    else:
        prefixlist2.append(prefix)
len(prefixlist2)

8

In [28]:
prefixlist2[:10]

['http://www.geneontology.org/formats/oboInOwl#Subset',
 'http://www.geneontology.org/formats/oboInOwl#SynonymType',
 'http://www.geneontology.org/formats/oboInOwl#DbXref',
 'http://www.geneontology.org/formats/oboInOwl#Synonym',
 'http://www.geneontology.org/formats/oboInOwl#Definition',
 'https://ghr.nlm.nih.gov/condition/saddan',
 'http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=55921',
 'https://rarediseases.info.nih.gov/diseases/9644/multicentric-castleman-disease']

In [29]:
def relation_curie(rel):
    newrel = rel
    if 'uberon' in rel:
        newrel = 'uberon:'+rel.split('#')[1]
    elif 'rdf-schema' in rel:
        newrel = 'rdfs:'+rel.split('#')[1]
    elif 'rdf-syntax' in rel:
        newrel = 'rdf:'+rel.split('#')[1]
    elif 'ro.owl' in rel:
        newrel = 'ro:'+rel.split('#')[1]
    else:
        temp = rel.split('/')[-1]
        if '#' in temp:
            temp = temp.split('#')
            newrel = temp[0]+':'+temp[1]
        else:
            newrel = temp.replace('_', ':').lower()
    return newrel

In [30]:
def get_node_curie(node):
    nodecurie = node
    if 'http://purl.obolibrary.org/obo/OBO_' in node:
        tempnode = node.replace('http://purl.obolibrary.org/obo/OBO_', '')
        nodecurie = tempnode.replace('_',':').lower()
    elif 'napdi' in node:
        nodecurie = node.split('/')[-1]
    elif 'reactome' in node:
        nodecurie = 'reactome:'+node.split('/')[-1]
    elif 'http://www.ncbi.nlm.nih.gov/gene/' in node:
        nodecurie = 'ncbigene:'+node.split('/')[-1]
    elif 'https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?' in node:
        nodecurie = node.replace('https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=', 'ensembl:')
    elif 'ensembl' in node:
        nodecurie = 'ensembl:'+node.split('/')[-1]
    elif 'http://www.ebi.ac.uk/efo/' in node:
        tempnode = node.split('/')[-1]
        nodecurie = tempnode.replace('_',':').lower()
    elif 'http://ihtsdo.org/snomedct/' in node or 'http://purl.bioontology.org/ontology/SNOMEDCT/' in node:
        nodecurie = 'snomedct:'+node.split('/')[-1]
    elif 'https://www.ncbi.nlm.nih.gov/snp/' in node:
        nodecurie = 'dbsnp:'+node.split('/')[-1]
    elif 'http://www.w3.org/2002/07/' in node:
        tempnode = node.split('/')[-1]
        nodecurie = 'owl:'+tempnode.split('#')[-1]
    elif 'hgnc_id' in node:
        nodecurie = 'hgnc:'+node.split('=')[-1]
    elif 'http://sig.uw.edu/fma' in node:
        tempnode = node.split('/')[-1]
        nodecurie = 'fma:'+tempnode.split('#')[-1]
    elif 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl' in node:
        tempnode = node.replace('http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#', '')
        nodecurie = 'ncit:'+tempnode
    elif 'https://bar.utoronto.ca/' in node:
        nodecurie = 'bar:'+node.split('=')[-1]
    elif 'http://flybase.org/' in node:
        nodecurie = 'flybase:'+node.split('/')[-1]
    elif 'http://dictybase.org/gene/' in node:
        nodecurie = 'dictyBase:'+node.split('/')[-1]
    elif 'http://rgd.mcw.edu/rgdweb/report/gene/' in node:
        nodecurie = 'rgd:'+node.split('/')[-1]
    elif 'http://zfin.org/action/marker/view/' in node:
        nodecurie = 'zfin:'+node.split('/')[-1]
    elif 'http://birdgenenames.org/cgnc/' in node:
        nodecurie = 'birdgenenames:'+node.split('=')[-1]
    elif 'informatics' in node:
        tempnode = node.split('/')[-1]
        nodecurie = 'mgi:'+tempnode.split(':')[-1]
    elif 'wormbase' in node:
        nodecurie = 'wormbase:'+node.split('/')[-1]
    elif 'yeastgenome' in node:
        nodecurie = 'yeastgenome:'+node.split('/')[-1]
    elif 'ecogene' in node:
        nodecurie = 'ecogene:'+node.split('/')[-1]
    elif 'pombase' in node:
        nodecurie = 'pombase:'+node.split('/')[-1]
    elif 'https://ghr.nlm.nih.gov/condition/' in node:
        nodecurie = 'ghr:'+node.split('/')[-1]
    elif 'https://rarediseases.info.nih.gov/diseases/' in node:
        nodecurie = 'rare:'+node.split('/')[-1]
    elif 'PDRO/PDRO.owl#PDRO' in node:
        temp = node.split('#')[-1]
        nodecurie = 'pdro:'+temp.split('_')[-1]
    else:
        tempnode = node.replace('http://purl.obolibrary.org/obo/', '')
        nodecurie = tempnode.replace('_',':').lower()
    return nodecurie

### Save to TSV

In [31]:
with open(INFILE, 'r') as fin, open(OUTFILE, 'w') as fout:
    freader = csv.reader(fin, delimiter='\t')
    fwriter = csv.writer(fout, delimiter='\t')
    fwriter.writerow(['source', 'relation', 'target'])
    idx = 1
    for row in freader:
        try:
            npkg_subject = row[0]
            npkg_relation = row[1]
            npkg_object = row[2]
            rel_curie = relation_curie(npkg_relation)
            subject_curie = get_node_curie(npkg_subject)
            object_curie = get_node_curie(npkg_object)
            fwriter.writerow([subject_curie, rel_curie, object_curie])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%100000 == 0:
            print('Completed edges: ', idx)

Completed edges:  100000
Completed edges:  200000
Completed edges:  300000
Completed edges:  400000
Completed edges:  500000
Completed edges:  600000
Completed edges:  700000
Completed edges:  800000
Completed edges:  900000
Completed edges:  1000000
Completed edges:  1100000
Completed edges:  1200000
Completed edges:  1300000
Completed edges:  1400000
Completed edges:  1500000
Completed edges:  1600000
Completed edges:  1700000
Completed edges:  1800000
Completed edges:  1900000
Completed edges:  2000000
Completed edges:  2100000
Completed edges:  2200000
Completed edges:  2300000
Completed edges:  2400000
Completed edges:  2500000
Completed edges:  2600000
Completed edges:  2700000
Completed edges:  2800000
Completed edges:  2900000
Completed edges:  3000000
Completed edges:  3100000
Completed edges:  3200000
Completed edges:  3300000
Completed edges:  3400000
Completed edges:  3500000
Completed edges:  3600000
Completed edges:  3700000
Completed edges:  3800000
Completed edges:  390

## Create new nodelabels with curies

#### Also create URI to CURIE map and vice versa

In [32]:
NODEINFILE = KG_PATH + 'nodeLabels_v3.0.0.tsv'
NODEOUTFILE = KG_PATH + 'nodeLabels_CURIE_v3.0.0.tsv'

In [33]:
##node labels with CURIEs

uri_to_curie_dict = {}
curie_to_uri_dict = {}

with open(NODEINFILE, 'r') as nodein, open(NODEOUTFILE, 'w') as nodeout:
    freader = csv.reader(nodein, delimiter='\t')
    fwriter = csv.writer(nodeout, delimiter='\t')
    fwriter.writerow(['source', 'entity_label'])
    next(freader)
    idx = 1
    for row in freader:
        try:
            npkg_node = row[0]
            npkg_label = row[1]
            nodecurie = get_node_curie(npkg_node)
            fwriter.writerow([nodecurie, npkg_label])
            uri_to_curie_dict[npkg_node] = nodecurie
            curie_to_uri_dict[nodecurie] = npkg_node
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%10000 == 0:
            print('Completed nodes: ', idx)

Completed nodes:  10000
Completed nodes:  20000
Completed nodes:  30000
Completed nodes:  40000
Completed nodes:  50000
Completed nodes:  60000
Completed nodes:  70000
Completed nodes:  80000
Completed nodes:  90000
Completed nodes:  100000
Completed nodes:  110000
Completed nodes:  120000
Completed nodes:  130000
Completed nodes:  140000
Completed nodes:  150000
Completed nodes:  160000
Completed nodes:  170000
Completed nodes:  180000
Completed nodes:  190000
Completed nodes:  200000
Completed nodes:  210000
Completed nodes:  220000
Completed nodes:  230000
Completed nodes:  240000
Completed nodes:  250000
Completed nodes:  260000
Completed nodes:  270000
Completed nodes:  280000
Completed nodes:  290000
Completed nodes:  300000
Completed nodes:  310000
Completed nodes:  320000
Completed nodes:  330000
Completed nodes:  340000
Completed nodes:  350000
Completed nodes:  360000
Completed nodes:  370000
Completed nodes:  380000
Completed nodes:  390000
Completed nodes:  400000
Completed

In [34]:
##save dictionaries as TSV files
with open(KG_PATH+'uri_to_curie_map.tsv', 'w') as fileo:
    for key, value in uri_to_curie_dict.items():
        fileo.write(key+'\t'+value+'\n')

with open(KG_PATH+'curie_to_uri_map.tsv', 'w') as fileo:
    for key, value in curie_to_uri_dict.items():
        fileo.write(key+'\t'+value+'\n')

### Convert relation CURIEs to labels


In [33]:
KG_PATH = '../resources/knowledge_graphs/'
INFILE = KG_PATH + 'NP-KG-CURIE-only-v3.0.0.tsv'
OUTFILE = KG_PATH + 'NP-KG-CURIE-with-relations-v3.0.0.tsv'
nodeLabelsFile = KG_PATH + 'nodeLabels_CURIE_v3.0.0.tsv'

In [34]:
import pandas as pd

In [35]:
nodedf = pd.read_csv(nodeLabelsFile, sep='\t')
nodedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090470 entries, 0 to 1090469
Data columns (total 2 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   source        1090470 non-null  object
 1   entity_label  1089225 non-null  object
dtypes: object(2)
memory usage: 16.6+ MB


In [36]:
nodedf.head()

Unnamed: 0,source,entity_label
0,cl:0000594,skeletal muscle satellite cell
1,ensembl:ENST00000456565,DARS1-206
2,dbsnp:rs750861887,NM_000256.3(MYBPC3):c.1944C>T (p.His648=)
3,pr:q8wu43,uncharacterized protein C2orf15 (human)
4,pr:q9m9h3,embryogenesis-like protein (Arabidopsis thaliana)


In [39]:
##need to fix none issue in new version (N=17k approx errors in relations propagated from PheKnowLator)
relations_dict = {
    'rdfs:subClassOf': 'rdfs:subClassOf',
    'rdf:type': 'rdf:type',
    'sio:000420': 'has expression'
}

In [40]:
import ast
import csv

In [41]:
with open(INFILE, 'r') as fin, open(OUTFILE, 'w') as fout:
    freader = csv.reader(fin, delimiter='\t')
    fwriter = csv.writer(fout, delimiter='\t')
    fwriter.writerow(['source', 'relation', 'target'])
    idx = 1
    next(freader)
    for row in freader:
        try:
            npkg_subject = row[0]
            npkg_relation = row[1]
            npkg_object = row[2]
            rel_label = npkg_relation
            if npkg_relation in relations_dict:
                rel_label = relations_dict[npkg_relation]
            else:
                rel_row = nodedf.loc[nodedf['source'] == npkg_relation]
                if not rel_row.empty:
                    rel_label = rel_row['entity_label'].values[0]
                    if 'entity_type' in rel_label:
                        rel_dict = ast.literal_eval(rel_label)
                        rel_label = rel_dict['label']
                    relations_dict[npkg_relation] = rel_label
            fwriter.writerow([npkg_subject, rel_label, npkg_object])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%100000 == 0:
            print('Completed edges: ', idx)

Completed edges:  100000
Completed edges:  200000
Completed edges:  300000
Completed edges:  400000
Completed edges:  500000
Completed edges:  600000
Completed edges:  700000
Completed edges:  800000
Completed edges:  900000
Completed edges:  1000000
Completed edges:  1100000
Completed edges:  1200000
Completed edges:  1300000
Completed edges:  1400000
Completed edges:  1500000
Completed edges:  1600000
Completed edges:  1700000
Completed edges:  1800000
Completed edges:  1900000
Completed edges:  2000000
Completed edges:  2100000
Completed edges:  2200000
Completed edges:  2300000
Completed edges:  2400000
Completed edges:  2500000
Completed edges:  2600000
Completed edges:  2700000
Completed edges:  2800000
Completed edges:  2900000
Completed edges:  3000000
Completed edges:  3100000
Completed edges:  3200000
Completed edges:  3300000
Completed edges:  3400000
Completed edges:  3500000
Completed edges:  3600000
Completed edges:  3700000
Completed edges:  3800000
Completed edges:  390

In [42]:
##write out rows with errors to file
with open(OUTFILE, 'a') as f:
    fwriter = csv.writer(f, delimiter='\t')
    fwriter.writerow(['envo:00002203', 'envo:has_increased_levels_of', 'chebi:24835'])
    fwriter.writerow(['envo:00002202', 'envo:has_increased_levels_of', 'chebi:50860'])
    fwriter.writerow(['envo:00002186', 'envo:has_increased_levels_of', 'chebi:24431'])
    fwriter.writerow(['envo:01000676', 'envo:has_increased_levels_of', 'chebi:24431'])
    fwriter.writerow(['envo:01001040', 'envo:has_increased_levels_of', 'chebi:26710'])
    fwriter.writerow(['envo:00002114', 'envo:has_increased_levels_of', 'chebi:24431'])
    fwriter.writerow(['envo:00002010', 'envo:has_increased_levels_of', 'chebi:26710'])
    fwriter.writerow(['mfomd:0000024', 'mf:manifestationof', 'mfomd:0000004'])

In [12]:
import csv

## Create node type dictionary

In [35]:
curie_to_node_type_dict = {
'ensembl': 'sequence',
'uberon': 'anatomy',
'pr':'protein',
'reactome':'pathway',
'so':'sequence',
'chebi':'chemical',
'go':'process',
'mondo':'disease',
'dbsnp': 'variant',
'hp':'phenotype',
'ncbigene':'gene',
'bar':'sequence',
'ncbitaxon':'organism',
'clo':'cell_line',
'mgi':'other',
'cl':'cell',
'dideo':'drug_drug_interaction',
'rgd':'rat_genome',
'flybase':'fly',
'pw':'pathway',
'oae':'adverse_event',
'pato':'trait',
'ecogene':'ecoli_gene',
'pombase':'yeast_genome',
'yeastgenome':'yeast_genome',
'mod':'other',
'po':'plant',
'caro':'other',
'dictyBase':'dictyo_genome',
'envo':'env',
'zfin':'zebrafish',
'wormbase':'worm',
'efo':'other',
'gno':'other',
'napdi_srs_imports':'napdi',
'birdgenenames':'bird_gene',
'mpath':'mouse_pathology',
'ensemblbacteria#':'bacteria',
'doid':'disease',
'ecto':'env',
'exo':'other',
'foodon':'food',
'apollo':'other',
'http':'other',
'chr':'other',
'uo':'other',
'nbo':'behavior',
'ncit':'ncit',
'pdro':'other',
'ensembl#':'sequence',
'hsapdv':'dev_stage',
'mfomd':'other',
'gaz':'other',
'dron':'drug',
'ogg':'other',
'ino':'other',
'ido':'other',
'fma':'other',
'vo':'vaccine',
'mf':'other',
'ero':'other',
'ghr':'other',
'pco':'other',
'ogms':'other',
'chmo':'other',
'hgnc':'gene',
'maxo':'other',
'stato':'other',
'mop':'other',
'upheno':'other',
'rare':'other'
}

In [36]:
INFILE = KG_PATH + 'nodeLabels_CURIE_v3.0.0.tsv'
OUTFILE = KG_PATH + 'nodeTypes_CURIE_v3.0.0.tsv'

In [37]:
with open(INFILE, 'r') as fin, open(OUTFILE, 'w') as fout:
    freader = csv.reader(fin, delimiter='\t')
    fwriter = csv.writer(fout, delimiter='\t')
    fwriter.writerow(['curie', 'category'])
    idx = 1
    next(freader)
    for row in freader:
        try:
            node_curie = row[0]
            node_type = node_curie.split(':')[0]
            if node_type in curie_to_node_type_dict:
                node_type = curie_to_node_type_dict[node_type]
            else:
                node_type = 'other'
            fwriter.writerow([node_curie, node_type])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%100000 == 0:
            print('Completed nodes: ', idx)

Completed nodes:  100000
Completed nodes:  200000
Completed nodes:  300000
Completed nodes:  400000
Completed nodes:  500000
Completed nodes:  600000
Completed nodes:  700000
Completed nodes:  800000
Completed nodes:  900000
Completed nodes:  1000000


In [38]:
NODEINFILE = KG_PATH + 'nodeLabels_v3.0.0.tsv'
NODEOUTFILE = KG_PATH + 'nodeTypes_v3.0.0.tsv'

In [39]:
#load mapping dicts
uri_to_curie_dict = {}
curie_to_uri_dict = {}
with open(KG_PATH+'uri_to_curie_map.tsv', 'r') as file1:
    for line in file1:
        uri_to_curie_dict[line.split('\t')[0]] = line.split('\t')[1].strip()

with open(KG_PATH+'curie_to_uri_map.tsv', 'r') as file2:
    for line in file2:
        curie_to_uri_dict[line.split('\t')[0]] = line.split('\t')[1].strip()

In [40]:
for key in curie_to_uri_dict:
    print(key, curie_to_uri_dict[key])
    break

clo:0035452 http://purl.obolibrary.org/obo/CLO_0035452


In [41]:
with open(NODEINFILE, 'r') as fin, open(NODEOUTFILE, 'w') as fout:
    freader = csv.reader(fin, delimiter='\t')
    fwriter = csv.writer(fout, delimiter='\t')
    fwriter.writerow(['uri', 'category'])
    idx = 1
    next(freader)
    for row in freader:
        try:
            node_uri = row[0]
            node_curie = uri_to_curie_dict.get(node_uri, node_uri)
            node_type = node_curie.split(':')[0]
            if node_type in curie_to_node_type_dict:
                node_type = curie_to_node_type_dict[node_type]
            else:
                node_type = 'other'
            fwriter.writerow([node_uri, node_type])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%100000 == 0:
            print('Completed nodes: ', idx)

Completed nodes:  100000
Completed nodes:  200000
Completed nodes:  300000
Completed nodes:  400000
Completed nodes:  500000
Completed nodes:  600000
Completed nodes:  700000
Completed nodes:  800000
Completed nodes:  900000
Completed nodes:  1000000


### Load in GRAPE to test

In [1]:
from grape import Graph

In [3]:
KG_PATH = '../resources/knowledge_graphs/'

In [35]:
#load without node and edge labels (time=~11 secs)
npkg = Graph.from_csv(
        node_path=KG_PATH+'nodeTypes_CURIE_v3.0.0.tsv',
        node_list_node_types_column_number=1,
        nodes_column_number=0,
        node_list_separator='\t',
        node_list_header=True,
        edge_path=KG_PATH+'NP-KG-CURIE_v3.0.0.tsv',
        edge_list_separator='\t',
        edge_list_header=True,
        edge_list_edge_types_column_number=1,
        sources_column_number=0,
        destinations_column_number=2,
        directed=True,
        verbose=True
    )

In [5]:
import csv

In [34]:
##remove nodes with unknown or non-existent identifers from TSV file
nodes_to_remove = ['pr:p0110014', 'chebi:1723062', 'pr:p3535483', 'napdi_srs_imports:trigonella_foenum',
                   'chebi:56627491', 'chebi:2652316', 'pr:q9y6l68', 'chebi:272265', 'chebi:343852',
                   'go:00085599', 'chebi:381555', 'chebi:633175', 'pr:00004538209', 'pr:00004538214']
INFILE = KG_PATH + 'NP-KG-CURIE-only-v3.0.0.tsv'
OUTFILE = KG_PATH + 'NP-KG-CURIE_v3.0.0.tsv'
with open(INFILE, 'r') as fin, open(OUTFILE, 'w') as fout:
    freader = csv.reader(fin, delimiter='\t')
    fwriter = csv.writer(fout, delimiter='\t')
    fwriter.writerow(['source', 'relation', 'target'])
    idx = 1
    #skip header
    next(freader)
    for row in freader:
        try:
            npkg_subject = row[0]
            npkg_relation = row[1]
            npkg_object = row[2]
            if npkg_subject in nodes_to_remove or npkg_object in nodes_to_remove:
                continue
            fwriter.writerow([npkg_subject, npkg_relation, npkg_object])
        except Exception as e:
            print('Error: ', e)
            print(idx)
            print(row)
        idx+=1
        if idx%100000 == 0:
            print('Completed edges: ', idx)

Completed edges:  100000
Completed edges:  200000
Completed edges:  300000
Completed edges:  400000
Completed edges:  500000
Completed edges:  600000
Completed edges:  700000
Completed edges:  800000
Completed edges:  900000
Completed edges:  1000000
Completed edges:  1100000
Completed edges:  1200000
Completed edges:  1300000
Completed edges:  1400000
Completed edges:  1500000
Completed edges:  1600000
Completed edges:  1700000
Completed edges:  1800000
Completed edges:  1900000
Completed edges:  2000000
Completed edges:  2100000
Completed edges:  2200000
Completed edges:  2300000
Completed edges:  2400000
Completed edges:  2500000
Completed edges:  2600000
Completed edges:  2700000
Completed edges:  2800000
Completed edges:  2900000
Completed edges:  3000000
Completed edges:  3100000
Completed edges:  3200000
Completed edges:  3300000
Completed edges:  3400000
Completed edges:  3500000
Completed edges:  3600000
Completed edges:  3700000
Completed edges:  3800000
Completed edges:  390

In [36]:
npkg