In [1]:
import pandas as pd
import os
from rdflib import URIRef, Namespace, Literal, Graph
import kglab

In [79]:
# Load datasets
genes = pd.read_csv("genes_NTDs.csv")
drugs = pd.read_csv("drugs_NTDs.csv")
go_cel = pd.read_csv("GO_cellularcomp_NTDs_genes.csv")
go_bp = pd.read_csv("GO_biologicalprcoess_NTDs_genes.csv")
go_mf = pd.read_csv("GO_molfunction_NTDs_genes.csv")
react_path = pd.read_csv("Reactomepaths_genes_NTDs.csv")
wiki_path = pd.read_csv("Wikipaths_genes_NTDs.csv")

In [80]:
# Define namespaces and KG
namespaces = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "https://www.w3.org/TR/rdf-schema/#",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "schema": "https://schema.org/",
    "sio": "http://semanticscience.org/resource/",
    "ncit": "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
    "obo": "http://purl.obolibrary.org/obo/",
    "mesh": "http://purl.bioontology.org/ontology/MESH/",
    "ncbigene": "http://bio2rdf.org/ncbigene:",
    "drugbank": "http://bio2rdf.org/drugbank:",
    "reactome": "http://identifiers.org/reactome/",
    "wikipath": "http://identifiers.org/wikipathways/"
}

kg = kglab.KnowledgeGraph(
    name="NTDs_kg",
    namespaces=namespaces,
)

In [81]:
# Add information about genes associated to the three neglected tropical diseases (NTDs)
for index, row in genes.iterrows():
    gene_id = row["entrezgene_id"]
    disease_ass = row["MESH ID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("mesh")) + "{}".format(disease_ass))
        
    #kg.get_ns("mesh").disease_ass)
    
    kg.add(node1, kg.get_ns("rdf").type, kg.get_ns("ncit").C16612)
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C7057)
    
    kg.add(node1, kg.get_ns("ncit").C25281, node2)
    
    gene_symbol = row["hgnc_symbol"]
    kg.add(node1, kg.get_ns("sio").SIO_000300, Literal(gene_symbol, datatype=kg.get_ns("xsd").string))
    
    disease_name = row["Disease"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(disease_name, datatype=kg.get_ns("xsd").string))

In [5]:
# Add information about other diseases associated to the genes
#for index, row in diseases.iterrows():
 #   gene_id = row["entrezgene_id"]
  #  disease_ass = row["disgenetID"]
   # node1 = URIRef("http://bio2rdf.org/ncbigene:{}".format(gene_id))
    #node2 = URIRef("http://linkedlifedata.com/resource/umls/id/{}".format(disease_ass))
    
    #kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C7057)
    
    #kg.add(node1, kg.get_ns("ncit").C25281, node2)
    
    #disease_name = row["description"]
    #kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(disease_name, datatype=kg.get_ns("xsd").string))

In [82]:
# Add information about drugs to treat the NTDs and their binding target genes
for index, row in drugs.iterrows():
    drug_id = row["Drugbank ID"]
    node = URIRef(str(kg.get_ns("drugbank")) + "{}".format(drug_id))
    kg.add(node, kg.get_ns("rdf").type, kg.get_ns("ncit").C1909)
    
    target_gene = row["Target_entrezgene_id"]
    kg.add(node, kg.get_ns("ncit").C82888, URIRef(str(kg.get_ns("ncbigene")) + "{}".format(target_gene)))
    
    disease_ass = row["MESH ID"]
    kg.add(node, kg.get_ns("ncit").C70742, URIRef(str(kg.get_ns("mesh")) + "{}".format(disease_ass)))
    
    drug_name = row["Drug"]
    kg.add(node, kg.get_ns("sio").SIO_000300, Literal(drug_name, datatype=kg.get_ns("xsd").string))

In [83]:
# Add information about cellular component gene ontologies from the genes
for index, row in go_cel.iterrows():
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("obo")) + "{}".format(go_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").GO_0016043)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000050, node2)
    
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [84]:
# Add information about biological processes gene ontologies from the genes
for index, row in go_bp.iterrows():
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("obo")) + "{}".format(go_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").GO_0008150)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000066, node2)
    
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [85]:
# Add information about molecular functions gene ontologies from the genes
for index, row in go_mf.iterrows():
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("obo")) + "{}".format(go_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").GO_0003674)
    
    kg.add(node1, kg.get_ns("obo").RO_0002211, node2)
    
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [86]:
# Add information about reactome pathways associated to the genes
for index, row in react_path.iterrows():
    gene_id = row["entrezgene_id"]
    path_id = row["reactpathID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("reactome")) + "{}".format(path_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C54214)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000050, node2)
    
    go_name = row["description"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [87]:
# Add information about pathways from wikipathways associated to the genes
for index, row in wiki_path.iterrows():
    gene_id = row["entrezgene_id"]
    path_id = row["wikipathID"]
    node1 = URIRef("http://bio2rdf.org/ncbigene:{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("wikipath")) + "{}".format(path_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C54214)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000050, node2)
    
    go_name = row["description"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [88]:
# Export kg
kg.save_rdf("./Data/RDF_graphs/NTDs_kg.ttl", format="ttl", base=None, encoding="utf-8")
kg.save_rdf("./Data/RDF_graphs/NTDs_kg.nt", format="nt", base=None, encoding="utf-8")

In [89]:
# Load NTDs kg with rdflib
g = Graph()
g.parse("Data/RDF_graphs/NTDs_kg.ttl")

<Graph identifier=N204d794cc6e544ebb2c35eede8274e4b (<class 'rdflib.graph.Graph'>)>

In [90]:
# Print lenght of the graph
print(f'graph has {len(g)} facts')

graph has 22085 facts


In [226]:
# Parse additional information from Bio2RDF
for i in genes["entrezgene_id"]:
    #print(('http://bio2rdf.org/' + 'ncbigene:' + str(i)))
    g.parse('http://bio2rdf.org/' + 'ncbigene:' + str(i))

In [227]:
# Print lenght of the graph
print(f'graph has {len(g)} facts')

graph has 222253 facts


In [228]:
# Export final kg
g.serialize(destination='NTDs_kg2.ttl', format='turtle')

<Graph identifier=N7c752e50b0884761b0e840b694a89d66 (<class 'rdflib.graph.Graph'>)>