In [81]:
import pandas as pd
import os
from rdflib import URIRef, Namespace, Literal, Graph
import kglab

In [82]:
# Load datasets
genes = pd.read_csv("Data/CSV_files/genes_NTDs.csv")
drugs = pd.read_csv("Data/CSV_files/drugs_NTDs.csv")
go_cel = pd.read_csv("Data/CSV_files/GO_cellularcomp_NTDs_genes.csv")
go_bp = pd.read_csv("Data/CSV_files/GO_biologicalprcoess_NTDs_genes.csv")
go_mf = pd.read_csv("Data/CSV_files/GO_molfunction_NTDs_genes.csv")
react_path = pd.read_csv("Data/CSV_files/Reactomepaths_genes_NTDs.csv")
wiki_path = pd.read_csv("Data/CSV_files/Wikipaths_genes_NTDs.csv")
proteins = pd.read_csv("Data/CSV_files/NTDs_proteins.csv")
domains = pd.read_csv("Data/CSV_files/Domains_NTDs_proteins.csv")
variants = pd.read_csv("Data/CSV_files/Variants_NTDs.csv")

In [83]:
# Define namespaces and KG
namespaces = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "https://www.w3.org/TR/rdf-schema/#",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "schema": "https://schema.org/",
    "sio": "http://semanticscience.org/resource/",
    "ncit": "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
    "obo": "http://purl.obolibrary.org/obo/",
    "mesh": "http://purl.bioontology.org/ontology/MESH/",
    "ncbigene": "http://bio2rdf.org/ncbigene:",
    "drugbank": "http://bio2rdf.org/drugbank:",
    "reactome": "http://identifiers.org/reactome/",
    "wikipath": "http://identifiers.org/wikipathways/",
    "uniprot": "https://bio2rdf.org/uniprot:",
    "interpro": "https://bio2rdf.org/interpro:",
    "pharmgkb": "https://bio2rdf.org/pharmgkb:",
    "dbsnp": "https://bio2rdf.org/dbsnp"
}

kg = kglab.KnowledgeGraph(
    name="NTDs_kg",
    namespaces=namespaces,
)

In [85]:
# Add information about genes associated to the three neglected tropical diseases (NTDs)
for index, row in genes.iterrows():
    gene_id = row["entrezgene_id"]
    disease_ass = row["MESH ID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("mesh")) + "{}".format(disease_ass))
        
    #kg.get_ns("mesh").disease_ass)
    
    kg.add(node1, kg.get_ns("rdf").type, kg.get_ns("ncit").C16612)
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C7057)
    
    kg.add(node1, kg.get_ns("ncit").C25281, node2)
    
    gene_symbol = row["hgnc_symbol"]
    kg.add(node1, kg.get_ns("sio").SIO_000300, Literal(gene_symbol, datatype=kg.get_ns("xsd").string))
    
    disease_name = row["Disease"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(disease_name, datatype=kg.get_ns("xsd").string))

In [86]:
# Add information about proteins encoded by genes 
for index, row in proteins.iterrows():
    gene_id = row["entrezgene_id"]
    uniprot_id = row["uniprotswissprot"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("uniprot")) + "{}".format(uniprot_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C17021)
    
    kg.add(node1, kg.get_ns("sio").SIO_010078, node2)
    
    protein_name = row["hgnc_symbol"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(protein_name, datatype=kg.get_ns("xsd").string))

In [87]:
# Add information about proteins domains
for index, row in domains.iterrows():
    uniprot_id = row["uniprotswissprot"]
    interpro_id = row["interpro"]
    node1 = URIRef(str(kg.get_ns("uniprot")) + "{}".format(uniprot_id))
    node2 = URIRef(str(kg.get_ns("interpro")) + "{}".format(interpro_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C13379)
    
    kg.add(node1, kg.get_ns("sio").SIO_000202, node2)
    
    domain_name = row["interpro_description"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(domain_name, datatype=kg.get_ns("xsd").string))

In [88]:
# Add information about drugs to treat the NTDs and their binding target proteins
for index, row in drugs.iterrows():
    drug_id = row["Drugbank.ID"]
    node = URIRef(str(kg.get_ns("drugbank")) + "{}".format(drug_id))
    kg.add(node, kg.get_ns("rdf").type, kg.get_ns("ncit").C1909)
    
    target_protein = row["uniprotswissprot"]
    kg.add(node, kg.get_ns("ncit").C82888, URIRef(str(kg.get_ns("uniprot")) + "{}".format(target_protein)))
    
    disease_ass = row["MESH.ID"]
    kg.add(node, kg.get_ns("ncit").C70742, URIRef(str(kg.get_ns("mesh")) + "{}".format(disease_ass)))
    
    drug_name = row["Drug"]
    kg.add(node, kg.get_ns("sio").SIO_000300, Literal(drug_name, datatype=kg.get_ns("xsd").string))

In [89]:
# Add information about cellular component gene ontologies from the genes
for index, row in go_cel.iterrows():
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("obo")) + "{}".format(go_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").GO_0016043)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000050, node2)
    
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [90]:
# Add information about biological processes gene ontologies from the genes
for index, row in go_bp.iterrows():
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("obo")) + "{}".format(go_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").GO_0008150)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000066, node2)
    
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [91]:
# Add information about molecular functions gene ontologies from the genes
for index, row in go_mf.iterrows():
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("obo")) + "{}".format(go_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").GO_0003674)
    
    kg.add(node1, kg.get_ns("obo").RO_0002211, node2)
    
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [92]:
# Add information about reactome pathways associated to the genes
for index, row in react_path.iterrows():
    gene_id = row["entrezgene_id"]
    path_id = row["reactpathID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("reactome")) + "{}".format(path_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C54214)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000050, node2)
    
    go_name = row["description"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [93]:
# Add information about pathways from wikipathways associated to the genes
for index, row in wiki_path.iterrows():
    gene_id = row["entrezgene_id"]
    path_id = row["wikipathID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("wikipath")) + "{}".format(path_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C54214)
    
    kg.add(node1, kg.get_ns("obo").BFO_0000050, node2)
    
    go_name = row["description"]
    kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(go_name, datatype=kg.get_ns("xsd").string))

In [77]:
variants.head()

Unnamed: 0,chemicals,gene,variant,type,level.of.evidence,phenotypes,chr_name,allele,entrezgene_id,PharmGKB_ID
0,calcium channel blockers,ACE,rs1799752,Other,3,Headache Disorders,17,ATACAGTCACTTTT/ATACAGTCACTTTTTTTTTTTTTTTGAGACG...,1636,PA10407
1,calcium channel blockers,ACE,rs1799752,Other,3,Migraine with Aura,17,ATACAGTCACTTTT/ATACAGTCACTTTTTTTTTTTTTTTGAGACG...,1636,PA10407
2,calcium channel blockers,ACE,rs1799752,Other,3,Migraine without Aura,17,ATACAGTCACTTTT/ATACAGTCACTTTTTTTTTTTTTTTGAGACG...,1636,PA10407
3,capecitabine,ABCB1,rs1045642,Toxicity,3,Neoplasms,7,A/C/G/T,5243,PA448771
4,capecitabine,ABCG2,rs2231142,Efficacy,4,Colorectal Neoplasms,4,G/C/T,9429,PA448771


In [94]:
# Add information about variants associated to genes
for index, row in variants.iterrows():
    gene_id = row["entrezgene_id"]
    SNP_id = row["variant"]
    drug_id = row["PharmGKB_ID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("dbsnp")) + "{}".format(SNP_id))
    node3 = URIRef(str(kg.get_ns("pharmgkb")) + "{}".format(drug_id))
    
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("obo").SO_0000694)
    
    kg.add(node1, kg.get_ns("ncit").C25281, node2)
    
    kg.add(node2, kg.get_ns("ncit").C25281, node3)
    
    chr_name = row["chr_name"]
    kg.add(node2, kg.get_ns("sio").SIO_000061, Literal(chr_name, datatype=kg.get_ns("xsd").integer))
    
    allele = row["allele"]
    kg.add(node2, kg.get_ns("sio").SIO_000223, Literal(allele, datatype=kg.get_ns("xsd").string))
    
    phen = row["phenotypes"]
    kg.add(node2, kg.get_ns("sio").SIO_000223, Literal(phen, datatype=kg.get_ns("xsd").string))
    

In [53]:
# Add information about other diseases associated to the genes
#for index, row in diseases.iterrows():
 #   gene_id = row["entrezgene_id"]
  #  disease_ass = row["disgenetID"]
   # node1 = URIRef("http://bio2rdf.org/ncbigene:{}".format(gene_id))
    #node2 = URIRef("http://linkedlifedata.com/resource/umls/id/{}".format(disease_ass))
    
    #kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("ncit").C7057)
    
    #kg.add(node1, kg.get_ns("ncit").C25281, node2)
    
    #disease_name = row["description"]
    #kg.add(node2, kg.get_ns("sio").SIO_000300, Literal(disease_name, datatype=kg.get_ns("xsd").string))

In [95]:
# Export kg
kg.save_rdf("./Data/RDF_graphs/NTDs_kg.ttl", format="ttl", base=None, encoding="utf-8")
kg.save_rdf("./Data/RDF_graphs/NTDs_kg.nt", format="nt", base=None, encoding="utf-8")

In [98]:
# Load NTDs kg with rdflib
g = Graph()
g.parse("Data/RDF_graphs/NTDs_kg.ttl")

<Graph identifier=Nd0e5d8629bf545199b573dda659878d9 (<class 'rdflib.graph.Graph'>)>

In [99]:
# Print lenght of the graph
print(f'graph has {len(g)} facts')

graph has 27705 facts


In [226]:
# Parse additional information from Bio2RDF
for i in genes["entrezgene_id"]:
    #print(('http://bio2rdf.org/' + 'ncbigene:' + str(i)))
    g.parse('http://bio2rdf.org/' + 'ncbigene:' + str(i))

In [227]:
# Print lenght of the graph
print(f'graph has {len(g)} facts')

graph has 222253 facts


In [228]:
# Export final kg
g.serialize(destination='NTDs_kg2.ttl', format='turtle')

<Graph identifier=N7c752e50b0884761b0e840b694a89d66 (<class 'rdflib.graph.Graph'>)>