In [23]:
import pandas as pd
from rdflib import URIRef, Namespace, Literal, Graph
import kglab

In [24]:
# Load datasets
genes = pd.read_csv("Data/CSV_files/genes_NTDs.csv",  dtype = {'MONDO_id': str})
drugs = pd.read_csv("Data/CSV_files/drugs_NTDs.csv")
go_cel = pd.read_csv("Data/CSV_files/GO_cellularcomp_NTDs_genes.csv", dtype = {'go_id': str})
go_bp = pd.read_csv("Data/CSV_files/GO_biologicalprcoess_NTDs_genes.csv",  dtype = {'go_id': str})
go_mf = pd.read_csv("Data/CSV_files/GO_molfunction_NTDs_genes.csv",  dtype = {'go_id': str})
react_path = pd.read_csv("Data/CSV_files/Reactomepaths_genes_NTDs.csv")
wiki_path = pd.read_csv("Data/CSV_files/Wikipaths_genes_NTDs.csv")
proteins = pd.read_csv("Data/CSV_files/NTDs_proteins.csv")
domains = pd.read_csv("Data/CSV_files/Domains_NTDs_proteins.csv")
variants = pd.read_csv("Data/CSV_files/Variants_NTDs.csv")

In [25]:
# Define namespaces and KG
namespaces = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "https://www.w3.org/TR/rdf-schema/#",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "biolink": "https://w3id.org/biolink/vocab/",
    "go": "https://bioregistry.io/go:",
    "mesh": "https://bioregistry.io/mesh:",
    "ncbigene": "https://bioregistry.io/ncbigene:",
    "ensembl": "https://bioregistry.io/ensembl:",
    "hgnc":"https://bioregistry.io/hgnc:",
    "mondo":"https://bioregistry.io/mondo:",
    "medgen":"https://bioregistry.io/medgen:",
    "orphanet":"https://bioregistry.io/orphanet:",
    "drugbank": "https://bioregistry.io/drugbank:",
    "pubchem":"https://bioregistry.io/pubchem.compound:",
    "chembl":"https://bioregistry.io/chembl:",
    "reactome": "https://bioregistry.io/reactome:",
    "wikipath": "https://bioregistry.io/wikipathways:",
    "uniprot": "https://bioregistry.io/uniprot:",
    "refseq": "https://bioregistry.io/refseq:",
    "interpro": "https://bioregistry.io/interpro:",
    "pfam":"https://bioregistry.io/pfam:",
    "pharmgkb_drug": "https://bioregistry.io/pharmgkb.drug:",
    "dbsnp": "https://bioregistry.io/dbsnp:"
}

kg = kglab.KnowledgeGraph(
    name="NTDs_kg",
    namespaces=namespaces,
)

In [26]:
# Add information about genes associated to the three neglected tropical diseases (NTDs)
for index, row in genes.iterrows():
    # Define the classes
    gene_id = row["entrezgene_id"]
    disease_ass = row["MESH_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("mesh")) + "{}".format(disease_ass))
        
    # Add node types
    kg.add(node1, kg.get_ns("rdf").type, kg.get_ns("biolink").Gene)
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").Disease)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").gene_associated_with_condition, node2)
    
    # Add properties for classes
    # Labels
    gene_symbol = row["hgnc_symbol"]
    kg.add(node1, kg.get_ns("rdfs").label, Literal(gene_symbol, datatype=kg.get_ns("xsd").string))
    disease_name = row["Disease"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(disease_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node1, kg.get_ns("biolink").provided_by, Literal("KEGG", datatype=kg.get_ns("xsd").string))
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("KEGG", datatype=kg.get_ns("xsd").string))
    
    # Alternate URIs
    ensembl_id = row["Ensembl_id"]
    node3 = URIRef(str(kg.get_ns("ensembl")) + "{}".format(ensembl_id))
    kg.add(node1, kg.get_ns("biolink").xref, node3)
    hgnc_id = row["hgnc_id"]
    node4 = URIRef(str(kg.get_ns("hgnc")) + "{}".format(hgnc_id))
    kg.add(node1, kg.get_ns("biolink").xref, node4)
    mondo_id = row["MONDO_id"]
    node5 = URIRef(str(kg.get_ns("mondo")) + "{}".format(mondo_id))
    kg.add(node2, kg.get_ns("biolink").xref, node5)
    medgen_id = row["MedGen_id"]
    node6 = URIRef(str(kg.get_ns("medgen")) + "{}".format(medgen_id))
    kg.add(node2, kg.get_ns("biolink").xref, node6)
    orphanet_id = row["Orphanet_id"]
    node7 = URIRef(str(kg.get_ns("orphanet")) + "{}".format(orphanet_id))
    kg.add(node2, kg.get_ns("biolink").xref, node7)

In [27]:
# Add information about proteins encoded by genes 
for index, row in proteins.iterrows():
    # Define the classes
    gene_id = row["entrezgene_id"]
    uniprot_id = row["UniProt_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("uniprot")) + "{}".format(uniprot_id))
    
    # Add node types
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").Protein)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").translates_to, node2)
    
    # Add properties for classes
    # Labels
    protein_name = row["hgnc_symbol"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(protein_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("Uniprot", datatype=kg.get_ns("xsd").string))
    # Alternate URIs
    Ensembl_protein_id = row["Ensembl_protein_id"]
    if not pd.isna(Ensembl_protein_id):
        node3 = URIRef(str(kg.get_ns("ensembl")) + "{}".format(Ensembl_protein_id))
        kg.add(node2, kg.get_ns("biolink").xref, node3)
    
    RefSeq_protein_id = row["RefSeq_protein_id"]
    if not pd.isna(RefSeq_protein_id):
        node4 = URIRef(str(kg.get_ns("refseq")) + "{}".format(RefSeq_protein_id))
        kg.add(node2, kg.get_ns("biolink").xref, node4)

In [28]:
# Add information about proteins domains
for index, row in domains.iterrows():
    # Define the classes
    uniprot_id = row["UniProt_id"]
    interpro_id = row["InterPro_id"]
    node1 = URIRef(str(kg.get_ns("uniprot")) + "{}".format(uniprot_id))
    node2 = URIRef(str(kg.get_ns("interpro")) + "{}".format(interpro_id))
    
    # Add node types
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").ProteinDomain)
    
    # Add predicate between classes
    kg.add(node2, kg.get_ns("biolink").part_of, node1)
    
    # Add properties for classes
    # Labels
    domain_name = row["interpro_description"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(domain_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("Interpro", datatype=kg.get_ns("xsd").string))
    
    # Alternate URIs
    Pfam_id = row["Pfam_id"]
    if not pd.isna(Pfam_id):
        node3 = URIRef(str(kg.get_ns("pfam")) + "{}".format(Pfam_id))
        kg.add(node2, kg.get_ns("biolink").xref, node3)

In [29]:
# Add information about drugs to treat the NTDs and their binding target proteins
for index, row in drugs.iterrows():
    # Define the classes
    drug_id = row["Drugbank.ID"]
    node = URIRef(str(kg.get_ns("drugbank")) + "{}".format(drug_id))
    
    # Add node types
    kg.add(node, kg.get_ns("rdf").type, kg.get_ns("biolink").Drug)
    
    # Add predicate between classes
    target_protein = row["UniProt_id"]
    kg.add(node, kg.get_ns("biolink").physically_interacts_with, URIRef(str(kg.get_ns("uniprot")) + "{}".format(target_protein)))
    
    disease_ass = row["MESH.ID"]
    kg.add(node, kg.get_ns("biolink").treats, URIRef(str(kg.get_ns("mesh")) + "{}".format(disease_ass)))
    
    # Add properties for classes
    # Labels
    drug_name = row["Drug"]
    kg.add(node, kg.get_ns("rdfs").label, Literal(drug_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node, kg.get_ns("biolink").provided_by, Literal("Drugbank", datatype=kg.get_ns("xsd").string))
    
    # Alternate URIs
    PubChem_ID = row["PubChem.ID"]
    node3 = URIRef(str(kg.get_ns("pubchem")) + "{}".format(PubChem_ID))
    kg.add(node, kg.get_ns("biolink").xref, node3)
    
    ChEMBL_ID = row["ChEMBL.ID"]
    if not pd.isna(ChEMBL_ID):
        node4 = URIRef(str(kg.get_ns("chembl")) + "{}".format(ChEMBL_ID))
        kg.add(node, kg.get_ns("biolink").xref, node4)

In [30]:
# Add information about cellular component gene ontologies from the genes
for index, row in go_cel.iterrows():
    # Define the class
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("go")) + "{}".format(go_id))
    
    # Add node type
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").CellularComponent)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").active_in, node2)
    
    # Add properties for classes
    # Labels
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(go_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("Gene Ontology", datatype=kg.get_ns("xsd").string))

In [31]:
# Add information about biological processes gene ontologies from the genes
for index, row in go_bp.iterrows():
    # Define the class
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("go")) + "{}".format(go_id))
    
    # Add node type
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").BiologicalProcess)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").acts_upstream_of , node2)
    
    # Add properties for classes
    # Labels
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(go_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("Gene Ontology", datatype=kg.get_ns("xsd").string))

In [32]:
# Add information about molecular functions gene ontologies from the genes
for index, row in go_mf.iterrows():
    # Define the class
    gene_id = row["entrezgene_id"]
    go_id = row["go_id"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("go")) + "{}".format(go_id))
    
    # Add node type
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").MolecularActivity)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").participates_in, node2)
    
    # Add properties for classes
    # Labels
    go_name = row["name_1006"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(go_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("Gene Ontology", datatype=kg.get_ns("xsd").string))

In [33]:
# Add information about reactome pathways associated to the genes
for index, row in react_path.iterrows():
    # Define the class
    gene_id = row["entrezgene_id"]
    path_id = row["reactpathID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("reactome")) + "{}".format(path_id))
    
    # Add node type
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").Pathway)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").participates_in, node2)
    
    # Add properties for classes
    # Labels
    go_name = row["description"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(go_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("Reactome", datatype=kg.get_ns("xsd").string))

In [34]:
# Add information about pathways from wikipathways associated to the genes
for index, row in wiki_path.iterrows():
    # Define the class
    gene_id = row["entrezgene_id"]
    path_id = row["wikipathID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("wikipath")) + "{}".format(path_id))
    
    # Add node type
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").Pathway)
    
    # Add predicate between classes
    kg.add(node1, kg.get_ns("biolink").participates_in, node2)
    
    # Add properties for classes
    # Labels
    go_name = row["description"]
    kg.add(node2, kg.get_ns("rdfs").label, Literal(go_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("WikiPathways", datatype=kg.get_ns("xsd").string))

In [35]:
# Add information about variants associated to genes
for index, row in variants.iterrows():
    # Define the classes
    gene_id = row["entrezgene_id"]
    SNP_id = row["variant"]
    drug_id = row["PharmGKB_ID"]
    node1 = URIRef(str(kg.get_ns("ncbigene")) + "{}".format(gene_id))
    node2 = URIRef(str(kg.get_ns("dbsnp")) + "{}".format(SNP_id))
    node3 = URIRef(str(kg.get_ns("pharmgkb_drug")) + "{}".format(drug_id))
    
    # Add node types
    kg.add(node2, kg.get_ns("rdf").type, kg.get_ns("biolink").Snv)
    kg.add(node3, kg.get_ns("rdf").type, kg.get_ns("biolink").Drug)
    
    # Add predicate between classes
    kg.add(node2, kg.get_ns("biolink").is_sequence_variant_of, node1)
    kg.add(node2, kg.get_ns("biolink").associated_with, node3)
    
    # Add properties for classes
    # Labels
    chr_name = row["chr_name"]
    kg.add(node2, kg.get_ns("biolink").has_sequence_location, Literal(chr_name, datatype=kg.get_ns("xsd").integer))
    
    allele = row["allele"]
    kg.add(node2, kg.get_ns("biolink").has_attribute, Literal(allele, datatype=kg.get_ns("xsd").string))
    
    phen = row["phenotypes"]
    if not pd.isna(phen):
        kg.add(node2, kg.get_ns("biolink").has_phenotype, Literal(phen, datatype=kg.get_ns("xsd").string))
        
    drug_name = row["chemicals"]
    kg.add(node3, kg.get_ns("rdfs").label, Literal(drug_name, datatype=kg.get_ns("xsd").string))
    
    # Provenance
    kg.add(node2, kg.get_ns("biolink").provided_by, Literal("PharmGKB", datatype=kg.get_ns("xsd").string))
    kg.add(node3, kg.get_ns("biolink").provided_by, Literal("PharmGKB", datatype=kg.get_ns("xsd").string))

In [36]:
# Export kg
kg.save_rdf("./Data/RDF_graphs/NTDs_kg.ttl", format="ttl", base=None, encoding="utf-8")
kg.save_rdf("./Data/RDF_graphs/NTDs_kg.nt", format="nt", base=None, encoding="utf-8")

In [37]:
# Load NTDs kg with rdflib
g = Graph()
g.parse("Data/RDF_graphs/NTDs_kg.ttl")

<Graph identifier=N9b348ad66ce04b1292ee45b601aecffc (<class 'rdflib.graph.Graph'>)>

In [38]:
# Print the number of "triples" in the Graph
print(f"Graph g has {len(g)} statements.")

Graph g has 33892 statements.
