In [1]:
import os
import json
import pandas as pd
import yaml
import sys  # Import sys to change the field size limit
import csv

#Set working directory
os.chdir('/your/working/directory/path') 

In [None]:
# Open DrugMechDB YAML file
def load_yaml_file(file_path):
    try:
        with open(file_path, 'r') as file:
            return yaml.safe_load(file)
    except Exception as e:
        print(f"An error occurred: {e}")

drug_db = load_yaml_file('indication_paths.yaml') # Change the pathway where your yaml file is located

# Display YAML file
#drug_db

In [3]:
### Parse YAML file extracting Node ID and their associated Names for each graph

def extract_graph_node_data(drug_db):
    # Initialize a list to store the extracted data
    node_data = []
    
    for graph in drug_db:
        graph_id = graph['graph']['_id']  # Accessing the graph ID
        nodes = graph['nodes']
        node_list = []

        for node in nodes:
            node_id = node['id']
            node_name = node['name']
            node_list.append({'id': node_id, 'name': node_name})

        node_data.append({
            'graph': {
                '_id': graph_id,
                'nodes': node_list
            }
        })

    return node_data  # list of dictionaries with nested 'nodes' under 'graph'

graphs = extract_graph_node_data(drug_db)

# Display DrugMechDB Node ID & Names for each graph
#graphs


In [6]:
# Screen for outdated node id's

def extract_unique_prefixes(graphs):
    unique_prefixes = set()

    for graph in graphs:
        for node in graph['graph']['nodes']:
            prefix = node['id'].split(':', 1)[0]  # Extract prefix before the colon
            unique_prefixes.add(prefix)  # Add to the set, duplicates will not be added

    return unique_prefixes

unique_prefixes = extract_unique_prefixes(graphs)

# Display the set of unique node id prefixes; shows all the node ide prefixes that are in DrugMechDB
print("DrugMechDB Node ID prefixes:", unique_prefixes)


DrugMechDB Node ID prefixes: {'HP', 'PR', 'reactome', 'taxonomy', 'TIGR', 'InterPro', 'GO', 'MESH', 'CHEBI', 'DB', 'CL', 'UniProt', 'Pfam', 'UBERON'}


In [7]:
curie_prefixes = {
    "ATC": "ATC",
    "BIOLINK": "biolink",
    "BIOLINK_SOURCE": "biolink_download_source",
    "CHEBI": "CHEBI",
    "CHEMBL_COMPOUND": "CHEMBL.COMPOUND",
    "CHEMBL_MECHANISM": "CHEMBL.MECHANISM",
    "CHEMBL_TARGET": "CHEMBL.TARGET",
    "CHV": "CHV",
    "CLINICALTRIALS": "clinicaltrials",
    "DCTERMS": "dcterms",
    "DGIDB": "DGIdb",
    "DOID": "DOID",
    "DRUGBANK": "DRUGBANK",
    "DRUGCENTRAL": "DrugCentral",
    "ENSEMBL": "ENSEMBL",
    "ENSEMBL_GENOMES": "EnsemblGenomes",
    "FMA": "FMA",
    "GO": "GO",
    "GTPI": "GTPI",
    "GTPI_SOURCE": "GTPI_source",
    "HCPCS": "HCPCS",
    "HGNC": "HGNC",
    "HMDB": "HMDB",
    "HP": "HP",
    "IAO": "IAO",
    "ICD10PCS": "ICD10PCS",
    "ICD9": "ICD9",
    "IDENTIFIERS_ORG_REGISTRY": "identifiers_org_registry",
    "ISBN": "ISBN",
    "KEGG": "KEGG",
    "KEGG_COMPOUND": "KEGG.COMPOUND",
    "KEGG_DRUG": "KEGG.DRUG",
    "KEGG_ENZYME": "KEGG.ENZYME",
    "KEGG_GLYCAN": "KEGG.GLYCAN",
    "KEGG_REACTION": "KEGG.REACTION",
    "KEGG_SOURCE": "KEGG_source",
    "MEDDRA": "MEDDRA",
    "MESH": "MESH",
    "MIRBASE": "miRBase",
    "MONDO": "MONDO",
    "NCBI_GENE": "NCBIGene",
    "NCBI_TAXON": "NCBITaxon",
    "NCIT": "NCIT",
    "NDDF": "NDDF",
    "OBO": "OBO",
    "OBO_FORMAT": "oboFormat",
    "OIO": "OIO",
    "OMIM": "OMIM",
    "OWL": "owl",
    "PATHWHIZ": "PathWhiz",
    "PATHWHIZ_COMPOUND": "PathWhiz.Compound",
    "PATHWHIZ_NUCLEIC_ACID": "PathWhiz.NucleicAcid",
    "PATHWHIZ_ELEMENT_COLLECTION": "PathWhiz.ElementCollection",
    "PATHWHIZ_REACTION": "PathWhiz.Reaction",
    "PATHWHIZ_BOUND": "PathWhiz.Bound",
    "PATHWHIZ_PROTEIN_COMPLEX": "PathWhiz.ProteinComplex",
    "PDQ": "PDQ",
    "PMID": "PMID",
    "PSY": "PSY",
    "RDF": "rdf",
    "RDFS": "rdfs",
    "REACTOME": "REACT",
    "REPODB": "REPODB",
    "RHEA": "RHEA",
    "RHEA_COMP": "RHEA.COMP",
    "RO": "RO",
    "RTX": "RTX",
    "RXNORM": "RXNORM",
    "SEMMEDDB": "SEMMEDDB",
    "SKOS": "skos",
    "SMPDB": "SMPDB",
    "SNOMED": "SNOMED",
    "TTD_DRUG": "ttd.drug",
    "TTD_TARGET": "ttd.target",
    "UMLS": "UMLS",
    "UMLS_STY": "STY",
    "UMLS_SOURCE": "umls_source",
    "UNICHEM_SOURCE": "UNICHEM_source",
    "UNIPROT": "UniProtKB",
    "VANDF": "VANDF"
} # curies_prefix is prefiexes from RTX-KG2 
    
def update_node_ids(graphs, curie_prefixes):
    # Convert all dictionary keys to uppercase for case-insensitive matching
    curie_prefixes = {k.upper(): v for k, v in curie_prefixes.items()}
    unmatched_prefixes = set()
    
    for graph in graphs:
        for node in graph['graph']['nodes']:
            original_id = node['id']
            prefix, id_suffix = original_id.split(':', 1)
            upper_prefix = prefix.upper()
            
            if upper_prefix in curie_prefixes:
                # Use the uppercased prefix to fetch the correct replacement
                node['id'] = f"{curie_prefixes[upper_prefix]}:{id_suffix}"
            else:
                unmatched_prefixes.add(prefix)  # Collect unmatched prefixes
    if unmatched_prefixes:
        print("Unmatched prefixes:", unmatched_prefixes)
        print()
    return graphs

# Displays updated node IDs for each graph and unmatched node id prefixes
# The unmatched node ID's will be processed in the later script using equivalent query match
graphs_converted = update_node_ids(graphs, curie_prefixes)

# Display updated DrugMechDB graph node id (prefix update)
graphs_converted

Unmatched prefixes: {'PR', 'taxonomy', 'TIGR', 'UBERON', 'DB', 'CL', 'Pfam', 'InterPro'}



[{'graph': {'_id': 'DB00619_MESH_D015464_1',
   'nodes': [{'id': 'MESH:D000068877', 'name': 'imatinib'},
    {'id': 'UniProtKB:P00519', 'name': 'BCR/ABL'},
    {'id': 'MESH:D015464', 'name': 'CML (ph+)'}]}},
 {'graph': {'_id': 'DB00619_MESH_D034721_1',
   'nodes': [{'id': 'MESH:D000068877', 'name': 'imatinib'},
    {'id': 'UniProtKB:P10721', 'name': 'c-Kit'},
    {'id': 'UniProtKB:P16234', 'name': 'Pdgf'},
    {'id': 'GO:0008283', 'name': 'Cellular proliferation'},
    {'id': 'MESH:D034721', 'name': 'Systemic mast cell disease'}]}},
 {'graph': {'_id': 'DB00316_MESH_D010146_1',
   'nodes': [{'id': 'MESH:D000082', 'name': 'Acetaminophen'},
    {'id': 'UniProtKB:P23219', 'name': 'Prostaglandin G/H synthase 1'},
    {'id': 'UniProtKB:P35354', 'name': 'Prostaglandin G/H synthase 2'},
    {'id': 'UniProtKB:Q15185', 'name': 'Prostaglandin E synthase 3'},
    {'id': 'GO:0001516', 'name': 'prostaglandin biosynthetic process'},
    {'id': 'MESH:D011453', 'name': 'Prostaglandins'},
    {'id': 'ME

In [8]:
# Save updated graph (node ID updated)
def save_graphs_to_yaml(graphs, filename):
    with open(filename, 'w') as file:
        yaml.dump(graphs, file)

save_graphs_to_yaml(graphs_converted, 'NodeID_updated_graphs.yaml')