In [1]:
import networkx as nx
import os
import pybel
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from se_kge.get_url_requests import cid_to_synonyms, get_gene_names
from se_kge.graph_preprocessing import get_drugbank_graph, get_sider_graph, combine_pubchem_drugbank, create_graph_mapping

In [2]:
basic_resources = os.path.join(os.pardir, "resources", "basic_graphs")

# Building SIDER and DrugBank graphs

In [3]:
sider_graph = get_sider_graph()

side effects: 100%|████████████████████████████████████████████████████████| 1853688/1853688 [18:20<00:00, 1684.36it/s]
indications: 100%|███████████████████████████████████████████████████████████| 184764/184764 [02:44<00:00, 1126.19it/s]


In [4]:
drugbank_graph = get_drugbank_graph()

Mapping drug-protein interactions to BEL: 100%|███████████████████████████████| 116911/116911 [08:51<00:00, 219.99it/s]


In [5]:
sider_side_effects = 0
sider_drugs = 0
for node in sider_graph.nodes():
    if node.namespace == 'umls':
        sider_side_effects+=1
    else:
        sider_drugs+=1
print('SIDER graph has a total of %d nodes' % len(sider_graph.nodes()))
print('The number of Side Effects nodes is %d' % sider_side_effects)
print('The number of Drugs nodes is %d' % sider_drugs)

SIDER graph has a total of 8497 nodes
The number of Side Effects nodes is 6990
The number of Drugs nodes is 1507


In [6]:
drugbank_proteins = 0
drugbank_drugs = 0
for node in drugbank_graph.nodes():
    if node.namespace == 'uniprot':
        drugbank_proteins+=1
    else:
        drugbank_drugs+=1
print('DrugBank Graph has a total of %d nodes' % len(drugbank_graph.nodes()))
print('The number of Proteins nodes is %d' % drugbank_proteins)
print('The number of drugs nodes is %d' % drugbank_drugs)

DrugBank Graph has a total of 8059 nodes
The number of Proteins nodes is 2781
The number of drugs nodes is 5278


In [None]:
pybel.to_pickle(sider_graph, os.path.join(basic_resources, "sider_graph.pickle"))

In [None]:
pybel.to_pickle(drugbank_graph, os.path.join(basic_resources, "drugbank_graph.pickle"))

# Combining both graphs into a complete graph

In [7]:
sider_graph_path = os.path.join(basic_resources, "sider_graph.pickle")

In [8]:
drugbank_graph_path = os.path.join(basic_resources, "drugbank_graph.pickle")

In [9]:
drugbank_pubchem_mapping = os.path.join(os.pardir, "resources", "mapping", "drugbank_pubchem_mapping.tsv")

In [10]:
full_graph = combine_pubchem_drugbank(
    mapping_path=drugbank_pubchem_mapping, 
    drugbank_graph_path=drugbank_graph_path, 
    sider_graph_path=sider_graph_path)

create pubchem-drugbank mapping dictionary: 8540it [00:05, 1576.08it/s]
Removing nodes that were not relabeled: 100%|██████████████████████████████████████| 770/770 [00:00<00:00, 4106.70it/s]


In [11]:
proteins = 0
side_effects = 0
drugs = 0
for node in full_graph.nodes():
    if node.namespace == 'uniprot':
        proteins+=1
    elif node.namespace == 'umls':
        side_effects+=1
    else:
        drugs+=1
print('The combined graph has a total of %d nodes' % len(full_graph.nodes()))
print('The number of Proteins nodes is %d' % proteins)
print('The number of Side Effects nodes is %d' % side_effects)
print('The number of Drugs nodes is %d' % drugs)

The combined graph has a total of 15024 nodes
The number of Proteins nodes is 2596
The number of Side Effects nodes is 6990
The number of Drugs nodes is 5438


In [None]:
pybel.to_pickle(full_graph, os.path.join(basic_resources, "fullgraph_without_sim.pickle"))

# Mapping the nodes and relabeling them

In [None]:
fullgraph_id, node_mapping_df = create_graph_mapping(graph_path=os.path.join(basic_resources, "fullgraph_without_sim.pickle"))

In [None]:
node_mapping_df.head()

In [None]:
node_mapping_df.to_csv(os.path.join(os.pardir, "resources", "mapping", "fullgraph_nodes_mapping.tsv"), index=False, sep='\t')

In [None]:
nx.write_edgelist(fullgraph_id, os.path.join(basic_resources, "fullgraph_without_sim.edgelist"), data=False)

# Combining the full graph with chemical similarity graph
Note: Calculating chemical similarities and creating the graph is done in a different notebook because it needs the RDkit package and environment

In [None]:
chemsim_resources = os.path.join(os.pardir, "resources", "chemsim_50_graphs")

In [44]:
chem_sim_graph = pybel.from_pickle(os.path.join(chemsim_resources,"chem_sim_graph_50.pickle"))

In [45]:
fullgraph_with_chemsim = full_graph + chem_sim_graph

In [46]:
pybel.to_pickle(fullgraph_with_chemsim, os.path.join(chemsim_resources, "fullgraph_with_chemsim_50.pickle"))

In [47]:
chem_sim_relabeled = nx.relabel_nodes(chem_sim_graph, relabel_fullgraph)

In [48]:
nx.write_edgelist(chem_sim_relabeled, os.path.join(chemsim_resources,"chem_sim_graph_50.edgelist"), data=False)

In [49]:
fullgraph_with_chemsim_relabeled = fullgraph_id + chem_sim_relabeled

In [50]:
nx.write_edgelist(fullgraph_with_chemsim_relabeled, os.path.join(chemsim_resources, "fullgraph_with_chemsim_50.edgelist"), data=False)