In [1]:
import networkx as nx
import os
import pybel
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from se_kge.get_url_requests import cid_to_synonyms, get_gene_names
from se_kge.graph_preprocessing import get_drugbank_graph, get_sider_graph, combine_pubchem_drugbank

# Building SIDER and DrugBank graphs

In [5]:
sider_graph = get_sider_graph()

side effects: 100%|██████████| 308948/308948 [01:14<00:00, 4132.80it/s]
indications: 100%|██████████| 30794/30794 [00:27<00:00, 1100.30it/s]


In [3]:
drugbank_graph = get_drugbank_graph()

Mapping drug-protein interactions to BEL: 100%|██████████| 25199/25199 [02:55<00:00, 143.21it/s]


# Combining both graphs into a complete graph

In [6]:
drugbank_pubchem_mapping = os.path.join(os.pardir, "resources", "drugbank_pubchem_mapping.tsv")

In [7]:
full_graph = combine_pubchem_drugbank(drugbank_pubchem_mapping, drugbank_graph, sider_graph)

create pubchem-drugbank mapping dictionary: 6194it [00:00, 8523.39it/s]
Removing nodes that were not relabeled: 100%|██████████| 1513/1513 [00:00<00:00, 19877.97it/s]

Number of nodes that were not relabeled 1513





The number of nodes in the combined graph is 14463


In [10]:
sider_side_effects = 0
sider_drugs = 0
for node in sider_graph.nodes():
    if node.namespace == 'umls':
        sider_side_effects+=1
    else:
        sider_drugs+=1
print('SIDER graph has a total of %d nodes' % len(sider_graph.nodes()))
print('The number of Side Effects nodes is %d' % sider_side_effects)
print('The number of Drugs nodes is %d' % sider_drugs)

SIDER graph has a total of 8497 nodes
The number of Side Effects nodes is 6990
The number of Drugs nodes is 1507


In [11]:
drugbank_proteins = 0
drugbank_drugs = 0
for node in drugbank_graph.nodes():
    if node.namespace == 'uniprot':
        drugbank_proteins+=1
    else:
        drugbank_drugs+=1
print('DrugBank Graph has a total of %d nodes' % len(drugbank_graph.nodes()))
print('The number of Proteins nodes is %d' % drugbank_proteins)
print('The number of drugs nodes is %d' % drugbank_drugs)

DrugBank Graph has a total of 7988 nodes
The number of Proteins nodes is 2778
The number of drugs nodes is 5210


In [12]:
proteins = 0
side_effects = 0
drugs = 0
for node in full_graph.nodes():
    if node.namespace == 'uniprot':
        proteins+=1
    elif node.namespace == 'umls':
        side_effects+=1
    else:
        drugs+=1
print('The combined graph has a total of %d nodes' % len(full_graph.nodes()))
print('The number of Proteins nodes is %d' % proteins)
print('The number of Side Effects nodes is %d' % side_effects)
print('The number of Drugs nodes is %d' % drugs)

The combined graph has a total of 14463 nodes
The number of Proteins nodes is 2778
The number of Side Effects nodes is 6990
The number of Drugs nodes is 4695


In [13]:
pybel.to_pickle(sider_graph, os.path.join(os.pardir, "resources", "sider_graph.pickle"))

In [14]:
pybel.to_pickle(drugbank_graph, os.path.join(os.pardir, "resources", "drugbank_graph.pickle"))

In [15]:
pybel.to_pickle(full_graph, os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

# Mapping the nodes and relabeling them

In [2]:
full_graph = pybel.from_pickle(os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

In [3]:
relabel_fullgraph = {}
i = 1
for node in full_graph.nodes():
    relabel_fullgraph[node] = i
    i+=1

In [6]:
node_mapping_list = []
protein_list = []
for node, node_id in tqdm(relabel_fullgraph.items()):
    name = node.name
    if node.namespace == 'pubchem':
        name = cid_to_synonyms(node.identifier).decode("utf-8")
    if node.namespace == 'uniprot':
        protein_list.append(node.identifier)
    node_mapping_list.append((node_id, node.namespace, node.identifier, name))
node_mapping_df = pd.DataFrame(node_mapping_list, columns=['node_id', 'namespace', 'identifier', 'name'])

HBox(children=(IntProgress(value=0, max=14463), HTML(value='')))

In [7]:
protein_to_gene = get_gene_names(protein_list)

In [9]:
for protein, gene in protein_to_gene.items():
    node_mapping_df.loc[node_mapping_df['identifier'] == protein, 'name'] = gene

In [10]:
node_mapping_df.head()

Unnamed: 0,node_id,namespace,identifier,name
0,1,pubchem,85,(3-carboxy-2-hydroxypropyl)-trimethylazanium
1,2,umls,C0000729,Abdominal cramps
2,3,umls,C0000737,Abdominal pain
3,4,umls,C0687713,Gastrointestinal pain
4,5,umls,C0002418,Amblyopia


In [12]:
node_mapping_df.to_csv(os.path.join(os.pardir, "resources", "fullgraph_nodes_mapping.tsv"), index=False)

In [13]:
fullgraph_id = nx.relabel_nodes(full_graph, relabel_fullgraph)

In [14]:
nx.write_edgelist(fullgraph_id, os.path.join(os.pardir, "resources", "fullgraph_without_sim.edgelist"), data=False)

# Combining the full graph with chemical similarity graph
Note: Calculating chemical similarities and creating the graph is done in a different notebook because it needs the RDkit package and environment

In [15]:
chem_sim_graph = pybel.from_pickle(os.path.join(os.pardir, "resources", "chem_sim_graph.pickle"))

In [30]:
chem_sim_relabeled = nx.relabel_nodes(chem_sim_graph, relabel_fullgraph)

In [32]:
nx.write_edgelist(chem_sim_relabeled, os.path.join(os.pardir, "resources", "chem_sim_graph.edgelist"), data=False)

In [33]:
pybel.to_pickle(chem_sim_graph, os.path.join(os.pardir, "resources", "chem_sim_graph.pickle"))

In [34]:
fullgraph_with_chemsim = fullgraph_id + chem_sim_relabeled

In [37]:
nx.write_edgelist(fullgraph_with_chemsim, os.path.join(os.pardir, "resources", "fullgraph_with_chemsim.edgelist"), data=False)

In [18]:
fullgraph_without_chemsim = pybel.from_pickle(os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

In [19]:
fullgraph_with_chemsim = fullgraph_without_chemsim + chem_sim_graph

In [20]:
pybel.to_pickle(fullgraph_with_chemsim, os.path.join(os.pardir, "resources", "fullgraph_with_chemsim.pickle"))