In [1]:
import networkx as nx
import os
import pybel
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from se_kge.get_url_requests import cid_to_synonyms, get_gene_names
from se_kge.graph_preprocessing import get_drugbank_graph, get_sider_graph, combine_pubchem_drugbank

# Building SIDER and DrugBank graphs

In [2]:
sider_graph = get_sider_graph()

side effects: 100%|██████████| 308948/308948 [01:45<00:00, 2928.78it/s]
indications: 100%|██████████| 30794/30794 [00:43<00:00, 704.49it/s] 


In [3]:
drugbank_graph = get_drugbank_graph()

Mapping drug-protein interactions to BEL: 100%|██████████| 25199/25199 [05:12<00:00, 80.75it/s] 


# Combining both graphs into a complete graph

In [4]:
drugbank_pubchem_mapping = os.path.join(os.pardir, "resources", "drugbank_pubchem_mapping.tsv")

In [5]:
full_graph = combine_pubchem_drugbank(drugbank_pubchem_mapping, drugbank_graph, sider_graph)

create pubchem-drugbank mapping dictionary: 6194it [00:00, 9506.94it/s]
Removing nodes that were not relabeled: 100%|██████████| 1513/1513 [00:00<00:00, 21825.42it/s]

Number of nodes that were not relabeled 1513





The number of nodes in the combined graph is 13669


In [6]:
sider_side_effects = 0
sider_drugs = 0
for node in sider_graph.nodes():
    if node.namespace == 'umls':
        sider_side_effects+=1
    else:
        sider_drugs+=1
print('SIDER graph has a total of %d nodes' % len(sider_graph.nodes()))
print('The number of Side Effects nodes is %d' % sider_side_effects)
print('The number of Drugs nodes is %d' % sider_drugs)

SIDER graph has a total of 8497 nodes
The number of Side Effects nodes is 6990
The number of Drugs nodes is 1507


In [7]:
drugbank_proteins = 0
drugbank_drugs = 0
for node in drugbank_graph.nodes():
    if node.namespace == 'uniprot':
        drugbank_proteins+=1
    else:
        drugbank_drugs+=1
print('DrugBank Graph has a total of %d nodes' % len(drugbank_graph.nodes()))
print('The number of Proteins nodes is %d' % drugbank_proteins)
print('The number of drugs nodes is %d' % drugbank_drugs)

DrugBank Graph has a total of 7988 nodes
The number of Proteins nodes is 2778
The number of drugs nodes is 5210


In [8]:
proteins = 0
side_effects = 0
drugs = 0
for node in full_graph.nodes():
    if node.namespace == 'uniprot':
        proteins+=1
    elif node.namespace == 'umls':
        side_effects+=1
    else:
        drugs+=1
print('The combined graph has a total of %d nodes' % len(full_graph.nodes()))
print('The number of Proteins nodes is %d' % proteins)
print('The number of Side Effects nodes is %d' % side_effects)
print('The number of Drugs nodes is %d' % drugs)

The combined graph has a total of 13669 nodes
The number of Proteins nodes is 1984
The number of Side Effects nodes is 6990
The number of Drugs nodes is 4695


In [None]:
pybel.to_pickle(sider_graph, os.path.join(os.pardir, "resources", "sider_graph.pickle"))

In [None]:
pybel.to_pickle(drugbank_graph, os.path.join(os.pardir, "resources", "drugbank_graph.pickle"))

In [None]:
pybel.to_pickle(full_graph, os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

# Mapping the nodes and relabeling them

In [None]:
full_graph = pybel.from_pickle(os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

In [None]:
relabel_fullgraph = {}
i = 1
for node in full_graph.nodes():
    relabel_fullgraph[node] = i
    i+=1

In [None]:
node_mapping_list = []
protein_list = []
for node, node_id in tqdm(relabel_fullgraph.items()):
    name = node.name
    if node.namespace == 'pubchem':
        name = cid_to_synonyms(node.identifier).decode("utf-8")
    if node.namespace == 'uniprot':
        protein_list.append(node.identifier)
    node_mapping_list.append((node_id, node.namespace, node.identifier, name))
node_mapping_df = pd.DataFrame(node_mapping_list, columns=['node_id', 'namespace', 'identifier', 'name'])

HBox(children=(IntProgress(value=0, max=13669), HTML(value='')))

In [None]:
protein_to_gene = get_gene_names(protein_list)

In [None]:
for protein, gene in protein_to_gene.items():
    node_mapping_df.loc[node_mapping_df['identifier'] == protein, 'name'] = gene

In [None]:
node_mapping_df.head()

In [None]:
node_mapping_df.to_csv(os.path.join(os.pardir, "resources", "fullgraph_nodes_mapping.tsv"), index=False)

In [None]:
fullgraph_id = nx.relabel_nodes(full_graph, relabel_fullgraph)

In [None]:
nx.write_edgelist(fullgraph_id, os.path.join(os.pardir, "resources", "fullgraph_without_sim.edgelist"), data=False)

# Combining the full graph with chemical similarity graph
Note: Calculating chemical similarities and creating the graph is done in a different notebook because it needs the RDkit package and environment

In [None]:
chem_sim_graph = pybel.from_pickle(os.path.join(os.pardir, "resources", "chem_sim_graph_70.pickle"))

In [None]:
chem_sim_relabeled = nx.relabel_nodes(chem_sim_graph, relabel_fullgraph)

In [None]:
nx.write_edgelist(chem_sim_relabeled, os.path.join(os.pardir, "resources", "chem_sim_graph_70.edgelist"), data=False)

In [None]:
fullgraph_with_chemsim = fullgraph_id + chem_sim_relabeled

In [None]:
nx.write_edgelist(fullgraph_with_chemsim, os.path.join(os.pardir, "resources", "fullgraph_with_chemsim.edgelist"), data=False)

In [None]:
fullgraph_without_chemsim = pybel.from_pickle(os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

In [None]:
fullgraph_with_chemsim = fullgraph_without_chemsim + chem_sim_graph

In [None]:
pybel.to_pickle(fullgraph_with_chemsim, os.path.join(os.pardir, "resources", "fullgraph_with_chemsim_70.pickle"))