In [108]:
from defusedxml import ElementTree
from seffnet.get_url_requests import cid_to_inchikey, cid_to_smiles, cid_to_synonyms, get_gene_names, inchikey_to_cid
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import joblib
from seffnet.constants import DEFAULT_EMBEDDINGS_PATH, DEFAULT_PREDICTIVE_MODEL_PATH

In [10]:
tree = ElementTree.parse("C:\\Users\\aldis\\Downloads\\full database.xml")
root = tree.getroot()
ns = '{http://www.drugbank.ca}'
drug_group_template = "{ns}groups/{ns}group"
approval_template = "{ns}patents/{ns}patent/{ns}approved"
mapping_list = []
for i, drug in tqdm(enumerate(root), desc="Getting DrugBank info"):
    assert drug.tag == ns + 'drug'
    if drug.attrib['type'] == "biotech":
        continue
    drug_group = drug.findtext(drug_group_template.format(ns=ns))
    if drug_group != 'approved':
        continue
    drug_approval = drug.findtext(approval_template.format(ns=ns))
    if drug_approval is None:
        continue
    if int(drug_approval[:4]) < 2015:
        continue
    name = drug.findtext(ns + "name")
    drugbank_id = drug.findtext(ns + "drugbank-id")
    indication = drug.findtext(ns + "indication")
    mapping_list.append((drugbank_id, name, drug_approval, indication))
mapping_df = pd.DataFrame(
    mapping_list,
    columns=['drugbank_id', 'name', 'approval year','indication']
)

HBox(children=(IntProgress(value=1, bar_style='info', description='Getting DrugBank info', max=1, style=Progre…




In [43]:
mapping_df

Unnamed: 0,drugbank_id,name,approval year,indication
0,DB00067,Vasopressin,2017-06-27,"For the treatment of enuresis, polyuria, diabe..."
1,DB00264,Metoprolol,2017-07-11,Metoprolol is indicated for the treatment of a...
2,DB00310,Chlorthalidone,2015-10-27,Chlorthalidone is indicated in the management ...
3,DB00328,Indometacin,2015-07-28,Oral indometacin is indicated for symptomatic ...
4,DB00377,Palonosetron,2015-09-08,For the prevention of acute and delayed nausea...
...,...,...,...,...
62,DB14048,Sodium zirconium cyclosilicate,2017-03-14,Sodium zirconium cyclosilicate is a potassium ...
63,DB14185,Aripiprazole lauroxil,2015-11-24,Aripiprazole lauroxil is indicated for the tre...
64,DB14554,Dotatate,2016-06-28,
65,DB14568,Ivosidenib,2016-10-25,Ivosidenib is approved for use in the treatmen...


In [52]:
fullgraph_df = pd.read_csv('C:\\Users\\aldis\\Documents\\GitHub\\seffnet\\resources\\mapping\\fullgraph_nodes_mapping.tsv', sep='\t')

In [53]:
phenotypes = []
for node_id, namespace, iden, name, typ in fullgraph_df.values:
    if typ == 'phenotype':
        phenotypes.append((name, iden))

In [54]:
indications_list = []
for iden, name, approval, indication in mapping_df.values:
    for pheno in phenotypes:
        if pheno[0].lower() in indication.lower():
            indications_list.append((iden, name, pheno[1], pheno[0], indication))
edgelist_df = pd.DataFrame(
    indications_list,
    columns=['drugbank_id', 'drug_name', 'phenotype_id', 'phenotype_name','indication']
)

In [57]:
edgelist_df.to_csv('C:\\Users\\aldis\\Documents\\GitHub\\seffnet\\resources\\evaluate_new_drug_indications.tsv', sep='\t', index=False)

In [62]:
testing_df = pd.read_csv('C:\\Users\\aldis\\Documents\\GitHub\\seffnet\\resources\\evaluate_new_drug_indications_checked.tsv', sep='\t')

In [80]:
graph_df = pd.read_csv('C:\\Users\\aldis\\Documents\\GitHub\\seffnet\\resources\\mapping\\fullgraph_nodes_mapping.tsv', sep='\t')

In [81]:
chemicals_df = pd.read_csv('C:\\Users\\aldis\\Documents\\GitHub\\seffnet\\resources\\mapping\\chemicals_mapping.tsv', sep='\t')

In [101]:
l = []
for _, name, pheno_id, _, _,_ in testing_df.values:
    if not graph_df.loc[graph_df['name'] == name, 'node_id'].any():
        continue
    source = graph_df.loc[graph_df['name'] == name, 'node_id'].iloc[0]
    target = graph_df.loc[graph_df['identifier'] == pheno_id, 'node_id'].iloc[0]
    l.append((source, target))

In [93]:
graph_df

Unnamed: 0,node_id,namespace,identifier,name,type
0,1,pubchem.compound,10918,(3-Carboxy-2-(R)-Hydroxy-Propyl)-Trimethyl-Amm...,experimental drug
1,2,umls,C0000729,Abdominal cramps,phenotype
2,3,umls,C0000737,Abdominal pain,phenotype
3,4,umls,C0687713,Gastrointestinal pain,phenotype
4,5,umls,C0002418,Amblyopia,phenotype
...,...,...,...,...,...
17715,17716,pubchem.compound,6433164,Ferrous fumarate,approved drug
17716,17717,pubchem.compound,167159,Ferrous glycine sulfate,approved drug
17717,17718,pubchem.compound,636398,Hydrocortisone probutate,approved drug
17718,17719,pubchem.compound,8759,Edetate disodium anhydrous,approved drug


Next:
- load model and embeddings
- create the testing set from the embeddings
- get the true positive rate

In [107]:
from bionev.utils import load_embedding

In [109]:
embeddings = load_embedding(DEFAULT_EMBEDDINGS_PATH)

In [110]:
model = joblib.load(DEFAULT_PREDICTIVE_MODEL_PATH)

In [115]:
x = []
y = []
for edge in l:
    node_u_emb = np.array(embeddings[str(edge[0])])
    node_v_emb = np.array(embeddings[str(edge[1])])
    feature_vector = node_u_emb * node_v_emb
    x.append(feature_vector.tolist())
    y.append(1)

In [118]:
from sklearn.metrics import recall_score

In [124]:
y_pred = model.predict(x)

In [126]:
recall_score(y, y_pred)

0.7419354838709677