In [1]:
# save the AUROC and AUPRC results for plotting
list_dict = []

In [2]:
# code block to track CPU time and Memory usage
import psutil
import time

start_memory = psutil.virtual_memory().available
start_time = time.time()

# 1.COVID-19 graph embedding alone

## 1.1. ranking model

In [3]:
import numpy as np
import pandas as pd
import time
import re
import math
import random
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.modules import Module
from torch.utils.data import Dataset, DataLoader

In [4]:
data_path='data/'
exp_id='v0'
device_id='cpu' #'cpu' if CPU, device number if GPU

In [5]:
device=torch.device(device_id)

In [6]:
le=pickle.load(open(data_path+'LabelEncoder_'+exp_id+'.pkl', 'rb'))
edge_index=pickle.load(open(data_path+'edge_index_'+exp_id+'.pkl','rb'))

In [7]:
types=np.array([item.split('_')[0] for item in le.classes_ ])

In [8]:
#label
trials=pd.read_excel(data_path+'literature-mining/All_trails_5_24.xlsx',header=1,index_col=0)
trials_drug=set([drug.strip().upper() for lst in trials.loc[trials['study_category'].apply(lambda x: 'drug' in x.lower()),'intervention'].apply(lambda x: re.split(r'[+|/|,]',x.replace(' vs. ', '/').replace(' vs ', '/').replace(' or ', '/').replace(' with and without ', '/').replace(' /wo ', '/').replace(' /w ', '/').replace(' and ', '/').replace(' - ', '/').replace(' (', '/').replace(') ', '/'))).values for drug in lst])
drug_labels=[1 if drug.split('_')[1] in trials_drug else 0 for drug in le.classes_[types=='drug'] ]

In [9]:
class Classifier(nn.Module):
    def __init__(self,embedding_dim):
        super(Classifier, self).__init__() 
        self.fc1=nn.Linear(embedding_dim,embedding_dim)
        self.fc2=nn.Linear(embedding_dim,1)
        self.bn=nn.BatchNorm1d(embedding_dim)
    def forward(self, x):
        residual1 = x
        x = F.dropout(x, training=self.training)
        x= self.bn(F.dropout(F.relu(self.fc1(x)),training=self.training))
        x += residual1  
        return self.fc2(x)

In [10]:
from torch.utils.data import BatchSampler, WeightedRandomSampler
class BPRLoss(nn.Module):
    def __init__(self, num_neg_samples):
        super(BPRLoss, self).__init__()
        self.num_neg_samples=num_neg_samples
    
    def forward(self, output, label):
        positive_output=output[label==1]
        negative_output=output[label!=1]
        
        #negative sample proportional to the high values
        negative_sampler=WeightedRandomSampler(negative_output-min(negative_output), num_samples=self.num_neg_samples*len(positive_output),replacement=True)
        negative_sample_output=negative_output[torch.tensor(list(BatchSampler(negative_sampler, batch_size=len(positive_output),drop_last=True)),dtype=torch.long).t()]
        return -(positive_output.view(-1,1)-negative_sample_output).sigmoid().log().mean()

In [11]:
z_np = pickle.load(open(data_path+'COVID_embedding_'+exp_id+'.pkl', 'rb'))

In [12]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [13]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [14]:
embed_dim = 128
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [15]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+'nn_clf_covid_embedding.pt')

training loss 0.8103823661804199
test loss 0.6932399272918701
training loss 0.8063710927963257
test loss 0.6932365298271179
training loss 0.8113760948181152
test loss 0.6932251453399658
training loss 0.8406867980957031
test loss 0.6931976079940796
training loss 0.7960447669029236
test loss 0.6932141184806824
training loss 0.7339330315589905
test loss 0.6932159662246704
training loss 0.7969534397125244
test loss 0.6932098269462585
training loss 0.7890408635139465
test loss 0.693217933177948
training loss 0.9003066420555115
test loss 0.6931884288787842
training loss 0.822611391544342
test loss 0.6931594610214233
training loss 0.8152236342430115
test loss 0.693177342414856
training loss 0.8225076198577881
test loss 0.6930989623069763
training loss 0.728827714920044
test loss 0.69306880235672
training loss 0.7472310066223145
test loss 0.6930496692657471
training loss 0.7640727162361145
test loss 0.6930521726608276
training loss 0.7679376006126404
test loss 0.6929921507835388
training loss 

In [16]:
clf.load_state_dict(torch.load(data_path+'nn_clf_covid_embedding.pt').state_dict())

<All keys matched successfully>

In [17]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

AUROC 0.6552501214943016
AUPRC 0.08606489606331079


In [18]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 3.91 sec
Memory usage: 44.25 MB


## 1.2. baseline models

In [19]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

Logit AUROC 0.6931524243991911
Logit AUPRC 0.10168666277003269
XGBoost AUROC 0.6519384219849818
XGBoost AUPRC 0.08424654473848554
rf AUROC 0.5822438037906222
rf AUPRC 0.0665767657030504
svm AUROC 0.6586402044239603
svm AUPRC 0.1319144480743561


In [20]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 7.05 sec
Memory usage: -10.67 MB


# 2. DRKG embedding alone

## 2.1. ranking model

In [21]:
z_np = pickle.load(open(data_path+'node_feature_'+exp_id+'.pkl','rb'))

In [22]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [23]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [24]:
embed_dim = 400
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [25]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+'nn_clf_DRKG_embedding.pt')

training loss 0.8182557225227356
test loss 0.5654714703559875
training loss 0.48486700654029846
test loss 0.504222571849823
training loss 0.29547011852264404
test loss 0.48464715480804443
training loss 0.3150358498096466
test loss 0.498331755399704
training loss 0.2940555810928345
test loss 0.497141033411026
training loss 0.28010106086730957
test loss 0.49028658866882324
training loss 0.2777594327926636
test loss 0.4876006245613098
training loss 0.21368248760700226
test loss 0.5224721431732178
training loss 0.28686225414276123
test loss 0.519257664680481
training loss 0.2793280780315399
test loss 0.5361437797546387
training loss 0.21720975637435913
test loss 0.5233345031738281
training loss 0.25004202127456665
test loss 0.5403933525085449
training loss 0.1824733018875122
test loss 0.5294878482818604
training loss 0.19845442473888397
test loss 0.5399629473686218
training loss 0.27991464734077454
test loss 0.5361773371696472
training loss 0.2212509959936142
test loss 0.521087646484375
tr

In [26]:
clf.load_state_dict(torch.load(data_path+'nn_clf_DRKG_embedding.pt').state_dict())

<All keys matched successfully>

In [27]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

AUROC 0.8657762310116166
AUPRC 0.17444598694313124


In [28]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 2.63 sec
Memory usage: 12.00 MB


## 2.2. baseline models

In [29]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

Logit AUROC 0.8436015613977331
Logit AUPRC 0.18394788718419244
XGBoost AUROC 0.8578908589255201
XGBoost AUPRC 0.14256704270151688
rf AUROC 0.8432174826380725
rf AUPRC 0.13306874486326117
svm AUROC 0.8842276881594006
svm AUPRC 0.2578582442037748


In [30]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 5.87 sec
Memory usage: -0.17 MB


# 3. hybrid embedding

## 3.1. embedding dimension: 128 (default)

### 3.1.1. ranking model

In [31]:
embed_dim = 128
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))

In [32]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [33]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [34]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [35]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

training loss 1.0323024988174438
test loss 0.7024515271186829
training loss 0.8107777833938599
test loss 0.6513453125953674
training loss 0.6297714114189148
test loss 0.6260812878608704
training loss 0.5238948464393616
test loss 0.6047687530517578
training loss 0.4022414982318878
test loss 0.5926344990730286
training loss 0.39312437176704407
test loss 0.6003910899162292
training loss 0.3976527750492096
test loss 0.6000576615333557
training loss 0.32615718245506287
test loss 0.6163175106048584
training loss 0.39616668224334717
test loss 0.6290398240089417
training loss 0.3855721652507782
test loss 0.6002631187438965
training loss 0.3514969050884247
test loss 0.6139815449714661
training loss 0.38395676016807556
test loss 0.6186772584915161
training loss 0.4060250222682953
test loss 0.6627027988433838
training loss 0.3827492892742157
test loss 0.6590971350669861
training loss 0.38731563091278076
test loss 0.6719714999198914
training loss 0.4262847304344177
test loss 0.6633058786392212
tra

In [36]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

<All keys matched successfully>

In [37]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

AUROC 0.8915291037639719
AUPRC 0.24000344741010085


In [38]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 0.84 sec
Memory usage: 3.10 MB


### 3.1.2. baseline models

In [39]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

Logit AUROC 0.9004412986565081
Logit AUPRC 0.2655952893593598
XGBoost AUROC 0.8746217999968646
XGBoost AUPRC 0.20499697201646291
rf AUROC 0.8697502704228002
rf AUPRC 0.21197584325983626
svm AUROC 0.6670233112292088
svm AUPRC 0.2636707578465237


In [40]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 3.84 sec
Memory usage: -1.46 MB


## 3.2. embedding dimension: 64

### 3.2.1. ranking model

In [41]:
embed_dim = 64
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))

In [42]:
z_np.shape

(15444, 64)

In [43]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [44]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [45]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [46]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

training loss 0.8822551369667053
test loss 0.706145703792572
training loss 0.8575397729873657
test loss 0.7001444697380066
training loss 0.7320197224617004
test loss 0.6952470541000366
training loss 0.7483329176902771
test loss 0.6929148435592651
training loss 0.8247588276863098
test loss 0.6889188289642334
training loss 0.7423900961875916
test loss 0.6838169693946838
training loss 0.6707960963249207
test loss 0.6793045401573181
training loss 0.7847961187362671
test loss 0.6754078269004822
training loss 0.7033270001411438
test loss 0.6717008948326111
training loss 0.7160780429840088
test loss 0.6680765748023987
training loss 0.7272428870201111
test loss 0.6631828546524048
training loss 0.67169189453125
test loss 0.661223292350769
training loss 0.6506121754646301
test loss 0.660032331943512
training loss 0.6574820280075073
test loss 0.6522682309150696
training loss 0.5831413269042969
test loss 0.649164080619812
training loss 0.5174868106842041
test loss 0.6433387398719788
training loss 

In [47]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

<All keys matched successfully>

In [48]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

AUROC 0.8840121337534685
AUPRC 0.1732987509968906


In [49]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 0.62 sec
Memory usage: -1.12 MB


### 3.2.2. baseline models

In [50]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

Logit AUROC 0.8848194829829595
Logit AUPRC 0.2584255158135413
XGBoost AUROC 0.86647776262365
XGBoost AUPRC 0.2136404370421257
rf AUROC 0.8607087428867046
rf AUPRC 0.18629615451725506
svm AUROC 0.7374821677718729
svm AUPRC 0.22620910728141796


In [51]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 2.41 sec
Memory usage: -0.53 MB


## 3.3. embedding dimension: 256

### 3.3.1. ranking model

In [52]:
embed_dim = 256
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))
z_np.shape

(15444, 256)

In [53]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [54]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [55]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

training loss 1.1971021890640259
test loss 0.6194038391113281
training loss 0.6007117629051208
test loss 0.5793237686157227
training loss 0.4675365686416626
test loss 0.6229389309883118
training loss 0.37748339772224426
test loss 0.6490309238433838
training loss 0.43516141176223755
test loss 0.6979953050613403
training loss 0.5512062907218933
test loss 0.6725337505340576
training loss 0.48158687353134155
test loss 0.7063509225845337
training loss 0.5235801339149475
test loss 0.7712727189064026
training loss 0.4072955250740051
test loss 0.7376261353492737
training loss 0.6094513535499573
test loss 0.8028955459594727
training loss 0.5353372097015381
test loss 0.789703905582428
training loss 0.45833268761634827
test loss 0.8465150594711304
training loss 0.4590623378753662
test loss 0.8092597126960754
training loss 0.5528014898300171
test loss 0.8178311586380005
training loss 0.49342602491378784
test loss 0.8162261247634888
training loss 0.49464279413223267
test loss 0.8329033255577087
tra

In [56]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

AUROC 0.8934338208782079
AUPRC 0.24169934575766414


In [57]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 1.44 sec
Memory usage: 2.74 MB


### 3.3.2. baseline models

In [58]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

Logit AUROC 0.9112111806110771
Logit AUPRC 0.3330538576819279


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


XGBoost AUROC 0.8722781357287308
XGBoost AUPRC 0.1619359600285969
rf AUROC 0.8797206414899121
rf AUPRC 0.22246521356421856
svm AUROC 0.6692572387088682
svm AUPRC 0.25591625669053947


In [59]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 7.18 sec
Memory usage: 6.47 MB


## 3.4. without bait-prey edges

### 3.4.1. ranking model

In [60]:
embed_dim = 128
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_no_bp_'+exp_id+'.pkl','rb'))
z_np.shape

(15444, 128)

In [61]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [62]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}_no_bp_.pt')

training loss 0.91648268699646
test loss 0.7551467418670654
training loss 0.7774485349655151
test loss 0.728404700756073
training loss 0.7347243428230286
test loss 0.7092740535736084
training loss 0.7062397003173828
test loss 0.6878621578216553
training loss 0.6790356040000916
test loss 0.6643539071083069
training loss 0.6182913184165955
test loss 0.6468978524208069
training loss 0.5792075991630554
test loss 0.631205677986145
training loss 0.5830502510070801
test loss 0.6173486709594727
training loss 0.5432453155517578
test loss 0.6000494360923767
training loss 0.458672434091568
test loss 0.5892173051834106
training loss 0.5596255660057068
test loss 0.5636516809463501
training loss 0.45072367787361145
test loss 0.5616510510444641
training loss 0.5626056790351868
test loss 0.5386508703231812
training loss 0.5114240646362305
test loss 0.536547064781189
training loss 0.45689332485198975
test loss 0.5256733298301697
training loss 0.47503483295440674
test loss 0.5071656107902527
training lo

In [63]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

AUROC 0.7666721535060904
AUPRC 0.08683337002903749


In [64]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 0.80 sec
Memory usage: 1.33 MB


### 3.4.2. baseline models

In [65]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

Logit AUROC 0.8839415886751634
Logit AUPRC 0.22722804899906635
XGBoost AUROC 0.8784939409616078
XGBoost AUPRC 0.17386730653022386
rf AUROC 0.8432841085453605
rf AUPRC 0.1272555533653837
svm AUROC 0.7566625907288091
svm AUPRC 0.17287456132624332


In [66]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

CPU time: 4.23 sec
Memory usage: 5.62 MB


# 4. save all AUROC and AUPRC results for plotting

In [67]:
import pandas as pd

df_sum = pd.DataFrame(list_dict)
df_sum.to_csv(f'df_sum_{exp_id}.csv', index=False)