In [None]:
# save the AUROC and AUPRC results for plotting
list_dict = []

In [None]:
# code block to track CPU time and Memory usage
import psutil
import time

start_memory = psutil.virtual_memory().available
start_time = time.time()

# 1.COVID-19 graph embedding alone

## 1.1. ranking model

In [None]:
import numpy as np
import pandas as pd
import time
import re
import math
import random
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.modules import Module
from torch.utils.data import Dataset, DataLoader

In [None]:
data_path='data/'
exp_id='v0'
device_id='cpu' #'cpu' if CPU, device number if GPU

In [None]:
device=torch.device(device_id)

In [None]:
le=pickle.load(open(data_path+'LabelEncoder_'+exp_id+'.pkl', 'rb'))
edge_index=pickle.load(open(data_path+'edge_index_'+exp_id+'.pkl','rb'))

In [None]:
types=np.array([item.split('_')[0] for item in le.classes_ ])

In [None]:
#label
trials=pd.read_excel(data_path+'literature-mining/All_trails_5_24.xlsx',header=1,index_col=0)
trials_drug=set([drug.strip().upper() for lst in trials.loc[trials['study_category'].apply(lambda x: 'drug' in x.lower()),'intervention'].apply(lambda x: re.split(r'[+|/|,]',x.replace(' vs. ', '/').replace(' vs ', '/').replace(' or ', '/').replace(' with and without ', '/').replace(' /wo ', '/').replace(' /w ', '/').replace(' and ', '/').replace(' - ', '/').replace(' (', '/').replace(') ', '/'))).values for drug in lst])
drug_labels=[1 if drug.split('_')[1] in trials_drug else 0 for drug in le.classes_[types=='drug'] ]

In [None]:
class Classifier(nn.Module):
    def __init__(self,embedding_dim):
        super(Classifier, self).__init__() 
        self.fc1=nn.Linear(embedding_dim,embedding_dim)
        self.fc2=nn.Linear(embedding_dim,1)
        self.bn=nn.BatchNorm1d(embedding_dim)
    def forward(self, x):
        residual1 = x
        x = F.dropout(x, training=self.training)
        x= self.bn(F.dropout(F.relu(self.fc1(x)),training=self.training))
        x += residual1  
        return self.fc2(x)

In [None]:
from torch.utils.data import BatchSampler, WeightedRandomSampler
class BPRLoss(nn.Module):
    def __init__(self, num_neg_samples):
        super(BPRLoss, self).__init__()
        self.num_neg_samples=num_neg_samples
    
    def forward(self, output, label):
        positive_output=output[label==1]
        negative_output=output[label!=1]
        
        #negative sample proportional to the high values
        negative_sampler=WeightedRandomSampler(negative_output-min(negative_output), num_samples=self.num_neg_samples*len(positive_output),replacement=True)
        negative_sample_output=negative_output[torch.tensor(list(BatchSampler(negative_sampler, batch_size=len(positive_output),drop_last=True)),dtype=torch.long).t()]
        return -(positive_output.view(-1,1)-negative_sample_output).sigmoid().log().mean()

In [None]:
z_np = pickle.load(open(data_path+'COVID_embedding_'+exp_id+'.pkl', 'rb'))

In [None]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [None]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [None]:
embed_dim = 128
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [None]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+'nn_clf_covid_embedding.pt')

In [None]:
clf.load_state_dict(torch.load(data_path+'nn_clf_covid_embedding.pt').state_dict())

In [None]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

## 1.2. baseline models

In [None]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'COVID-19 alone',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

# 2. DRKG embedding alone

## 2.1. ranking model

In [None]:
z_np = pickle.load(open(data_path+'node_feature_'+exp_id+'.pkl','rb'))

In [None]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [None]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [None]:
embed_dim = 400
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [None]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+'nn_clf_DRKG_embedding.pt')

In [None]:
clf.load_state_dict(torch.load(data_path+'nn_clf_DRKG_embedding.pt').state_dict())

In [None]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

## 2.2. baseline models

In [None]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'DRKG alone',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

# 3. hybrid embedding

## 3.1. embedding dimension: 128 (default)

### 3.1.1. ranking model

In [None]:
embed_dim = 128
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))

In [None]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [None]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [None]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [None]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

In [None]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

In [None]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

### 3.1.2. baseline models

In [None]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

## 3.2. embedding dimension: 64

### 3.2.1. ranking model

In [None]:
embed_dim = 64
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))

In [None]:
z_np.shape

In [None]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [None]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [None]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [None]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

In [None]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

In [None]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

### 3.2.2. baseline models

In [None]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

## 3.3. embedding dimension: 256

### 3.3.1. ranking model

In [None]:
embed_dim = 256
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))
z_np.shape

In [None]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [None]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [None]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

In [None]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

### 3.3.2. baseline models

In [None]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

## 3.4. without bait-prey edges

### 3.4.1. ranking model

In [None]:
embed_dim = 128
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_no_bp_'+exp_id+'.pkl','rb'))
z_np.shape

In [None]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [None]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}_no_bp_.pt')

In [None]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
auroc = metrics.roc_auc_score(y_test,prob)
auprc = metrics.average_precision_score(y_test,prob)
print("AUROC", auroc)
print("AUPRC", auprc)

list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'neural network ranking',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

### 3.4.2. baseline models

In [None]:
clf=LogisticRegression().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("Logit AUROC", auroc)
print("Logit AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'logistic regression',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=GradientBoostingClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("XGBoost AUROC", auroc)
print("XGBoost AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'XGBoost',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=RandomForestClassifier().fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("rf AUROC", auroc)
print("rf AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'random forest',
    'AUROC': auroc,
    'AUPRC': auprc
})

clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
auroc = roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])
auprc = average_precision_score(y_test,clf.predict_proba(X_test)[:,1])
print("svm AUROC", auroc)
print("svm AUPRC", auprc)
list_dict.append({
    'embed_method': 'hybrid without bait-prey',
    'embed_dim': embed_dim,
    'pred_model': 'support vector machines',
    'AUROC': auroc,
    'AUPRC': auprc
})

In [None]:
# code block to track CPU / Memory usage
cpu_sec = time.time() - start_time
start_time = time.time()
end_memory = psutil.virtual_memory().available
mem_use = (start_memory - end_memory) / (1024.0 ** 2)
start_memory = end_memory
print(f'CPU time: {cpu_sec:.2f} sec')
print(f'Memory usage: {mem_use:.2f} MB')

# 4. save all AUROC and AUPRC results for plotting

In [None]:
import pandas as pd

df_sum = pd.DataFrame(list_dict)
df_sum.to_csv(f'df_sum_{exp_id}.csv', index=False)