# 1.COVID-19 graph embedding alone

## 1.1. ranking model

In [1]:
import numpy as np
import pandas as pd
import time
import re
import math
import random
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.modules import Module
from torch.utils.data import Dataset, DataLoader

In [2]:
data_path='data/'
exp_id='v0'
device_id='cpu' #'cpu' if CPU, device number if GPU

In [3]:
device=torch.device(device_id)

In [4]:
le=pickle.load(open(data_path+'LabelEncoder_'+exp_id+'.pkl', 'rb'))
edge_index=pickle.load(open(data_path+'edge_index_'+exp_id+'.pkl','rb'))

In [5]:
types=np.array([item.split('_')[0] for item in le.classes_ ])

In [6]:
#label
trials=pd.read_excel(data_path+'literature-mining/All_trails_5_24.xlsx',header=1,index_col=0)
trials_drug=set([drug.strip().upper() for lst in trials.loc[trials['study_category'].apply(lambda x: 'drug' in x.lower()),'intervention'].apply(lambda x: re.split(r'[+|/|,]',x.replace(' vs. ', '/').replace(' vs ', '/').replace(' or ', '/').replace(' with and without ', '/').replace(' /wo ', '/').replace(' /w ', '/').replace(' and ', '/').replace(' - ', '/').replace(' (', '/').replace(') ', '/'))).values for drug in lst])
drug_labels=[1 if drug.split('_')[1] in trials_drug else 0 for drug in le.classes_[types=='drug'] ]

In [7]:
z_np = pickle.load(open(data_path+'COVID_embedding_'+exp_id+'.pkl', 'rb'))

In [8]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [9]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [10]:
class Classifier(nn.Module):
    def __init__(self,embedding_dim):
        super(Classifier, self).__init__() 
        self.fc1=nn.Linear(embedding_dim,embedding_dim)
        self.fc2=nn.Linear(embedding_dim,1)
        self.bn=nn.BatchNorm1d(embedding_dim)
    def forward(self, x):
        residual1 = x
        x = F.dropout(x, training=self.training)
        x= self.bn(F.dropout(F.relu(self.fc1(x)),training=self.training))
        x += residual1  
        return self.fc2(x)

In [11]:
from torch.utils.data import BatchSampler, WeightedRandomSampler
class BPRLoss(nn.Module):
    def __init__(self, num_neg_samples):
        super(BPRLoss, self).__init__()
        self.num_neg_samples=num_neg_samples
    
    def forward(self, output, label):
        positive_output=output[label==1]
        negative_output=output[label!=1]
        
        #negative sample proportional to the high values
        negative_sampler=WeightedRandomSampler(negative_output-min(negative_output), num_samples=self.num_neg_samples*len(positive_output),replacement=True)
        negative_sample_output=negative_output[torch.tensor(list(BatchSampler(negative_sampler, batch_size=len(positive_output),drop_last=True)),dtype=torch.long).t()]
        return -(positive_output.view(-1,1)-negative_sample_output).sigmoid().log().mean()

In [12]:
clf=Classifier(embedding_dim=128).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [13]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+'nn_clf_covid_embedding.pt')

training loss 0.8716539144515991
test loss 0.6930662393569946
training loss 0.8322080373764038
test loss 0.6930374503135681
training loss 0.7600246667861938
test loss 0.6930240988731384
training loss 0.7626522183418274
test loss 0.6929814219474792
training loss 0.8592816591262817
test loss 0.692952573299408
training loss 0.8270158171653748
test loss 0.6930034160614014
training loss 0.8540498614311218
test loss 0.6929587125778198
training loss 0.76790851354599
test loss 0.6929677724838257
training loss 0.8421787619590759
test loss 0.6929559111595154
training loss 0.8273248076438904
test loss 0.6929405927658081
training loss 0.842562198638916
test loss 0.6929025650024414
training loss 0.7815515398979187
test loss 0.692903995513916
training loss 0.8356725573539734
test loss 0.6928220987319946
training loss 0.8115183115005493
test loss 0.6929302215576172
training loss 0.7723425626754761
test loss 0.6928080320358276
training loss 0.8501881957054138
test loss 0.6928654909133911
training loss

In [14]:
clf.load_state_dict(torch.load(data_path+'nn_clf_covid_embedding.pt').state_dict())

<All keys matched successfully>

In [15]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
print("AUROC", metrics.roc_auc_score(y_test,prob))
print("AUPRC", metrics.average_precision_score(y_test,prob))

AUROC 0.6287290912226247
AUPRC 0.09106576446619787


## 1.2. baseline models

In [16]:
clf=LogisticRegression().fit(X_train,y_train)
print("Logit AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("Logit AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

Logit AUROC 0.6513818997005754
Logit AUPRC 0.09660661565548181


In [17]:
clf=GradientBoostingClassifier().fit(X_train,y_train)
print("XGBoost AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("XGBoost AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

XGBoost AUROC 0.6683754252300552
XGBoost AUPRC 0.09523363019699255


In [18]:
clf=RandomForestClassifier().fit(X_train,y_train)
print("rf AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("rf AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

rf AUROC 0.5370048127420087
rf AUPRC 0.07398467441322117


In [19]:
clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
print("svm AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("svm AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

svm AUROC 0.6655457837558199
svm AUPRC 0.14372443386651776


# 2. DRKG embedding alone

## 2.1. ranking model

In [20]:
z_np = pickle.load(open(data_path+'node_feature_'+exp_id+'.pkl','rb'))

In [21]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [22]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [23]:
clf=Classifier(embedding_dim=400).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [24]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+'nn_clf_DRKG_embedding.pt')

training loss 0.7944270372390747
test loss 0.6170163154602051
training loss 0.48377883434295654
test loss 0.5800155997276306
training loss 0.39509230852127075
test loss 0.5775877237319946
training loss 0.3760899305343628
test loss 0.6104579567909241
training loss 0.27303117513656616
test loss 0.6286222338676453
training loss 0.3109939992427826
test loss 0.6282638311386108
training loss 0.2763216197490692
test loss 0.63301020860672
training loss 0.25853246450424194
test loss 0.6356980800628662
training loss 0.26446986198425293
test loss 0.674058198928833
training loss 0.2692054808139801
test loss 0.6779187321662903
training loss 0.229512557387352
test loss 0.6826764345169067
training loss 0.24521872401237488
test loss 0.6548711061477661
training loss 0.19644127786159515
test loss 0.6817864775657654
training loss 0.20006510615348816
test loss 0.6720173358917236
training loss 0.27677255868911743
test loss 0.6485769748687744
training loss 0.23483483493328094
test loss 0.6717806458473206
tr

In [25]:
clf.load_state_dict(torch.load(data_path+'nn_clf_DRKG_embedding.pt').state_dict())

<All keys matched successfully>

In [26]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
print("AUROC", metrics.roc_auc_score(y_test,prob))
print("AUPRC", metrics.average_precision_score(y_test,prob))

AUROC 0.8668500838702599
AUPRC 0.17730709970489889


## 2.2. baseline models

In [27]:
clf=LogisticRegression().fit(X_train,y_train)
print("Logit AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("Logit AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

Logit AUROC 0.8436015613977331
Logit AUPRC 0.18394788718419244


In [28]:
clf=GradientBoostingClassifier().fit(X_train,y_train)
print("XGBoost AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("XGBoost AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

XGBoost AUROC 0.8583611594475536
XGBoost AUPRC 0.15191194989385046


In [29]:
clf=RandomForestClassifier().fit(X_train,y_train)
print("rf AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("rf AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

rf AUROC 0.8511420464343382
rf AUPRC 0.13026709035014253


In [30]:
clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
print("svm AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("svm AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

svm AUROC 0.8842276881594006
svm AUPRC 0.2578582442037748


# 3. hybrid embedding

## 3.1. embedding dimension: 128 (default)

### 3.1.1. ranking model

In [38]:
embed_dim = 128
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))

In [40]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [41]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [45]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [46]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

training loss 0.7538624405860901
test loss 0.6290020942687988
training loss 0.5243105292320251
test loss 0.6215382814407349
training loss 0.5414445400238037
test loss 0.6241465210914612
training loss 0.4407672882080078
test loss 0.6238628625869751
training loss 0.45946377515792847
test loss 0.6108922362327576
training loss 0.4036145806312561
test loss 0.6018462181091309
training loss 0.38138726353645325
test loss 0.6161376237869263
training loss 0.3851678967475891
test loss 0.6237032413482666
training loss 0.3674241900444031
test loss 0.6209856867790222
training loss 0.449250191450119
test loss 0.6209054589271545
training loss 0.4135022461414337
test loss 0.6220009326934814
training loss 0.33527398109436035
test loss 0.6344513297080994
training loss 0.4725155532360077
test loss 0.6190335750579834
training loss 0.39669251441955566
test loss 0.6208900809288025
training loss 0.37401318550109863
test loss 0.6573752760887146
training loss 0.4095124304294586
test loss 0.6529629230499268
trai

In [47]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

<All keys matched successfully>

In [48]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
print("AUROC", metrics.roc_auc_score(y_test,prob))
print("AUPRC", metrics.average_precision_score(y_test,prob))

AUROC 0.8998063929517629
AUPRC 0.2583046193010424


### 3.1.2. baseline models

In [49]:
clf=LogisticRegression().fit(X_train,y_train)
print("Logit AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("Logit AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

Logit AUROC 0.905152142218878
Logit AUPRC 0.3305250793741263


In [50]:
clf=GradientBoostingClassifier().fit(X_train,y_train)
print("XGBoost AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("XGBoost AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

XGBoost AUROC 0.8752292715044914
XGBoost AUPRC 0.21914690333250755


In [51]:
clf=RandomForestClassifier().fit(X_train,y_train)
print("rf AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("rf AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

rf AUROC 0.8852662684788913
rf AUPRC 0.28619761285067663


In [52]:
clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
print("svm AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("svm AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

svm AUROC 0.7705129411026981
svm AUPRC 0.29489019949865203


## 3.2. embedding dimension: 64

### 3.2.1. ranking model

In [54]:
embed_dim = 64
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))

In [55]:
z_np.shape

(15444, 64)

In [56]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [57]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [58]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

In [59]:
best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

training loss 0.7560538053512573
test loss 0.7096765041351318
training loss 0.7549910545349121
test loss 0.7040911912918091
training loss 0.7865292429924011
test loss 0.6966719627380371
training loss 0.680122435092926
test loss 0.6897680163383484
training loss 0.650279700756073
test loss 0.6816246509552002
training loss 0.6703377962112427
test loss 0.6766055226325989
training loss 0.6379246711730957
test loss 0.6697896122932434
training loss 0.6141386032104492
test loss 0.6634857654571533
training loss 0.6358440518379211
test loss 0.660664439201355
training loss 0.5682786703109741
test loss 0.655004620552063
training loss 0.5320441126823425
test loss 0.6551922559738159
training loss 0.5863359570503235
test loss 0.6535322070121765
training loss 0.6095439791679382
test loss 0.6523875594139099
training loss 0.5119084119796753
test loss 0.6472150683403015
training loss 0.5629633665084839
test loss 0.6450152397155762
training loss 0.467242956161499
test loss 0.6385010480880737
training loss

In [60]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

<All keys matched successfully>

In [61]:
#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
print("AUROC", metrics.roc_auc_score(y_test,prob))
print("AUPRC", metrics.average_precision_score(y_test,prob))

AUROC 0.8747158601012714
AUPRC 0.21495240817586903


### 3.2.2. baseline models

In [62]:
clf=LogisticRegression().fit(X_train,y_train)
print("Logit AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("Logit AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

Logit AUROC 0.8988814685917634
Logit AUPRC 0.27234791247919043


In [63]:
clf=GradientBoostingClassifier().fit(X_train,y_train)
print("XGBoost AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("XGBoost AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

XGBoost AUROC 0.8616650206148395
XGBoost AUPRC 0.17929335139129227


In [64]:
clf=RandomForestClassifier().fit(X_train,y_train)
print("rf AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("rf AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

rf AUROC 0.849249086833153
rf AUPRC 0.1703973637462179


In [65]:
clf=make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True)).fit(X_train,y_train)
print("svm AUROC", roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))
print("svm AUPRC", average_precision_score(y_test,clf.predict_proba(X_test)[:,1]))

svm AUROC 0.7916921412782769
svm AUPRC 0.303447323202348


## 3.3. embedding dimension: 256

### 3.3.1. ranking model

In [66]:
embed_dim = 256
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_'+exp_id+'.pkl','rb'))
z_np.shape

(15444, 256)

In [67]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

In [68]:
#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [69]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt')

training loss 0.8388015031814575
test loss 0.6239639520645142
training loss 0.4660673439502716
test loss 0.6188392639160156
training loss 0.4223840534687042
test loss 0.6687822937965393
training loss 0.431878924369812
test loss 0.6985674500465393
training loss 0.4525814950466156
test loss 0.744674801826477
training loss 0.5594502687454224
test loss 0.7303826808929443
training loss 0.5496017336845398
test loss 0.7981721758842468
training loss 0.4734538197517395
test loss 0.8239403367042542
training loss 0.49881720542907715
test loss 0.8360521793365479
training loss 0.6051755547523499
test loss 0.8378788232803345
training loss 0.4763484299182892
test loss 0.8756752014160156
training loss 0.49863147735595703
test loss 0.8731870651245117
training loss 0.5167421698570251
test loss 0.8868876695632935
training loss 0.45275208353996277
test loss 0.9023710489273071
training loss 0.5487418174743652
test loss 0.8447073101997375
training loss 0.46862584352493286
test loss 0.870251476764679
trainin

In [70]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}.pt').state_dict())

#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
print("AUROC", metrics.roc_auc_score(y_test,prob))
print("AUPRC", metrics.average_precision_score(y_test,prob))

AUROC 0.8960988571697314
AUPRC 0.2096530817404918


## 3.4. without bait-prey edges

### 3.4.1. ranking model

In [71]:
embed_dim = 128
z_np = pickle.load(open(data_path+f'hybrid_embedding_{embed_dim}_no_bp_'+exp_id+'.pkl','rb'))
z_np.shape

(15444, 128)

In [73]:
seed=70
indices = np.arange(len(drug_labels))
X_train, X_test, y_train, y_test,indices_train,indices_test=train_test_split(z_np[types=='drug'],drug_labels,indices, test_size=0.5,random_state=seed,)

#Variable wrapping for torch.tensor
_X_train, _y_train=Variable(torch.tensor(X_train,dtype=torch.float).to(device)), Variable(torch.tensor(y_train,dtype=torch.float).to(device))
_X_test, _y_test=Variable(torch.tensor(X_test,dtype=torch.float).to(device)), Variable(torch.tensor(y_test,dtype=torch.float).to(device))

In [74]:
clf=Classifier(embedding_dim=embed_dim).to(device)
optimizer=torch.optim.Adam(clf.parameters())
criterion=BPRLoss(num_neg_samples=15)

best_auprc=0
for epoch in range(30):
    clf.train()
    optimizer.zero_grad()
    out = clf(_X_train)
    loss=criterion(out.squeeze(), _y_train)
    loss.backward()
    optimizer.step()   
    print('training loss',loss.item())

    clf.eval()
    print('test loss', criterion(clf(_X_test).squeeze(), _y_test).item())
    prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
    auprc=metrics.average_precision_score(y_test,prob)
    if auprc>best_auprc:
        best_auproc=auprc
        torch.save(clf, data_path+f'nn_clf_hybrid_embedding_{embed_dim}_no_bp_.pt')

training loss 1.0453006029129028
test loss 0.7818886041641235
training loss 0.9210878610610962
test loss 0.7580038905143738
training loss 0.8879008293151855
test loss 0.7397895455360413
training loss 0.8280649185180664
test loss 0.7168262600898743
training loss 0.7898090481758118
test loss 0.6982943415641785
training loss 0.6602843999862671
test loss 0.6799466013908386
training loss 0.587087094783783
test loss 0.6667906045913696
training loss 0.5509982109069824
test loss 0.6455351114273071
training loss 0.5651601552963257
test loss 0.6387615203857422
training loss 0.5862936973571777
test loss 0.6144457459449768
training loss 0.5239415764808655
test loss 0.6008878350257874
training loss 0.5182040929794312
test loss 0.5885345339775085
training loss 0.4950651228427887
test loss 0.5700702667236328
training loss 0.47883421182632446
test loss 0.5527593493461609
training loss 0.4223332405090332
test loss 0.5429217219352722
training loss 0.4268721342086792
test loss 0.5372671484947205
training

In [75]:
clf.load_state_dict(torch.load(data_path+f'nn_clf_hybrid_embedding_{embed_dim}_no_bp_.pt').state_dict())

#Compute AUC
clf.eval()

prob=torch.sigmoid(clf(_X_test)).cpu().detach().numpy().squeeze()
print("AUROC", metrics.roc_auc_score(y_test,prob))
print("AUPRC", metrics.average_precision_score(y_test,prob))

AUROC 0.8636481211494145
AUPRC 0.1413109092759659
