In [35]:
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc as calculate_auc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from scipy.stats.stats import pearsonr
import os
import subprocess
import pandas as pd
import numpy as np

random_seeds = [2, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]

def r2(x,y):
    pcc, _ = pearsonr(x,y)
    return pcc**2


def rmse(x, y):
    mse = mean_squared_error(x,y)
    return np.sqrt(mse)

def prc_auc(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

def roc_auc(y_true, y_score):
    return roc_auc_score(y_true, y_score)

# 1) classification scaffold-split

In [36]:
task_names = ["BACE" , "BBBP",  "HIV" ]  #"Tox21","SIDER", "ToxCast",, 
task_types = ["classification", "classification", "classification",]

res1 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_valid.csv")
            
            ts_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_test.csv")


            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
#             if len(df_true.columns[1:]) > 1:
            test_rocs = []
            valid_rocs = []
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())  
                except:
                    test_ = np.nan
                
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())                
                except:
                    val_ = np.nan
                test_rocs.append(test_)
                valid_rocs.append(val_)


            test_roc = np.nanmean(test_rocs)
            valid_roc = np.nanmean(valid_rocs)

            final_res = {"task_name":task_name, "seed":seed, "valid_roc": valid_roc, "test_roc": test_roc}
              
            res1.append(final_res)


In [37]:
df1 = pd.DataFrame(res1)
df1

Unnamed: 0,task_name,seed,valid_roc,test_roc
0,BACE,2,0.842576,0.846471
1,BACE,16,0.862281,0.698891
2,BACE,32,0.885549,0.793031
3,BACE,64,0.916226,0.897489
4,BACE,128,0.886898,0.811435
5,BACE,256,0.842401,0.851436
6,BACE,512,0.858974,0.923407
7,BACE,1024,0.924648,0.811621
8,BACE,2048,0.929501,0.892334
9,BACE,4096,0.835732,0.850423


In [38]:
pd.DataFrame(res1).groupby('task_name').apply(lambda x:x.mean())

Unnamed: 0_level_0,seed,valid_roc,test_roc
task_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BACE,817.8,0.878479,0.837654
BBBP,817.8,0.90708,0.906006
HIV,817.8,0.814689,0.786939


# 2) classification random-split

In [39]:
task_names = ["Tox21", "SIDER", "ToxCast"]  
task_types = ["classification", "classification", "classification",]

res2 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_valid.csv")
            
            ts_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_test.csv")

            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
#             if len(df_true.columns[1:]) > 1:
            test_rocs = []
            valid_rocs = []
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())  
                except:
                    test_ = np.nan
                
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())                
                except:
                    val_ = np.nan
                test_rocs.append(test_)
                valid_rocs.append(val_)

            test_roc = np.nanmean(test_rocs)
            valid_roc = np.nanmean(valid_rocs)

            final_res = {"task_name":task_name, "seed":seed, "valid_roc": valid_roc, "test_roc": test_roc}
            
            res2.append(final_res)


In [40]:
df2 = pd.DataFrame(res2)
df2

Unnamed: 0,task_name,seed,valid_roc,test_roc
0,Tox21,2,0.81895,0.84419
1,Tox21,16,0.836713,0.844242
2,Tox21,32,0.852619,0.830838
3,Tox21,64,0.851685,0.847115
4,Tox21,128,0.862153,0.836655
5,Tox21,256,0.846997,0.846851
6,Tox21,512,0.837875,0.829655
7,Tox21,1024,0.85575,0.860605
8,Tox21,2048,0.85196,0.839562
9,Tox21,4096,0.826527,0.823884


# 03) regression

In [41]:
task_names = [ "FreeSolv", "ESOL" , "Malaria"] 
task_types = ["regression", "regression", "regression"]

res3 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_valid.csv")
            
            ts_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_test.csv")

            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
            
            test_rmses = []
            valid_rmses = []
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_ = rmse(dfi.true.tolist(), dfi.pred.tolist())  
                except:
                    test_ = np.nan
                
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_ = rmse(dfi.true.tolist(), dfi.pred.tolist())                
                except:
                    val_ = np.nan
                test_rmses.append(test_)
                valid_rmses.append(val_)
                
            test_rmse = np.nanmean(test_rmses)
            valid_rmse = np.nanmean(valid_rmses)
            final_res = {"task_name":task_name, "seed":seed, "valid_rmse": valid_rmse, "test_rmse": test_rmse}
            res3.append(final_res)

In [42]:
df3 = pd.DataFrame(res3)
df3

Unnamed: 0,task_name,seed,valid_rmse,test_rmse
0,FreeSolv,2,1.131206,1.396408
1,FreeSolv,16,0.99498,1.147864
2,FreeSolv,32,1.288333,0.962947
3,FreeSolv,64,0.997069,1.179571
4,FreeSolv,128,0.819142,1.029147
5,FreeSolv,256,0.899403,1.274465
6,FreeSolv,512,0.799836,1.173136
7,FreeSolv,1024,1.245129,0.783695
8,FreeSolv,2048,1.014519,1.35731
9,FreeSolv,4096,1.133444,0.914262


In [43]:
model_name = 'DMPNN'

In [44]:
df1['test_metric'] = 'ROC_AUC'
df1['test_performance'] = df1.test_roc
df1['model'] = model_name
df1['split'] = 'scaffold'
df1 = df1[["task_name","seed", "split", "test_metric","test_performance","model"]]

In [45]:
df2['test_metric'] = 'ROC_AUC'
df2['test_performance'] = df2.test_roc
df2['model'] = model_name
df2['split'] = 'random'
df2 = df2[["task_name","seed", "split", "test_metric","test_performance","model"]]

In [46]:
df3['test_metric'] = 'RMSE'
df3['test_performance'] = df3.test_rmse
df3['model'] = model_name
df3['split'] = 'random'
df3 = df3[["task_name","seed", "split", "test_metric","test_performance","model"]]

In [47]:
df1.append(df2).append(df3).round(3).to_csv('./results_chemprop.csv')

# 1) DMPNN classification MUV, PCAB, ChEMBL: 

In [68]:
task_names = ["MUV", "PCBA", "ChEMBL"]  
task_types = ["classification", "classification", "classification",]

res4 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_valid.csv")
            
            ts_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "DMPNN_pred_test.csv")

            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
#             if len(df_true.columns[1:]) > 1:
            test_rocs = []
            valid_rocs = []
        
            test_prcs = []
            valid_prcs = []
            
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_1 = roc_auc(dfi.true.tolist(), dfi.pred.tolist()) 
                    test_2 = prc_auc(dfi.true.tolist(), dfi.pred.tolist())  
                    
                except:
                    test_1 = np.nan
                    test_2 = np.nan
                    
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_1 = roc_auc(dfi.true.tolist(), dfi.pred.tolist())      
                    val_2 = prc_auc(dfi.true.tolist(), dfi.pred.tolist())                        
                except:
                    val_1 = np.nan
                    val_2 = np.nan                    
                test_rocs.append(test_1)
                valid_rocs.append(val_1)
                test_prcs.append(test_2)
                valid_prcs.append(val_2)                
                

            test_roc = np.nanmean(test_rocs)
            valid_roc = np.nanmean(valid_rocs)
           
            test_prc = np.nanmean(test_prcs)
            valid_prc = np.nanmean(valid_prcs)
            
            final_res = {"task_name":task_name, "seed":seed, 
                         "valid_roc": valid_roc, "test_roc": test_roc, 
                         "valid_prc": valid_prc, "test_prc": test_prc, 
                        }
            
            res4.append(final_res)
            
pd.DataFrame(res4).to_csv('./chemprop_muv_pcba_chembl.csv')

# 1)  MolMap classification MUV, PCAB, ChEMBL:

In [75]:
task_names = ["MUV", "PCBA", "ChEMBL"]  
task_types = ["classification", "classification", "classification",]

res5 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "MolMAP_pred_val.csv")
            
            ts_path = os.path.join(file_path, task_name,"%s" % seed, "test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "MolMAP_pred_test.csv")


            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
#             if len(df_true.columns[1:]) > 1:
            test_rocs = []
            valid_rocs = []
        
            test_prcs = []
            valid_prcs = []
            
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_1 = roc_auc(dfi.true.tolist(), dfi.pred.tolist()) 
                    test_2 = prc_auc(dfi.true.tolist(), dfi.pred.tolist())  
                    
                except:
                    test_1 = np.nan
                    test_2 = np.nan
                    
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_1 = roc_auc(dfi.true.tolist(), dfi.pred.tolist())      
                    val_2 = prc_auc(dfi.true.tolist(), dfi.pred.tolist())                        
                except:
                    val_1 = np.nan
                    val_2 = np.nan                    
                test_rocs.append(test_1)
                valid_rocs.append(val_1)
                test_prcs.append(test_2)
                valid_prcs.append(val_2)                
                

            test_roc = np.nanmean(test_rocs)
            valid_roc = np.nanmean(valid_rocs)
           
            test_prc = np.nanmean(test_prcs)
            valid_prc = np.nanmean(valid_prcs)
            
            final_res = {"task_name":task_name, "seed":seed, 
                         "valid_roc": valid_roc, "test_roc": test_roc, 
                         "valid_prc": valid_prc, "test_prc": test_prc, 
                        }
            
            res5.append(final_res)


In [None]:
pd.DataFrame(res5).to_csv('./molmap_muv_pcba_chembl.csv')