In [1]:
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc as calculate_auc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from scipy.stats.stats import pearsonr
import os
import subprocess
import pandas as pd
import numpy as np

random_seeds = [2, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]

def r2(x,y):
    pcc, _ = pearsonr(x,y)
    return pcc**2


def rmse(x, y):
    mse = mean_squared_error(x,y)
    return np.sqrt(mse)

def prc_auc(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

def roc_auc(y_true, y_score):
    return roc_auc_score(y_true, y_score)

# 1) classification-scaffold split

In [2]:
task_names = ["BACE" , "BBBP",  "HIV" , ]  # "Tox21","SIDER", "ToxCast",
task_types = ["classification", "classification", "classification", "classification", "classification"]

res1 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"attfp_saved_val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_pred_val.csv")
            
            
            
            ts_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_saved_test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_pred_test.csv")


            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
#             if len(df_true.columns[1:]) > 1:
            test_rocs = []
            valid_rocs = []
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())  
                except:
                    test_ = np.nan
                
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())                
                except:
                    val_ = np.nan
                test_rocs.append(test_)
                valid_rocs.append(val_)


            test_roc = np.nanmean(test_rocs)
            valid_roc = np.nanmean(valid_rocs)

            final_res = {"task_name":task_name, "seed":seed, "valid_roc": valid_roc, "test_roc": test_roc}
              
            res1.append(final_res)


In [3]:
df1 = pd.DataFrame(res1)
df1

Unnamed: 0,task_name,seed,valid_roc,test_roc
0,BACE,2,0.807476,0.848235
1,BACE,16,0.819298,0.729383
2,BACE,32,0.870781,0.78223
3,BACE,64,0.908466,0.895411
4,BACE,128,0.886723,0.737216
5,BACE,256,0.851,0.683431
6,BACE,512,0.848261,0.902136
7,BACE,1024,0.710211,0.841612
8,BACE,2048,0.852496,0.917247
9,BACE,4096,0.538786,0.685694


# 2) classification-random split

In [4]:
task_names = ["Tox21","SIDER", "ToxCast"]  
task_types = ["classification", "classification", "classification", "classification", "classification"]

res2 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"attfp_saved_val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_pred_val.csv")

            ts_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_saved_test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_pred_test.csv")


            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
#             if len(df_true.columns[1:]) > 1:
            test_rocs = []
            valid_rocs = []
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())  
                except:
                    test_ = np.nan
                
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_ = roc_auc(dfi.true.tolist(), dfi.pred.tolist())                
                except:
                    val_ = np.nan
                test_rocs.append(test_)
                valid_rocs.append(val_)


            test_roc = np.nanmean(test_rocs)
            valid_roc = np.nanmean(valid_rocs)

            final_res = {"task_name":task_name, "seed":seed, "valid_roc": valid_roc, "test_roc": test_roc}
            
            res2.append(final_res)


In [5]:
df2 = pd.DataFrame(res2)
df2

Unnamed: 0,task_name,seed,valid_roc,test_roc
0,Tox21,2,0.804297,0.846541
1,Tox21,16,0.831063,0.819801
2,Tox21,32,0.852699,0.852778
3,Tox21,64,0.848987,0.838396
4,Tox21,128,0.881308,0.834822
5,Tox21,256,0.840928,0.842675
6,Tox21,512,0.824006,0.843068
7,Tox21,1024,0.833666,0.8582
8,Tox21,2048,0.845955,0.847321
9,Tox21,4096,0.83164,0.833086


## 03) regression

In [6]:
task_names = [ "FreeSolv", "ESOL" , "Malaria"] 
task_types = ["regression", "regression", "regression"]

res3 = []
for task_name,task_type  in zip(task_names, task_types):

    for seed in random_seeds: #
  
            file_path = "/raid/shenwanxiang/08_Robustness/dataset_induces/split"

            vl_path = os.path.join(file_path, task_name,"%s" % seed,"attfp_saved_val.csv")
            vl_pred_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_pred_val.csv")

            ts_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_saved_test.csv")
            ts_pred_path = os.path.join(file_path, task_name,"%s" % seed, "attfp_pred_test.csv")

            df_true = pd.read_csv(ts_path)
            df_pred = pd.read_csv(ts_pred_path)

            df_true_val = pd.read_csv(vl_path)
            df_pred_val = pd.read_csv(vl_pred_path)
            
            test_rmses = []
            valid_rmses = []
            for i in df_true.columns[1:]:

                dfi = df_true[i].to_frame(name = 'true').join(df_pred[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    test_ = rmse(dfi.true.tolist(), dfi.pred.tolist())  
                except:
                    test_ = np.nan
                
                dfi = df_true_val[i].to_frame(name = 'true').join(df_pred_val[i].to_frame(name = 'pred'))
                dfi = dfi.dropna()
                try:
                    val_ = rmse(dfi.true.tolist(), dfi.pred.tolist())                
                except:
                    val_ = np.nan
                test_rmses.append(test_)
                valid_rmses.append(val_)
                
            test_rmse = np.nanmean(test_rmses)
            valid_rmse = np.nanmean(valid_rmses)
            final_res = {"task_name":task_name, "seed":seed, "valid_rmse": valid_rmse, "test_rmse": test_rmse}
            res3.append(final_res)

In [7]:
df3 = pd.DataFrame(res3)
df3

Unnamed: 0,task_name,seed,valid_rmse,test_rmse
0,FreeSolv,2,1.02199,1.114411
1,FreeSolv,16,0.889942,1.243484
2,FreeSolv,32,1.165425,1.187666
3,FreeSolv,64,0.991103,1.107081
4,FreeSolv,128,0.742528,1.075621
5,FreeSolv,256,0.852396,1.077621
6,FreeSolv,512,0.533894,0.873474
7,FreeSolv,1024,1.176605,0.868632
8,FreeSolv,2048,1.027391,0.809547
9,FreeSolv,4096,0.914487,0.850223


In [8]:
model_name = 'AttentiveFP'

In [9]:
df1['test_metric'] = 'ROC_AUC'
df1['test_performance'] = df1.test_roc
df1['model'] = model_name
df1['split'] = 'scaffold'
df1 = df1[["task_name","seed", "split", "test_metric","test_performance","model"]]

In [10]:
df2['test_metric'] = 'ROC_AUC'
df2['test_performance'] = df2.test_roc
df2['model'] = model_name
df2['split'] = 'random'
df2 = df2[["task_name","seed", "split", "test_metric","test_performance","model"]]

In [12]:
df3['test_metric'] = 'RMSE'
df3['test_performance'] = df3.test_rmse
df3['model'] = model_name
df3['split'] = 'random'
df3 = df3[["task_name","seed", "split", "test_metric","test_performance","model"]]

In [14]:
df1.append(df2).append(df3).round(3).to_csv('./results_attentivefp.csv')