In [1]:
import pandas as pd
from glob import glob
from sklearn.metrics import mean_squared_error as mse
from scipy.stats.stats import pearsonr as pcc
import os

In [2]:
csv_files = glob('./results/*.csv')
csv_files = [i for i in csv_files if 'results_0' in i]
csv_files

['./results/results_05_MolMapNet_fingerprint.csv',
 './results/results_02_FSOL_model.csv',
 './results/results_03_MPNN.csv',
 './results/results_05_MolMapNet_both.csv',
 './results/results_05_MolMapNet_descriptor.csv',
 './results/results_04_AttentiveFP.csv',
 './results/results_01_ESOL_model.csv']

In [3]:
df_train = pd.read_csv('./train.csv', index_col = 0)
df_valid = pd.read_csv('./valid.csv',  index_col = 0)
df_test = pd.read_csv('./test.csv',  index_col = 0)
df_etc = pd.read_csv('./etc.csv')
task = 'measured log solubility in mols per litre'
df_etc[task] = df_etc.Exp_LogS

train_true_y = df_train[task].tolist()
valid_true_y = df_valid[task].tolist()
test_true_y = df_test[task].tolist()
etc_true_y = df_etc[task].tolist()

In [4]:
def get_results(csv):
    print(csv)
    df = pd.read_csv(csv,index_col=0)
    def _format(x):
        return [float(i.replace('[', '').replace(']', '')) for i in x.split(',')]
    df.train_pred_y = df.train_pred_y.apply(_format)
    df.valid_pred_y = df.valid_pred_y.apply(_format)
    df.test_pred_y = df.test_pred_y.apply(_format)
    df.etc_pred_y = df.etc_pred_y.apply(_format)


    df['train_pcc'] = df.train_pred_y.apply(lambda x:pcc(x, train_true_y)[0])
    df['valid_pcc'] = df.valid_pred_y.apply(lambda x:pcc(x, valid_true_y)[0])    
    df['test_pcc']  = df.test_pred_y.apply(lambda x:pcc(x, test_true_y)[0])
    df['etc_pcc']   = df.etc_pred_y.apply(lambda x:pcc(x, etc_true_y)[0])

    df['train_p_value']  = df.train_pred_y.apply(lambda x:pcc(x, train_true_y)[1])
    df['valid_p_value']  = df.valid_pred_y.apply(lambda x:pcc(x, valid_true_y)[1])    
    df['test_p_value']   = df.test_pred_y.apply(lambda x:pcc(x, test_true_y)[1])
    df['etc_p_value']    = df.etc_pred_y.apply(lambda x:pcc(x, etc_true_y)[1])

    res = { 'model': os.path.basename(csv).split('results_')[-1].replace('.csv',''),
            'random_seed': df.random_seed.tolist(),
            'best_epoch': df.best_epoch.tolist(),
            'batch_size': df.batch_size.iloc[0],
            'lr': df.lr.iloc[0],
            '# trainable params': df['# trainable params'].iloc[0],
            'train_rmse': '%.3f ± %.3f' % (df.train_rmse.mean(), df.train_rmse.std()),  
            'valid_rmse': '%.3f ± %.3f' % (df.valid_rmse.mean(), df.valid_rmse.std()),            
            'test_rmse': '%.3f ± %.3f' % (df.test_rmse.mean(), df.test_rmse.std()),  
            'etc_rmse': '%.3f ± %.3f' % (df.etc_rmse.mean(), df.etc_rmse.std()),

            'train_pcc': '%.3f ± %.3f' % (df.train_pcc.mean(), df.train_pcc.std()), 
            'valid_pcc': '%.3f ± %.3f' % (df.valid_pcc.mean(), df.valid_pcc.std()),
            'test_pcc': '%.3f ± %.3f' % (df.test_pcc.mean(), df.test_pcc.std()),  
            'etc_pcc': '%.3f ± %.3f' % (df.etc_pcc.mean(), df.etc_pcc.std()),

            'train_p_value': '%.2e ± %.2e' % (df.train_p_value.mean(), df.train_p_value.std()),  
            'valid_p_value': '%.2e ± %.2e' % (df.valid_p_value.mean(), df.valid_p_value.std()),             
            'test_p_value': '%.2e ± %.2e' % (df.test_p_value.mean(), df.test_p_value.std()),  
            'etc_p_value': '%.2e ± %.2e' % (df.etc_p_value.mean(), df.etc_p_value.std()),}
    
    return res

In [5]:
results = []
for csv in csv_files:
    res = get_results(csv)
    results.append(res)

./results/results_05_MolMapNet_fingerprint.csv
./results/results_02_FSOL_model.csv
./results/results_03_MPNN.csv
./results/results_05_MolMapNet_both.csv
./results/results_05_MolMapNet_descriptor.csv
./results/results_04_AttentiveFP.csv
./results/results_01_ESOL_model.csv


In [6]:
df_res = pd.DataFrame(results).set_index('model')

In [7]:
df_res.index = df_res.index.map(lambda x:'_'.join(x.split('_')[1:]))
df_res.index = df_res.index.map(lambda x:x.replace('_model', ''))

In [8]:
df_res.to_csv('./results/results.csv')
df_res.to_pickle('./results/results.pkl')

In [9]:
df_res

Unnamed: 0_level_0,random_seed,best_epoch,batch_size,lr,# trainable params,train_rmse,valid_rmse,test_rmse,etc_rmse,train_pcc,valid_pcc,test_pcc,etc_pcc,train_p_value,valid_p_value,test_p_value,etc_p_value
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
MolMapNet_fingerprint,"[7, 77, 777]","[194, 241, 269]",128.0,0.0001,326497.0,0.175 ± 0.021,0.621 ± 0.008,0.729 ± 0.010,1.170 ± 0.073,0.997 ± 0.001,0.953 ± 0.001,0.923 ± 0.003,0.104 ± 0.030,0.00e+00 ± 0.00e+00,5.65e-59 ± 6.71e-59,2.00e-47 ± 2.93e-47,2.76e-01 ± 1.25e-01
FSOL,"[7, 77, 777]","[nan, nan, nan]",,,,1.091 ± 0.000,1.100 ± 0.000,0.968 ± 0.000,1.793 ± 0.000,0.866 ± 0.000,0.851 ± 0.000,0.861 ± 0.000,0.534 ± 0.000,1.40e-272 ± 0.00e+00,7.06e-33 ± 0.00e+00,2.55e-34 ± 0.00e+00,3.41e-10 ± 6.33e-26
MPNN,"[7, 77, 777]","[245, 266, 182]",64.0,1e-05,341201.0,0.531 ± 0.043,0.623 ± 0.047,0.971 ± 0.014,2.189 ± 0.024,0.973 ± 0.005,0.959 ± 0.005,0.945 ± 0.004,0.231 ± 0.183,0.00e+00 ± 0.00e+00,1.24e-59 ± 2.15e-59,3.26e-54 ± 5.61e-54,2.25e-01 ± 3.83e-01
MolMapNet_both,"[7, 77, 777]","[170, 194, 167]",128.0,0.0001,725665.0,0.184 ± 0.030,0.531 ± 0.011,0.547 ± 0.018,1.026 ± 0.133,0.997 ± 0.001,0.966 ± 0.001,0.958 ± 0.003,0.379 ± 0.047,0.00e+00 ± 0.00e+00,3.62e-66 ± 5.88e-66,1.02e-60 ± 1.76e-60,7.49e-05 ± 1.12e-04
MolMapNet_descriptor,"[7, 77, 777]","[288, 301, 400]",128.0,0.0001,407617.0,0.390 ± 0.035,0.562 ± 0.019,0.472 ± 0.018,0.864 ± 0.043,0.983 ± 0.003,0.962 ± 0.003,0.969 ± 0.002,0.691 ± 0.019,0.00e+00 ± 0.00e+00,2.42e-63 ± 3.89e-63,4.67e-68 ± 7.43e-68,2.38e-17 ± 4.06e-17
AttentiveFP,"[7, 77, 777]","[119, 78, 105]",200.0,0.003162,863604.0,0.329 ± 0.042,0.524 ± 0.019,0.483 ± 0.010,1.266 ± 0.085,0.988 ± 0.003,0.967 ± 0.002,0.968 ± 0.001,0.487 ± 0.102,0.00e+00 ± 0.00e+00,1.25e-66 ± 2.15e-66,6.64e-68 ± 6.55e-68,7.09e-06 ± 1.23e-05
ESOL,"[7, 77, 777]","[nan, nan, nan]",,,,1.019 ± 0.000,0.985 ± 0.000,0.961 ± 0.000,1.136 ± 0.000,0.877 ± 0.000,0.878 ± 0.000,0.859 ± 0.000,0.593 ± 0.000,4.77e-289 ± 0.00e+00,3.15e-37 ± 0.00e+00,4.36e-34 ± 0.00e+00,9.57e-13 ± 0.00e+00


In [10]:
df_etc = pd.read_csv('./etc.csv')
res = []
for csv in csv_files:
    col = os.path.basename(csv).split('results_')[-1].replace('.csv','')
    df = pd.read_csv(csv,index_col=0)
    def _format(x):
        return [float(i.replace('[', '').replace(']', '')) for i in x.split(',')]
    df.train_pred_y = df.train_pred_y.apply(_format)
    df.valid_pred_y = df.valid_pred_y.apply(_format)
    df.test_pred_y = df.test_pred_y.apply(_format)
    
    df.etc_pred_y = df.etc_pred_y.apply(_format)
    df1 = pd.DataFrame(df.etc_pred_y.tolist()).mean().T.to_frame(name = col)
    res.append(df1)

dfy = pd.concat(res, axis=1)
dfy.columns = dfy.columns.map(lambda x:'_'.join(x.split('_')[1:]))
dfy.columns = dfy.columns.map(lambda x:x.replace('_model', ''))


df_etc = df_etc.join(dfy)

# df_etc.columns = ['IDs', 'Exp_LogS', 'smiles', 'FSOL',
#                    'MolMapNet_descriptor',
#                    'MolMapNet_fingerprint', 'MolMapNet_both',
#                    'MPNN', 'AttentiveFP', 'ESOL']

df_etc = df_etc.set_index('IDs')
df_etc.to_csv('./etc_pred.csv')