In [1]:
import os

import pandas as pd
from sklearn.metrics import f1_score, roc_auc_score

from src.datasets import prepare_dataset
from src.train import train
from src.models import load_pretrained

In [2]:
dataset_names = ['HIA_Hou', 'BBB_Martins', 'CYP3A4_Substrate_CarbonMangels']

In [3]:
model_names = ['tree', 'DGL_GCN', 'Transformer']

In [4]:
# download and transorm datasets so that they are suitable for respective models
# split into train, val, and test datasets
for dataset_name in dataset_names:
    for model_name in model_names:
        
        dataset_folder = os.path.join('datasets', dataset_name, model_name)
        if os.path.exists(dataset_folder) and all([pkl in os.listdir(dataset_folder) for pkl in ['train.pickle', 'val.pickle', 'test.pickle']]):
            continue
        
        train_df, val_df, test_df = prepare_dataset(dataset_name, model_name)
        

        if not os.path.exists(dataset_folder):
            os.makedirs(dataset_folder)
        
        train_df.to_pickle(os.path.join(dataset_folder, 'train.pickle'))
        val_df.to_pickle(os.path.join(dataset_folder, 'val.pickle'))
        test_df.to_pickle(os.path.join(dataset_folder, 'test.pickle'))

In [5]:
# set up hyperparameters
# pretty much ad-hoc
model_kwargs = {}
model_kwargs['DGL_GCN'] = dict(LR=1e-3, train_epoch=100, batch_size=64)
model_kwargs['Transformer'] = model_kwargs['DGL_GCN']
model_kwargs['tree'] = {'max_depth': 5, 'random_state': 1234,}

In [6]:
auc_scores = {}

In [7]:
# train models and compute ROC-AUC
for dataset_name in dataset_names:
    auc_scores[dataset_name] = {}
    for model_name in model_names:
        model_dir = os.path.join('models', dataset_name, model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            
        dataset_dir = os.path.join('datasets', dataset_name, model_name)
        
        try:
            model = load_pretrained(model_name, model_dir)
        except FileNotFoundError:
            model = train(model_name, 
                          train_pickle=os.path.join(dataset_dir, 'train.pickle'),
                          val_pickle=os.path.join(dataset_dir, 'val.pickle'),
                          model_dir=model_dir,
                          **model_kwargs[model_name])

        test_dataset = pd.read_pickle(os.path.join(dataset_dir, 'test.pickle'))
        y_pred =  model.predict(test_dataset)
        
        auc_scores[dataset_name][model_name] = roc_auc_score(test_dataset['Label'], y_pred)

predicting...
predicting...
predicting...
predicting...
predicting...
predicting...


In [8]:
auc_scores_df = pd.DataFrame(auc_scores)

In [9]:
auc_scores_df

Unnamed: 0,HIA_Hou,BBB_Martins,CYP3A4_Substrate_CarbonMangels
tree,0.887459,0.806663,0.619095
DGL_GCN,0.937624,0.885884,0.650624
Transformer,0.79802,0.801809,0.532754


In [10]:
import numpy as np
from scipy import stats

In [11]:
# summarize ROC-AUC across datasets
# harmonic mean is better than arithmetic mean
auc_scores_df['mean_auc'] = stats.hmean(auc_scores_df, axis=1)

In [12]:
auc_scores_df

Unnamed: 0,HIA_Hou,BBB_Martins,CYP3A4_Substrate_CarbonMangels,mean_auc
tree,0.887459,0.806663,0.619095,0.753438
DGL_GCN,0.937624,0.885884,0.650624,0.803788
Transformer,0.79802,0.801809,0.532754,0.685351


In [13]:
# also, compute f1 scores
f1_scores = {}
for dataset_name in dataset_names:
    f1_scores[dataset_name] = {}
    for model_name in model_names:
        model_dir = os.path.join('models', dataset_name, model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            
        dataset_dir = os.path.join('datasets', dataset_name, model_name)
        model = load_pretrained(model_name, model_dir)
        
        test_dataset = pd.read_pickle(os.path.join(dataset_dir, 'test.pickle'))
        y_pred =  model.predict(test_dataset)
        y_pred_bin = (np.array(y_pred) > 0.5).astype(int)
        
        f1_scores[dataset_name][model_name] = f1_score(test_dataset['Label'], y_pred_bin)

predicting...
predicting...
predicting...
predicting...
predicting...
predicting...


In [14]:
f1_scores_df = pd.DataFrame(f1_scores)
f1_scores_df['mean_f1'] = stats.hmean(f1_scores_df, axis=1)
f1_scores_df

Unnamed: 0,HIA_Hou,BBB_Martins,CYP3A4_Substrate_CarbonMangels,mean_f1
tree,0.975124,0.911353,0.639456,0.813756
DGL_GCN,0.970297,0.90301,0.643357,0.812487
Transformer,0.930876,0.856338,0.66,0.798473


In [15]:
mean_scores = pd.concat([auc_scores_df['mean_auc'], f1_scores_df['mean_f1']], axis=1)

In [16]:
mean_scores

Unnamed: 0,mean_auc,mean_f1
tree,0.753438,0.813756
DGL_GCN,0.803788,0.812487
Transformer,0.685351,0.798473


In [17]:
mean_scores.to_markdown('mean_scores.md')