In [1]:
from molmap import model as molmodel
import molmap

import matplotlib.pyplot as plt

import pandas as pd
from tqdm import tqdm
from joblib import load, dump
tqdm.pandas(ascii=True)
import numpy as np

import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
np.random.seed(123)
tf.compat.v1.set_random_seed(123)




In [2]:
tf.__version__

'2.0.0'

In [3]:
tmp_feature_dir = './tmpignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)

In [4]:
#load dataset
from molmap import dataset
data = dataset.load_CYP450()
df = data.data
X_smiles = df.smiles.tolist()
task_name = 'CYP450'


total samples: 16896


In [5]:
MASK = -1
tasks = ['label_1a2', 'label_2c19', 'label_2c9', 'label_2d6', 'label_3a4']
Y = df[tasks].astype('float').fillna(MASK).values
if Y.shape[1] == 0:
    Y = Y.reshape(-1, 1)

# featurizer

In [6]:
mp1 = molmap.loadmap('../descriptor.mp')
mp2 = molmap.loadmap('../fingerprint.mp')

X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)

if not os.path.exists(X2_name): 
    X2 = mp2.batch_transform(df.smiles, n_jobs = 8)
    dump(X2, X2_name)
else:
    X2 = load(X2_name)

molmap1_size = X1.shape[1:]
molmap2_size = X2.shape[1:]

# Perfor a 10 fold Cross-validation to find best epochs

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10, shuffle=True, random_state=123)

train_valid_idx = df[df.group != 'test set'].index.tolist()
test_idx = df[df.group == 'test set'].index.tolist()
testX = (X1[test_idx], X2[test_idx])
testY = Y[test_idx]

def get_pos_weights(trainY):
    """pos_weights: neg_n / pos_n """
    dfY = pd.DataFrame(trainY)
    pos = dfY == 1
    pos_n = pos.sum(axis=0)
    neg = dfY == 0
    neg_n = neg.sum(axis=0)
    pos_weights = (neg_n / pos_n).values
    neg_weights = (pos_n / neg_n).values
    return pos_weights, neg_weights

pos_weights, neg_weights = get_pos_weights(Y[train_valid_idx])

epochs = 800
patience = 50 #early stopping

dense_layers = [256, 128, 32]
batch_size = 128
lr = 1e-4
weight_decay = 0
monitor = 'val_auc'
metric = 'ROC'
dense_avf = 'relu'
last_avf = None #sigmoid in loss

## 10 fold-cv

In [None]:
import tensorflow_addons as tfa
    
model_performance_10fcv = []
model_performance_history = []
i = 0
for train_idx, valid_idx in kf.split(train_valid_idx):
    
    trainX = (X1[train_idx], X2[train_idx])
    trainY = Y[train_idx]

    validX = (X1[valid_idx], X2[valid_idx])
    validY = Y[valid_idx]

    loss = lambda y_true, y_pred: molmodel.loss.weighted_cross_entropy(y_true,y_pred, pos_weights, MASK = -1)
    opt = tfa.optimizers.AdamW(weight_decay = 0.0,learning_rate=lr,beta_1=0.9,beta_2=0.999, epsilon=1e-08)

    model = molmodel.net.DoublePathNet(molmap1_size, molmap2_size, 
                                       n_outputs=Y.shape[-1], 
                                       dense_layers=dense_layers, 
                                       dense_avf = dense_avf, 
                                       last_avf=last_avf)
    model.compile(optimizer = opt, loss = loss)
    performance = molmodel.cbks.CLA_EarlyStoppingAndPerformance((trainX, trainY), 
                                                                (validX, validY), 
                                                                patience = patience,
                                                                metric = metric,
                                                                criteria = monitor)

    model.fit(trainX, trainY, batch_size=batch_size, 
              epochs=epochs, verbose= 0, shuffle = True, 
              validation_data = (validX, validY), 
              callbacks=[performance]) 
    model.save('model-fold-%s.h5' % str(i).zfill(2))

    best_epoch = performance.best_epoch
    train_pfs = performance.evaluate(trainX, trainY)            
    valid_pfs = performance.evaluate(validX, validY)            
    final_res = {    'fold':i,
                     'task_name':task_name,            
                     'train_pfs':train_pfs, 
                     'metric': metric,
                     'valid_pfs':valid_pfs,                      
                     'best_epoch': best_epoch,
                     'batch_size':batch_size,
                     'lr': lr,
                     'weight_decay':weight_decay
                    }
    model_performance_10fcv.append(final_res)
    model_performance_history.append(performance.history)
    i += 1

epoch: 0001, loss: 0.8410 - val_loss: 0.7799; auc: 0.7870 - val_auc: 0.7860                                                                                                    
epoch: 0002, loss: 0.7438 - val_loss: 0.7191; auc: 0.8362 - val_auc: 0.8326                                                                                                    
epoch: 0003, loss: 0.6937 - val_loss: 0.6767; auc: 0.8600 - val_auc: 0.8534                                                                                                    
epoch: 0004, loss: 0.6667 - val_loss: 0.6614; auc: 0.8727 - val_auc: 0.8638                                                                                                    
epoch: 0005, loss: 0.6490 - val_loss: 0.6519; auc: 0.8790 - val_auc: 0.8692                                                                                                    
epoch: 0006, loss: 0.6406 - val_loss: 0.6487; auc: 0.8835 - val_auc: 0.8720                                             

In [None]:
pd.DataFrame(pd.DataFrame(model_performance_history)['val_auc'].tolist()).T.iloc[:90].plot()

# training the final model, and test on the external dataset

In [None]:
avg_best_epoch = int(round(pd.DataFrame(model_performance_10fcv).best_epoch.mean()))
avg_best_epoch

In [None]:
trainX = (X1[train_valid_idx], X2[train_valid_idx])
trainY = Y[train_valid_idx]

loss = lambda y_true, y_pred: molmodel.loss.weighted_cross_entropy(y_true,y_pred, pos_weights, MASK = -1)
opt = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) #
performance = molmodel.cbks.CLA_EarlyStoppingAndPerformance((trainX, trainY), 
                                                            (testX, testY), 
                                                            patience = patience,
                                                            metric = 'ROC',
                                                            criteria = 'val_auc')


model.compile(optimizer = opt, loss = loss)

model.fit(trainX, trainY, batch_size=batch_size, 
          epochs=epochs, verbose= 0, shuffle = True, 
          validation_data = (testX, testY), 
          callbacks=[performance]) 

In [None]:
performance.evaluate(testX, testY) # roc_auc