In [1]:
from molmap import model as molmodel
import molmap
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from joblib import load, dump
tqdm.pandas(ascii=True)
import numpy as np
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
np.random.seed(123)
tf.compat.v1.set_random_seed(123)



In [2]:
mp1 = molmap.loadmap('../../descriptor.mp')
mp2 = molmap.loadmap('../../fingerprint.mp')

In [3]:
task_name = 'ESOL'
from chembench import load_data
df, induces = load_data(task_name)

loading dataset: ESOL number of split times: 3


In [4]:
smiles_col = df.columns[0]
values_col = df.columns[1:]
Y = df[values_col].astype('float').values
Y = Y.reshape(-1, 1)

tmp_feature_dir = '../../02_OutofTheBox_benchmark_comparison_DMPNN/tmpignore/'

if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)
    
X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)

if not os.path.exists(X2_name): 
    X2 = mp2.batch_transform(df.smiles, n_jobs = 8)
    dump(X2, X2_name)
else:
    X2 = load(X2_name)
    
molmap1_size = X1.shape[1:]
molmap2_size = X2.shape[1:]

In [5]:
X1.shape, X2.shape

((1128, 37, 37, 13), (1128, 37, 36, 3))

# 5 fold-cv

In [6]:
from sklearn.model_selection import KFold
kf  = KFold(n_splits = 5, random_state=123)

In [None]:
epochs = 800
patience = 30 

batch_size = 128
lr = 1e-4
weight_decay = 0

loss = 'mse'
monitor = 'val_loss'
dense_avf = 'relu'
last_avf = 'linear'


stypes = ['descriptor_path', 'fingerprint_path', 'both_path']
idx = list(range(len(df)))
results = []
for stype in stypes:
    i = 0
    for train_idx, valid_idx in kf.split(idx):
        trainY = Y[train_idx]
        validY = Y[valid_idx]      

        if stype == 'descriptor_path':

            trainX = X1[train_idx]
            validX = X1[valid_idx]   

            model = molmodel.net.SinglePathNet(molmap1_size,  
                                               n_outputs=Y.shape[-1], 
                                               dense_layers=[128, 32], 
                                               dense_avf = dense_avf, 
                                               last_avf=last_avf)
        
        if stype == 'fingerprint_path':

            trainX = X2[train_idx]
            validX = X2[valid_idx]

            model = molmodel.net.SinglePathNet(molmap2_size,  
                                               n_outputs=Y.shape[-1], 
                                               dense_layers=[128, 32], 
                                               dense_avf = dense_avf, 
                                               last_avf=last_avf)        



        if stype == 'both_path':
            trainX = (X1[train_idx], X2[train_idx])
            validX = (X1[valid_idx], X2[valid_idx])
            model = molmodel.net.DoublePathNet(molmap1_size, molmap2_size, 
                                               n_outputs=Y.shape[-1], 
                                               dense_layers=[256, 128, 32], 
                                               dense_avf = dense_avf, 
                                               last_avf=last_avf)


        opt = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) #
        #import tensorflow_addons as tfa
        #opt = tfa.optimizers.AdamW(weight_decay = 0.1,learning_rate=0.001,beta1=0.9,beta2=0.999, epsilon=1e-08)
        model.compile(optimizer = opt, loss = loss)
        performance = molmodel.cbks.Reg_EarlyStoppingAndPerformance((trainX, trainY), 
                                                                   (validX, validY), 
                                                                   patience = patience, 
                                                                   criteria = monitor)
        model.fit(trainX, trainY, batch_size=batch_size, 
              epochs=epochs, verbose= 0, shuffle = True, 
              validation_data = (validX, validY), 
              callbacks=[performance]) 

        trainable_params = model.count_params()
        _, val_rmse = performance.evaluate(validX, validY)
        _, train_rmse = performance.evaluate(trainX, trainY)        
        train_rmse = np.nanmean(train_rmse)
        val_rmse = np.nanmean(val_rmse)
        
        score = (val_rmse/train_rmse) * val_rmse
    
        
        final_res = { 
                    'stype':stype,
                    'fold':i,
                    'train_rmse':train_rmse,
                    'valid_rmse':val_rmse,
                    'trainable params': trainable_params,
                    'best_epoch': performance.best_epoch,
                    }
        results.append(final_res)
        i += 1



epoch: 0001, loss: 11.2228 - val_loss: 9.6345; rmse: 3.0003 - rmse_val: 3.1177;  r2: 0.1972 - r2_val: 0.2144                                                                                                    
epoch: 0002, loss: 7.7324 - val_loss: 6.0543; rmse: 2.3401 - rmse_val: 2.4699;  r2: 0.1613 - r2_val: 0.1823                                                                                                    
epoch: 0003, loss: 4.8097 - val_loss: 4.8910; rmse: 2.0953 - rmse_val: 2.2097;  r2: 0.2514 - r2_val: 0.2915                                                                                                    
epoch: 0004, loss: 4.3989 - val_loss: 4.7430; rmse: 2.0577 - rmse_val: 2.1781;  r2: 0.3394 - r2_val: 0.3853                                                                                                    
epoch: 0005, loss: 4.2887 - val_loss: 4.8053; rmse: 2.0683 - rmse_val: 2.1962;  r2: 0.3512 - r2_val: 0.4067                                                            

In [None]:
dfres = pd.DataFrame(results)
dfres.to_csv('./esol_5fcv.csv')