In [1]:
#!/usr/bin/env python
# coding: utf-8

from molmap.model import RegressionEstimator, MultiClassEstimator, MultiLabelEstimator
from molmap import loadmap, dataset
from molmap.show import imshow_wrap

from sklearn.utils import shuffle 
from joblib import load, dump
import numpy as np
import pandas as pd
import os


def Rdsplit(df, random_state = 1, split_size = [0.8, 0.1, 0.1]):

    base_indices = np.arange(len(df)) 
    base_indices = shuffle(base_indices, random_state = random_state) 
    nb_test = int(len(base_indices) * split_size[2]) 
    nb_val = int(len(base_indices) * split_size[1]) 
    test_idx = base_indices[0:nb_test] 
    valid_idx = base_indices[(nb_test):(nb_test+nb_val)] 
    train_idx = base_indices[(nb_test+nb_val):len(base_indices)] 
    
    print(len(train_idx), len(valid_idx), len(test_idx)) 
    
    return train_idx, valid_idx, test_idx 



data = dataset.load_ESOL()

task_name = data.task_name
smiles = data.x
df = data.data


# In[5]:


from chembench import load_data
_, induces = load_data(task_name)


# In[6]:

mp1 = loadmap('../descriptor.mp')
mp2 = loadmap('../fingerprint.mp')


# In[7]:


tmp_feature_dir = '/raid/shenwanxiang/09_batchsize_effect/tempignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)
    
X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)

if not os.path.exists(X2_name): 
    X2 = mp2.batch_transform(smiles, n_jobs = 8)
    dump(X2, X2_name)
else:
    X2 = load(X2_name)


# In[8]:


fmap_shape1 = X1.shape[1:] 
fmap_shape2 = X2.shape[1:] 


# In[9]:


Y = data.y
n_outputs = Y.shape[1]


# induces = []
# for random_state in range(10):
#     induce = Rdsplit(data.data, random_state)
#     induces.append(induce)

# from sklearn.model_selection import KFold
# induces = []
# for random_state in [2, 32, 128, 512, 1024]:
#     kf = KFold(n_splits=5, shuffle = True, random_state=random_state)
#     for tr, ts in kf.split(range(len(df))):
#         induces.append([tr, ts])
        
batch_sizes  = [8, 64, 128]


res = []
for batch_size in batch_sizes:

    c1 = []
    for idx in induces:

        train_idx, valid_idx, test_idx  = idx

        X = X1[train_idx]
        y = Y[train_idx]

        X_valid =  X1[valid_idx]
        y_valid = Y[valid_idx]

        X_test = X1[test_idx]
        y_test = Y[test_idx]    

        clf = RegressionEstimator(n_outputs=n_outputs,  
                                  fmap_shape1 = fmap_shape1, 
                                  fmap_shape2 = None,
                                  batch_size = batch_size,
                                  dense_layers = [128, 64],
                                  gpuid = 5,
                                  epochs = 800,
                                  patience = 50,
                                 ) 

        clf.fit(X, y, X_valid, y_valid)


        train_rmses, train_r2s = clf._performance.evaluate(X,y)
        valid_rmses, valid_r2s = clf._performance.evaluate(X_valid,y_valid)            
        test_rmses, test_r2s = clf._performance.evaluate(X_test,y_test)
        
        
        dfp = pd.DataFrame(clf._performance.history)
        dfp = dfp.set_index('epoch')
        
        final_res = {'batch_size': batch_size, 
                   'process':dfp, 
                   'train_rmse':np.nanmean(train_rmses), 
                   'valid_rmse':np.nanmean(valid_rmses),                      
                   'test_rmse':np.nanmean(test_rmses),
                   'train_r2':np.nanmean(train_r2s), 
                   'valid_r2':np.nanmean(valid_r2s),                      
                   'test_r2':np.nanmean(test_r2s)}
        print(final_res)
        c1.append(final_res)
        
    res.append(c1)



dump((batch_sizes, res), './%s.x1.res' % task_name)
for i in res:
    x = pd.DataFrame(i).test_rmse.mean()    
    print(x)

total samples: 1128
loading dataset: ESOL number of split times: 3
RegressionEstimator(batch_size=8, gpuid='5')
epoch: 0001, loss: 5.3997 - val_loss: 5.0694; rmse: 2.0863 - rmse_val: 2.2574;  r2: 0.4765 - r2_val: 0.4987                                                                                                    
epoch: 0002, loss: 3.9094 - val_loss: 4.2806; rmse: 1.9170 - rmse_val: 2.0727;  r2: 0.4904 - r2_val: 0.5223                                                                                                    
epoch: 0003, loss: 3.2864 - val_loss: 3.4209; rmse: 1.7275 - rmse_val: 1.8483;  r2: 0.4814 - r2_val: 0.5084                                                                                                    
epoch: 0004, loss: 2.5291 - val_loss: 2.1977; rmse: 1.4045 - rmse_val: 1.4819;  r2: 0.5957 - r2_val: 0.6377                                                                                                    
epoch: 0005, loss: 1.7255 - val_loss: 1.4170; rmse: 1.15