In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import load, dump


from molmap import dataset
from molmap import loadmap
from molmap import model as molmodel
import molmap

#use GPU, if negative value, CPUs will be used
import tensorflow as tf
#import tensorflow_addons as tfa
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## fix random seed to get repeatale results
seed = 123
tqdm.pandas(ascii=True)
np.random.seed(seed)
tf.compat.v1.set_random_seed(seed)



In [2]:
def get_attentiveFP_idx(df):
    """ attentiveFP dataset"""
    train, valid,test = load('../ESOL_train_valid_test.data')
    print('training set: %s, valid set: %s, test set %s' % (len(train), len(valid), len(test)))
    train_idx = df[df.smiles.isin(train.smiles)].index
    valid_idx = df[df.smiles.isin(valid.smiles)].index
    test_idx = df[df.smiles.isin(test.smiles)].index
    print('training set: %s, valid set: %s, test set %s' % (len(train_idx), len(valid_idx), len(test_idx)))
    return train_idx, valid_idx, test_idx 

#load dataset
data = dataset.load_ESOL()
df = data.data
Y = data.y


task_name = 'ESOL'
tmp_feature_dir = './tmpignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)
mp1 = loadmap('../../descriptor.mp')


X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)



train_idx, valid_idx, test_idx = get_attentiveFP_idx(df)
trainY = Y[train_idx]
validY = Y[valid_idx]

total samples: 1128
training set: 901, valid set: 113, test set 113
training set: 901, valid set: 113, test set 113


In [3]:
import time
start_time = str(time.ctime()).replace(':','-').replace(' ','_')
log_file = data.task_name + '_' + start_time + '.log'

with open(log_file,'a') as f:
    f.write(','.join(['n_neighbors', 'min_dist', 'valid_best_rmse', 
                      'train_best_rmse', 'best_epoch'])+'\n')

def Eva(n_neighbors,  min_dist):
    
    min_dist = round(min_dist, 1)    
    n_neighbors = int(n_neighbors)

    print({'min_dist':min_dist, 'n_neighbors':n_neighbors})
    mp_new =  loadmap('../../descriptor.mp')
    mp_new.fit(method = 'umap', min_dist = min_dist, n_neighbors = n_neighbors)
    X_new = mp1.rearrangement(X1, mp_new)
    
    trainX = X_new[train_idx]
    validX = X_new[valid_idx]

    opt = tf.keras.optimizers.Adam(lr = 1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) #    
    model = molmodel.net.SinglePathNet(trainX.shape[1:], 
                                       n_outputs=1, dense_layers=[128, 64], 
                                       dense_avf='tanh', last_avf='linear')
    
    model.compile(optimizer = opt, loss = 'mse')
    performance = molmodel.cbks.Reg_EarlyStoppingAndPerformance((trainX, trainY), 
                                                               (validX, validY), 
                                                               patience = 30, #with out early stopping 
                                                               criteria = 'val_loss')
    model.fit(trainX, trainY, batch_size = 128, 
          epochs=800, verbose= 0, shuffle = True, 
          validation_data = (validX, validY), 
          callbacks=[performance]) 
    
    valid_rmse, valid_r2 = performance.evaluate(validX, validY)
    train_rmse, train_r2 = performance.evaluate(trainX, trainY)
    
    valid_best_rmse = np.nanmean(valid_rmse)
    train_best_rmse = np.nanmean(train_rmse)
    best_epoch = performance.best_epoch
    
    with open(log_file, 'a') as f:
        f.write(','.join([str(min_dist), str(n_neighbors), str(valid_best_rmse), 
                          str(train_best_rmse), str(best_epoch)]) + '\n')
        
    return [valid_best_rmse, train_best_rmse, best_epoch]

In [None]:
if __name__ == '__main__':
    
    n_neighbors_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    min_dist_list = [0, 0.1, 0.2, 0.3, 0.4,0.5,0.6,0.7, 0.8, 0.9]
    res = []
    for n_neighbors in n_neighbors_list:
        for min_dist in min_dist_list:
            output = Eva(n_neighbors,  min_dist)
            x = [n_neighbors, min_dist]
            x.extend(output)
            res.append(x)

{'min_dist': 0, 'n_neighbors': 10}
UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='precomputed',
   metric_kwds=None, min_dist=0, n_components=2, n_epochs=None,
   n_neighbors=10, negative_sample_rate=5, random_state=1,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=2)
Construct fuzzy simplicial set
Wed Jan 22 15:03:17 2020 Finding Nearest Neighbors
Wed Jan 22 15:03:17 2020 Finished Nearest Neighbor Search
Wed Jan 22 15:03:18 2020 Construct embedding


  n_components


	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:03:21 2020 Finished embedding
2020-01-22 15:03:21,135 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:03:23,440 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3361.16it/s]


epoch: 0001, loss: 10.8377 - val_loss: 6.8618; rmse: 2.5690 - rmse_val: 2.6195;  r2: 0.3150 - r2_val: 0.2611                                                                                                    
epoch: 0002, loss: 5.3095 - val_loss: 4.1448; rmse: 2.1086 - rmse_val: 2.0359;  r2: 0.3810 - r2_val: 0.3454                                                                                                    
epoch: 0003, loss: 4.6133 - val_loss: 4.3220; rmse: 2.2076 - rmse_val: 2.0789;  r2: 0.4917 - r2_val: 0.4670                                                                                                    
epoch: 0004, loss: 4.7950 - val_loss: 4.1071; rmse: 2.1304 - rmse_val: 2.0266;  r2: 0.5571 - r2_val: 0.5425                                                                                                    
epoch: 0005, loss: 4.4861 - val_loss: 4.1398; rmse: 2.0973 - rmse_val: 2.0347;  r2: 0.5994 - r2_val: 0.5817                                                            

  n_components


	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:10:18 2020 Finished embedding
2020-01-22 15:10:18,716 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:10:21,425 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3428.12it/s]


epoch: 0001, loss: 10.9136 - val_loss: 7.1743; rmse: 2.6197 - rmse_val: 2.6785;  r2: 0.1227 - r2_val: 0.0517                                                                                                    
epoch: 0002, loss: 5.5197 - val_loss: 4.2010; rmse: 2.0971 - rmse_val: 2.0496;  r2: 0.1324 - r2_val: 0.0657                                                                                                    
epoch: 0003, loss: 4.4476 - val_loss: 4.2251; rmse: 2.1675 - rmse_val: 2.0555;  r2: 0.1737 - r2_val: 0.1040                                                                                                    
epoch: 0004, loss: 4.6629 - val_loss: 4.0370; rmse: 2.1039 - rmse_val: 2.0092;  r2: 0.2419 - r2_val: 0.1698                                                                                                    
epoch: 0005, loss: 4.3547 - val_loss: 4.0204; rmse: 2.0595 - rmse_val: 2.0051;  r2: 0.3137 - r2_val: 0.2434                                                            

  n_components


 50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:18:17 2020 Finished embedding
2020-01-22 15:18:17,901 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:18:20,301 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3463.66it/s]


epoch: 0001, loss: 12.2548 - val_loss: 8.6364; rmse: 2.8617 - rmse_val: 2.9388;  r2: 0.1676 - r2_val: 0.0954                                                                                                    
epoch: 0002, loss: 6.5415 - val_loss: 4.7038; rmse: 2.1818 - rmse_val: 2.1688;  r2: 0.2006 - r2_val: 0.1327                                                                                                    
epoch: 0003, loss: 4.4760 - val_loss: 4.1173; rmse: 2.1354 - rmse_val: 2.0291;  r2: 0.2644 - r2_val: 0.1957                                                                                                    
epoch: 0004, loss: 4.6726 - val_loss: 4.1375; rmse: 2.1456 - rmse_val: 2.0341;  r2: 0.3204 - r2_val: 0.2469                                                                                                    
epoch: 0005, loss: 4.5039 - val_loss: 4.0020; rmse: 2.0744 - rmse_val: 2.0005;  r2: 0.3784 - r2_val: 0.3025                                                            

  n_components


epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:23:16 2020 Finished embedding
2020-01-22 15:23:16,110 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:23:18,857 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3819.79it/s]


epoch: 0001, loss: 11.3438 - val_loss: 6.9931; rmse: 2.5920 - rmse_val: 2.6444;  r2: 0.1903 - r2_val: 0.1414                                                                                                    
epoch: 0002, loss: 5.3634 - val_loss: 4.1454; rmse: 2.1046 - rmse_val: 2.0360;  r2: 0.2748 - r2_val: 0.2414                                                                                                    
epoch: 0003, loss: 4.5563 - val_loss: 4.2605; rmse: 2.1878 - rmse_val: 2.0641;  r2: 0.3621 - r2_val: 0.3302                                                                                                    
epoch: 0004, loss: 4.7145 - val_loss: 4.0481; rmse: 2.1127 - rmse_val: 2.0120;  r2: 0.4215 - r2_val: 0.3938                                                                                                    
epoch: 0005, loss: 4.4017 - val_loss: 4.0454; rmse: 2.0730 - rmse_val: 2.0113;  r2: 0.4513 - r2_val: 0.4244                                                            

  n_components


	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:28:28 2020 Finished embedding
2020-01-22 15:28:29,006 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:28:30,883 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3838.80it/s]


epoch: 0001, loss: 10.4020 - val_loss: 6.8418; rmse: 2.5665 - rmse_val: 2.6157;  r2: 0.1616 - r2_val: 0.1299                                                                                                    
epoch: 0002, loss: 5.3129 - val_loss: 4.1258; rmse: 2.1023 - rmse_val: 2.0312;  r2: 0.2404 - r2_val: 0.2219                                                                                                    
epoch: 0003, loss: 4.5777 - val_loss: 4.2892; rmse: 2.1967 - rmse_val: 2.0710;  r2: 0.3241 - r2_val: 0.3134                                                                                                    
epoch: 0004, loss: 4.7335 - val_loss: 4.0468; rmse: 2.1116 - rmse_val: 2.0117;  r2: 0.3810 - r2_val: 0.3630                                                                                                    
epoch: 0005, loss: 4.4046 - val_loss: 4.0721; rmse: 2.0776 - rmse_val: 2.0179;  r2: 0.4217 - r2_val: 0.3965                                                            

  n_components


	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:33:27 2020 Finished embedding
2020-01-22 15:33:27,072 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:33:29,380 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3419.66it/s]


epoch: 0001, loss: 10.3859 - val_loss: 6.3450; rmse: 2.4807 - rmse_val: 2.5189;  r2: 0.0809 - r2_val: 0.0665                                                                                                    
epoch: 0002, loss: 5.0267 - val_loss: 4.1245; rmse: 2.1237 - rmse_val: 2.0309;  r2: 0.2070 - r2_val: 0.2094                                                                                                    
epoch: 0003, loss: 4.7487 - val_loss: 4.3345; rmse: 2.2105 - rmse_val: 2.0819;  r2: 0.4036 - r2_val: 0.3954                                                                                                    
epoch: 0004, loss: 4.7145 - val_loss: 4.0610; rmse: 2.1043 - rmse_val: 2.0152;  r2: 0.5253 - r2_val: 0.5068                                                                                                    
epoch: 0005, loss: 4.4213 - val_loss: 4.1777; rmse: 2.0955 - rmse_val: 2.0440;  r2: 0.5887 - r2_val: 0.5624                                                            

  n_components


 50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:40:21 2020 Finished embedding
2020-01-22 15:40:21,039 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:40:22,835 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3448.49it/s]


epoch: 0001, loss: 11.2932 - val_loss: 7.4420; rmse: 2.6677 - rmse_val: 2.7280;  r2: 0.1415 - r2_val: 0.0844                                                                                                    
epoch: 0002, loss: 5.6392 - val_loss: 4.2094; rmse: 2.1039 - rmse_val: 2.0517;  r2: 0.1617 - r2_val: 0.1081                                                                                                    
epoch: 0003, loss: 4.4883 - val_loss: 4.2550; rmse: 2.1806 - rmse_val: 2.0628;  r2: 0.2165 - r2_val: 0.1610                                                                                                    
epoch: 0004, loss: 4.7142 - val_loss: 4.0516; rmse: 2.1119 - rmse_val: 2.0129;  r2: 0.2951 - r2_val: 0.2347                                                                                                    
epoch: 0005, loss: 4.3754 - val_loss: 4.0074; rmse: 2.0590 - rmse_val: 2.0018;  r2: 0.3651 - r2_val: 0.3049                                                            

  n_components


	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:45:27 2020 Finished embedding
2020-01-22 15:45:27,790 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:45:29,816 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:00<00:00, 3764.37it/s]


epoch: 0001, loss: 9.1635 - val_loss: 5.7287; rmse: 2.3678 - rmse_val: 2.3935;  r2: 0.2445 - r2_val: 0.1819                                                                                                    
epoch: 0002, loss: 4.7416 - val_loss: 4.0774; rmse: 2.1119 - rmse_val: 2.0193;  r2: 0.2367 - r2_val: 0.1816                                                                                                    
epoch: 0003, loss: 4.6618 - val_loss: 4.2293; rmse: 2.1742 - rmse_val: 2.0565;  r2: 0.3012 - r2_val: 0.2494                                                                                                    
epoch: 0004, loss: 4.5606 - val_loss: 3.9833; rmse: 2.0757 - rmse_val: 1.9958;  r2: 0.3746 - r2_val: 0.3200                                                                                                    
epoch: 0005, loss: 4.3018 - val_loss: 4.0642; rmse: 2.0645 - rmse_val: 2.0160;  r2: 0.4227 - r2_val: 0.3660                                                             

  n_components


	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Jan 22 15:57:56 2020 Finished embedding
2020-01-22 15:57:56,736 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-01-22 15:58:03,137 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m


100%|##########| 1128/1128 [00:01<00:00, 864.47it/s]


epoch: 0001, loss: 8.5326 - val_loss: 5.1504; rmse: 2.2614 - rmse_val: 2.2694;  r2: 0.2758 - r2_val: 0.2074                                                                                                    
epoch: 0002, loss: 4.4948 - val_loss: 4.0798; rmse: 2.1237 - rmse_val: 2.0198;  r2: 0.2646 - r2_val: 0.1992                                                                                                    
epoch: 0003, loss: 4.6637 - val_loss: 4.0915; rmse: 2.1327 - rmse_val: 2.0228;  r2: 0.3264 - r2_val: 0.2613                                                                                                    
epoch: 0004, loss: 4.3478 - val_loss: 3.8747; rmse: 2.0331 - rmse_val: 1.9684;  r2: 0.3810 - r2_val: 0.3139                                                                                                    
epoch: 0005, loss: 4.1286 - val_loss: 3.8863; rmse: 2.0150 - rmse_val: 1.9714;  r2: 0.3971 - r2_val: 0.3334                                                             