In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import load, dump


from molmap import dataset
from molmap import loadmap
from molmap import model as molmodel
import molmap

#use GPU, if negative value, CPUs will be used
import tensorflow as tf
#import tensorflow_addons as tfa
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## fix random seed to get repeatale results
seed = 123
tqdm.pandas(ascii=True)
np.random.seed(seed)
tf.compat.v1.set_random_seed(seed)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:



def get_attentiveFP_idx(df):
    """ attentiveFP dataset"""
    train, valid,test = load('../ESOL_train_valid_test.data')
    print('training set: %s, valid set: %s, test set %s' % (len(train), len(valid), len(test)))
    train_idx = df[df.smiles.isin(train.smiles)].index
    valid_idx = df[df.smiles.isin(valid.smiles)].index
    test_idx = df[df.smiles.isin(test.smiles)].index
    print('training set: %s, valid set: %s, test set %s' % (len(train_idx), len(valid_idx), len(test_idx)))
    return train_idx, valid_idx, test_idx 

#load dataset
data = dataset.load_ESOL()
df = data.data
Y = data.y


task_name = 'ESOL'
tmp_feature_dir = './tmpignore'
if not os.path.exists(tmp_feature_dir):
    os.makedirs(tmp_feature_dir)
mp1 = loadmap('../../descriptor.mp')


X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)



train_idx, valid_idx, test_idx = get_attentiveFP_idx(df)
trainY = Y[train_idx]
validY = Y[valid_idx]




total samples: 1128
training set: 901, valid set: 113, test set 113
training set: 901, valid set: 113, test set 113


In [4]:

min_dist = 0.1    
n_neighbors = 15

print({'min_dist':min_dist, 'n_neighbors':n_neighbors})
mp_new =  loadmap('../../descriptor.mp')
mp_new.fit(method = 'umap', min_dist = min_dist, n_neighbors = n_neighbors)
X_new = mp1.rearrangement(X1, mp_new)

trainX = X_new[train_idx]
validX = X_new[valid_idx]

opt = tf.keras.optimizers.Adam(lr = 1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) #    
model = molmodel.net.SinglePathNet(trainX.shape[1:], 
                                   n_outputs=1, dense_layers=[128, 32], 
                                   dense_avf='tanh', last_avf='linear')

model.compile(optimizer = opt, loss = 'mse')
performance = molmodel.cbks.Reg_EarlyStoppingAndPerformance((trainX, trainY), 
                                                           (validX, validY), 
                                                           patience = 1000000, #find best epoch in total 500 epochs
                                                           criteria = 'val_loss')
model.fit(trainX, trainY, batch_size = 128, 
      epochs=500, verbose= 0, shuffle = True, 
      validation_data = (validX, validY), 
      callbacks=[performance]) 

valid_rmse, valid_r2 = performance.evaluate(validX, validY)
train_rmse, train_r2 = performance.evaluate(trainX, trainY)

valid_best_rmse = np.nanmean(valid_rmse)
valid_best_loss = performance.best
train_best_rmse = np.nanmean(train_rmse)
best_epoch = performance.best_epoch

{'min_dist': 0.1, 'n_neighbors': 15}
UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='precomputed',
     metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, random_state=1,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=2)
Construct fuzzy simplicial set
Wed Jan 29 10:36:48 2020 Finding Nearest Neighbors
Wed Jan 29 10:36:48 2020 Finished Nearest Neighbor Search
Wed Jan 29 10:36:50 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed

100%|##########| 1128/1128 [00:00<00:00, 2602.89it/s]


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
epoch: 0001, loss: 8.7544 - val_loss: 5.3413; rmse: 2.2996 - rmse_val: 2.3111;  r2: 0.3313 - r2_val: 0.2890                                                                                                    
epoch: 0002, loss: 4.7726 - val_loss: 4.1018; rmse: 2.1098 - rmse_val: 2.0253;  r2: 0.4449 - r2_val: 0.4530                                                                                                    
epoch: 0003, loss: 4.5025 - val_loss: 4.0896; rmse: 2.1250 - rmse_val: 2.0223;  r2: 0.4916 - r2_val: 0.4888                                                                                                    
epoch: 0004, loss: 4.4563 - val_loss: 4.0453; rmse: 2.0877 - rmse_val: 2.0113;  r2: 0.5304 - r2_val: 0.5252                                                                                                    
epoch: 0005, loss: 4.3412 - val_loss: 4.0518; rmse

In [5]:
best_epoch

444

In [6]:
valid_best_rmse

0.5092212752798269