In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import load, dump
import time

from molmap import dataset
from molmap import loadmap
from molmap import model as molmodel
import molmap

#use GPU, if negative value, CPUs will be used
import tensorflow as tf
#import tensorflow_addons as tfa
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

## fix random seed to get repeatale results
seed = 123
tqdm.pandas(ascii=True)
np.random.seed(seed)
tf.compat.v1.set_random_seed(seed)

def get_pos_weights(trainY):
    """pos_weights: neg_n / pos_n """
    dfY = pd.DataFrame(trainY)
    pos = dfY == 1
    pos_n = pos.sum(axis=0)
    neg = dfY == 0
    neg_n = neg.sum(axis=0)
    pos_weights = (neg_n / pos_n).values
    neg_weights = (pos_n / neg_n).values
    return pos_weights, neg_weights

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
mp1 = molmap.loadmap('../descriptor.mp')
mp2 = molmap.loadmap('../fingerprint.mp')

In [3]:
task_name = 'Tox21'
from chembench import load_data
df, induces = load_data(task_name)

MASK = -1
smiles_col = df.columns[0]
values_col = df.columns[1:]
Y = df[values_col].astype('float').fillna(MASK).values
if Y.shape[1] == 0:
    Y = Y.reshape(-1, 1)
    
tmp_feature_dir = './tmpignore'
X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)

if not os.path.exists(X2_name): 
    X2 = mp2.batch_transform(df.smiles, n_jobs = 8)
    dump(X2, X2_name)
else:
    X2 = load(X2_name)
    
train_idx, valid_idx, test_idx = induces[0]

molmap1_size = X1.shape[1:]
molmap2_size = X2.shape[1:]
trainY = Y[train_idx]
validY = Y[valid_idx]
testY = Y[test_idx]

loading dataset: Tox21 number of split times: 3


In [13]:
epochs = 100
patience = 10 #early stopping, 100 epochs to select best
dense_layers = [256, 128]
batch_size = 128
lr = 1e-4
weight_decay = 0
metric = 'ROC'
monitor = 'val_auc'
dense_avf = 'relu'
last_avf = None #sigmoid in loss


method = 'umap'
min_dist = 0.9
n_neighbors = 10

In [14]:
mp1_opt = molmap.loadmap('../descriptor.mp')
mp1_opt.fit(method = method, n_neighbors = n_neighbors, min_dist = min_dist)
mp1_opt.save('./optimized_des.mp')
mp2_opt = molmap.loadmap('../fingerprint.mp')
mp2_opt.fit(method = method, n_neighbors = n_neighbors, min_dist = min_dist)
mp2_opt.save('./optimized_fp.mp')

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='precomputed',
     metric_kwds=None, min_dist=0.9, n_components=2, n_epochs=None,
     n_neighbors=10, negative_sample_rate=5, random_state=1,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=2)
Construct fuzzy simplicial set
Mon Feb  3 22:31:55 2020 Finding Nearest Neighbors
Mon Feb  3 22:31:55 2020 Finished Nearest Neighbor Search
Mon Feb  3 22:31:55 2020 Construct embedding


  n_components


	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Feb  3 22:31:58 2020 Finished embedding
2020-02-03 22:31:58,398 - [32mINFO[0m - [bidd-molmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-02-03 22:32:00,518 - [32mINFO[0m - [bidd-molmap][0m - Finished[0m
UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='precomputed',
     metric_kwds=None, min_dist=0.9, n_components=2, n_epochs=None,
     n_neighbors=10, negative_sample_rate=5, random_state=1,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5

['./optimized_fp.mp']

In [15]:
X1_new = mp1.rearrangement(X1, mp1_opt)
X2_new = mp2.rearrangement(X2, mp2_opt)

100%|##########| 7831/7831 [00:02<00:00, 2657.56it/s]
100%|##########| 7831/7831 [00:01<00:00, 4461.12it/s]


In [16]:
results = []
for i, split_idxs in enumerate(induces):

    train_idx, valid_idx, test_idx = split_idxs
    print(len(train_idx), len(valid_idx), len(test_idx))

    trainY = Y[train_idx]
    validY = Y[valid_idx]
    testY = Y[test_idx]            

    trainX = (X1_new[train_idx], X2_new[train_idx])
    validX = (X1_new[valid_idx], X2_new[valid_idx])
    testX = (X1_new[test_idx], X2_new[test_idx])    
    
    
    pos_weights, neg_weights = get_pos_weights(trainY)
    loss = lambda y_true, y_pred: molmodel.loss.weighted_cross_entropy(y_true,y_pred, pos_weights, MASK = -1)
    
    model = molmodel.net.DoublePathNet(molmap1_size, molmap2_size, 
                                       n_outputs=Y.shape[-1], 
                                       dense_layers=dense_layers, 
                                       dense_avf = dense_avf, 
                                       last_avf=last_avf)

    opt = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) #
    #import tensorflow_addons as tfa
    #opt = tfa.optimizers.AdamW(weight_decay = 0.1,learning_rate=0.001,beta1=0.9,beta2=0.999, epsilon=1e-08)
    model.compile(optimizer = opt, loss = loss)

    performance = molmodel.cbks.CLA_EarlyStoppingAndPerformance((trainX, trainY), 
                                                                   (validX, validY), 
                                                                   patience = patience, 
                                                                   criteria = monitor,
                                                                   metric = metric,
                                                                  )
    model.fit(trainX, trainY, batch_size=batch_size, 
          epochs=epochs, verbose= 0, shuffle = True, 
          validation_data = (validX, validY), 
          callbacks=[performance]) 

    
    best_epoch = performance.best_epoch
    trainable_params = model.count_params()
    
    train_aucs = performance.evaluate(trainX, trainY)            
    valid_aucs = performance.evaluate(validX, validY)            
    test_aucs = performance.evaluate(testX, testY)


    final_res = {
                     'task_name':task_name,            
                     'train_auc':np.nanmean(train_aucs), 
                     'valid_auc':np.nanmean(valid_aucs),                      
                     'test_auc':np.nanmean(test_aucs), 
                     'metric':metric,
                     '# trainable params': trainable_params,
                     'best_epoch': best_epoch,
                     'batch_size':batch_size,
                     'lr': lr,
                     'weight_decay':weight_decay
                    }
    
    results.append(final_res)
    print(final_res)

6264 783 784
epoch: 0001, loss: 1.1649 - val_loss: 1.1592; auc: 0.7119 - val_auc: 0.7251                                                                                                    
epoch: 0002, loss: 1.1072 - val_loss: 1.0913; auc: 0.7352 - val_auc: 0.7524                                                                                                    
epoch: 0003, loss: 1.0583 - val_loss: 1.0355; auc: 0.7456 - val_auc: 0.7584                                                                                                    
epoch: 0004, loss: 1.0305 - val_loss: 1.0142; auc: 0.7584 - val_auc: 0.7666                                                                                                    
epoch: 0005, loss: 1.0066 - val_loss: 1.0099; auc: 0.7735 - val_auc: 0.7753                                                                                                    
epoch: 0006, loss: 0.9923 - val_loss: 0.9919; auc: 0.7796 - val_auc: 0.7766                                

In [17]:
pd.DataFrame(results)

Unnamed: 0,task_name,train_auc,valid_auc,test_auc,metric,# trainable params,best_epoch,batch_size,lr,weight_decay
0,Tox21,0.953479,0.855298,0.842746,ROC,801068,53,128,0.0001,0
1,Tox21,0.929975,0.863619,0.85139,ROC,801068,40,128,0.0001,0
2,Tox21,0.936592,0.870506,0.84685,ROC,801068,42,128,0.0001,0


In [18]:
pd.DataFrame(results).to_csv('./%s_optimized.csv' % task_name)

In [19]:
pd.DataFrame(results).test_auc.mean()

0.8469954155611884

In [20]:
pd.DataFrame(results).valid_auc.mean()

0.8631409047595492