In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import load, dump
import time

from molmap import dataset
from molmap import loadmap
from molmap import model as molmodel
import molmap

#use GPU, if negative value, CPUs will be used
import tensorflow as tf
#import tensorflow_addons as tfa
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

## fix random seed to get repeatale results
seed = 123
tqdm.pandas(ascii=True)
np.random.seed(seed)
tf.compat.v1.set_random_seed(seed)

def get_pos_weights(trainY):
    """pos_weights: neg_n / pos_n """
    dfY = pd.DataFrame(trainY)
    pos = dfY == 1
    pos_n = pos.sum(axis=0)
    neg = dfY == 0
    neg_n = neg.sum(axis=0)
    pos_weights = (neg_n / pos_n).values
    neg_weights = (pos_n / neg_n).values
    return pos_weights, neg_weights

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
mp1 = molmap.loadmap('../descriptor.mp')
mp2 = molmap.loadmap('../fingerprint.mp')

In [3]:
task_name = 'HIV'
from chembench import load_data
df, induces = load_data(task_name)


MASK = -1
smiles_col = df.columns[0]
values_col = df.columns[1:]
Y = df[values_col].astype('float').fillna(MASK).values
if Y.shape[1] == 0:
    Y = Y.reshape(-1, 1)
    
tmp_feature_dir = './tmpignore'
X1_name = os.path.join(tmp_feature_dir, 'X1_%s.data' % task_name)
X2_name = os.path.join(tmp_feature_dir, 'X2_%s.data' % task_name)
if not os.path.exists(X1_name):
    X1 = mp1.batch_transform(df.smiles, n_jobs = 8)
    dump(X1, X1_name)
else:
    X1 = load(X1_name)

if not os.path.exists(X2_name): 
    X2 = mp2.batch_transform(df.smiles, n_jobs = 8)
    dump(X2, X2_name)
else:
    X2 = load(X2_name)
    
train_idx, valid_idx, test_idx = induces[0]


molmap1_size = X1.shape[1:]
molmap2_size = X2.shape[1:]
trainY = Y[train_idx]
validY = Y[valid_idx]
testY = Y[test_idx]

loading dataset: HIV number of split times: 3


In [4]:
epochs = 800
patience = 10 #early stopping in total 10 epochs
dense_layers = [64]
batch_size = 128
lr = 1e-4
weight_decay = 0
metric = 'ROC'
monitor = 'val_loss'
dense_avf = 'relu'
last_avf = None #sigmoid in loss


method = 'umap'
min_dist = 0.1
n_neighbors = 50


In [5]:
mp1_opt = molmap.loadmap('../descriptor.mp')
mp1_opt.fit(method = method, n_neighbors = 20, min_dist = 0.20)
mp1_opt.save('./optimized_des.mp')
mp2_opt = molmap.loadmap('../fingerprint.mp')
mp2_opt.fit(method = method, n_neighbors = 20, min_dist = 0.20)
mp2_opt.save('./optimized_fp.mp')

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='precomputed',
     metric_kwds=None, min_dist=0.2, n_components=2, n_epochs=None,
     n_neighbors=20, negative_sample_rate=5, random_state=1,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=2)
Construct fuzzy simplicial set
Mon Feb 17 10:29:07 2020 Finding Nearest Neighbors
Mon Feb 17 10:29:07 2020 Finished Nearest Neighbor Search
Mon Feb 17 10:29:09 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Feb 17 10:29

['./optimized_fp.mp']

In [6]:
X1_new = mp1.rearrangement(X1, mp1_opt)
X2_new = mp2.rearrangement(X2, mp2_opt)

100%|##########| 41127/41127 [00:15<00:00, 2590.75it/s]
100%|##########| 41127/41127 [00:10<00:00, 4013.53it/s]


In [7]:
results = []
for i, split_idxs in enumerate(induces):

    train_idx, valid_idx, test_idx = split_idxs
    print(len(train_idx), len(valid_idx), len(test_idx))

    trainY = Y[train_idx]
    validY = Y[valid_idx]
    testY = Y[test_idx]            

    trainX = (X1_new[train_idx], X2_new[train_idx])
    validX = (X1_new[valid_idx], X2_new[valid_idx])
    testX = (X1_new[test_idx], X2_new[test_idx])    
    
    
    pos_weights, neg_weights = get_pos_weights(trainY)
    loss = lambda y_true, y_pred: molmodel.loss.weighted_cross_entropy(y_true,y_pred, pos_weights, MASK = -1)

    model = molmodel.net.DoublePathNet(molmap1_size, molmap2_size, 
                                       n_outputs=Y.shape[-1], 
                                       dense_layers=dense_layers, 
                                       dense_avf = dense_avf, 
                                       last_avf=last_avf)

    opt = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) #
    #import tensorflow_addons as tfa
    #opt = tfa.optimizers.AdamW(weight_decay = 0.1,learning_rate=0.001,beta1=0.9,beta2=0.999, epsilon=1e-08)
    model.compile(optimizer = opt, loss = loss)

    performance = molmodel.cbks.CLA_EarlyStoppingAndPerformance((trainX, trainY), 
                                                                   (validX, validY), 
                                                                   patience = patience, 
                                                                   criteria = monitor,
                                                                   metric = metric,
                                                                  )
    model.fit(trainX, trainY, batch_size=batch_size, 
          epochs=epochs, verbose= 0, shuffle = True, 
          validation_data = (validX, validY), 
          callbacks=[performance]) 

    
    best_epoch = performance.best_epoch
    trainable_params = model.count_params()
    
    train_aucs = performance.evaluate(trainX, trainY)            
    valid_aucs = performance.evaluate(validX, validY)            
    test_aucs = performance.evaluate(testX, testY)


    final_res = {
                     'task_name':task_name,            
                     'train_auc':np.nanmean(train_aucs), 
                     'valid_auc':np.nanmean(valid_aucs),                      
                     'test_auc':np.nanmean(test_aucs), 
                     'metric':metric,
                     '# trainable params': trainable_params,
                     'best_epoch': best_epoch,
                     'batch_size':batch_size,
                     'lr': lr,
                     'weight_decay':weight_decay
                    }
    
    results.append(final_res)
    print(final_res)

32901 4113 4113
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
epoch: 0001, loss: 1.2444 - val_loss: 0.8228; auc: 0.7508 - val_auc: 0.7791                                                                                                    
epoch: 0002, loss: 1.1273 - val_loss: 0.7835; auc: 0.8001 - val_auc: 0.7986                                                                                                    
epoch: 0003, loss: 1.0536 - val_loss: 0.8802; auc: 0.8307 - val_auc: 0.8172                                                                                                    
epoch: 0004, loss: 0.9930 - val_loss: 0.7086; auc: 0.8590 - val_auc: 0.8265                                                                                                    
epoch: 0005, loss: 0.9402 - val_loss: 0.7092; auc: 0.8755 - val_auc: 0.8326                                                                                       

In [8]:
pd.DataFrame(results)

Unnamed: 0,task_name,train_auc,valid_auc,test_auc,metric,# trainable params,best_epoch,batch_size,lr,weight_decay
0,HIV,0.95065,0.84608,0.775878,ROC,692769,10,128,0.0001,0
1,HIV,0.966456,0.843112,0.767738,ROC,692769,11,128,0.0001,0
2,HIV,0.917441,0.836992,0.773597,ROC,692769,7,128,0.0001,0


In [9]:
pd.DataFrame(results).to_csv('./%s_optimized.csv' % task_name)

In [10]:
pd.DataFrame(results).test_auc.mean()

0.7724045140565351