#### Import Dependencies

In [1]:
import tensorflow as tf
import numpy as np
from datetime import datetime
from data_utils import *

  from ._conv import register_converters as _register_converters


#### Load Data

In [2]:
train_valid_ligids  = np.load('../data/PHARM_TRAIN_X.npy')
train_valid_smiles = np.load('../data/PHARM_TRAIN_SMILES.npy')
train_valid_scores = np.load('../data/Y_train.npy')

print('train_valid_ligids shape: {}'.format(train_valid_ligids.shape))
print('train_valid_smiles shape: {}'.format(train_valid_smiles.shape))
print('train_valid_scores shape: {}'.format(train_valid_scores.shape))

train_valid_ligids shape: (13246, 9)
train_valid_smiles shape: (46005, 167)
train_valid_scores shape: (609382230,)


In [27]:
print(np.sum(train_valid_ligids, 0)>10000)
print(np.sum(train_valid_ligids[:,np.sum(train_valid_ligids, 0)>10000], axis=0))
print(np.sum(train_valid_smiles[:,np.sum(train_valid_smiles, 0)>1000], axis=0))

[False  True  True False False False  True  True False]
[10463 20780 12156 19674]
[ 1677  2715  2003  3184  3425  6630  4831 10623  2098  7199  4300  1538
  5228 18664  3747  9782  6815  8306  3856 12138  3859  4785  3935  4013
 14435  4937 26484  4925  4184  4808  1319  2348  7465  3974  8041 24657
 18848 24703 23932 10672 21497 26935  9191 25842 21037 11452 11655  6227
 20427 23029 27226 16426 14635 23262 27921 21562 25521  1818 32340 15434
  6898  4958 23509 16294 14395 12837 15108 13597 30553 33591 10571 15434
  8484 12535 13601 32509 27795 31070 36723 30346  8939 15888 28080 14942
 18711 23632 25118  6834 20427 22252 21012 12857 17831 15115 39680 25328
  8422  7284 11334 38983 18711 26128 25611 17372 28807 32310 22599 27398
 36847 21646 37288 33464 34426 43503 26812 43446 31983 35565 44886 43395
 43068 42741 45587]


#### Train Validation Test Split

In [3]:
train_data, validation_data = train_validation_split(train_valid_ligids,
                                                     train_valid_smiles,
                                                     train_valid_scores,
                                                     num_val_lig=3046, 
                                                     num_val_smi=10581,
                                                     shuffle=False)
del train_valid_ligids, train_valid_smiles, train_valid_scores

# Balance Dataset
map_from = [0,1,2,3,4,5,6,7,8,9]
map_to =   [0,0,0,0,1,2,3,4,4,4]

for _from, _to in zip(map_from, map_to):
    train_data.scores[train_data.scores==_from] = _to
    validation_data.scores[validation_data.scores==_from] = _to   

num validation ligids: 3046
num train ligids: 10200
num validation smiles: 10581
num train smiles: 35424
num validation labels: 32229726
num train labels: 361324800


#### Define Model

In [22]:
# Hyper Paramters
LEARNING_RATE = .0005
LAMBDA = .000
DROPOUT = .5
NUM_EPOCHS = 10
VALIDATIONS_PER_EPOCH = 5
TRAINING_BATCH_SIZE = 1
USE_PERCENT_DATA = .05
VALIDATION_BATCH_SIZE = 10000

# Model
L1_UNITS = 15
L2_UNITS = 15
NUM_OUTPUTS = 1

tf.reset_default_graph()

relu = tf.nn.relu
elu = tf.nn.elu
xavier_init = tf.contrib.layers.xavier_initializer()
zero_init = tf.zeros_initializer()
l2_reg = tf.contrib.layers.l2_regularizer(scale=LAMBDA)

with tf.name_scope('inputs') as scope:
    smiles = tf.placeholder(shape=(None, 167), dtype=tf.float32, name='smiles')
    ligids = tf.placeholder(shape=(None, 9), dtype=tf.float32, name='ligids')
    Y = tf.placeholder(shape=(None), dtype=tf.float32, name='score')
    training = tf.placeholder_with_default(input=False, shape=(), name='training')
    
with tf.name_scope('hidden_layers') as scope:
    smiles_layer1 = tf.layers.dense(inputs=ligids,
                                    units=L1_UNITS, 
                                    activation=relu,
                                    kernel_initializer=xavier_init,
                                    bias_initializer=zero_init,
                                    kernel_regularizer=l2_reg,
                                    bias_regularizer=l2_reg,
                                    name='smiles_layer1')
    ligid_layer1 = tf.layers.dense(inputs=smiles,
                                   units=L1_UNITS, 
                                   activation=relu,
                                   kernel_initializer=xavier_init,
                                   bias_initializer=zero_init,
                                   kernel_regularizer=l2_reg,
                                   bias_regularizer=l2_reg,
                                   name='ligid_layer1')
    layer1 = tf.add(ligid_layer1, smiles_layer1, name='layer1')
    layer2 = tf.layers.dense(inputs=layer1, 
                             units=L2_UNITS, 
                             activation=relu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer2')
    
with tf.name_scope('predicted_score') as scope:
    pred_score = tf.layers.dense(inputs=layer2,
                                 units=NUM_OUTPUTS)
    with tf.get_default_graph().gradient_override_map({"Floor": "Identity"}):
        pred_score = tf.floor(pred_score, name='pred_score')
        
with tf.name_scope('train') as scope:
    mae = tf.losses.absolute_difference(Y,pred_score)
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    reg_loss = tf.reduce_sum(reg_loss)
    mse_loss = tf.losses.mean_squared_error(labels=Y,predictions=pred_score)
    loss = mae+reg_loss
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    train_op = optimizer.minimize(loss)

INFO:tensorflow:Scale of 0 disables regularizer.


#### Run Model

In [39]:
# Misc. constants
num_training_batches = 1#int(train_data.num_scores/TRAINING_BATCH_SIZE*USE_PERCENT_DATA)
num_validation_batches = int(validation_data.num_scores/VALIDATION_BATCH_SIZE)
TB_PERIOD = 1000
TRAIN_PERIOD = 100
VALIDATION_PERIOD = 100

# Saver - Prediction Tensors
tf.get_collection_ref('pred_ops').clear()
tf.get_collection('pred_ops')
# tf.add_to_collection('pred_ops', X)
tf.add_to_collection('pred_ops', ligids)
tf.add_to_collection('pred_ops', smiles)
tf.add_to_collection('pred_ops', Y)
tf.add_to_collection('pred_ops', pred_score)
# Saver - Training Tensors
tf.get_collection_ref('train_ops').clear()
tf.get_collection('train_ops')
# tf.add_to_collection('train_ops', X)
tf.add_to_collection('train_ops', ligids)
tf.add_to_collection('train_ops', smiles)
tf.add_to_collection('train_ops', Y)
tf.add_to_collection('train_ops', loss)
tf.add_to_collection('train_ops', train_op)
saver = tf.train.Saver(max_to_keep=1000)

# Tensorboard - Graph
time_now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
tensorboard_logdir = '../tf_log/run-{}'.format(time_now)
print('tensorboard log_dir: {}'.format(tensorboard_logdir))
writer = tf.summary.FileWriter(tensorboard_logdir)
writer.add_graph(tf.get_default_graph())
# Tensorboard - Summaries
summaries = [tf.summary.scalar('mean_abs_error', mae), 
             tf.summary.scalar('mean_square_error', mse_loss), 
             tf.summary.scalar('regularization', reg_loss), 
             tf.summary.scalar('total_loss', loss),
#              tf.summary.histogram('input',X),
             tf.summary.histogram('ligids',ligids),
             tf.summary.histogram('smiles',smiles),
             tf.summary.histogram('layer1',layer1),
             tf.summary.histogram('layer2',layer2),
             tf.summary.histogram('predictions',pred_score),
             tf.summary.histogram('ground_truth',Y)]

# Start Session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(NUM_EPOCHS):       
        # Train Model
        train_data.reset()
        ligids_batch, smiles_batch, scores_batch = train_data.next_batch(4)
        ligids_batch = ligids_batch[1:4,:]
        smiles_batch = smiles_batch[1:4,:]
        scores_batch = scores_batch[1:4]
        print('ligids_batch: {}'.format(ligids_batch))
        print('smiles_batch: {}'.format(smiles_batch))
        print('scores_batch: {}'.format(scores_batch))
#         for step in range(num_training_batches):
        for step in range(100000):

            _, train_loss = sess.run([train_op, loss], feed_dict={ligids:ligids_batch, 
                                                                  smiles:smiles_batch,
                                                                  Y:scores_batch})
            if step%TRAIN_PERIOD == 0:
                print('{}/{} train_loss_batch: {:.3f}'.format(step, 
                                                              num_training_batches, 
                                                              train_loss), end='\r')
            # Tensorboard
            if step%TB_PERIOD == 0:
                s = [sess.run(summary, feed_dict={ligids:ligids_batch, 
                                                  smiles:smiles_batch, 
                                                  Y:scores_batch}) for summary in summaries]
                global_step = step+epoch*num_training_batches
                [writer.add_summary(summary, global_step) for summary in s]
                
        # Validation
        mae_value = []
        validation_data.reset()
        for step in range(num_validation_batches): 
            ligids_batch, smiles_batch, scores_batch = validation_data.next_batch(VALIDATION_BATCH_SIZE)
            mae_batch = sess.run(mae, feed_dict={ligids:ligids_batch, 
                                                 smiles:smiles_batch,
                                                 Y:scores_batch})
            mae_value.append(mae_batch)
            if step%VALIDATION_PERIOD == 0:
                print('{}/{} mae_batch: {}'.format(step, 
                                                   num_validation_batches, 
                                                   mae_batch), end='\r')
        mae_value = sum(mae_value)/len(mae_value)
        print('EPOCH: {:<10} | mae: {:<20}'.format(epoch+1, mae_value))
        
        # Save Model w/ name: e{epoch number}_l{loss}
        saver_filename = 'mae{}_e{}'.format(mae_value, epoch)        
        saver.save(sess, '../models/{}/{}'.format(time_now, saver_filename))

tensorboard log_dir: ../tf_log/run-20180523224526
ligids_batch: [[0 0 4 0 0 0 0 1 0]
 [0 0 4 0 0 0 0 1 0]
 [0 0 4 0 0 0 0 1 0]]
smiles_batch: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 0 0 1 1 0
  0 1 1 1 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0 1 1
  1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 1 0 1
  0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 1 0
  0 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 

KeyboardInterrupt: 