#### Import Dependencies

In [1]:
%matplotlib notebook

import tensorflow as tf
import numpy as np
from datetime import datetime
from data_utils import *
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


#### Load Data

In [3]:
train_valid_ligids  = np.load('../data/PHARM_TRAIN_X.npy')
train_valid_smiles = np.load('../data/PHARM_TRAIN_SMILES.npy')
train_valid_scores = np.load('../data/Y_train.npy')

print('train_valid_ligids shape: {}'.format(train_valid_ligids.shape))
print('train_valid_smiles shape: {}'.format(train_valid_smiles.shape))
print('train_valid_scores shape: {}'.format(train_valid_scores.shape))

train_valid_ligids shape: (13246, 9)
train_valid_smiles shape: (46005, 167)
train_valid_scores shape: (609382230,)


#### Train Validation Test Split

In [4]:
train_data, validation_data = train_validation_split(train_valid_ligids,
                                                     train_valid_smiles,
                                                     train_valid_scores,
                                                     num_val_lig=3046, 
                                                     num_val_smi=10581)
del train_valid_ligids, train_valid_smiles, train_valid_scores

num validation ligids: 3046
num train ligids: 10200
num validation smiles: 10581
num train smiles: 35424
num validation labels: 32229726
num train labels: 361324800


In [5]:
print(np.bincount(train_data.scores)/train_data.num_scores)
print(np.bincount(validation_data.scores)/validation_data.num_scores)

[0.00122935 0.00457153 0.02121999 0.08647217 0.22505868 0.22536001
 0.1995506  0.15464038 0.02262394 0.05927335]
[0.00139948 0.00484937 0.0219202  0.08746879 0.22459654 0.22516052
 0.19882344 0.15418052 0.02242222 0.05917891]


In [6]:
# Balance Dataset
map_from = [0,1,2,3,4,5,6,7,8,9]
map_to =   [0,0,0,0,1,2,3,4,4,4]

for _from, _to in zip(map_from, map_to):
    train_data.scores[train_data.scores==_from] = _to
    validation_data.scores[validation_data.scores==_from] = _to    

In [7]:
print(np.bincount(train_data.scores)/train_data.num_scores)
print(np.bincount(validation_data.scores)/validation_data.num_scores)

[0.11349304 0.22505868 0.22536001 0.1995506  0.23653767]
[0.11563784 0.22459654 0.22516052 0.19882344 0.23578165]


#### Define Model

In [10]:
# Hyper Paramters
LEARNING_RATE = .0001
LAMBDA = .001
DROPOUT = .5
NUM_EPOCHS = 5
VALIDATIONS_PER_EPOCH = 5
TRAINING_BATCH_SIZE = 1000
USE_PERCENT_DATA = .05
VALIDATION_BATCH_SIZE = 10000

# Model
L1_UNITS = 20
L2_UNITS = 20
NUM_OUTPUTS = 5

tf.reset_default_graph()

relu = tf.nn.relu
elu = tf.nn.elu
xavier_init = tf.contrib.layers.xavier_initializer()
zero_init = tf.zeros_initializer()
l2_reg = tf.contrib.layers.l2_regularizer(scale=LAMBDA)

with tf.name_scope('inputs') as scope:
    X = tf.placeholder(shape=(None, 176), dtype=tf.float32, name='ligids_smiles')
    Y = tf.placeholder(shape=(None), dtype=tf.int32, name='score')
    training = tf.placeholder_with_default(input=False, shape=(), name='training')
    
with tf.name_scope('hidden_layers') as scope:
    layer1 = tf.layers.dense(inputs=X,
                             units=L1_UNITS, 
                             activation=elu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer1')
    layer2 = tf.layers.dense(inputs=layer1, 
                             units=L2_UNITS, 
                             activation=elu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer2')
    
with tf.name_scope('predicted_score') as scope:
    logits = tf.layers.dense(inputs=layer1,
                             units=NUM_OUTPUTS,
                             name='logits')
    pred_score = tf.nn.softmax(logits=logits, axis=1)
    pred_score = tf.argmax(pred_score, axis=1, name='Y_hat')
        
with tf.name_scope('train') as scope:
    mae = tf.losses.absolute_difference(Y,pred_score)
    reg_loss = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    xent_loss = tf.reduce_sum(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=Y))
    loss = xent_loss+reg_loss
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    train_op = optimizer.minimize(loss)

#### Run Model

In [None]:
# Misc. constants
num_training_batches = int(train_data.num_scores/TRAINING_BATCH_SIZE*USE_PERCENT_DATA)
num_validation_batches = int(validation_data.num_scores/VALIDATION_BATCH_SIZE)
TB_PERIOD = 1000
TRAIN_PERIOD = 100
VALIDATION_PERIOD = 100

# Saver - Prediction Tensors
tf.get_collection_ref('pred_ops').clear()
tf.get_collection('pred_ops')
tf.add_to_collection('pred_ops', X)
tf.add_to_collection('pred_ops', Y)
tf.add_to_collection('pred_ops', pred_score)
# Saver - Training Tensors
tf.get_collection_ref('train_ops').clear()
tf.get_collection('train_ops')
tf.add_to_collection('train_ops', X)
tf.add_to_collection('train_ops', Y)
tf.add_to_collection('train_ops', mae)
tf.add_to_collection('train_ops', loss)
tf.add_to_collection('train_ops', train_op)
saver = tf.train.Saver(max_to_keep=1000)

# Tensorboard - Graph
time_now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
tensorboard_logdir = '../tf_log/run-{}'.format(time_now)
print('tensorboard log_dir: {}'.format(tensorboard_logdir))
writer = tf.summary.FileWriter(tensorboard_logdir)
writer.add_graph(tf.get_default_graph())
# Tensorboard - Summaries
summaries = [tf.summary.scalar('mean_abs_error', mae), 
             tf.summary.scalar('xent_error', xent_loss), 
             tf.summary.scalar('regularization', reg_loss), 
             tf.summary.scalar('total_loss', loss),
             tf.summary.histogram('input',X),
             tf.summary.histogram('layer1',layer1),
             tf.summary.histogram('layer2',layer2),
             tf.summary.histogram('predictions',pred_score),
             tf.summary.histogram('ground_truth',Y)]

# Start Session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(NUM_EPOCHS):       
        # Train Model
        train_data.reset()
#         train_data.shuffle()
        for step in range(num_training_batches):
            ligids_batch, smiles_batch, scores_batch = train_data.next_batch(TRAINING_BATCH_SIZE)
            lig_smi_batch = np.concatenate((ligids_batch,smiles_batch), axis=1)
            _, train_loss = sess.run([train_op, loss], feed_dict={X:lig_smi_batch, Y:scores_batch})
            if step%TRAIN_PERIOD == 0:
                print('{}/{} train_loss_batch: {:.3f}'.format(step, 
                                                              num_training_batches, 
                                                              train_loss), end='\r')
            # Tensorboard
            if step%TB_PERIOD == 0:
                s = [sess.run(summary, feed_dict={X:lig_smi_batch, Y:scores_batch}) for summary in summaries]
                global_step = step+epoch*num_training_batches
                [writer.add_summary(summary, global_step) for summary in s]
                
        # Validation
        mae_value = []
        validation_data.reset()
        for step in range(num_validation_batches): 
            ligids_batch, smiles_batch, scores_batch = validation_data.next_batch(VALIDATION_BATCH_SIZE)
            lig_smi_batch = np.concatenate((ligids_batch, smiles_batch), axis=1)
            mae_batch = sess.run(mae, feed_dict={X:lig_smi_batch, Y:scores_batch})
            mae_value.append(mae_batch)
            if step%VALIDATION_PERIOD == 0:
                print('{}/{} mae_batch: {}'.format(step, 
                                                   num_validation_batches, 
                                                   mae_batch), end='\r')
        mae_value = sum(mae_value)/len(mae_value)
        print('EPOCH: {:<10} | mae: {:<20}'.format(epoch+1, mae_value))
        
        # Save Model w/ name: mse{mean_abs_err}_e{epoch number}
        saver_filename = 'mae{}_e{}'.format(mae_value, epoch+1)        
        saver.save(sess, '../models/{}/{}'.format(time_now, saver_filename))

tensorboard log_dir: ../tf_log/run-20180523191807
7000/72264 train_loss_batch: 1.575