#### Import Dependencies

In [None]:
import tensorflow as tf
import numpy as np

#### data_utils.py

In [None]:
class Data:
    def __init__(self, ligids, smiles, scores, autoreset=False):
        self.ligids = ligids
        self.smiles = smiles
        self.scores = scores
        self.autoreset = autoreset
        self.num_ligids = int(ligids.shape[0])
        self.num_smiles = int(smiles.shape[0])
        self.num_scores = int(scores.shape[0])
        self.batch_index = 0 # Current batch index
    
        assert(self.num_ligids*self.num_smiles == self.num_scores),\
        'number of ligids times number of smiles must equal number of scores'
    
    def next_batch(self, batch_size):                
        assert(self.batch_index < self.num_scores), \
        'batch index out of bound, try doing Data.reset() after stepping through the entire dataset'
        
        lig_idx_lower = int(self.batch_index/self.num_smiles)
        lig_idx_upper = int((self.batch_index+batch_size-1)/self.num_smiles)
        smi_idx_lower = self.batch_index-lig_idx_lower*self.num_smiles
        smi_idx_upper = smi_idx_lower+batch_size
        smi_idx_upper -= int((smi_idx_upper-1)/self.num_smiles)*self.num_smiles
        
        if lig_idx_upper-lig_idx_lower == 0:
            ligids_batch = self.ligids[lig_idx_lower,:]
            ligids_batch = np.tile(ligids_batch, (batch_size,1))
            smiles_batch = self.smiles[smi_idx_lower:smi_idx_upper,:]
            
        if lig_idx_upper-lig_idx_lower == 1:
            ligids_batch1 = self.ligids[lig_idx_lower,:]
            ligids_batch1 = np.tile(ligids_batch1, (self.num_smiles-smi_idx_lower,1))
            ligids_batch2 = self.ligids[lig_idx_upper,:]
            ligids_batch2 = np.tile(ligids_batch2, (smi_idx_upper,1))
            ligids_batch = np.concatenate((ligids_batch1,ligids_batch2), axis=0)

            smiles_batch1 = self.smiles[smi_idx_lower:,:]
            smiles_batch2 = self.smiles[:smi_idx_upper,:]
            smiles_batch = np.concatenate((smiles_batch1,smiles_batch2), axis=0)
           
        if lig_idx_upper-lig_idx_lower >= 2:
            raise Exception('batch size too large')
           
        scores_batch = self.scores[self.batch_index:self.batch_index+batch_size]
        self.batch_index += batch_size
        return ligids_batch, smiles_batch, scores_batch

    def full_batch(self):
        raise NotImplementedError('full_batch not implemented')
    
    def random_batch(self, batch_size):
        raise NotImplementedError('random_batch not implemented')
    
    def shuffle(self):
        raise NotImplementedError('shuffle not implemented')
    
    def reset(self,shuffle=False):
        self.batch_index = 0
        if shuffle:
            self.shuffle()
            
########################################################################################

def train_validation_split(ligids, smiles, labels, num_val_lig=3046, num_val_smi=10581):
    """
    Example usage:
        train_data, validation_data = train_validation_split(train_valid_ligids,
                                                             train_valid_smiles,
                                                             train_valid_scores,
                                                             num_val_lig=3046, 
                                                             num_val_smi=10581)
    """
    # Train valiatation split - X data
    num_train_lig = ligids.shape[0]-num_val_lig
    num_train_smi = smiles.shape[0]-num_val_smi

    print('num validation ligids: {}'.format(num_val_lig))
    print('num train ligids: {}'.format(num_train_lig))
    print('num validation smiles: {}'.format(num_val_smi))
    print('num train smiles: {}'.format(num_train_smi))

    train_ligids = ligids[:num_train_lig,:]
    train_smiles = smiles[:num_train_smi,:]
    validation_ligids = ligids[num_train_lig:,:]
    validation_smiles = smiles[num_train_smi:,:]

    # Train validation split - Y data
    train_labels = []
    validation_labels = []
    data = Data(ligids, smiles, labels)
    for lig_num in range(num_train_lig): # Train labels
        _, _, train_labels_batch = data.next_batch(num_train_smi)
        _, _, _ = data.next_batch(num_val_smi)
        train_labels.append(train_labels_batch)
    for lig_num in range(num_val_lig): # Validation labels
        _, _, _ = data.next_batch(num_train_smi)
        _, _, validation_labels_batch = data.next_batch(num_val_smi)
        validation_labels.append(validation_labels_batch)
    train_labels = np.concatenate(train_labels, axis=0)
    validation_labels = np.concatenate(validation_labels, axis=0)
    print('num validation labels: {}'.format(validation_labels.shape[0]))
    print('num train labels: {}'.format(train_labels.shape[0]))

    # Return train and validation datasets
    train_data = Data(train_ligids, train_smiles, train_labels)
    validation_data = Data(validation_ligids, validation_smiles, validation_labels)
    return train_data, validation_data

#### Load Data

In [None]:
train_valid_ligids  = np.load('../data/PHARM_TRAIN_X.npy')
train_valid_smiles = np.load('../data/PHARM_TRAIN_SMILES.npy')
train_valid_scores = np.load('../data/Y_train.npy')
test_ligids = np.load('../data/PHARM_TEST_X.npy')
test_smiles = np.load('../data/PHARM_TEST_SMILES.npy')

print('train_valid_ligids shape: {}'.format(train_valid_ligids.shape))
print('train_valid_smiles shape: {}'.format(train_valid_smiles.shape))
print('train_valid_scores shape: {}'.format(train_valid_scores.shape))
print('test_ligids shape: {}'.format(test_ligids.shape))
print('test_smiles shape: {}'.format(test_smiles.shape))

#### Train Validation Split

In [None]:
train_data, validation_data = train_validation_split(train_valid_ligids,
                                                     train_valid_smiles,
                                                     train_valid_scores,
                                                     num_val_lig=3046, 
                                                     num_val_smi=10581)
test_data = Data(test_ligids, 
                 test_smiles, 
                 np.empty(shape=(test_ligids.shape[0]* test_smiles.shape[0]), dtype=np.int8))

del train_valid_ligids, train_valid_smiles, train_valid_scores

#### Define Hyper Parameters

In [None]:
LEARNING_RATE = .001
LAMBDA = .001
DROPOUT = .5

#### Define Model

In [None]:
L1_UNITS = 100
L2_UNITS = 100
L3_UNITS = 100
NUM_OUTPUTS = 1

tf.reset_default_graph()

relu = tf.nn.relu
xavier_init = tf.contrib.layers.xavier_initializer()
zero_init = tf.zeros_initializer()
l2_reg = tf.contrib.layers.l2_regularizer(scale=LAMBDA)


with tf.name_scope('inputs') as scope:
    X = tf.placeholder(shape=(None, 176), dtype=tf.float32, name='ligids_smiles')
    Y = tf.placeholder(shape=(None), dtype=tf.float32, name='score')
    training = tf.placeholder_with_default(input=False, shape=(), name='training')
    
with tf.name_scope('hidden_layers') as scope:
    layer1 = tf.layers.dense(inputs=X,
                             units=L1_UNITS, 
                             activation=relu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer1')
    layer2 = tf.layers.dense(inputs=layer1, 
                             units=L2_UNITS, 
                             activation=relu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer2')
    layer3 = tf.layers.dense(inputs=layer2, 
                             units=L3_UNITS, 
                             activation=relu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer3')
    
with tf.name_scope('predicted_score') as scope:
    pred_score = tf.layers.dense(inputs=layer3,
                                 units=NUM_OUTPUTS)
    with tf.get_default_graph().gradient_override_map({"Floor": "Identity"}):
        pred_score = tf.floor(pred_score)
        
with tf.name_scope('train') as scope:
    reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    train_loss = tf.losses.mean_squared_error(labels=Y,predictions=pred_score)
    loss = train_loss+tf.reduce_sum(reg_loss)
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    train_op = optimizer.minimize(loss)

#### Run Model

In [None]:
# Misc. constants
NUM_EPOCHS = 5
NUM_SAVES_PER_EPOCH = 5

# Saver
tf.get_collection('pred_ops')
tf.add_to_collection('pred_ops', X)
tf.add_to_collection('pred_ops', Y)
tf.add_to_collection('pred_ops', pred_score)
tf.get_collection('train_ops')
tf.add_to_collection('train_ops', X)
tf.add_to_collection('train_ops', Y)
tf.add_to_collection('train_ops', loss)
tf.add_to_collection('train_ops', train_op)
saver = tf.train.Saver(max_to_keep=1000)

# Batches
TRAINING_BATCH_SIZE = 10000
num_training_batches = int(train_data.num_scores/TRAINING_BATCH_SIZE)
VALIDATION_BATCH_SIZE = 10000
num_validation_batches = int(validation_data.num_scores/VALIDATION_BATCH_SIZE)

# Tensorboard
tensorboard_logdir = '../tf_log/run-1'
print('tensorboard log_dir: {}\n'.format(tensorboard_logdir))
writer = tf.summary.FileWriter(tensorboard_logdir)
writer.add_graph(tf.get_default_graph())

# Start Session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(NUM_EPOCHS):
        print('Beginning epoch {}'.format(epoch))
        
        # Train Model
        train_data.reset()
        for i in range(num_training_batches):
            ligids_batch, smiles_batch, scores_batch = train_data.next_batch(TRAINING_BATCH_SIZE)
            lig_smi_batch = np.concatenate((ligids_batch,smiles_batch), axis=1)
            _, train_loss = sess.run([train_op, loss], feed_dict={X:lig_smi_batch, Y:scores_batch})
            print('{}/{} train_loss: {}'.format(i, num_training_batches, train_loss), end='\r')
        
        # Validation
        validation_data.reset()
        validation_loss = []
        mean_abs_err = []
        for i in range(num_validation_batches): 
            # Compute validation loss batch by batch and average
            ligids_batch, smiles_batch, scores_batch = validation_data.next_batch(VALIDATION_BATCH_SIZE)
            lig_smi_batch = np.concatenate((ligids_batch,smiles_batch), axis=1)
            validation_loss_batch, mean_abs_err_batch = sess.run([loss, mean_abs_err], 
                                                                 feed_dict={X:lig_smi_batch, Y:scores_batch})
            validation_loss.append(validation_loss_batch)
            print('{}/{} validation_loss_batch: {}'.format(i, num_validation_batches, validation_loss_batch), end='\r')
        validation_loss = sum(validation_loss)/len(validation_loss)
        print('EPOCH: {} | validation loss: {} | mae: {}'.format(epoch, validation_loss, mae))

        # Save Model w/ name: e{epoch number}_l{loss}
        saver_filename = 'e{}_l{}'.format(epoch, validation_loss)        
        saver.save(sess, '../models/{}'.format(saver_filename))

#### Make Submission

In [None]:
# Batches
TESTING_BATCH_SIZE = 10000
num_test_batches = int(test_data.num_scores/TESTING_BATCH_SIZE)

# Loader
saver_filename = 'e8_l2.8511100673734733'
tf.reset_default_graph()
loader = tf.train.import_meta_graph('../models/{}.meta'.format(saver_filename))

with tf.Session() as sess:
    # Load model and tensors
    sess.run(tf.global_variables_initializer())
    loader.restore(sess, 'models/{}'.format(saver_filename))
    X, Y, pred_score = tf.get_collection('pred_ops')

    # Make predictions
    predictions = []
    test_data.reset()
    for batch_num in range(num_test_batches):
        ligids_batch, smiles_batch, empty_batch = test_data.next_batch(TESTING_BATCH_SIZE)
        lig_smi_batch = np.concatenate((ligids_batch,smiles_batch), axis=1)
        prediction_batch = sess.run(pred_score, feed_dict={X:lig_smi_batch, Y:empty_batch})
        predictions.append(prediction_batch)
        print('{}/{}'.format(batch_num, num_test_batches), end='\r')
    ligids_batch, smiles_batch, empty_batch = \
    test_data.next_batch(test_data.num_scores-num_test_batches*TESTING_BATCH_SIZE)
    lig_smi_batch = np.concatenate((ligids_batch,smiles_batch), axis=1)
    prediction_batch = sess.run(pred_score, feed_dict={X:lig_smi_batch, Y:empty_batch})
    predictions.append(prediction_batch)
    predictions = np.concatenate(predictions, axis=0)

# Submission file
with open('../submissions/submission_{}.csv'.format(saver_filename), 'w') as out_file
    out_file.write('ligid_pharmid,score\n')
    for glob_index in range(test_data.num_scores):
        out_file.write('{},{}\n'.format(glob_index, int(predictions[glob_index])))