#### Import Dependencies

In [1]:
import tensorflow as tf
import numpy as np
from datetime import datetime
from data_utils import *
from tqdm import tnrange, tqdm_notebook
from sklearn.decomposition import PCA

  from ._conv import register_converters as _register_converters


#### Load Data

In [123]:
train_valid_ligids  = np.load('../data/PHARM_TRAIN_X.npy')
train_valid_smiles = np.load('../data/PHARM_TRAIN_SMILES.npy')
train_valid_scores = np.load('../data/Y_train.npy')

print('train_valid_ligids shape: {}'.format(train_valid_ligids.shape))
print('train_valid_smiles shape: {}'.format(train_valid_smiles.shape))
print('train_valid_scores shape: {}'.format(train_valid_scores.shape))

train_valid_ligids shape: (13246, 9)
train_valid_smiles shape: (46005, 167)
train_valid_scores shape: (609382230,)


#### Preprocess Data

In [124]:
# Remap Classes
map_from = [0,1,2,3,4,5,6,7,8,9]
map_to =   [0,0,0,0,1,2,3,4,4,4]
train_valid_scores = remap_scores(train_valid_scores, map_from, map_to)

# Train Validation Split
train_data, validation_data = train_validation_split(train_valid_ligids,
                                                     train_valid_smiles,
                                                     train_valid_scores,
                                                     num_val_lig=3046, 
                                                     num_val_smi=10581,
                                                     shuffle=False)
del train_valid_ligids, train_valid_smiles, train_valid_scores

Remapping 0 to 0
Remapping 1 to 0
Remapping 2 to 0
Remapping 3 to 0
Remapping 4 to 1
Remapping 5 to 2
Remapping 6 to 3
Remapping 7 to 4
Remapping 8 to 4
Remapping 9 to 4
New score distribution: [0.1134342  0.22478444 0.22529852 0.19962638 0.23685646]
num validation ligids: 3046
num train ligids: 10200
num validation smiles: 10581
num train smiles: 35424
num validation labels: 32229726
num train labels: 361324800


In [125]:
# Use only a part of full dataset, convert to X_data, Y_data format
train_data = reduce_dataset(train_data, samples=5000000)
validation_data = reduce_dataset(validation_data, samples=500000)

In [126]:
def balance_dataset(dataset, samples_per_class):
    num_clss = len(samples_per_class)
    cls_idxs = [np.equal(dataset.Y_data, i) for i in range(num_clss)]
    classes_X_data = [dataset.X_data[cls_idx] for cls_idx in cls_idxs]
    classes_Y_data = [dataset.Y_data[cls_idx] for cls_idx in cls_idxs]
    sampled_X_data = [class_X_data[:num_samples] for class_X_data, num_samples in zip(classes_X_data, samples_per_class)]
    sampled_Y_data = [class_Y_data[:num_samples] for class_Y_data, num_samples in zip(classes_Y_data, samples_per_class)]
    sampled_X_data = np.concatenate(sampled_X_data, axis=0)
    sampled_Y_data = np.concatenate(sampled_Y_data, axis=0)
    new_dataset = Data2(sampled_X_data, sampled_Y_data)
    return new_dataset

train_data = balance_dataset(train_data, samples_per_class=[500000]*5)
validation_data = balance_dataset(validation_data, samples_per_class=[50000]*5)

In [113]:
# PCA
def pca_dataset(dataset, n_components):
    pca_train = PCA(n_components=n_components)
    pca_train.fit(dataset.X_data)
    print(np.sum(pca_train.explained_variance_ratio_))  
    print(pca_train.singular_values_.shape)
    new_X_data = pca_train.transform(dataset.X_data)
    print(new_X_data.shape)
    new_dataset = Data2(new_X_data, dataset.Y_data)
    return new_dataset

train_data = pca_dataset(train_data, n_components=60)
validation_data = pca_dataset(validation_data, n_components=60)

0.9147496272952258
(60,)
(2500000, 60)
0.9123766786137454
(60,)
(250000, 60)


In [2]:
NUM_TRAINING_SAMPLES = 900000
NUM_TESTING_SAMPLE = 100000

def split_data(sequences, labels):
    """
    Split the data into a train and test dataset
    """
    seq_and_labels = Data2(sequences, labels)
    seq_and_labels.shuffle()
    train_seq, train_labels = seq_and_labels.next_batch(NUM_TRAINING_SAMPLES)
    test_seq, test_labels = seq_and_labels.next_batch(NUM_TESTING_SAMPLE)
    return train_seq, train_labels, test_seq, test_labels

X_train_valid = np.load('../data/X_train_new.npy')
Y_train_valid = np.load('../data/Y_train_new.npy')

X_train, Y_train, X_valid, Y_valid = split_data(X_train_valid, Y_train_valid)

print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)

train_data = Data2(X_train, Y_train)
validation_data = Data2(X_valid, Y_valid)

(900000, 176)
(900000,)
(100000, 176)
(100000,)


#### Define Model

In [None]:
# Other Parameters
use_regression = True
use_mae = True
train_summaries = []
validation_summaries = []

# Hyper Parameters
LEARNING_RATE = .0001
LAMBDA = .001
DROP_RATE = .5
NUM_EPOCHS = 10
TRAINING_BATCH_SIZE = 1000
USE_PERCENT_TRAIN_DATA = 1
USE_PERCENT_VALID_DATA = 1
VALIDATION_BATCH_SIZE = 10000

# Model
L1_UNITS = 20
# L2_UNITS = 20
# L3_UNITS = 20
if use_regression:
    NUM_OUTPUTS = 1
else:
    NUM_OUTPUTS = 5

tf.reset_default_graph()

lerelu = tf.nn.leaky_relu
relu = tf.nn.relu
elu = tf.nn.elu
xavier_init = tf.contrib.layers.xavier_initializer()
zero_init = tf.zeros_initializer()
l2_reg = tf.contrib.layers.l2_regularizer(scale=LAMBDA)

with tf.name_scope('inputs') as scope:
    X = tf.placeholder(shape=(None, train_data.X_data.shape[1]), dtype=tf.float32, name='ligids_smiles')
    if use_regression:
        Y = tf.placeholder(shape=(None), dtype=tf.float32, name='score')   
    else:
        Y = tf.placeholder(shape=(None), dtype=tf.int64, name='score') 
    training = tf.placeholder_with_default(False, shape=())
    
with tf.name_scope('hidden_layers') as scope:
    layer1 = tf.layers.dense(inputs=X,
                             units=L1_UNITS, 
                             activation=lerelu,
                             kernel_initializer=xavier_init,
                             bias_initializer=zero_init,
                             kernel_regularizer=l2_reg,
                             bias_regularizer=l2_reg,
                             name='layer1')
#     layer2 = tf.layers.dense(inputs=layer1,
#                              units=L2_UNITS, 
#                              activation=lerelu,
#                              kernel_initializer=xavier_init,
#                              bias_initializer=zero_init,
#                              kernel_regularizer=l2_reg,
#                              bias_regularizer=l2_reg,
#                              name='layer2')
#     layer3 = tf.layers.dense(inputs=layer2,
#                              units=L3_UNITS, 
#                              activation=lerelu,
#                              kernel_initializer=xavier_init,
#                              bias_initializer=zero_init,
#                              kernel_regularizer=l2_reg,
#                              bias_regularizer=l2_reg,
#                              name='layer3')

with tf.name_scope('predicted_score') as scope:
    logits = tf.layers.dense(inputs=layer1,
                             units=NUM_OUTPUTS,
                             name='logits')
    if use_regression: # Regression
        with tf.get_default_graph().gradient_override_map({"Floor": "Identity"}):
            pred_score = tf.floor(logits, name='pred_score')
    else: # Classification  
        pred_score = tf.nn.softmax(logits=logits, axis=1)
        pred_score = tf.argmax(pred_score, axis=1, name='pred_score')
        
with tf.name_scope('loss') as scope:
    if use_regression: # Regression
        if use_mae:
            model_loss = tf.losses.absolute_difference(Y,pred_score) # MAE
        else:
            mae_t = tf.losses.absolute_difference(Y,pred_score)
            model_loss = tf.losses.mean_squared_error(labels=Y, predictions=pred_score) # MSE
    else: # Classification
        model_loss = tf.reduce_sum(tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=Y))
    reg_loss = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    loss = model_loss+reg_loss
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    train_op = optimizer.minimize(loss)

with tf.name_scope('train_summaries') as scope:
    train_summaries.append(tf.summary.histogram('ligids_smiles', X))
    train_summaries.append(tf.summary.histogram('score', Y))
    train_summaries.append(tf.summary.histogram('layer1', layer1))
#     train_summaries.append(tf.summary.histogram('layer2', layer2))
#     train_summaries.append(tf.summary.histogram('layer3', layer3))
    train_summaries.append(tf.summary.histogram('pred_score', pred_score))
    if use_regression and not use_mae:
        train_summaries.append(tf.summary.scalar('mae', mae_t))
    train_summaries.append(tf.summary.scalar('model_loss', model_loss))
    train_summaries.append(tf.summary.scalar('reg_loss', reg_loss))
    train_summaries.append(tf.summary.scalar('loss', loss))

with tf.name_scope('validation_summaries') as scope:
    validation_summaries.append(tf.summary.histogram('ligids_smiles', X))
    validation_summaries.append(tf.summary.histogram('score', Y))
    validation_summaries.append(tf.summary.histogram('layer1', layer1))
#     validation_summaries.append(tf.summary.histogram('layer2', layer2))
#     validation_summaries.append(tf.summary.histogram('layer3', layer3))
    validation_summaries.append(tf.summary.histogram('pred_score', pred_score))
    if use_regression and not use_mae:
        validation_summaries.append(tf.summary.scalar('mae', mae_t))
    validation_summaries.append(tf.summary.scalar('model_loss', model_loss))
    validation_summaries.append(tf.summary.scalar('reg_loss', reg_loss))
    validation_summaries.append(tf.summary.scalar('loss', loss))

#### Run Model

In [None]:
# Misc. constants
num_training_batches = int(train_data.num_scores/TRAINING_BATCH_SIZE*USE_PERCENT_TRAIN_DATA)
num_validation_batches = int(validation_data.num_scores/VALIDATION_BATCH_SIZE*USE_PERCENT_VALID_DATA)
TB_PERIOD_TRAIN = 100
TB_PERIOD_VALID = 1

# Saver - Prediction Tensors
tf.get_collection_ref('pred_ops').clear()
tf.get_collection('pred_ops')
tf.add_to_collection('pred_ops', X)
tf.add_to_collection('pred_ops', Y)
tf.add_to_collection('pred_ops', pred_score)
# Saver - Training Tensors
tf.get_collection_ref('train_ops').clear()
tf.get_collection('train_ops')
tf.add_to_collection('train_ops', X)
tf.add_to_collection('train_ops', Y)
tf.add_to_collection('train_ops', loss)
tf.add_to_collection('train_ops', train_op)
saver = tf.train.Saver(max_to_keep=1000)

# Tensorboard - Graph
time_now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
tensorboard_logdir = '../tf_log/run-{}'.format(time_now)
print('tensorboard log_dir: {}'.format(tensorboard_logdir))
writer = tf.summary.FileWriter(tensorboard_logdir)
writer.add_graph(tf.get_default_graph())

# Start Session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in tqdm_notebook(range(NUM_EPOCHS),desc='Epoch'):       
        # Train Model
        train_data.reset()
        train_data.shuffle()
        for step in tqdm_notebook(range(num_training_batches), desc='Train', leave=False):
            lig_smi_batch, scores_batch = train_data.next_batch(TRAINING_BATCH_SIZE)
            _, train_loss = sess.run([train_op, loss], feed_dict={X:lig_smi_batch, 
                                                                  Y:scores_batch, 
                                                                  training:True})
            # Tensorboard - Train
            if step%TB_PERIOD_TRAIN == 0:
                global_train_step = step+epoch*num_training_batches
                s = [sess.run(summary, feed_dict={X:lig_smi_batch, Y:scores_batch})\
                     for summary in train_summaries]
                [writer.add_summary(summary, global_train_step) for summary in s]
                
        # Validation
        mae_value = []
        validation_data.reset()
        for step in tqdm_notebook(range(num_validation_batches), desc='Validation', leave=False): 
            lig_smi_batch, scores_batch = validation_data.next_batch(VALIDATION_BATCH_SIZE)
            if use_mae:
                mae_batch = sess.run(model_loss, feed_dict={X:lig_smi_batch, Y:scores_batch})
            else: 
                mae_batch = sess.run(mae_t, feed_dict={X:lig_smi_batch, Y:scores_batch})
            mae_value.append(mae_batch)
            # Tensorboard - Validation
            if step%TB_PERIOD_VALID == 0:
                global_valid_step = step+epoch*num_validation_batches
                s = [sess.run(summary, feed_dict={X:lig_smi_batch, Y:scores_batch})\
                     for summary in validation_summaries]
                [writer.add_summary(summary, global_valid_step) for summary in s]
        mae_value = sum(mae_value)/len(mae_value)
        
        # Save Model w/ name: mse{mean_abs_err}_e{epoch number}
        saver_filename = 'mae{}_e{}'.format(mae_value, epoch+1)        
        saver.save(sess, '../models/{}/{}'.format(time_now, saver_filename))

tensorboard log_dir: ../tf_log/run-20180525011434


A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget