In [11]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import dictionary

# Update python files
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
embedding_size = 64
hidden_cells = 32
lr = 1e-3
batch_size = 128
epochs = 1000

In [13]:
# Prepare dataset for prediction
protein_names, sequences, labels = [], [], []


'''
    Labels:
        "+" stands for "binding protein" => 1
        "-" stands for "non-binding" => 0
'''
def convert_label(label_string):
 
    if label_string == "+":
        return 1
    elif label_string == "-":
        return 0
    else:
        print("Should not enter here")
        return None

    
# Open file containing dataset    
with open('./ppi_data.fasta') as f:
    lines = f.read().splitlines()
    
    for i in range(len(lines)):
        
        if i % 3 == 0:
            protein_names.append(lines[i])
        elif i % 3 == 1:
            sequences.append(list(lines[i]))
        elif i % 3 == 2:
            labels.append(np.array([convert_label(letter) for letter in lines[i]]))
            
protein_names = np.array(protein_names)
sequences = np.array(sequences)
labels = np.array(labels)

assert(protein_names.shape[0] == sequences.shape[0] == labels.shape[0])

print(protein_names[0])
print("".join(sequences[0]))
print(labels[0].shape)

>P0A8Q6
MGKTNDWLDFDQLAEEKVRDALKPPSMYKVILVNDDYTPMEFVIDVLQKFFSYDVERATQLMLAVHYQGKAICGVFTAEVAETKVAMVNKYARENEHPLLCTLEKA
(106,)


In [14]:
#sequences = sequences[:60]
#labels = labels[:60]

In [15]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(sequences)
train_dataset = int(total_dataset * split_percentage)

# Shuffle
np.random.seed(142)
indices = list(range(total_dataset))
np.random.shuffle(indices)

# Train dataset
sequences_train = sequences[indices[:train_dataset]]
labels_train = labels[indices[:train_dataset]]

# Validation dataset
sequences_val = sequences[indices[train_dataset:]]
labels_val = labels[indices[train_dataset:]]

# Shapes
print("Training samples: " + str(sequences_train.shape[0]))
print("Validation samples: " + str(sequences_val.shape[0]))

# Reset seed for randomness
np.random.seed()

Training samples: 432
Validation samples: 108


In [16]:
def max_length_sentence(dataset):
    return max([len(line) for line in dataset])


def pad_sentence(tokenized_sentence, max_length_sentence, padding_value=0):
    
    pad_length = max_length_sentence - len(tokenized_sentence)
    sentence = list(tokenized_sentence)
    
    if pad_length > 0:
        return np.pad(tokenized_sentence, (0, pad_length), mode='constant', constant_values=int(padding_value))
    else: # Cut sequence if longer than max_length_sentence
        return sentence[:max_length_sentence]


def create_input(vocab, max_length_seq, sequences, labels):
    X, Y = [], []
    assert(len(sequences) == len(labels))
    
    for i in range(len(sequences)):
        X.append(pad_sentence(vocab.text_to_indices(sequences[i]), max_length_seq))
        Y.append(pad_sentence(labels[i], max_length_seq, padding_value=1))
        
    return np.array(X), np.array(Y)

In [17]:
# Create vocabulary of n-grams
vocab = dictionary.LanguageDictionary(sequences_train)
max_length_seq = max_length_sentence(sequences_train)

X_train, Y_train = create_input(vocab, max_length_seq, sequences_train, labels_train)
X_val, Y_val = create_input(vocab, max_length_seq, sequences_val, labels_val)


print(X_train.shape)
print(Y_train.shape)

print(X_train[0])
print(Y_train[0])

(432, 521)
(432, 521)
[ 1  2  3  4  5  6  7  8  8  9 10  4  3  6 11  4  4 12  8 13 13  4  3  2
 14  4  2 11 11  5 15  6 16 13 10 11  7  2  3  6  8 10  6 15  8  6 10  4
  6 17 15  8  6  3  4  8  6 13  1 15 17 16 10 18  6  2 15  6 18  6 13 10
 17 12  9 10  7  7  1  4  4 17  2  7  3 13 16 15 15  2  7 12  8  6 12  4
  2  2  6 13  3  4  6  8  2 16  3  7 17  2 10 13  2 11 16  9 12  6  6  2
 13  8 18  4 10  6 10  4  7  4  8  6 15  1 11  4 17  4  8  4  5  8 10 10
  9  4  6 14 11 11  2  6  9 18 14 13  3 17  6 13  2  4  6 16  8 13 13 12
  5  3 11  8  9  4  4 15  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  

In [21]:
def new_weights(shape, name=None):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name)


def new_biases(length, name=None):
    return tf.Variable(tf.constant(0.1, shape=[length]), name=name)


def embedding_layer(input_x, vocabulary_size, embedding_size):
    init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
    embeddings = tf.Variable(init_embeds)
    layer = tf.nn.embedding_lookup(embeddings, input_x)
    return layer


def create_network(X, Y, vocabulary, embedding_size, verbose=1):
    
    # Calculate length without padding
    mask = tf.cast(tf.sign(X), tf.float32)
    sequence_length = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
    
    # Embedding layer
    embedding = embedding_layer(X, len(vocabulary.index_to_word), embedding_size)
    
    # Bidirectional LSTM cell
    lstm_fw_cell = tf.contrib.rnn.LSTMCell(hidden_cells, forget_bias=1.0)
    lstm_bw_cell = tf.contrib.rnn.LSTMCell(hidden_cells, forget_bias=1.0)
    
    (outputs_fw, outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, 
                                                                  lstm_bw_cell, 
                                                                  embedding, 
                                                                  dtype=tf.float32,
                                                                  sequence_length=sequence_length)
    # Concat outputs
    outputs_concat = tf.concat([outputs_fw, outputs_bw], 2)
    
    # FC layer
    fc1 = tf.layers.dense(inputs=outputs_concat, units=32, activation=tf.nn.relu)
    logits = tf.layers.dense(inputs=fc1, units=2, activation=None)
    
    if verbose:
        print(embedding)
        print(outputs_concat)
        print(logits)
    
    return logits, mask, sequence_length

In [22]:
tf.reset_default_graph()

# Placeholders
tensor_X = tf.placeholder(tf.int32, (None, X_train.shape[1]), 'inputs')
tensor_Y = tf.placeholder(tf.int32, (None, Y_train.shape[1]), 'outputs')

# Dropout placeholders
#input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
#output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits, mask, sequence_length = create_network(tensor_X, tensor_Y, vocab, embedding_size, verbose=1)

Tensor("embedding_lookup/Identity:0", shape=(?, 521, 64), dtype=float32)
Tensor("concat:0", shape=(?, 521, 64), dtype=float32)
Tensor("dense_1/BiasAdd:0", shape=(?, 521, 2), dtype=float32)


In [20]:
'''cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tensor_Y)

# Mask padding inside cross entropy loss
cross_entropy *= mask

# Average over actual sequence lengths
cross_entropy = tf.reduce_sum(cross_entropy, 1)
cross_entropy /= tf.reduce_sum(mask, 1)
loss = tf.reduce_mean(cross_entropy)

# Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))
accuracy = tf.contrib.metrics.accuracy(tensor_Y, predictions, weights=mask)'''

In [None]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=target_labels) * mask
loss = tf.reduce_mean(ce)

# Using Adam optimizer for the update of the weights of the network with gradient clipping
optimizer = tf.train.AdamOptimizer(learning_rate=lr) #.minimize(loss)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, variables))

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))
correct_mask = tf.to_float(tf.equal(predictions, target_labels))
accuracy = tf.contrib.metrics.accuracy(predictions, target_labels, weights=mask)

In [108]:
# Training data variables
iterations_training = max((len(X_train) // batch_size), 1)
print("Training iterations per epoch: " + str(iterations_training))

# Validation data variables
max_val_acc = 0
iterations_validation = max((len(X_val) // batch_size), 1)

# Saver for the checkpoints
saver = tf.train.Saver()

# Perform each epoch, shuffle training dataset
indices = list(range(len(X_train)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle data (with random seed for debug) to not train the network always with the same order
        np.random.seed(143)
        np.random.shuffle(indices)
        X_train = X_train[indices]
        Y_train = Y_train[indices]
        
        # Vector accumulating accuracy and loss during for one epoch
        total_accuracies, total_losses = [], []

        # Iterate over mini-batches
        for j in range(iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size 
            
            # If last batch, take also elements that are less than batch_size
            if j == (iterations_training - 1):
                end_index += (batch_size - 1)

            _, avg_accuracy, avg_loss = sess.run([optimizer, accuracy, loss], feed_dict={
                                            tensor_X: X_train[start_index:end_index],
                                            tensor_Y: Y_train[start_index:end_index] })
            
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)

            # Print loss and accuracy
            if (j+1) % 5 == 0:
                print("Loss: " + str(avg_loss) + ", Accuracy: " + str(avg_accuracy))
                    
            # Statistics on validation set
            if (j+1) % 5 == 0:
                avg_accuracy, avg_loss, pred = sess.run([accuracy, loss, predictions], feed_dict={ 
                                                                                tensor_X: X_val,
                                                                                tensor_Y: Y_val })
                
                #avg_accuracy = roc_auc_score(Y_val, pred)
                
                # Save model if validation accuracy better
                if avg_accuracy > max_val_acc:
                    consecutive_validation_without_saving = 0
                    max_val_acc = avg_accuracy
                    print("VALIDATION loss: " + str(avg_loss) + ", accuracy: " + str(avg_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
        
        # Epoch statistics
        #print("Training epoch: " + str(i+1) + ", AVG loss: " + str(np.mean(np.array(total_losses))) + 
        #      ", AVG accuracy: " + str(np.mean(np.array(total_accuracies))) + "\n")

Training iterations per epoch: 6
Loss: 0.6535127, Accuracy: 0.7059498
VALIDATION loss: 0.63084567, accuracy: 0.7297221
Loss: 0.64911693, Accuracy: 0.72099346
VALIDATION loss: 0.6164645, accuracy: 0.7307953
Loss: 0.64732057, Accuracy: 0.6960475
Loss: 0.633052, Accuracy: 0.7172973
Loss: 0.63683593, Accuracy: 0.71428573
Loss: 0.6382774, Accuracy: 0.7087112


KeyboardInterrupt: 

In [106]:
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    saver.restore(sess, "./checkpoints/model.ckpt") 
    
    avg_accuracy, avg_loss, pred = sess.run([accuracy, loss, predictions], feed_dict={
                                            tensor_X: X_val,
                                            tensor_Y: Y_val })

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt


In [107]:
print(Y_val[:1])
print(pred[:1])
print(avg_accuracy)

[[0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1
  1 1 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1
  1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0
  0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [80]:
print("Accuracy :" + str(accuracy_score(Y_val, pred)))
print("Precision :" + str(precision_score(Y_val, pred)))
print("Recall :" + str(recall_score(Y_val, pred)))
print("AUC :" + str(roc_auc_score(Y_val, pred)))

Accuracy :0.0


ValueError: Target is multilabel-indicator but average='binary'. Please choose another average setting.