In [1]:
import tensorflow as tf
import numpy as np

import dictionary

# Update python files
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


In [2]:
n_grams = 3
sliding_window_size = 9
embedding_size = 512
lr = 1e-4
batch_size = 64
epochs = 5000

In [3]:
# Prepare dataset for prediction
protein_names, sequences, labels = [], [], []


'''
    Labels:
        "+" stands for "binding protein" => 1
        "-" stands for "non-binding" => 0
'''
def convert_label(label_string):
 
    if label_string == "+":
        return 1
    elif label_string == "-":
        return 0
    else:
        print("Should not enter here")
        return None

    
# Open file containing dataset    
with open('./ppi_data.fasta') as f:
    lines = f.read().splitlines()
    
    for i in range(len(lines)):
        
        if i % 3 == 0:
            protein_names.append(lines[i])
        elif i % 3 == 1:
            sequences.append(lines[i])
        elif i % 3 == 2:
            labels.append([convert_label(letter) for letter in lines[i]])
            
protein_names = np.array(protein_names)
sequences = np.array(sequences)
labels = np.array(labels)

assert(protein_names.shape[0] == sequences.shape[0] == labels.shape[0])

print(protein_names[0])
print(sequences[0])
print(labels[0])

>P0A8Q6
MGKTNDWLDFDQLAEEKVRDALKPPSMYKVILVNDDYTPMEFVIDVLQKFFSYDVERATQLMLAVHYQGKAICGVFTAEVAETKVAMVNKYARENEHPLLCTLEKA
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [4]:
amino_acids, bindings = [], []

# Iterates over all proteins in dataset
for i in range(len(sequences)):

    # Loop over sequence
    for j in range(0,len(sequences[i]) - sliding_window_size + 1):
        sub_sequence = sequences[i][j:j+sliding_window_size]

        tmp = []
        for k in range(0, sliding_window_size - n_grams + 1):
            tmp.append(sub_sequence[k:k+n_grams])
    
        amino_acids.append(tmp)
        bindings.append(labels[i][j+(sliding_window_size//2)]) 
        
amino_acids, bindings = np.array(amino_acids), np.array(bindings)
print(amino_acids.shape)
print(bindings.shape)

(88461, 7)
(88461,)


In [5]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(amino_acids)
train_dataset = int(total_dataset * split_percentage)

# Shuffle
np.random.seed(145)
indices = list(range(total_dataset))
np.random.shuffle(indices)

# Train dataset
X_train = amino_acids[indices[:train_dataset]]
y_train = bindings[indices[:train_dataset]]

# Validation dataset
X_val = amino_acids[indices[train_dataset:]]
y_val = bindings[indices[train_dataset:]]

# Shapes
print("Training samples: " + str(X_train.shape[0]))
print("Validation samples: " + str(X_val.shape[0]) + "\n")

# Reset seed for randomness
np.random.seed()

# Sample
print(sequences[0])
print(X_train[0])
print(y_train[0])

Training samples: 70768
Validation samples: 17693

MGKTNDWLDFDQLAEEKVRDALKPPSMYKVILVNDDYTPMEFVIDVLQKFFSYDVERATQLMLAVHYQGKAICGVFTAEVAETKVAMVNKYARENEHPLLCTLEKA
['DDG' 'DGS' 'GSF' 'SFE' 'FEI' 'EIE' 'IEV']
0


In [6]:
# Create vocabulary of n-grams
vocab = dictionary.LanguageDictionary(X_train)

# Map grams to indices for the embedding matrix
X_train_mapped = np.array([vocab.text_to_indices(tmp) for tmp in X_train])
X_val_mapped = np.array([vocab.text_to_indices(tmp) for tmp in X_val if not vocab.text_to_indices(tmp) is None])

# Shapes
print(X_train_mapped.shape)
print(y_train.shape)
print(X_val_mapped.shape)
print(y_val.shape)

(70768, 7)
(70768,)
(17693, 7)
(17693,)


In [7]:
X_final_val = []
y_final_val = []

for i in range(len(X_val_mapped)):

    is_none = False
    for j in range(len(X_val_mapped[i])):

        if X_val_mapped[i][j] is None:
            is_none = True
    
    if not is_none:
        X_final_val.append(X_val_mapped[i])
        y_final_val.append(y_val[i])    
    
X_final_val = np.array(X_final_val)
y_final_val = np.array(y_final_val)

print(X_final_val.shape)
print(y_final_val.shape)

(17693, 7)
(17693,)


In [8]:
def new_weights(shape, name=None):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name)


def new_biases(length, name=None):
    return tf.Variable(tf.constant(0.1, shape=[length]), name=name)


def embedding_layer(input_x, vocabulary_size, embedding_size):
    init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
    embeddings = tf.Variable(init_embeds)
    layer = tf.nn.embedding_lookup(embeddings, input_x)
    
    return layer


def create_network(X, Y, vocabulary, embedding_size, verbose):
    
    embedding = embedding_layer(X, len(vocabulary.index_to_word), embedding_size)
    
    
    lstm_fw_cell = tf.contrib.rnn.LSTMCell(128, forget_bias=1.0)
    lstm_bw_cell = tf.contrib.rnn.LSTMCell(128, forget_bias=1.0)
    (outputs_fw, outputs_bw), last_states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, embedding, dtype=tf.float32)

    # Get last output of LSTM
    outputs_fw = tf.transpose(outputs_fw, [1, 0, 2])
    last_output_fw = tf.gather(outputs_fw, int(outputs_fw.get_shape()[0]) - 1)

    outputs_bw = tf.transpose(outputs_bw, [1, 0, 2])
    last_output_bw = tf.gather(outputs_bw, int(outputs_bw.get_shape()[0]) - 1)
    
    # Concat outputs
    outputs_concat = tf.concat([last_output_fw, last_output_bw], 1)

    # Fully connected
    fc1 = tf.layers.dense(inputs=outputs_concat, units=32)
    
    logits = tf.layers.dense(inputs=fc1, units=2, activation=None)

    
    if verbose:
        print(X)
        print(Y)
        print(embedding)
        print(outputs_concat)
        print(fc1)
        print(logits)
        
    return logits

In [9]:
tf.reset_default_graph()

# Placeholders
tensor_X = tf.placeholder(tf.int32, (None, X_train_mapped.shape[1]), 'inputs')
tensor_Y = tf.placeholder(tf.int32, (None), 'output')

#input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
#output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = create_network(tensor_X, tensor_Y, vocab, embedding_size, verbose=1)

Tensor("inputs:0", shape=(?, 7), dtype=int32)
Tensor("output:0", dtype=int32)
Tensor("embedding_lookup/Identity:0", shape=(?, 7, 512), dtype=float32)
Tensor("concat:0", shape=(?, 256), dtype=float32)
Tensor("dense/BiasAdd:0", shape=(?, 32), dtype=float32)
Tensor("dense_1/BiasAdd:0", shape=(?, 2), dtype=float32)


In [None]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tensor_Y)
loss = tf.reduce_mean(ce)

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=1))
correct_mask = tf.to_float(tf.equal(predictions, tensor_Y))
accuracy = tf.reduce_mean(tf.cast(correct_mask, tf.float32), axis=0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
# Training data variables
iterations_training = max((len(X_train) // batch_size), 1)
print("Training iterations per epoch: " + str(iterations_training))
training_overfit = False
consecutive_validation_without_saving = 0

# Validation data variables
max_val_acc = 0
iterations_validation = max((len(X_final_val) // batch_size), 1)

# Initializer for variables in the graph
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Perform each epoch, shuffle training dataset
indices = list(range(len(X_train)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(init)
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle data (with random seed for debug) to not train the network always with the same order
        np.random.seed(42)
        np.random.shuffle(indices)
        X_train_mapped = X_train_mapped[indices]
        y_train = y_train[indices]
        
        # Vector accumulating accuracy and loss during for one epoch
        total_accuracies, total_losses = [], []

        # Iterate over mini-batches
        for j in range(iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size 
            
            # If last batch, take also elements that are less than batch_size
            if j == (iterations_training - 1):
                end_index += (batch_size - 1)

            _, avg_accuracy, avg_loss = sess.run([optimizer, accuracy, loss], feed_dict={
                                            tensor_X: X_train_mapped[start_index:end_index],
                                            tensor_Y: y_train[start_index:end_index] })
            
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)

            
            # Print loss and accuracy
            if (j) % 100 == 0:
                print("Loss: " + str(avg_loss) + ", Accuracy: " + str(avg_accuracy))
                
            # Statistics on validation set
            if (j) % 100 == 0:
                
                # Accumulate validation statistics
                val_accuracies, val_losses = [], []

                # Iterate over mini-batches
                for k in range(iterations_validation):
                    start_index = k * batch_size
                    end_index = (k + 1) * batch_size 
                    
                    if j == (iterations_validation - 1):
                        end_index += (batch_size - 1)
                        
                    avg_accuracy, avg_loss = sess.run([accuracy, loss], feed_dict={
                                            tensor_X: X_final_val[start_index:end_index],
                                            tensor_Y: y_final_val[start_index:end_index] })
                    
                    # Statistics over the mini-batch
                    val_losses.append(avg_loss) 
                    val_accuracies.append(avg_accuracy)
                    
                # Average validation accuracy over batches
                final_val_accuracy = np.mean(val_accuracies)
                
                # Save model if validation accuracy better
                if final_val_accuracy > max_val_acc:
                    consecutive_validation_without_saving = 0
                    max_val_acc = final_val_accuracy
                    print("VALIDATION loss: " + str(np.mean(val_losses)) + ", accuracy: " + str(final_val_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
                else:
                    # Count every time check validation accuracy
                    consecutive_validation_without_saving += 1
                
                # If checked validation time many consecutive times without having improvement in accuracy
                if consecutive_validation_without_saving >= 20:
                    training_overfit = True
                    break
                    
        if training_overfit:
            print("Early stopping training because it starts overfitting")
            break
            
        # Epoch statistics
        print("Training epoch: " + str(i+1) + ", AVG loss: " + str(np.mean(np.array(total_losses))) + 
              ", AVG accuracy: " + str(np.mean(np.array(total_accuracies))) + "\n")
        


Training iterations per epoch: 1105
Loss: 0.6624677, Accuracy: 0.65625
VALIDATION loss: 0.7041237, accuracy: 0.4990376
Loss: 0.58844924, Accuracy: 0.75
VALIDATION loss: 0.63115823, accuracy: 0.68761325
Loss: 0.57296, Accuracy: 0.734375
VALIDATION loss: 0.59675926, accuracy: 0.7189198
Loss: 0.63503325, Accuracy: 0.65625
VALIDATION loss: 0.5885399, accuracy: 0.720505
Loss: 0.61275923, Accuracy: 0.6875
VALIDATION loss: 0.585065, accuracy: 0.7210145
Loss: 0.55083835, Accuracy: 0.75
VALIDATION loss: 0.58160025, accuracy: 0.7211843
Loss: 0.509092, Accuracy: 0.796875
VALIDATION loss: 0.5783371, accuracy: 0.72129756
Loss: 0.5601565, Accuracy: 0.78125
VALIDATION loss: 0.5746887, accuracy: 0.7219769
Loss: 0.52747357, Accuracy: 0.78125
VALIDATION loss: 0.57209873, accuracy: 0.7225996
Loss: 0.52562404, Accuracy: 0.75
VALIDATION loss: 0.56810755, accuracy: 0.72356206
Loss: 0.583038, Accuracy: 0.71875
VALIDATION loss: 0.56391156, accuracy: 0.7244679
Loss: 0.5565937, Accuracy: 0.703125
VALIDATION los