In [1]:
import tensorflow as tf
import numpy as np

import dictionary

# Update python files
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


In [2]:
n_grams = 1
sliding_window_size = 9
embedding_size = 512
lr = 1e-4
batch_size = 32
epochs = 5000

In [3]:
# Prepare dataset for prediction
protein_names, sequences, labels = [], [], []


'''
    Labels:
        "+" stands for "binding protein" => 1
        "-" stands for "non-binding" => 0
'''
def convert_label(label_string):
 
    if label_string == "+":
        return 1
    elif label_string == "-":
        return 0
    else:
        print("Should not enter here")
        return None

    
# Open file containing dataset    
with open('./ppi_data.fasta') as f:
    lines = f.read().splitlines()
    
    for i in range(len(lines)):
        
        if i % 3 == 0:
            protein_names.append(lines[i])
        elif i % 3 == 1:
            sequences.append(lines[i])
        elif i % 3 == 2:
            labels.append([convert_label(letter) for letter in lines[i]])
            
protein_names = np.array(protein_names)
sequences = np.array(sequences)
labels = np.array(labels)

assert(protein_names.shape[0] == sequences.shape[0] == labels.shape[0])

print(protein_names[0])
print(sequences[0])
print(labels[0])

>P0A8Q6
MGKTNDWLDFDQLAEEKVRDALKPPSMYKVILVNDDYTPMEFVIDVLQKFFSYDVERATQLMLAVHYQGKAICGVFTAEVAETKVAMVNKYARENEHPLLCTLEKA
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [4]:
#sequences = sequences[:15]
#labels = labels[:15]

In [5]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(sequences)
train_dataset = int(total_dataset * split_percentage)

# Shuffle
np.random.seed(42)
indices = list(range(total_dataset))
np.random.shuffle(indices)

# Train dataset
sequences_train = sequences[indices[:train_dataset]]
labels_train = labels[indices[:train_dataset]]

# Validation dataset
sequences_val = sequences[indices[train_dataset:]]
labels_val = labels[indices[train_dataset:]]

# Shapes
print("Training samples: " + str(sequences_train.shape[0]))
print("Validation samples: " + str(sequences_val.shape[0]) + "\n")

# Reset seed for randomness
np.random.seed()

# Sample
print(sequences_train[0])
print(labels_train[0])

Training samples: 432
Validation samples: 108

MTDLFSSPDHTLDALGLRCPEPVMMVRKTVRNMQPGETLLIIADDPATTRDIPGFCTFMEHELVAKETDGLPYRYLIRKGG
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [6]:
def create_input(sequences, labels, sliding_window_size, n_grams):

    X, Y = [], []

    # Iterates over all proteins in dataset
    for i in range(len(sequences)):

        # Loop over sequence
        for j in range(0,len(sequences[i]) - sliding_window_size + 1):
            sub_sequence = sequences[i][j:j+sliding_window_size]

            tmp = []
            for k in range(0, sliding_window_size - n_grams + 1):
                tmp.append(sub_sequence[k:k+n_grams])

            X.append(tmp)
            Y.append(labels[i][j+(sliding_window_size//2)]) 

    return np.array(X), np.array(Y)
    
X_train_tmp, Y_train_tmp = create_input(sequences_train, labels_train, sliding_window_size, n_grams)
X_val, Y_val = create_input(sequences_val, labels_val, sliding_window_size, n_grams)

print(X_train_tmp.shape)
print(X_val.shape)

print(X_train_tmp[0])
print(Y_train_tmp[0])

(70799, 9)
(17662, 9)
['M' 'T' 'D' 'L' 'F' 'S' 'S' 'P' 'D']
0


In [7]:
# Find indices with positive and negative labels
X_train_pos = np.array([i for i in range(len(X_train_tmp)) if Y_train_tmp[i] == 1])
X_train_neg = np.array([i for i in range(len(X_train_tmp)) if Y_train_tmp[i] == 0])

# Print how many positive and negative labels => I want same number of labels for each class during training
print(X_train_pos.shape)
print(X_train_neg.shape)

# Get indices from X_train_pos
np.random.seed(42)
X_train_pos_indices = np.random.choice(len(X_train_pos), len(X_train_neg), replace=True)
X_train_selected = X_train_pos[X_train_pos_indices]

# Final X_train data
X_train = np.concatenate((X_train_tmp[X_train_selected], X_train_tmp[X_train_neg]), axis=0)
Y_train = np.concatenate((Y_train_tmp[X_train_selected], Y_train_tmp[X_train_neg]), axis=0)

# Check that labels 1 and 0 are equal
assert(len(np.array([i for i in range(len(X_train)) if Y_train[i] == 1])) == 
    len(np.array([i for i in range(len(X_train)) if Y_train[i] == 0])))

# Shuffle data
shuffle_indices = list(range(len(X_train)))
np.random.shuffle(shuffle_indices)
X_train = X_train[shuffle_indices]
Y_train = Y_train[shuffle_indices]
print(X_train.shape)
print(Y_train.shape)

(19705,)
(51094,)
(102188, 9)
(102188,)


In [8]:
# Create vocabulary of n-grams
vocab = dictionary.LanguageDictionary(X_train)

# Map grams to indices for the embedding matrix and remove samples where unknown words
X_train_mapped = np.array([vocab.text_to_indices(tmp) for tmp in X_train])

# Prepare validation data
X_val_mapped = []
Y_val_mapped = []
for i in range(len(X_val)):
    
    tmp = vocab.text_to_indices(X_val[i])
    if not None in tmp:
        X_val_mapped.append(tmp)
        Y_val_mapped.append(Y_val[i])
        
X_val_mapped = np.array(X_val_mapped)
Y_val_mapped = np.array(Y_val_mapped)

print(X_val_mapped.shape)
print(Y_val_mapped.shape)

(17662, 9)
(17662,)


In [9]:
'''def max_length_sentence(dataset):
    return max([len(line) for line in dataset])


def pad_sentence(tokenized_sentence, max_length_sentence, padding_value=0, pad_before=False):
    
    pad_length = max_length_sentence - len(tokenized_sentence)
    sentence = list(tokenized_sentence)
    
    if pad_length > 0:
        if pad_before:
            return np.pad(tokenized_sentence, (pad_length, 0), mode='constant', constant_values=int(padding_value))
        else:
            return np.pad(tokenized_sentence, (0, pad_length), mode='constant', constant_values=int(padding_value))
    else: # Cut sequence if longer than max_length_sentence
        return sentence[:max_length_sentence]

    
max_length = max_length_sentence(sequences_train)


X_train = np.array([pad_sentence(tmp, max_length) for tmp in X_train_mapped])
Y_train = np.array([pad_sentence(tmp, max_length, padding_value=1) for tmp in labels_train])
print(X_train.shape)
print(Y_train.shape)

X_val = np.array([pad_sentence(tmp, max_length) for tmp in X_val_mapped])
Y_val = np.array([pad_sentence(tmp, max_length, padding_value=1) for tmp in labels_val])
print(X_val.shape)
print(Y_val.shape)'''

"def max_length_sentence(dataset):\n    return max([len(line) for line in dataset])\n\n\ndef pad_sentence(tokenized_sentence, max_length_sentence, padding_value=0, pad_before=False):\n    \n    pad_length = max_length_sentence - len(tokenized_sentence)\n    sentence = list(tokenized_sentence)\n    \n    if pad_length > 0:\n        if pad_before:\n            return np.pad(tokenized_sentence, (pad_length, 0), mode='constant', constant_values=int(padding_value))\n        else:\n            return np.pad(tokenized_sentence, (0, pad_length), mode='constant', constant_values=int(padding_value))\n    else: # Cut sequence if longer than max_length_sentence\n        return sentence[:max_length_sentence]\n\n    \nmax_length = max_length_sentence(sequences_train)\n\n\nX_train = np.array([pad_sentence(tmp, max_length) for tmp in X_train_mapped])\nY_train = np.array([pad_sentence(tmp, max_length, padding_value=1) for tmp in labels_train])\nprint(X_train.shape)\nprint(Y_train.shape)\n\nX_val = np.a

In [10]:
def new_weights(shape, name=None):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name)


def new_biases(length, name=None):
    return tf.Variable(tf.constant(0.1, shape=[length]), name=name)


def embedding_layer(input_x, vocabulary_size, embedding_size):
    init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
    embeddings = tf.Variable(init_embeds)
    layer = tf.nn.embedding_lookup(embeddings, input_x)
    
    return layer


def create_network(X, Y, vocabulary, embedding_size, verbose):
    
    embedding = embedding_layer(X, len(vocabulary.index_to_word), embedding_size)
    
    
    lstm_fw_cell = tf.contrib.rnn.LSTMCell(128, forget_bias=1.0)
    lstm_bw_cell = tf.contrib.rnn.LSTMCell(128, forget_bias=1.0)
    (outputs_fw, outputs_bw), last_states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, embedding, dtype=tf.float32)

    # Get last output of LSTM
    outputs_fw = tf.transpose(outputs_fw, [1, 0, 2])
    last_output_fw = tf.gather(outputs_fw, int(outputs_fw.get_shape()[0]) - 1)

    outputs_bw = tf.transpose(outputs_bw, [1, 0, 2])
    last_output_bw = tf.gather(outputs_bw, int(outputs_bw.get_shape()[0]) - 1)
    
    # Concat outputs
    outputs_concat = tf.concat([last_output_fw, last_output_bw], 1)

    # Fully connected
    fc1 = tf.layers.dense(inputs=outputs_concat, units=64)
    
    logits = tf.layers.dense(inputs=fc1, units=2, activation=None)

    
    if verbose:
        print(X)
        print(Y)
        print(embedding)
        print(outputs_concat)
        print(fc1)
        print(logits)
        
    return logits

In [11]:
tf.reset_default_graph()

# Placeholders
tensor_X = tf.placeholder(tf.int32, (None, X_train_mapped.shape[1]), 'inputs')
tensor_Y = tf.placeholder(tf.int32, (None), 'output')

#input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
#output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = create_network(tensor_X, tensor_Y, vocab, embedding_size, verbose=1)

Tensor("inputs:0", shape=(?, 9), dtype=int32)
Tensor("output:0", dtype=int32)
Tensor("embedding_lookup/Identity:0", shape=(?, 9, 512), dtype=float32)
Tensor("concat:0", shape=(?, 256), dtype=float32)
Tensor("dense/BiasAdd:0", shape=(?, 64), dtype=float32)
Tensor("dense_1/BiasAdd:0", shape=(?, 2), dtype=float32)


In [12]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tensor_Y)
loss = tf.reduce_mean(ce)

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=1))
correct_mask = tf.to_float(tf.equal(predictions, tensor_Y))
accuracy = tf.reduce_mean(tf.cast(correct_mask, tf.float32), axis=0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [13]:
# Training data variables
iterations_training = max((len(X_train) // batch_size), 1)
print("Training iterations per epoch: " + str(iterations_training))
training_overfit = False
consecutive_validation_without_saving = 0

# Validation data variables
max_val_acc = 0
iterations_validation = max((len(X_val_mapped) // batch_size), 1)

# Initializer for variables in the graph
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Perform each epoch, shuffle training dataset
indices = list(range(len(X_train)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(init)
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle data (with random seed for debug) to not train the network always with the same order
        np.random.seed(42)
        np.random.shuffle(indices)
        X_train_mapped = X_train_mapped[indices]
        Y_train = Y_train[indices]
        
        # Vector accumulating accuracy and loss during for one epoch
        total_accuracies, total_losses = [], []

        # Iterate over mini-batches
        for j in range(iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size 
            
            # If last batch, take also elements that are less than batch_size
            if j == (iterations_training - 1):
                end_index += (batch_size - 1)

            _, avg_accuracy, avg_loss = sess.run([optimizer, accuracy, loss], feed_dict={
                                            tensor_X: X_train_mapped[start_index:end_index],
                                            tensor_Y: Y_train[start_index:end_index] })
            
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)

            # Print loss and accuracy
            if (j) % 100 == 0:
                print("Loss: " + str(avg_loss) + ", Accuracy: " + str(avg_accuracy))
                    
            # Statistics on validation set
            if (j) % 100 == 0:
                
                # Accumulate validation statistics
                val_accuracies, val_losses = [], []

                # Iterate over mini-batches
                for k in range(iterations_validation):
                    start_index = k * batch_size
                    end_index = (k + 1) * batch_size 
                    
                    if j == (iterations_validation - 1):
                        end_index += (batch_size - 1)
                        
                    avg_accuracy, avg_loss = sess.run([accuracy, loss], feed_dict={
                                            tensor_X: X_val_mapped[start_index:end_index],
                                            tensor_Y: Y_val_mapped[start_index:end_index] })
                    
                    # Statistics over the mini-batch
                    val_losses.append(avg_loss) 
                    val_accuracies.append(avg_accuracy)
                    
                # Average validation accuracy over batches
                final_val_accuracy = np.mean(val_accuracies)
                
                # Save model if validation accuracy better
                if final_val_accuracy > max_val_acc:
                    consecutive_validation_without_saving = 0
                    max_val_acc = final_val_accuracy
                    print("VALIDATION loss: " + str(np.mean(val_losses)) + ", accuracy: " + str(final_val_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
                else:
                    # Count every time check validation accuracy
                    consecutive_validation_without_saving += 1
                
                # If checked validation time many consecutive times without having improvement in accuracy
                if consecutive_validation_without_saving >= 10:
                    training_overfit = True
                    #break
                    
        #if training_overfit:
        #    print("Early stopping training because it starts overfitting")
        #    break
            
        # Epoch statistics
        print("Training epoch: " + str(i+1) + ", AVG loss: " + str(np.mean(np.array(total_losses))) + 
              ", AVG accuracy: " + str(np.mean(np.array(total_accuracies))) + "\n")

Training iterations per epoch: 3193
Loss: 0.72264683, Accuracy: 0.53125
VALIDATION loss: 0.65414816, accuracy: 0.64025635
Loss: 0.6993661, Accuracy: 0.46875
Loss: 0.73302424, Accuracy: 0.46875
Loss: 0.67934746, Accuracy: 0.59375
Loss: 0.67359316, Accuracy: 0.53125
Loss: 0.6587292, Accuracy: 0.5625
Loss: 0.6704196, Accuracy: 0.53125
Loss: 0.6672443, Accuracy: 0.625
Loss: 0.697526, Accuracy: 0.5
Loss: 0.72128445, Accuracy: 0.40625
Loss: 0.70684975, Accuracy: 0.59375
Loss: 0.79136455, Accuracy: 0.34375
Loss: 0.74661684, Accuracy: 0.46875
Loss: 0.69505733, Accuracy: 0.53125
Loss: 0.6881025, Accuracy: 0.59375
Loss: 0.67779255, Accuracy: 0.59375
Loss: 0.63059306, Accuracy: 0.5625
Loss: 0.66302514, Accuracy: 0.5625
Loss: 0.6652262, Accuracy: 0.65625
Loss: 0.65966666, Accuracy: 0.625
Loss: 0.6354507, Accuracy: 0.75
Loss: 0.7217624, Accuracy: 0.5
Loss: 0.6742359, Accuracy: 0.5625
Loss: 0.6495132, Accuracy: 0.59375
Loss: 0.7414073, Accuracy: 0.4375
Loss: 0.6919689, Accuracy: 0.5625
Loss: 0.72138

Loss: 0.3123756, Accuracy: 0.875
Loss: 0.4226761, Accuracy: 0.78125
Loss: 0.57688785, Accuracy: 0.65625
Loss: 0.60094714, Accuracy: 0.75
Loss: 0.56137985, Accuracy: 0.71875
Loss: 0.4705498, Accuracy: 0.8125
Loss: 0.60218656, Accuracy: 0.625
Loss: 0.5687554, Accuracy: 0.625
Loss: 0.445229, Accuracy: 0.75
Loss: 0.4452623, Accuracy: 0.71875
Loss: 0.39030454, Accuracy: 0.8125
Loss: 0.44109452, Accuracy: 0.78125
Loss: 0.31831124, Accuracy: 0.9375
Loss: 0.4212801, Accuracy: 0.84375
Loss: 0.39368972, Accuracy: 0.8125
Loss: 0.31898314, Accuracy: 0.96875
Loss: 0.31178465, Accuracy: 0.90625
Loss: 0.5976896, Accuracy: 0.65625
Loss: 0.49866766, Accuracy: 0.78125
Loss: 0.499685, Accuracy: 0.78125
Loss: 0.46898642, Accuracy: 0.6875
Loss: 0.39917496, Accuracy: 0.8125
Loss: 0.5092752, Accuracy: 0.78125
Loss: 0.4485217, Accuracy: 0.71875
Loss: 0.31891423, Accuracy: 0.84375
Loss: 0.509223, Accuracy: 0.71875
Loss: 0.4236986, Accuracy: 0.78125
Loss: 0.47360513, Accuracy: 0.75
Loss: 0.33595175, Accuracy: 0

KeyboardInterrupt: 

In [14]:
# TF variables
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./checkpoints/model.ckpt") 
    
    avg_accuracy, avg_loss, pred = sess.run([accuracy, loss, predictions], feed_dict={
                                            tensor_X: X_val_mapped,
                                            tensor_Y: Y_val_mapped })

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt


In [15]:
print(Y_val_mapped[:50])
print(pred[:50])

[1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 1 0 1
 1 1 1 1 1 1 1 1 1 0 1 1 1]
[0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 1
 1 1 1 1 0 0 0 0 1 1 0 1 0]


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

print("Accuracy :" + str(accuracy_score(Y_val_mapped, pred)))
print("Precision :" + str(precision_score(Y_val_mapped, pred)))
print("Recall :" + str(recall_score(Y_val_mapped, pred)))
print("AUC :" + str(roc_auc_score(Y_val_mapped, pred)))

Accuracy :0.6403012116408108
Precision :0.2791090111373608
Recall :0.16395717684377478
AUC :0.49733759935864447
