In [1]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import src.dictionary as dictionary
import src.processing as processing
import src.network as network
import pickle

# Update python files
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


In [2]:
# Parameters
n_grams = 3
sliding_window_size = 15
embedding_size = 256
use_pretrained_embeddings = False
reg_alpha = 1e-4
lr = 1e-3
dropout_prob = 0.6
batch_size = 64
epochs = 500

assert(n_grams % 2 == 1 and sliding_window_size % 2 == 1)

# Save parameters, so that when restore model for testing, I have them
parameters = {"n_grams" : n_grams, 
              "sliding_window_size" : sliding_window_size, 
              "embedding_size" : embedding_size
             }

In [3]:
# Prepare dataset for prediction
protein_names, sequences, labels = [], [], []


'''
    Labels:
        "+" stands for "binding protein" => 1
        "-" stands for "non-binding" => 0
'''
def convert_label(label_string):
 
    if label_string == "+":
        return 1
    elif label_string == "-":
        return 0
    else:
        print("Should not enter here")
        return None

    
# Open file containing dataset    
with open('./dataset/ppi_data.fasta') as f:
    lines = f.read().splitlines()
    
    for i in range(len(lines)):
        
        if i % 3 == 0:
            protein_names.append(lines[i])
        elif i % 3 == 1:
            sequences.append(lines[i])
        elif i % 3 == 2:
            labels.append([convert_label(letter) for letter in lines[i]])
            
protein_names = np.array(protein_names)
sequences = np.array(sequences)
labels = np.array(labels)

assert(protein_names.shape[0] == sequences.shape[0] == labels.shape[0])

print(protein_names[0])
print(sequences[0])
print(labels[0])

>P0A8Q6
MGKTNDWLDFDQLAEEKVRDALKPPSMYKVILVNDDYTPMEFVIDVLQKFFSYDVERATQLMLAVHYQGKAICGVFTAEVAETKVAMVNKYARENEHPLLCTLEKA
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [4]:
#sequences = sequences[:24]
#labels = labels[:24]

In [5]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(sequences)
train_dataset = int(total_dataset * split_percentage)

# Shuffle
np.random.seed(97)
indices = list(range(total_dataset))
np.random.shuffle(indices)

# Train dataset
sequences_train = sequences[indices[:train_dataset]]
labels_train = labels[indices[:train_dataset]]

# Validation dataset
sequences_val = sequences[indices[train_dataset:]]
labels_val = labels[indices[train_dataset:]]

# Shapes
print("Training samples: " + str(sequences_train.shape[0]))
print("Validation samples: " + str(sequences_val.shape[0]) + "\n")

# Reset seed for randomness
np.random.seed()

# Sample
print(sequences_train[0])
print(labels_train[0])

Training samples: 432
Validation samples: 108

MPQSFTSIARIGDYILKSPVLSKLCVPVANQFINLAGYKKLGLKFDDLIAEENPIMQTALRRLPEDESYARAYRIIRAHQTELTHHLLPRNEWIKAQEDVPYLLPYILEAEAAAKEKDELDNIEVSK
[0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]


In [6]:
X_train, Y_train = processing.create_input(sequences_train, labels_train, sliding_window_size, n_grams)
X_val_proc, Y_val_proc = processing.create_input(sequences_val, labels_val, sliding_window_size, n_grams)

print(X_train.shape)
print(X_val_proc.shape)

print(X_train[0])
print(Y_train[0])

# Save dump for parameters
parameters["timestamps"] = X_train.shape[1]
with open('./dumps/parameters.pickle', 'wb') as handle:
    pickle.dump(parameters, handle, protocol=pickle.HIGHEST_PROTOCOL)

(69953, 13)
(15268, 13)
['MPQ' 'PQS' 'QSF' 'SFT' 'FTS' 'TSI' 'SIA' 'IAR' 'ARI' 'RIG' 'IGD' 'GDY'
 'DYI']
1


In [7]:
# Find indices with positive and negative labels
X_train_pos = np.array([i for i in range(len(X_train)) if Y_train[i] == 1])
X_train_neg = np.array([i for i in range(len(X_train)) if Y_train[i] == 0])

# Print how many positive and negative labels => I want same number of labels for each class during training
print("Positive: " + str(X_train_pos.shape[0]) + " | Negative: " + str(X_train_neg.shape[0]))

# Get indices from X_train_pos
np.random.seed(97)
X_train_pos_indices = np.random.choice(len(X_train_pos), len(X_train_neg), replace=True)
X_train_selected = X_train_pos[X_train_pos_indices]

# Final X_train data
X_train = np.concatenate((X_train[X_train_selected], X_train[X_train_neg]), axis=0)
Y_train = np.concatenate((Y_train[X_train_selected], Y_train[X_train_neg]), axis=0)

# Check that labels 1 and 0 are equal
assert(len(np.array([i for i in range(len(X_train)) if Y_train[i] == 1])) == 
    len(np.array([i for i in range(len(X_train)) if Y_train[i] == 0])))

print("Final training data shape: " + str(X_train.shape))

Positive: 19105 | Negative: 50848
Final training data shape: (101696, 13)


In [8]:
# Create vocabulary of n-grams
vocab = dictionary.LanguageDictionary(X_train)

# Load embeddings trained with Protovec
trained_embeddings = processing.loadEmbeddings(vocab, "./train_embeddings/models/ngram-" + str(n_grams) + ".model")

assert(len(vocab.index_to_word) == len(vocab.word_to_index) == len(trained_embeddings))
print("Embedding layer words: " + str(len(trained_embeddings)))

# Save vocabulary locally with pickle dump
with open('./dumps/vocab.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

Embedding layer words: 7707


In [9]:
# Map grams to indices for the embedding matrix and remove samples where unknown words
X_train = np.array([vocab.text_to_indices(tmp) for tmp in X_train])
print(X_train.shape)

# Prepare validation data
X_val, Y_val = [], []
for i in range(len(X_val_proc)):
    
    tmp = vocab.text_to_indices(X_val_proc[i])
    if not None in tmp:
        X_val.append(tmp)
        Y_val.append(Y_val_proc[i])
        
X_val = np.array(X_val)
Y_val = np.array(Y_val)

assert(len(X_val) == len(Y_val))
print(X_val.shape)

(101696, 13)
(14008, 13)


In [10]:
tf.reset_default_graph()

# Placeholders
tensor_X = tf.placeholder(tf.int32, (None, X_train.shape[1]), 'inputs')
tensor_Y = tf.placeholder(tf.int32, (None), 'output')

keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')

# Create graph for the network
if use_pretrained_embeddings:
    assert(len(trained_embeddings[0]) == embedding_size)
else:
    trained_embeddings = None

logits = network.create_network(tensor_X, tensor_Y, keep_prob, vocab, embedding_size, trained_embeddings)

In [11]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tensor_Y)
meaned = tf.reduce_mean(ce)

trainable_vars = tf.trainable_variables()
l2_reg = tf.reduce_sum([tf.nn.l2_loss(var) for var in trainable_vars])
loss = meaned + reg_alpha * l2_reg

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=1))
correct_mask = tf.to_float(tf.equal(predictions, tensor_Y))
accuracy = tf.reduce_mean(tf.cast(correct_mask, tf.float32), axis=0)

In [12]:
# Training data variables
iterations_training = max((len(X_train) // batch_size), 1)
print("Training iterations per epoch: " + str(iterations_training))

# Validation data variables
max_val_acc = 0
iterations_validation = max((len(X_val) // batch_size), 1)

# Perform each epoch, shuffle training dataset
indices = list(range(len(X_train)))
consecutive_validation = 0

saver = tf.train.Saver()
with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(tf.global_variables_initializer())
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle data (with random seed for debug) to not train the network always with the same order
        np.random.seed(97)
        np.random.shuffle(indices)
        X_train = X_train[indices]
        Y_train = Y_train[indices]
        
        # Vector accumulating accuracy and loss during one epoch
        total_accuracies, total_losses = [], []

        # Iterate over mini-batches
        for j in range(iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size 
            
            # If last batch, take also elements that are less than batch_size
            if j == (iterations_training - 1):
                end_index += (batch_size - 1)

            _, avg_accuracy, avg_loss = sess.run([optimizer, accuracy, loss], feed_dict={
                                                        tensor_X: X_train[start_index:end_index],
                                                        tensor_Y: Y_train[start_index:end_index],
                                                        keep_prob: dropout_prob})
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)

            #if (j+1) % 100 == 0:
            #    print("Loss: " + str(avg_loss) + ", Accuracy: " + str(avg_accuracy))
                    
            # Statistics on validation set
            if (j+1) % 100 == 0:  
                avg_accuracy, avg_loss, pred = sess.run([accuracy, loss, predictions], feed_dict={ 
                                                                                    tensor_X: X_val,
                                                                                    tensor_Y: Y_val,
                                                                                    keep_prob: 1.0 })
                #avg_accuracy = precision_score(Y_val, pred)
                
                # Save model if validation accuracy better
                if avg_accuracy > max_val_acc:
                    consecutive_validation_without_saving = 0
                    max_val_acc = avg_accuracy
                    print("SAVE | Val loss: " + str(avg_loss) + ", accuracy: " + str(avg_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
                    consecutive_validation = 0
                else:
                    consecutive_validation += 1
            
        if consecutive_validation >= 35:
            print("Early stopping")
            break
        
        # Epoch statistics
        #print("Epoch: " + str(i) + ", Loss: " + str(np.mean(np.array(total_losses))) + 
        #      ", Acc: " + str(np.mean(np.array(total_accuracies))) + "\n")

Training iterations per epoch: 1589
SAVE | Val loss: 26.927614, accuracy: 0.5357653
SAVE | Val loss: 21.514845, accuracy: 0.53997713
SAVE | Val loss: 17.033527, accuracy: 0.5409052
SAVE | Val loss: 10.658874, accuracy: 0.54575956
SAVE | Val loss: 8.610101, accuracy: 0.559823
SAVE | Val loss: 7.0800824, accuracy: 0.5875928
SAVE | Val loss: 3.9299293, accuracy: 0.6104369
SAVE | Val loss: 2.4537804, accuracy: 0.6121502
SAVE | Val loss: 2.303782, accuracy: 0.62064534
SAVE | Val loss: 1.7967415, accuracy: 0.6226442
SAVE | Val loss: 1.7373092, accuracy: 0.622787
SAVE | Val loss: 1.6823735, accuracy: 0.6425614
SAVE | Val loss: 1.5811579, accuracy: 0.6448458
SAVE | Val loss: 1.9323804, accuracy: 0.6477013
SAVE | Val loss: 1.9318738, accuracy: 0.6484152
SAVE | Val loss: 1.9679573, accuracy: 0.64855796
SAVE | Val loss: 1.8474904, accuracy: 0.6489863
SAVE | Val loss: 1.9610058, accuracy: 0.64927185
SAVE | Val loss: 1.933443, accuracy: 0.6501999
SAVE | Val loss: 1.9238939, accuracy: 0.65969443
Ear

In [13]:
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, "./checkpoints/model.ckpt") 
    
    avg_accuracy, avg_loss, pred = sess.run([accuracy, loss, predictions], feed_dict={
                                            tensor_X: X_val,
                                            tensor_Y: Y_val,
                                            keep_prob: 1.0 })

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt


In [14]:
print(Y_val[-150:])
print(pred[-150:])


print("\nAccuracy :" + str(accuracy_score(Y_val, pred)))
print("Precision :" + str(precision_score(Y_val, pred)))
print("Recall :" + str(recall_score(Y_val, pred)))
print("AUC :" + str(roc_auc_score(Y_val, pred)))

[1 0 0 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 1 0 0 0 1 0
 0 0]
[1 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0]

Accuracy :0.6596944603083952
Precision :0.37325227963525837
Recall :0.14114942528735633
AUC :0.5171992725939785
