# =================== Import ===================

In [14]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Dropout, GlobalMaxPooling1D, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics import confusion_matrix
import math

# =================== Variables ===================

In [15]:
data = np.load('2.ER_dataset.npz')

# =================== Functions ===================

In [12]:
def cross_validation(data):
    '''Function to make the Convolutional Neural Network model and cross validate the model'''
    max_protein_length=100
    input_dim=21 
    n_filters=110
    kernel_size=7
    dropout=0.25
    maxpooling_size=2
    n_layers=[0]  
    activation='relu'
    activation_out='sigmoid'
    padding='same'   
    optimizer='adam'
    loss='binary_crossentropy'
    metrics=['accuracy']   
    n_classes=1
    epochs=12
    batch_size=64
    verbose=1
   
    X = data['input']
    y = data['label']
    part = data['partition']
    #length = data['length']
    
    test = 4
    not_test = [0,1,2,3]
    #test_indices = np.where(part == test)

    performances = []
    # Cross validation loop
    for partition in not_test:
        train_indices = np.where((part != partition)&(part != test))
        val_indices = np.where(part == partition)        

        # Last 100aa in proteins
        X = sequence.pad_sequences(X, maxlen=max_protein_length)
        X = X[:,-max_protein_length:]
        X_train = X[train_indices] 
        y_train = y[train_indices]
        X_val = X[val_indices]
        y_val = y[val_indices]
        
        model = Sequential()
        model.add(Embedding(input_dim, n_filters, input_length=max_protein_length)) #21 is the one hot..
        model.add(MaxPooling1D(pool_size=maxpooling_size))
        model.add(Dropout(dropout))
        
        for n in n_layers:
            model.add(Conv1D(n_filters, kernel_size, activation=activation, padding=padding)) 
            #model.add(MaxPooling1D(pool_size=maxpooling_size))
        
        model.add(GlobalMaxPooling1D())
        
        #model.add(Flatten())
        model.add(Dense(n_filters, activation=activation))
        model.add(Dense(n_classes, activation=activation_out))
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics) #add matthews correlation coefficient, as this will be taken into account.

        # Training model
        print(partition + 1, 'Iteration')
        file_model = "model_%i.hdf5" % partition
        mcp_save = ModelCheckpoint(file_model, save_best_only=True, monitor='val_loss', mode='min')
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, 
                  callbacks=[mcp_save], verbose=verbose)
        
        model.load_weights(filepath = file_model)
        scores = model.evaluate(X_val, y_val)
        print('Test loss:', scores[0])
        print('Test accuracy:', scores[1], '\n')
        # Running the mcc function
        val_mcc = mcc(X_val,y_val, model)
        
        # Appending performances from each iteration
        performances.append(val_mcc)
            
    print('MCC from each iterations = ', performances) 
    kfold_mcc = np.mean(performances)  
    print('Average of MCC = ', kfold_mcc)
    return kfold_mcc



def mcc(X_val,y_val, model):
    '''Function to calculate the Matthews Correlation Coefficient'''
    predictions = model.predict_classes(X_val)
    y_actu = y_val.tolist()
    y_pred = predictions.tolist()
    cm = confusion_matrix(y_pred, y_actu)
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TP = cm[1][1]
    MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    print('TP', 'FP', 'FN', 'TN')
    print(TP, FP, FN, TN, '\n')
    print('Confusion Matrix')
    print(cm)
    print ('MCC = ', MCC, '\n')
    return MCC





# =================== Main ===================

In [13]:
kfold_mcc = cross_validation(data)

1 Iteration


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1715 samples, validate on 575 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Test loss: 0.041421651913532635
Test accuracy: 0.9773913043478261 

TP FP FN TN
72 8 5 490 

Confusion Matrix
[[490   8]
 [  5  72]]
MCC =  0.9043327789599662 

2 Iteration
Train on 1718 samples, validate on 572 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Test loss: 0.04156058721514614
Test accuracy: 0.9877622377622378 

TP FP FN TN
75 2 5 490 

Confusion Matrix
[[490   2]
 [  5  75]]
MCC =  0.9485574624224709 

3 Iteration
Train on 1719 samples, validate on 571 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Test loss: 0.028069060731307324
Test accuracy: 0.9894921183586121 

TP FP FN TN
74 2 4 491 

Confusion Matrix