# =================== Import ===================

In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Dropout, GlobalMaxPooling1D, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics import confusion_matrix
import math

  from ._conv import register_converters as _register_converters


# =================== Variables ===================

In [2]:
data = np.load('2.CNN_Model/ER_dataset.npz')

# =================== Functions ===================

In [3]:
def cross_validation(data):
    '''Function to make the CNN model and cross validate the model'''
    max_protein_length=100
    input_dim=21 #One Hot Encoding
    n_filters=110
    n_filters_out=96
    kernel_size=6
    #dropout=0.25
    maxpooling_size=2
    n_layers=[0]  
    activation='relu'
    activation_out='sigmoid'
    padding='same'   
    optimizer='adam'
    loss='binary_crossentropy'
    metrics=['accuracy']   
    n_classes=1
    epochs=30
    batch_size=64
    verbose=1
   
    X = data['input']
    y = data['label']
    part = data['partition']
    
    # The data set is partitioned in 5 where the fifth partiotion is test set and the other 4 will train the model
    test = 4
    not_test = [0,1,2,3]
    #test_indices = np.where(part == test)

    performances = []
    accscores = []
    
    # Cross validation loop
    for partition in not_test:
        train_indices = np.where((part != partition)&(part != test))
        val_indices = np.where(part == partition)        

        # Last 100aa in the protein sequences
        X = sequence.pad_sequences(X, maxlen=max_protein_length)
        X = X[:,-max_protein_length:] 
        X_train = X[train_indices] 
        X_val = X[val_indices]
        y_train = y[train_indices]
        y_val = y[val_indices]
        
        # Building model
        model = Sequential()
        model.add(Embedding(input_dim, n_filters, input_length=max_protein_length)) 
        model.add(MaxPooling1D(pool_size=maxpooling_size))
        #model.add(Dropout(dropout))
        for n in n_layers:
            model.add(Conv1D(n_filters, kernel_size, activation=activation, padding=padding))         
        model.add(GlobalMaxPooling1D())
        model.add(Dense(n_filters_out, activation=activation))
        model.add(Dense(n_classes, activation=activation_out)) 
        # Summerize model
        print(model.summary())
        # Compiling model
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics) 
        # Training model
        print(partition + 1, 'Iteration')
        file_model = "2.CNN_Model/model_%i.hdf5" % partition
        mcp_save = ModelCheckpoint(file_model, save_best_only=True, monitor='val_loss', mode='min')
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, 
                  callbacks=[mcp_save], verbose=verbose)
        # Loading weights into model
        model.load_weights(filepath = file_model)
        # Evalating model
        scores = model.evaluate(X_val, y_val)
        print('Test loss:', scores[0])
        print('Test accuracy:', scores[1], '\n')
        # Appending performances from each iteration acc
        accscores.append(scores[1])
        # Running the "mcc function"
        val_mcc = mcc(X_val,y_val, model)     
        # Appending performances from each iteration MCC
        performances.append(val_mcc)
            
    
    print('Accuracy from each iterations = ', accscores) 
    acc_scores = np.mean(accscores)
    print('Average accuracy = ', acc_scores)
    
    print('MCC from each iterations = ', performances) 
    kfold_mcc = np.mean(performances) 
    print('Average MCC = ', kfold_mcc)
    return kfold_mcc

In [None]:
def mcc(X_val,y_val, model):
    '''Function to make predictions and calculate the Matthews Correlation Coefficient'''
    predictions = model.predict_classes(X_val)
    y_actu = y_val.tolist()
    y_pred = predictions.tolist()
    cm = confusion_matrix(y_pred, y_actu)
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TP = cm[1][1]
    MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    print('TP', 'FP', 'FN', 'TN')
    print(TP, FP, FN, TN, '\n')
    print('Confusion Matrix')
    print(cm)
    print ('MCC = ', MCC, '\n')
    return MCC

# =================== Main ===================

In [None]:
kfold_mcc = cross_validation(data)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 110)          2310      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 50, 110)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 50, 110)           72710     
_________________________________________________________________
global_max_pooling1d (Global (None, 110)               0         
_________________________________________________________________
dense (Dense)                (None, 96)                10656     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 97        
Total params: 85,773
Trainable params: 85,773
Non-trainable params: 0
_________________________________________________________________
None
1

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1712 samples, validate on 576 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test loss: 0.028494358460569913
Test accuracy: 0.9895833333333334 

TP FP FN TN
78 3 3 492 

Confusion Matrix
[[492   3]
 [  3  78]]
MCC =  0.9569023569023569 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 110)          2310      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 110)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 50, 110) 