# =================== Import ===================

In [8]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Embedding, Bidirectional, LSTM
from keras.preprocessing import sequence
import shap
import matplotlib.pyplot as plt
%matplotlib inline

# =================== Variables ===================

In [9]:
data = np.load('2.CNN_Model/ER_dataset.npz')
SHAP_file = ('3.DeepLoc_SHAP/Plot_SHAP.png')

# =================== CNN model ===================

In [10]:
'''Run the CNN model'''
max_protein_length=100
input_dim=21
n_filters=96
kernel_size=6
maxpooling_size=2
activation='relu'
activation_out='sigmoid'
padding='same'
optimizer='adam'
loss='binary_crossentropy'
metrics=['accuracy']
n_classes=1
epochs=30
batch_size=64
verbose=1
X = data['input']
y = data['label']
part = data['partition']
test = 4
not_test = [0,1,2,3]

performances = []
# Cross validation loop
for partition in not_test:
    train_indices = np.where((part != partition)&(part != test))
    val_indices = np.where(part == partition)        
    # Last 100aa in proteins
    X = sequence.pad_sequences(X, maxlen=max_protein_length)
    X = X[:,-max_protein_length:]
    X_train = X[train_indices] 
    y_train = y[train_indices]
    X_val = X[val_indices]
    y_val = y[val_indices]
    # Build model
    model = Sequential()
    model.add(Embedding(input_dim, 96))
    model.add(MaxPooling1D(pool_size=maxpooling_size))
    model.add(Conv1D(n_filters, kernel_size, activation=activation, padding=padding)) 
    model.add(Bidirectional(LSTM(16)))
    model.add(Dense(15, activation=activation))
    model.add(Dense(n_classes, activation=activation_out))
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics) 
    # Training model
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, 
              batch_size=batch_size, shuffle=True, verbose=verbose)
    scores = model.evaluate(X_val, y_val)
    print('Test loss:', scores[0])
    print('Test accuracy:', scores[1], '\n')
    break

Train on 1712 samples, validate on 576 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test loss: 0.056531849517417264
Test accuracy: 0.984375 



# =================== Functions ===================

In [11]:
def SHAP(model,X_train,X_val):
    e = shap.DeepExplainer(model, X_train[-100:])
    shap_values = e.shap_values(X_val[-100:])
    return shap_values

In [12]:
def SHAP_plot(shap_values):
    #Plotting the values
    fig_SHAP = plt.figure(figsize=(20,10))
    plt.plot(np.arange(100), np.abs(np.mean(shap_values[0],axis=0)),color='skyblue', linewidth=2.5)
    plt.ylabel("Importance of amino acid position", fontsize=15)
    plt.xlabel("Amino acid position", fontsize=15)
    fig_SHAP.savefig(SHAP_file,bbox_inches='tight')
    plt.close()
    return fig_SHAP

# =================== Main ===================

In [16]:
shap_values = SHAP(model, X_train, X_val)

In [17]:
fig_SHAP = SHAP_plot(shap_values)