# Deep learning Gray2018 window approach

Here I implement the network used to process the MSA obtained from the filtered output of hhblits. This implementation uses the network that extracts the secondary structure and relative solvent accessibility from an msa, and then further processes the output from there with additional layers. This approach is based on a sliding window.

## Obtaining the sliding windows

Here I put the sliding windows for each protein and for each position in a dictionary for easy retrieval.

In [1]:
import joblib
import numpy as np


in_msa_path = '../processing/gray2018/deep_learning/msa_vectors/'
basename_list = '../processing/gray2018/input_list.txt'
out_sliding_windows_path = '../processing/gray2018/deep_learning/sliding_windows_unmapped.joblib.xz'
window_size = 31
msa_depth = 500

with open(basename_list) as handle:
    sliding_windows = {}
    for line in handle:
        basename = line.rstrip()
        msa_vec = joblib.load(in_msa_path + basename + '.npy.joblib.xz')[:msa_depth]
        windows_list = []
        for i, _ in enumerate(msa_vec.T):
            upper = i + ((window_size - 1)//2) + 1
            lower = i - ((window_size - 1)//2)
            pad_lower, pad_upper = 0, 0
            if lower < 0:
                pad_lower = - lower
                lower = 0
            if upper > len(msa_vec.T):
                pad_upper = upper - len(msa_vec.T)
                # no need to reset upper since numpy allows indeces exceeding len
            curr_window_unpadded = msa_vec[:, lower:upper]
            # this is for the vertical padding if there are not enough sequences in the msa
            pad_vertical = 0
            if len(msa_vec) < msa_depth:
                pad_vertical = msa_depth - len(msa_vec)
            # 0 is a special padding value for the keras embedding layer
            curr_window = np.pad(curr_window_unpadded, ((0,pad_vertical),(pad_lower,pad_upper)), mode='constant', constant_values = 0)
            assert curr_window.shape == (msa_depth, window_size)
            windows_list.append(curr_window)
        sliding_windows[basename] = np.array(windows_list)
        assert sliding_windows[basename].shape == (len(msa_vec.T), msa_depth, window_size)

joblib.dump(sliding_windows, out_sliding_windows_path)

['../processing/gray2018/deep_learning/sliding_windows_unmapped.joblib.xz']

## Purge the dataset from wrong mappings

I cross-check that aa1 is correct with the uniprot sequences. I remove entries with a wrong mapping.

Noticeable points:
- P62593 has some position with a wrong mapping (most probably slided by 2)
- P06654 has consistently a Q in position 228 while uniprot reports a T. The MSA reports consistently T

In [4]:
import pandas as pd
import numpy as np
import joblib
from Bio import SeqIO


df = pd.read_csv('../dataset/gray2018/dmsTraining_2017-02-20.csv')

# some of the studies in the training set were excluded from training in the original paper
# beta lactamase I am adding just now for remobving int temporarely
excluded_studies = ['Brca1_E3', 'Brca1_Y2H', 'E3_ligase']
for study in excluded_studies:
    df = df[df['dms_id'] != study]

# obtain a dictionary with the uniprot sequences for each protein
sequences = {}
for basename in set(df.uniprot_id):
    curr_seq = str(list(SeqIO.parse('../processing/gray2018/deep_learning/sequences/' + basename + '.fasta', 'fasta'))[0].seq)
    sequences[basename] = curr_seq

len_before = len(df)
    
# remove inconsistencies
index_in_range = pd.Series([row.position - 1 < len(sequences[row.uniprot_id]) for _, row in df.iterrows()], index=df.index)
df = df[index_in_range]
correct_aa1 = pd.Series([row.aa1 == sequences[row.uniprot_id][row.position - 1] for _, row in df.iterrows()], index=df.index)
df = df[correct_aa1]

# check that there are no more inconsistencies
for protein in set(df.uniprot_id.values):
    curr_df = df[df.uniprot_id == protein]
    for position in set(curr_df.position.values):
        aa1_set = set(curr_df[curr_df.position == position].aa1)
        assert len(aa1_set) == 1
        curr_aa1 = aa1_set.pop()
        seq_index = position - 1
        curr_real_res = sequences[protein][seq_index]
        assert curr_aa1 == curr_real_res
                
# print how many entries have been removed (the false values)
len_after = len(df)
print('Removed entries for wrong mapping:', len_before - len_after)

# create the target, protein, and dataset vectors for the edited set (not the x vector since I need to map the sliding windows)
joblib.dump(np.array(df.scaled_effect1), '../processing/gray2018/deep_learning/target_fitness.npy.joblib.xz')
joblib.dump(np.array(df.protein), '../processing/gray2018/deep_learning/protein.npy.joblib.xz')
joblib.dump(np.array(df.dms_id), '../processing/gray2018/deep_learning/dms_ids.npy.joblib.xz')

Removed entries for wrong mapping: 58


['../processing/gray2018/deep_learning/dms_ids.npy.joblib.xz']

## Map the sliding windows to the dataset

I create an array with the correct sliding window for each dataset position.

In [3]:
import joblib
import numpy as np


def map_windows(sliding_windows, df):
    # the first element of iterrows is the index
    windows = []
    for _, row in df.iterrows():
        # I remove 1 to position since the index starts from 0 but the position from 1
        curr_window = sliding_windows[row['uniprot_id']][row['position']-1]
        windows.append(curr_window)
    windows_vec = np.array(windows)
    assert windows_vec.shape[0] == len(df)
    return windows_vec


sliding_windows = joblib.load('../processing/gray2018/deep_learning/sliding_windows_unmapped.joblib.xz')
joblib.dump(map_windows(sliding_windows, df), '../processing/gray2018/deep_learning/sliding_windows_mapped.npy.joblib.xz')

['../processing/gray2018/deep_learning/sliding_windows_mapped.npy.joblib.xz']

## Create and train the deep learning network

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
from scipy import stats
import joblib
import datetime
import os
from sklearn.model_selection import LeaveOneGroupOut
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

window_size = 31
msa_depth = 500
embedding_dim = 14

def create_model(window_size=window_size, msa_depth=msa_depth, embedding_dim=embedding_dim):
    inputs = tf.keras.Input(shape=(msa_depth, window_size))
    x = tf.keras.layers.Flatten()(inputs)
    x = tf.keras.layers.Embedding(input_dim=26, output_dim=embedding_dim, mask_zero=True)(x)
    x = tf.keras.layers.Reshape((-1,msa_depth,embedding_dim))(x)
    x = tf.keras.layers.Conv2D(embedding_dim, [1,10], activation='relu', padding='same', data_format='channels_last')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=[1,20], data_format='channels_last')(x)
    x = tf.keras.layers.Reshape((-1,embedding_dim*(msa_depth//20)))(x)

    for _ in range(2):
        lstm_layer = tf.keras.layers.LSTM(embedding_dim*(msa_depth//20), return_sequences=True, recurrent_activation='hard_sigmoid')
        x = tf.keras.layers.Bidirectional(lstm_layer, merge_mode='ave')(x)
        x = tf.keras.layers.Dropout(0.5)(x)

    x = tf.keras.layers.Flatten(data_format='channels_last')(x)
    x = tf.keras.layers.Dense(50, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(20, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name='window_dms_predictor')
    optimizer = tf.keras.optimizers.RMSprop()
    loss = tf.keras.losses.mean_squared_error
    model.compile(optimizer=optimizer, loss=loss)
    return model

# load the data vectors
x = joblib.load('../processing/gray2018/deep_learning/sliding_windows_mapped.npy.joblib.xz')
y = joblib.load('../processing/gray2018/deep_learning/target_fitness.npy.joblib.xz')
datasets = joblib.load('../processing/gray2018/deep_learning/dms_ids.npy.joblib.xz')
proteins = joblib.load('../processing/gray2018/deep_learning/protein.npy.joblib.xz')
#np.random.shuffle(y)

# tensorflow logging
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

cv_indexes = LeaveOneGroupOut().split(x, y, proteins)
result_vectors = {}

for train, val in cv_indexes:
    assert len(set(proteins[val])) == 1
    curr_protein = set(proteins[val]).pop()
    result_vectors[curr_protein] = {}
    model = create_model()
    model.fit(x[train], y[train], epochs=1, validation_data=(x[val], y[val]), callbacks=[tensorboard_callback])
    model.save('../models/window_deep_learning/only_msa_no_mutation/tf2.4_keras_model_' + curr_protein)
    # need to flatten since the output has shape (n,1) but I need (n,) for calculating the correlation
    y_pred = model.predict(x[train]).flatten()
    print(curr_protein, 'training scores')
    print('Pearson:', stats.pearsonr(y[train], y_pred)[0], '\tSpearman:', stats.spearmanr(y[train], y_pred)[0])
    del y_pred
    for curr_dataset in set(datasets[val]):
        y_pred = model.predict(x[val][datasets[val] == curr_dataset]).flatten()
        result_vectors[curr_protein][curr_dataset] = (y[val][datasets[val] == curr_dataset], y_pred)
        joblib.dump(result_vectors, '../models/window_deep_learning/only_msa_no_mutation/predictions.joblib.xz')
        print(curr_protein, curr_dataset, 'validation scores')
        print('Pearson:', stats.pearsonr(y[val][datasets[val] == curr_dataset], y_pred)[0],
              '\tSpearman:', stats.spearmanr(y[val][datasets[val] == curr_dataset], y_pred)[0])
        plt.close()
        sns.scatterplot(x=y[val][datasets[val] == curr_dataset], y=y_pred, s=10)
        plt.show()
        del y_pred
        del curr_dataset
    del curr_protein
    del model
    tf.keras.backend.clear_session()

