In [None]:
import os
os.environ['KERAS_BACKEND'] = "torch"
import keras
import keras.backend as K
from keras import layers, Model
from keras.optimizers import Adam


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf

from modules.dataset import DatasetFactory
from modules.encoding import NLFEncoder

from modules.data.loader import SequenceEncodingDataGenerator

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
dataset_dir = "../data/SAbDab"

df_match = pd.read_csv(os.path.join(dataset_dir, "data.csv"), sep=";")
df_seq = pd.read_csv(os.path.join(dataset_dir, "sequences.csv"), sep=";")

In [None]:
df_match.head()

### Filter 

Remove keys which does not have both an antigen and an anticorps.

In [None]:
df_filtered = df_match[df_match["ab"].isin(df_seq["seq_id"]) & df_match["ag"].isin(df_seq["seq_id"])]
df_filtered.shape

In [None]:
df_match.shape

## Encoding

### NLF

We want to encode the sequence first, and retrieve the encoded for each sequence in the join later.

In [73]:
encoder = NLFEncoder()

In [75]:
seq = df_seq["sequence"].values
encoded_seq = encoder.encode(seq, vector_size=2000)

In [76]:
seq_id_to_nlf = {seq_id: nlf for seq_id, nlf in zip(df_seq["seq_id"], encoded_seq)}

In [77]:
X = df_filtered[["ab", "ag"]] 
y = df_filtered[["interaction"]]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [78]:
train_ds_loader = SequenceEncodingDataGenerator(x_train, y_train, seq_id_to_nlf, batch_size=32)
test_ds_loader = SequenceEncodingDataGenerator(x_test, y_test, seq_id_to_nlf, batch_size=32)

In [80]:
example_batch = train_ds_loader[0]
input_dimensions = example_batch[0][0].shape

## Siamese network

In [81]:
seq_input1 = layers.Input(shape=input_dimensions, name='seq_ag')
seq_input2 = layers.Input(shape=input_dimensions, name='seq_ab')

In [82]:
# Convolutional modules
filters = 16


conv01 = layers.Conv1D(filters, 11, padding='same', activation="relu")
mp1 = layers.MaxPooling1D(3)
conv02 = layers.Conv1D(filters*2, 7, padding='same', activation="relu")
mp2 = layers.MaxPooling1D(3)
conv03 = layers.Conv1D(filters*4, 3, padding='same', activation="relu")
mp3 = layers.MaxPooling1D(3)
conv04 = layers.Conv1D(filters*2, 3, padding='same', activation="relu")
mp4 = layers.MaxPooling1D(3)

gru = layers.Bidirectional(layers.GRU(filters, return_sequences=False))

def siamese_propagation(x):
    x = conv01(x)
    x = mp1(x)

    x = conv02(x)
    x = mp2(x)

    # x = conv03(x)
    # x = mp3(x)

    # x = conv04(x)
    # x = mp4(x)

    x_gru = gru(x)
    return x_gru



In [83]:
def forward(left, right):
    left = siamese_propagation(left)
    right = siamese_propagation(right)

    merge = layers.multiply([left, right])

    # merge = layers.Dense(filters*2, activation='relu')(merge)
    merge = layers.Dropout(0.2)(merge)
    return layers.Dense(1, activation='sigmoid')(merge)

In [84]:
def f1(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    pred_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = tp / (pred_pos + K.epsilon())
    recall = tp / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

def mcc(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos
    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)
    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)
    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return numerator / (denominator + K.epsilon())

def accuracy(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos
    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)
    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)
    return (tp + tn) / (tp + tn + fp + fn)

In [85]:
def binary_crossentropy(y_true, y_pred):
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    loss = - K.mean(y_true * K.log(y_pred) + (1 - y_true) * K.log(1 - y_pred))
    return loss

In [86]:
model = Model(inputs=[seq_input1, seq_input2],
            outputs=[forward(seq_input1, seq_input2)])

adam = Adam(learning_rate=1e-4, amsgrad=True, epsilon=1e-6)

checkpoint_callback = keras.callbacks.ModelCheckpoint('run/siamese/model.h5', monitor='val_mcc', mode='max')
earlystop_callback = keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

model.compile(optimizer=adam, loss=binary_crossentropy, metrics=[accuracy, f1, mcc])


model.fit(train_ds_loader, epochs=50, callbacks=[checkpoint_callback, earlystop_callback],
          batch_size=64, verbose=1)


Epoch 1/50


ValueError: in user code:

    File "/home/sortion/.local/share/miniforge3/envs/intelligent-antibodies/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/home/sortion/.local/share/miniforge3/envs/intelligent-antibodies/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/sortion/.local/share/miniforge3/envs/intelligent-antibodies/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/home/sortion/.local/share/miniforge3/envs/intelligent-antibodies/lib/python3.11/site-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/home/sortion/.local/share/miniforge3/envs/intelligent-antibodies/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/sortion/.local/share/miniforge3/envs/intelligent-antibodies/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 253, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'model_10' (type Functional).
    
    Input 0 of layer "conv1d_20" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (None, None)
    
    Call arguments received by layer 'model_10' (type Functional):
      • inputs=('tf.Tensor(shape=(None, None), dtype=float32)', 'tf.Tensor(shape=(None, None), dtype=float32)')
      • training=True
      • mask=None


In [None]:
train_ds_loader[0]