# Intelligent Antibodies

> **Goal** Generate antibodies protein sequence.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Tuple, Generator
import keras
from keras import layers
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import Tensor


from modules.models.VAE import VAE
from modules.models.VAEFull import VAEFull
from modules.models.SiameseInteractionClassifier import f1, binary_crossentropy, mcc, forward, accuracy
from modules.encoding import ProteinOneHotEncoder

import pandas as pd
import numpy as np

2025-09-09 22:48:28.065167: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
  except RuntimeError as e:
    print(e)

2025-09-09 22:48:30.454222: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-09 22:48:30.469778: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-09 22:48:30.471066: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [4]:
vector_size = 200
alphabet_size = 18
input_dimensions = (vector_size, alphabet_size)

vae_full = VAEFull(200, 18)
vae_full.vae.reload(f'run/vae-one-hot-{vector_size}.keras')

2025-09-09 22:48:30.505965: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-09 22:48:30.506866: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-09 22:48:30.508145: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-09 22:48:30.509298: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there m

Reloaded.


In [5]:
seq_input1 = layers.Input(shape=input_dimensions, name='seq_ag')
seq_input2 = layers.Input(shape=input_dimensions, name='seq_ab')

In [6]:
siamese = keras.models.load_model(f'run/models/siamese/one-hot-{vector_size}-model.h5', custom_objects=dict(f1=f1, mcc=mcc, binary_crossentropy=binary_crossentropy, forward=forward, accuracy=accuracy))

In [7]:
# Get target antigen
antigen_seq_id = "6xe1"

df_seq = pd.read_csv("../data/SAbDab/sequences.csv", sep=";")
antigen = df_seq[df_seq["seq_id"] == f"{antigen_seq_id}|ag"]

In [8]:
antigen_sequence = antigen["sequence"]

In [9]:
"".join(antigen_sequence)

'RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCS'

In [10]:
encoder = ProteinOneHotEncoder()

In [11]:
def generate_antibody_sequence(n: int, vector_size: int) -> Generator[Tuple[str, Tensor], None, None]:
    """
    Generate novel protein sequences by sampling from the VAE's latent space.

    This function leverages the trained VAE decoder to reconstruct new protein
    sequences by sampling points from a standard normal distribution in the
    latent space. It then decodes these numerical representations back into
    amino acid sequences and re-encodes them.

    Parameters
    ----------
    n : int
        The number of sequences to generate.
    vector_size : int
        The size of the vector representation for each sequence. This must match
        the input size of the VAE decoder.

    Yields
    ------
    Tuple[str, Tensor]
        A generator that yields a tuple for each generated sequence, containing:
        - The generated protein sequence as a string.
        - The one-hot encoded representation of the generated sequence.

    Notes
    -----
    The VAE decoder is assumed to have a latent space dimension of 2. The
    `x_reconst` output is reshaped to (200, 18), implying that the generated
    sequences have a length of 200 and an alphabet size of 18.
    """
    
    z: tf.Tensor = tf.random.normal(shape=[n, 2])
    x_reconst: np.array = vae_full.decoder.predict(z, verbose=0)
    latent_dim: int = z.shape[1]
    for x in x_reconst:
        x_sample: np.array = x.reshape((200, 18))
        protein_sequence:str = "".join(list(encoder.decode(x_sample)))
        protein_onehot: tf.Tensor = encoder.encode([protein_sequence], vector_size)
        yield protein_sequence, protein_onehot


In [12]:
def test_interaction(onehot_antibody: tf.Tensor, onehot_antigen: tf.Tensor, threshold: float =0.8) -> tf.Tensor:
    """
    Test if there is an interaction between antibody and antigene
    both being one hot encoded.

    Parameters
    ----------
    onehot_antibody : tf.Tensor
        One hot encoded antibody to be tested.
    onehot_antigen : tf.Tensor
        One hot encoded antigene to be tested.
    threshold : float, optional
        Threshold beyond which interaction between antibody and antigene is considered as real, by default 0.8

    Returns
    -------
    tf.Tensor
        Return 0.0 if no interaction, 1.0 otherwise.
    """        
    score = siamese.predict([onehot_antibody, onehot_antigen])
    label = tf.cast(score > threshold, tf.int32)
    return label[0][0]

In [48]:
def generate_interacting_antibody(antigen, limit: int=20): # antigene has to be pd.Series or List[str] -> Change it to accept direct str
    # print(f"antigene: {type(antigen[0])}")
    print(str(antigen))
    if isinstance(antigen, str):
        print('youhou')
        antigen = "".join(antigen)
        print(antigen)
    onehot_antigen = encoder.encode(antigen, vector_size)
    for _ in range(limit):
        for sequence_antibody, onehot_antibody in generate_antibody_sequence(10, vector_size):
            if test_interaction(onehot_antibody, onehot_antigen):
                yield sequence_antibody


In [None]:
with open(f"run/antibody-sequence-{antigen_seq_id}.fasta", "w") as f:
    for i, sequence in enumerate(generate_interacting_antibody(antigen_sequence)): # Also works with list of str seq
        print(sequence)
        f.write(f">{antigen_seq_id}_{i}\n{sequence}")
        
        
print(f"We generated {i} new antibody sequences")

RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCS
youhou
RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCS


ValueError: Data cardinality is ambiguous:
  x sizes: 1, 273
Make sure all arrays contain the same number of samples.

In [15]:
G = generate_interacting_antibody('RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCS', limit = 150)