In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install Bio --quiet
!pip install keras==3.0.0 --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m997.1/997.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.0.0 which is incompatible.[0m[31m
[0m

In [6]:
from Bio import SeqIO

import os
os.environ["KERAS_BACKEND"] = "torch"

import torch
from torch.utils.data import Dataset

import keras
from keras import backend as K

import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [7]:
print(keras.__version__)

3.0.0


In [8]:
print(K.backend())

torch


# Data processing

## Pre-processing

In [72]:
# Global variables
folder_path = 'drive/MyDrive/ae_training'
amino_acids_str = ' ACDEFGHIKLMNPQRSTVWY-'
amino_acids = [' ', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']
onehot_encoder = OneHotEncoder(categories=[amino_acids])
onehot_encoder.fit(np.array(list(amino_acids_str)).reshape(-1, 1))

# Hyperparameters
max_len = 221

In [89]:
# Utility functions

def parse_fasta(file_path) -> list:
    "Parse a fasta file into an array of Seq"
    sequences = []
    with open(file_path, 'r') as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequences.append(record.seq)
    return sequences


def integer_encode(sequence, max_length) -> torch.tensor:
    "Encode a protein sequence into a sequence of integers"
    sequence = sequence.replace('X', '-')  # X also means missing
    encoding = [amino_acids.index(aa) for aa in sequence]
    # Pad the sequence to the specified maximum length
    if len(encoding) < max_length:
        encoding += [0] * (max_length - len(encoding))
    return torch.tensor(encoding).reshape(-1, 1)


def integer_decode(int_seq) -> str:
    "Decode an integer encoded sequence back to a sequence of amino acids"
    # Convert the torch tensor to a list of integers
    encoded_list = int_seq.flatten().tolist()

    # Decode each integer back to the corresponding amino acid
    decoded_sequence = ''.join([amino_acids[i] for i in encoded_list])

    return decoded_sequence


def onehot_encode(sequence, max_length) -> torch.tensor:
    "Encode a protein sequence into a sequence of one-hot vectors"
    sequence = sequence.replace('X', '-')  # X also means missing
    # Pad the sequence with whitespaces
    padding = ' ' * (max_length - len(sequence))
    sequence += padding
    protein_sequence_array = np.array(list(sequence)).reshape(-1, 1)
    one_hot_encoded_sequence = onehot_encoder.transform(protein_sequence_array)
    one_hot_encoded_array = one_hot_encoded_sequence.toarray()
    return torch.tensor(one_hot_encoded_array)


def onehot_decode(onehot_seq: torch.tensor) -> str:
    "Decode a one-hot encoded sequence back to a sequence of amino acids"
    original_seq = onehot_encoder.inverse_transform(onehot_seq)
    s = [''.join(c) for c in original_seq]
    return ''.join(s)

In [93]:
file_name = f'{folder_path}/card1_1273x130.fasta'

sequences = parse_fasta(file_name)
onehot_encoded_sequences = [onehot_encode(seq, max_len) for seq in sequences]
data = torch.stack(onehot_encoded_sequences)
data.shape
# data[1]
# sequences[0]

torch.Size([1273, 221, 22])

In [94]:
data[1]

tensor([[0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [95]:
onehot_decode(data[1])

'---------------KNDPWDVLKNSAM-K---VLKDFCDDLIEQDV-FNQNEIKNMGKQ----LSTVKDK--SEDLVKIVTHK-GSQ-IG---DIFVKRV--LM-------AAK--QLHS---------                                                                                           '

In [96]:
sequences = parse_fasta(file_name)
int_encoded_sequences = [integer_encode(seq, max_len) for seq in sequences]
data = torch.stack(int_encoded_sequences)
data.shape

torch.Size([1273, 221, 1])

In [97]:
integer_decode(data[1])

'---------------KNDPWDVLKNSAM-K---VLKDFCDDLIEQDV-FNQNEIKNMGKQ----LSTVKDK--SEDLVKIVTHK-GSQ-IG---DIFVKRV--LM-------AAK--QLHS---------                                                                                           '

## Build the dataloaders

TODO: Build the training dataset as in PyTorch `DataLoader`.

Ideally, the model (as `keras.Model`) should be instantiated as a PyTorch `Module` in PyTorch backend.

In [None]:
from torch.utils.data import Dataset

# Build the LSTM Variational Autoencoder (VAE)

TODO: Rewrite the code to build the model: **Protein sequence LSTM Variational AutoEncoder (VAE)**

Write it in Keras 3 as subclass of `keras.Model` to handle variable length sequences (and missing characters).

Sources:
- VAE in Keras 3: https://keras.io/examples/generative/vae/
- LSTM Autoencoder: https://machinelearningmastery.com/lstm-autoencoders/
- Variable length: https://machinelearningmastery.com/handle-missing-timesteps-sequence-prediction-problems-python/


In [None]:
# an example lstm vae

from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, LSTM, RepeatVector, Masking
from keras.layers import Flatten, Dense, Dropout, Lambda
from keras.optimizers import SGD, RMSprop, Adam
from keras import losses


def create_lstm_vae(input_dim,
    timesteps,
    batch_size,
    intermediate_dim,
    latent_dim,
    epsilon_std=1.):

    """
    Creates an LSTM Variational Autoencoder (VAE). Returns VAE, Encoder, Generator.

    # Arguments
        input_dim: int.
        timesteps: int, input timestep dimension.
        batch_size: int.
        intermediate_dim: int, output shape of LSTM.
        latent_dim: int, latent z-layer shape.
        epsilon_std: float, z-layer sigma.
    """
    x = Input(shape=(timesteps, input_dim,))
    masked_inputs = Masking(mask_value=0.0)(x)  # Assuming 0.0 is the mask value

    # LSTM encoding
    h = LSTM(intermediate_dim)(masked_inputs)

    # VAE Z layer
    z_mean = Dense(latent_dim)(h)
    z_log_sigma = Dense(latent_dim)(h)

    def sampling(args):
        z_mean, z_log_sigma = args
        epsilon = K.random_normal(shape=(batch_size, latent_dim),
                                  mean=0., stddev=epsilon_std)
        return z_mean + z_log_sigma * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    # so you could write `Lambda(sampling)([z_mean, z_log_sigma])`
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])

    # decoded LSTM layer
    decoder_h = LSTM(intermediate_dim, return_sequences=True)
    decoder_mean = LSTM(input_dim, return_sequences=True)

    h_decoded = RepeatVector(timesteps)(z)
    h_decoded = decoder_h(h_decoded)

    # decoded layer
    x_decoded_mean = decoder_mean(h_decoded)

    # end-to-end autoencoder
    vae = Model(x, x_decoded_mean)

    # encoder, from inputs to latent space
    encoder = Model(x, z_mean)

    # generator, from latent space to reconstructed inputs
    decoder_input = Input(shape=(latent_dim,))

    _h_decoded = RepeatVector(timesteps)(decoder_input)
    _h_decoded = decoder_h(_h_decoded)

    _x_decoded_mean = decoder_mean(_h_decoded)
    generator = Model(decoder_input, _x_decoded_mean)

    def vae_loss(x, x_decoded_mean):
        xent_loss = losses.mse(x, x_decoded_mean)
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        loss = xent_loss + kl_loss
        return loss

    vae.compile(optimizer='rmsprop', loss=vae_loss)

    return vae, encoder, generator

# Training