# Data processing

In [60]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [61]:
!pip install Bio
from Bio import SeqIO
import torch



In [62]:
def parse_fasta(file_path):
    sequences = []
    with open(file_path, 'r') as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequences.append(record.seq)
    return sequences

amino_acids = 'ACDEFGHIKLMNPQRSTVWY-'

def integer_encode(sequence, max_length):
    sequence = sequence.replace('X', '-')
    encoding = [amino_acids.index(aa) for aa in sequence]
    # Pad the sequence to the specified maximum length
    if len(encoding) < max_length:
        encoding += [0] * (max_length - len(encoding))
    return torch.tensor(encoding)

In [63]:
folder_path = 'drive/MyDrive/ae_training'
file_name = f'{folder_path}/card1_1273x130.fasta'
max_len = 440

sequences = parse_fasta(file_name)
encoded_sequences = [integer_encode(seq, max_len) for seq in sequences]
data = torch.stack(encoded_sequences)
data.shape

torch.Size([1273, 440])

# Build the autoencoder

In [68]:
# !pip install keras-core
import os
os.environ["KERAS_BACKEND"] = "torch"

import keras_core as keras
# import torch.nn as nn

In [89]:
from keras_core.layers import Input, Dense, Lambda, LSTM, Masking
from keras_core.models import Model
from keras_core import backend as K

class VAE:
    def __init__(self, input_dim=784, h_dim=400, z_dim=20):
        self.input_dim = input_dim
        self.h_dim = h_dim
        self.z_dim = z_dim
        self.build_model()

    def build_model(self):
        # Encoder
        inputs = Input(shape=(self.input_dim, 1))
        masked_inputs = Masking(mask_value=0.0)(inputs)  # Assuming 0.0 is the mask value

        h = LSTM(self.h_dim, activation='relu')(masked_inputs)
        mu = Dense(self.z_dim)(h)
        log_var = Dense(self.z_dim)(h)

        # Reparameterization trick
        def sampling(args):
            mu, log_var = args
            batch = K.shape(mu)[0]
            dim = K.int_shape(mu)[1]
            epsilon = K.random_normal(shape=(batch, dim))
            return mu + K.exp(0.5 * log_var) * epsilon

        z = Lambda(sampling, output_shape=(self.z_dim,))([mu, log_var])

        # Decoder
        decoder_h = Dense(self.h_dim, activation='relu')
        decoder_mean = Dense(self.input_dim, activation='sigmoid')
        h_decoded = decoder_h(z)
        x_decoded_mean = decoder_mean(h_decoded)

        # Define VAE model
        vae = Model(inputs, x_decoded_mean)

        # Compute VAE loss
        # xent_loss = self.input_dim * K.binary_crossentropy(inputs, x_decoded_mean)
        # kl_loss = - 0.5 * K.sum(1 + log_var - K.square(mu) - K.exp(log_var), axis=-1)
        # vae_loss = K.mean(xent_loss + kl_loss)

        # vae.add_loss(vae_loss)

        self.encoder = Model(inputs, [mu, log_var])
        self.decoder = Model(inputs, x_decoded_mean)

        self.vae = vae

    def compile(self, optimizer='adam'):
        self.vae.compile(optimizer=optimizer)

    # def train(self, x_train, epochs=10, batch_size=100):
    #     self.vae.fit(x_train,
    #                  shuffle=True,
    #                  epochs=epochs,
    #                  batch_size=batch_size)

    def encode(self, x):
        return self.encoder.predict(x)

    def decode(self, z):
        return self.decoder.predict(z)


# Example usage:
# vae = VAE()
# vae.compile()
# vae.train(x_train, epochs=10, batch_size=100)
# encoded = vae.encode(x_test)
# decoded = vae.decode(encoded)

In [90]:
vae = VAE()
vae.compile()