In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Lambda, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import backend as K


In [2]:
max_seq_length = 100  # Length of input sequences
vocab_size = 10000    # Size of vocabulary
embedding_dim = 64    # Embedding dimensions
latent_dim = 32       # Latent space dimensions


In [5]:
# Encoder
input_text = Input(shape=(max_seq_length,))  # Define the input layer with a specified maximum sequence length.

x = Embedding(vocab_size,embedding_dim)(input_text)  # Embedding layer to convert input text into dense vector representations.

encoder_outputs = Bidirectional(LSTM(latent_dim))(x)  # Define a bidirectional LSTM layer of latent_dim units. Only return the last state

# VAE Sampling layer
def sampling(inputs):
    z_mean, z_log_sigma = inputs                  # Unpack the arguments: mean and log of variance of the latent space.
    batch = tf.shape(z_mean)[0]               # Determine the batch size.
    dim = tf.shape(z_mean)[1]                     # Determine the dimensionality of the latent space.
    epsilon = K.random_normal(shape=(batch,dim)) # Generate random values for epsilon, used for the reparameterization trick.
    return z_mean + tf.exp(0.5 * z_log_sigma) * epsilon  # Return the sampled latent vector (reparameterization trick).

z_mean = Dense(latent_dim)(encoder_outputs)  # Dense layer to generate the mean of the latent space distribution.
z_log_sigma = Dense(latent_dim)(encoder_outputs)  # Dense layer to generate the log variance of the latent space distribution.
z = Lambda(sampling)([z_mean, z_log_sigma])  # Apply the sampling function to obtain the latent variable 'z'.

# Instantiate encoder model
encoder = Model(input_text, [z_mean, z_log_sigma, z])  # Create the encoder model that outputs the mean, log variance, and the sampled z.
encoder.summary()  # Display the summary of the encoder model.


In [6]:
# Example text
texts = ["Hello world", "Variational Autoencoders are fun"]

# Tokenize text
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences
data = pad_sequences(sequences, maxlen=max_seq_length)


In [8]:
sequences

[[1, 2], [3, 4, 5, 6]]