In [17]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Reshape
from tensorflow.keras.models import Model
import numpy as np

# Parameters
vocab_size = 10000  # Size of the vocabulary
max_length = 150    # Maximum length of input sentences
latent_dim = 64     # Size of the latent space
embedding_dim = 100  # Embedding dimension
epochs = 40        # Number of training epochs

In [2]:
# Load IMDB dataset
(x_train, _), (x_test, _) = imdb.load_data(num_words=vocab_size)
x_train = pad_sequences(x_train,maxlen=max_length,padding='post')  # Pad the sequences to the right to have a tensor of shape (batch_size, max_length)
x_test = pad_sequences(x_test,maxlen=max_length,padding='post')   # Pad the sequences to the right to have a tensor of shape (batch_size, max_length)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
# Load the IMDB word index
word_index = imdb.get_word_index()
# Reverse the word index to map integer indices to words
reverse_word_index = {value:key for key,value in word_index.items()}
# Add padding, start, and unknown tokens
reverse_word_index[0] = '<PAD>'
reverse_word_index[1] = '<START>'
reverse_word_index[2] = '<UNK>'


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [7]:
# Encoder
inputs = Input(shape=(max_length,))
embedded = Embedding(vocab_size,embedding_dim)(inputs)  # Embedding layer
flattened = Flatten()(embedded)  # Flatten the output of the embedding layer
encoded = Dense(latent_dim, activation='relu')(flattened)  # Dense layer with latent_dim
encoder_model = Model(inputs, encoded)  # Encoder model


In [14]:

# Decoder
latent_inputs = Input(shape=(latent_dim,))
reconstructed = Dense(max_length*embedding_dim,activation='relu') (latent_inputs) # Dense layer with max_length*embedding_dim
reshaped = Reshape((max_length,embedding_dim))(reconstructed)  # Reshape the output of the Dense layer to (max_length, embedding_dim)
decoded = Dense(vocab_size,activation='softmax')(reshaped)  # Dense layer with vocab_size as last layer
decoder_model = Model(latent_inputs, decoded)  # Decoder model

# Autoencoder
autoencoder = Model(inputs,decoder_model(encoder_model(inputs))) # Autoencoder model


In [18]:

# Compile the model
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')   #  actual loss for the autoencoder model

# Train the model
autoencoder.fit(x_train,x_train,epochs=epochs,batch_size=32,validation_data=(x_test,x_test))

Epoch 1/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 31ms/step - loss: 4.7616 - val_loss: 5.2356
Epoch 2/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 23ms/step - loss: 4.7119 - val_loss: 5.2929
Epoch 3/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - loss: 4.6967 - val_loss: 5.3152
Epoch 4/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - loss: 4.6797 - val_loss: 5.3390
Epoch 5/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - loss: 4.6656 - val_loss: 5.3499
Epoch 6/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - loss: 4.6589 - val_loss: 5.3790
Epoch 7/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - loss: 4.6518 - val_loss: 5.3917
Epoch 8/40
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - loss: 4.6396 - val_loss: 5.4073
Epoch 9/40
[1m782/782[

<keras.src.callbacks.history.History at 0x7c75225fe510>

In [20]:
# Usage example (after training)
decoded_texts = autoencoder.predict(x_test[:10])  # AutoEncode the first 10 texts
print(decoded_texts)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[[[4.65756131e-07 9.99999523e-01 5.48204981e-10 ... 0.00000000e+00
   0.00000000e+00 1.20169715e-29]
  [7.08906214e-07 2.12997367e-43 2.10322614e-04 ... 0.00000000e+00
   0.00000000e+00 1.97733624e-34]
  [2.12311375e-06 1.10919104e-19 4.80970815e-02 ... 2.93659478e-30
   1.53903027e-13 1.33900154e-27]
  ...
  [9.99999642e-01 0.00000000e+00 9.03732547e-13 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [1.00000000e+00 0.00000000e+00 1.15695277e-16 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [1.00000000e+00 0.00000000e+00 3.73937142e-20 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]]

 [[2.31214173e-10 7.57055823e-05 8.20362568e-02 ... 1.26301579e-22
   3.64609124e-12 2.89796063e-08]
  [3.10789368e-08 1.30732068e-13 1.37516335e-01 ... 1.76072226e-10
   1.28211929e-11 2.34959496e-10]
  [2.91504221e-09 2.37055036e-20 1.08677752e-01 ... 6.77242132e-19
   3.84878879e-31 9.57885324e-18]
  ...
  [1.

In [21]:
def decode_sequence(sequence):
    """Decode a sequence of integers back to words."""
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in sequence])

# Assume `decoded_texts` is the output from the decoder and get the argmax
decoded_sequences = np.argmax(decoded_texts,axis=-1)

# Convert each sequence in the decoded_sequences back to text
decoded_texts = [decode_sequence(seq) for seq in decoded_sequences]

# Example: print the first decoded text
print(decoded_texts[0])


? this is this movie a movie ? it ? ? in <START> ? of <START> film ? ? ? <START> ? in ? ? ? br ? is some ? but is ? is ? ? this ? in this movie ? br manages to ? as is supposed to be ? <UNK> his ? is is ? that ? this behaves ? ? <UNK> a strongest ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
