In [1]:
import random
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, RepeatVector, TimeDistributed, Embedding, Dropout, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
from gensim.models import Word2Vec
from tensorflow.keras.callbacks import EarlyStopping
from keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.utils import to_categorical

# Parameters for the model
embedding_dim = 100
latent_dim = 54

# Parameters
vocab_size = 1000
max_length = 100
epochs = 98
batch_size = 32
learning_rate = 0.01

# Load IMDB dataset
(x_train, _), (x_test, _) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()

# Create a reverse word index
reverse_word_index = {value + 3: key for key, value in word_index.items()}
reverse_word_index[0] = '<PAD>'
reverse_word_index[1] = '<START>'
reverse_word_index[2] = '<UNK>'
reverse_word_index[3] = '<UNUSED>'

# Convert sequences back to text
train_texts = [[reverse_word_index.get(i, '<UNK>') for i in sequence] for sequence in x_train]
test_texts = [[reverse_word_index.get(i, '<UNK>') for i in sequence] for sequence in x_test]
all_texts = train_texts + test_texts



Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [2]:
!pip uninstall numpy -y
!pip install numpy==1.26.4
!pip install --upgrade --force-reinstall gensim
!pip install --upgrade --force-reinstall tensorflow


Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylin

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting packaging (from te

In [4]:

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=all_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4, epochs=20)

# Convert the word2vec model to a dictionary
# Create an embedding matrix where each row index corresponds to a word index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = word2vec_model.wv[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            # Words not found in the embedding index will be all zeros
            pass


In [5]:

def sequence_to_text(sequence):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in sequence])

def add_noise_to_text(sequence, noise_factor=0.1):
    noisy_sequence = []
    for i in sequence:
        if random.random() < noise_factor:
            # Randomly replace a word with another word
            noisy_sequence.append(random.randint(1, vocab_size - 1))
        else:
            noisy_sequence.append(i)
    return noisy_sequence

# Add noise to the data
noisy_x_train = [add_noise_to_text(seq) for seq in x_train]
noisy_x_test = [add_noise_to_text(seq) for seq in x_test]

# Convert to padded sequences
noisy_x_train_padded = pad_sequences(noisy_x_train, maxlen=max_length, padding='post')
noisy_x_test_padded = pad_sequences(noisy_x_test, maxlen=max_length, padding='post')

# Example of noisy text
print("Original:", sequence_to_text(x_train[0]))
print("Noisy:", sequence_to_text(noisy_x_train[0]))


Original: ? in with i like horrible business ? ? would killer ? which ? <START> going at fun <UNK> film make like lame character has novel ? a all final sense <UNK> real <START> find character nothing ? second perhaps they <START> find ? ? this city an br overall <START> horror has i <UNUSED> should ? was in with <START> ? ? despite <START> with their people is i like horrible an well it br ? <START> with this genre this is i taken that ? <UNK> she sex is and house and after <UNK> <START> ? ? i final which ? be <START> does is i an annoying <UNK> film where if at man it's film ? be <UNUSED> with is comedy you than some <UNK> in perfect i get ? and <START> think plot ? it fun <START> ? the ? <UNK> sequence at their like horrible wanted on getting night just the <START> ? ? br any other <START> couple it someone then he ? more on why <UNUSED> can't ? that <START> family with for still wanted on final <UNK> such his ? that if at you interesting how film any <START> family would i an ? oth

In [6]:
# Encoder
input_text = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(input_text)
encoder_output = Bidirectional(LSTM(2*latent_dim, return_sequences=False))(encoder_embedding)
encoder_output = Dropout(0.15)(encoder_output)
encoder_output = Dense(latent_dim, activation='relu')(encoder_output)

# Decoder
decoder_input_seq = Input(shape=(max_length - 1,))
decoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_input_seq)
decoder_input = RepeatVector(max_length)(encoder_output)
decoder_lstm = LSTM(latent_dim, return_sequences=True)(decoder_input)
decoder_lstm = Dropout(0.15)(decoder_lstm)
decoder_lstm = LSTM(2*latent_dim, return_sequences=True)(decoder_lstm)
decoder_output = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder_lstm)


# Compile and train with MSE loss
autoencoder = Model([input_text, decoder_input_seq], decoder_output)
autoencoder.compile(optimizer=Adam(learning_rate=learning_rate, clipnorm=1.0), loss='sparse_categorical_crossentropy')

# # First, pad the sequences to max_length
x_train_padded = pad_sequences(x_train, maxlen=max_length, padding='post')
x_test_padded = pad_sequences(x_test, maxlen=max_length, padding='post')

# y_train and y_test are simply the padded original sequences
y_train = x_train_padded
y_test = x_test_padded

# Prepare decoder_input_data by shifting the sequences by one position
decoder_input_train = np.zeros((x_train_padded.shape[0], max_length - 1), dtype=int)
decoder_input_test = np.zeros((x_test_padded.shape[0], max_length - 1), dtype=int)

for i in range(len(x_train_padded)):
    decoder_input_train[i] = x_train_padded[i, 1:]  # Drop the first word

for i in range(len(x_test_padded)):
    decoder_input_test[i] = x_test_padded[i, 1:]  # Drop the first word


In [7]:
autoencoder.summary()

In [8]:
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow.keras.backend as K

def custom_lr_scheduler(epoch, lr):
    # Decrease learning rate by 0.1 factor every 5 epochs
    if epoch % 5 == 0 and epoch != 0:
        lr = lr * 0.1
    return lr

# Define the callback
lr_scheduler = LearningRateScheduler(custom_lr_scheduler)
early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train the model
autoencoder.fit([noisy_x_train_padded, decoder_input_train], np.expand_dims(y_train, -1),
          epochs=epochs,
          batch_size=batch_size,
          validation_data=([noisy_x_test_padded, decoder_input_test], np.expand_dims(y_test, -1)),
          callbacks=[early_stopping, lr_scheduler])



Epoch 1/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 45ms/step - loss: 4.8980 - val_loss: 4.8065 - learning_rate: 0.0100
Epoch 2/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 44ms/step - loss: 4.7909 - val_loss: 4.8234 - learning_rate: 0.0100
Epoch 3/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 43ms/step - loss: 4.7838 - val_loss: 4.8033 - learning_rate: 0.0100
Epoch 4/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 45ms/step - loss: 4.7877 - val_loss: 4.8023 - learning_rate: 0.0100
Epoch 5/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 43ms/step - loss: 4.7856 - val_loss: 4.8148 - learning_rate: 0.0100
Epoch 6/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 45ms/step - loss: 4.7717 - val_loss: 4.7938 - learning_rate: 1.0000e-03
Epoch 7/98
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 44ms/step - loss: 4.7594 - val_loss: 4.663

<keras.src.callbacks.history.History at 0x799b0de9f6d0>

In [9]:
def sample(preds, temperature=1.0):
    # Convert to array and prevent numerical issues with very small numbers
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-7) / temperature  # Adjust by temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)  # Softmax
    probas = np.random.multinomial(1, preds, 1)  # Sample from the softmax distribution
    return np.argmax(probas)

In [10]:
import tensorflow as tf
# Prepare the decoder input for prediction
decoder_input_test = np.zeros((noisy_x_test_padded.shape[0], max_length - 1), dtype=int)
for i in range(len(x_test_padded)):
    decoder_input_test[i] = x_test_padded[i, 1:]  # Drop the first word

# Predicting denoised text
denoised_texts = autoencoder.predict([noisy_x_test_padded[:10], decoder_input_test[:10]])

# Function to convert sequences back to text
def decode_denoised_sequence(sequence):
    indices = tf.math.argmax(sequence, axis=1).numpy()
    return ' '.join([reverse_word_index.get(word, '?') for word in indices])

def decode_sequence_with_sampling(prob_distributions, temperature=1.0):
    return ' '.join([reverse_word_index.get(sample(probs, temperature), '?') for probs in prob_distributions])

# Decoding the denoised sequences
for i in range(10):
    print("Original:", sequence_to_text(x_test[i]))
    print("Noisy:", sequence_to_text(noisy_x_test_padded[i]))
    print("Generated:", decode_sequence_with_sampling(denoised_texts[i], temperature=1))
    print("\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Original: ? murder both in have <UNUSED> easily of of ? ? <UNK> <START> boring the <START> again ? understand dead <START> over a ? ? ? of of br how where first lead ? make you ? in have movie not ? are role dark and where in true director and old just <UNK> not last i lot ? an he film ? based both in <UNUSED> easily
Noisy: ? murder both in have <UNUSED> easily of whether ? ? <UNK> <START> boring the <START> again that understand dead <START> more a ? ? ? of of br how where plus lead ? try that's ? in have movie not ? are role dark and where in true director and old just on not last i lot ? poor he ? ? based both in <UNUSED> easily ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
Generated: <START> no this emotional still and please <UNK> ok there bad also <UNK> br which in a theater and in the of on head <UNK> br became overall and at <UNK> wouldn't do a be little is character the say while were <UNK>