In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, Bidirectional, Embedding, TimeDistributed, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import gensim.downloader as api

In [2]:
# Prepare the Dataset
print("Loading GloVe embeddings...")
glove_vectors = api.load("glove-wiki-gigaword-50")
embedding_dim = 50  # GloVe vector size
print("GloVe embeddings loaded.")

# Sample Dataset
sentences = [
    "This is an autoencoder example",
    "Autoencoders can learn text representations",
    "Neural networks can compress and reconstruct data",
    "Natural language processing is powerful",
    "Deep learning enables advanced NLP tasks"
]

# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(sentences)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
print(f"Max sequence length: {max_length}")

# Create Embedding Matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_vectors:
        embedding_matrix[i] = glove_vectors[word]
    else:
        print(f"Warning: '{word}' not found in GloVe vectors.")

Loading GloVe embeddings...
GloVe embeddings loaded.
Max sequence length: 7


In [3]:
# Split data into training and testing sets
X_train, X_test = train_test_split(padded_sequences, test_size=0.2, random_state=42)

In [4]:
# Build the Autoencoder Model
latent_dim = 64  # Dimensionality of the latent space

input_sequence = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False)(input_sequence)
encoded = Bidirectional(LSTM(latent_dim))(embedding_layer)
decoded = RepeatVector(max_length)(encoded)
decoded = Bidirectional(LSTM(latent_dim, return_sequences=True))(decoded)
decoded = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoded)

autoencoder = Model(input_sequence, decoded)
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [5]:
# Train the Autoencoder Model
target_data = np.expand_dims(X_train, axis=-1)  # Required for sparse categorical cross-entropy
autoencoder.fit(X_train, target_data, epochs=100, batch_size=4, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 3.3209
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0714 - loss: 3.2578
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.1429 - loss: 3.1973
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.2143 - loss: 3.1377
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.2143 - loss: 3.0776
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.2143 - loss: 3.0162
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.2143 - loss: 2.9527
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.2143 - loss: 2.8870
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7e7181aa9750>

In [6]:
# Evaluate the Autoencoder Model
target_data_test = np.expand_dims(X_test, axis=-1)  # Test target data reshaped
loss, accuracy = autoencoder.evaluate(X_test, target_data_test, verbose=0)
print(f"Evaluation Loss: {loss}, Evaluation Accuracy: {accuracy}")

Evaluation Loss: 5.436692714691162, Evaluation Accuracy: 0.2857142984867096


In [7]:
# Encode and Decode a test sentence
def reconstruct_sentence(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    predicted_sequence = autoencoder.predict(sequence)
    predicted_tokens = np.argmax(predicted_sequence, axis=-1)[0]
    reconstructed_words = [tokenizer.index_word.get(token, '[UNK]') for token in predicted_tokens if token > 0]
    reconstructed = ' '.join(reconstructed_words)
    return reconstructed

In [8]:
# Test the Autoencoder with a sentence
input_text = "Deep learning powers complex NLP tasks"
output_text = reconstruct_sentence(input_text)
print("Original:", input_text)
print("Reconstructed:", output_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443ms/step
Original: Deep learning powers complex NLP tasks
Reconstructed: deep learning enables advanced nlp tasks
