In [53]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [54]:
# Sample dataset (can be replaced with a larger corpus)
text_corpus = [
    "deep learning is amazing",
    "lstm networks are powerful",
    "attention improves sequence models",
    "transformers outperform lstm",
    "machine learning is fun",
]

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_corpus)
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x7bd41623c050>

In [55]:
tokenizer.word_index

{'learning': 1,
 'is': 2,
 'lstm': 3,
 'deep': 4,
 'amazing': 5,
 'networks': 6,
 'are': 7,
 'powerful': 8,
 'attention': 9,
 'improves': 10,
 'sequence': 11,
 'models': 12,
 'transformers': 13,
 'outperform': 14,
 'machine': 15,
 'fun': 16}

In [56]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(text_corpus)
sequences

[[4, 1, 2, 5], [3, 6, 7, 8], [9, 10, 11, 12], [13, 14, 3], [15, 1, 2, 16]]

In [57]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding token
vocab_size

17

In [58]:
# Maximum sequence length
max_sequence_length = max(len(seq) for seq in sequences)
max_sequence_length

4

In [59]:
# Padding sequences
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding="post")
sequences

array([[ 4,  1,  2,  5],
       [ 3,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14,  3,  0],
       [15,  1,  2, 16]], dtype=int32)

In [60]:
# Split input and target
X_train = sequences[:, :-1]  # All words except last
y_train = sequences[:, -1]   # Last word (target)
X_train, y_train

(array([[ 4,  1,  2],
        [ 3,  6,  7],
        [ 9, 10, 11],
        [13, 14,  3],
        [15,  1,  2]], dtype=int32),
 array([ 5,  8, 12,  0, 16], dtype=int32))

In [61]:
# One-hot encode the target output
y_train = tf.keras.utils.to_categorical(y_train, num_classes=vocab_size)

# Reshape target to match decoder output shape
y_train = np.expand_dims(y_train, axis=1)  # Shape: (batch_size, 1, vocab_size)

y_train

array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1.]]])

In [67]:
latent_dim = 128  # LSTM units

# Encoder
encoder_inputs = Input(shape=(max_sequence_length-1,))
encoder_embedding = Embedding(vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(1,))  # Single word input at each timestep
decoder_embedding = Embedding(vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention Mechanism
attention_layer = Attention()
attention_output = attention_layer([decoder_outputs, encoder_outputs])

# Concatenate LSTM output with attention context
decoder_combined = Concatenate()([decoder_outputs, attention_output])

# Dense layer for final prediction
decoder_dense = Dense(vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_combined)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()


In [68]:
# Training decoder input (start token for each sequence)
decoder_input_data = np.zeros((len(X_train), 1))  # (batch_size, 1)
decoder_input_data

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [69]:
# Train the model
model.fit([X_train, decoder_input_data], y_train, batch_size=32, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0000e+00 - loss: 2.8333
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.4000 - loss: 2.8154
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.6000 - loss: 2.7974
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - accuracy: 0.8000 - loss: 2.7788
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.8000 - loss: 2.7593
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 1.0000 - loss: 2.7387
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step - accuracy: 1.0000 - loss: 2.7166
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 1.0000 - loss: 2.6927
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7bd41a4d7410>

In [71]:
# Encoder Inference Model
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder Inference Model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Create a new Embedding layer for the decoder inference model
decoder_embedding_layer = Embedding(vocab_size, latent_dim)
decoder_embedding_input = decoder_embedding_layer(decoder_inputs)  # Use the new layer

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_input, initial_state=decoder_states_inputs)

attention_output = attention_layer([decoder_outputs, encoder_outputs])
decoder_combined = Concatenate()([decoder_outputs, attention_output])
decoder_outputs = decoder_dense(decoder_combined)

decoder_model = Model([decoder_inputs, encoder_outputs] + decoder_states_inputs, [decoder_outputs] + [state_h, state_c])

In [74]:
def generate_text(seed_text, max_length=30):
    sequence = tokenizer.texts_to_sequences([seed_text])
    sequence = pad_sequences(sequence, maxlen=max_sequence_length-1, padding="post")

    encoder_out, state_h, state_c = encoder_model.predict(sequence)
    input_word = np.zeros((1, 1))  # Start token input

    generated_text = seed_text

    for _ in range(max_length):
        output_tokens, state_h, state_c = decoder_model.predict([input_word, encoder_out, state_h, state_c])
        sampled_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_index, "")

        if sampled_word == "":
            break

        generated_text += " " + sampled_word
        input_word[0, 0] = sampled_index

    return generated_text


In [75]:
print(generate_text("deep learning"))  # Example output: "deep learning is amazing"
print(generate_text("machine"))  # Example output: "machine learning is fun"
print(generate_text("attention"))  # Example output: "attention improves sequence models"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44

In [76]:
print(generate_text("i love u"))  # Example output: "attention improves sequence models"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41