# <center>Text Generation with LSTM-based Sequence-to-Sequence Model</center>

<p> This notebook demonstrates the implementation of a sequence-to-sequence model using LSTM layers for text generation. The model is trained on a dataset of input and target text pairs and is then used to predict summaries for input sentences. The notebook provides a step-by-step explanation of the encoding and decoding process for generating text predictions.</p>

- Import required libraries

In [58]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


- Import the dataset

In [59]:

data = [
    ("A cat sat on the mat.", "A cat rested on the mat."),
    ("The dog was barked loudly.", "The dog was loud."),
    ("Cats are mammals.", "Cats are animals."),
    ("Roses are flowers.", "Rose is a flower."),
    ("Children are playing.", "Kids are playing."),
    ("Schools are open for kids.", "Schools are open for children."),
    # Add more examples here
]


- Tokenise the input and target texts

In [60]:
input_texts = [pair[0] for pair in data]
target_texts = ['<start> ' + pair[1] + ' <end>' for pair in data]  # Add start and end tokens to target sequences
print(target_texts)
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(input_texts + target_texts)

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

max_encoder_seq_length = max(len(seq) for seq in input_sequences)
max_decoder_seq_length = max(len(seq) for seq in target_sequences)


['<start> A cat rested on the mat. <end>', '<start> The dog was loud. <end>', '<start> Cats are animals. <end>', '<start> Rose is a flower. <end>', '<start> Kids are playing. <end>', '<start> Schools are open for children. <end>']


- Pad the input and target sequences

In [61]:

# Pad sequences to make them the same length
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

# Define the model architecture
embedding_dim = 256
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the padding token
hidden_units = 512


- Initialise the encoder and decoder layers

In [62]:

# Encoder
encoder_inputs = Input(shape=(max_encoder_seq_length,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_decoder_seq_length,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


- Define, compile and train the model

In [63]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=50, validation_split=0.2)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0625 - loss: 3.4667 - val_accuracy: 0.3750 - val_loss: 3.4220
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step - accuracy: 0.4062 - loss: 3.4095 - val_accuracy: 0.3750 - val_loss: 3.3613
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step - accuracy: 0.3750 - loss: 3.3356 - val_accuracy: 0.3750 - val_loss: 3.2498
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - accuracy: 0.3750 - loss: 3.2086 - val_accuracy: 0.3750 - val_loss: 3.0237
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step - accuracy: 0.3750 - loss: 2.9613 - val_accuracy: 0.3750 - val_loss: 2.5985
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step - accuracy: 0.3438 - loss: 2.5163 - val_accuracy: 0.3750 - val_loss: 2.2396
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x21eeb932290>

- Define the inference models

In [64]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_outputs

<KerasTensor shape=(None, 8, 32), dtype=float32, sparse=False, name=keras_tensor_210>

In [65]:
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)
reverse_input_word_index = {v: k for k, v in tokenizer.word_index.items()}
reverse_target_word_index = reverse_input_word_index

- Decoding the input sequence

In [66]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start token
    target_seq[0, 0] = tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_word_index[sampled_token_index]
        print(sampled_char)
        # Exit condition: either hit max length or find stop token
        if sampled_char == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += sampled_char + ' '

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence


- Step by step encoding and decoding of input sequence

In [67]:
for i in range(len(input_texts)):
    input_seq = encoder_input_data[i:i+1]
    print(input_texts,input_seq)
    decoded_sentence = decode_sequence(input_seq)
    print('Input sentence:', input_texts[i])
    print('Predicted summary:', decoded_sentence)

['A cat sat on the mat.', 'The dog was barked loudly.', 'Cats are mammals.', 'Roses are flowers.', 'Children are playing.', 'Schools are open for kids.'] [[ 5  6 16  7  4  8]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step
a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
cat
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
rested
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
on
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
mat.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
<end>
Input sentence: A cat sat on the mat.
Predicted summary: a cat rested on the mat. 
['A cat sat on the mat.', 'The dog was barked loudly.', 'Cats are mammals.', 'Roses are flowers.', 'Children are playing.', 'Schools