# Sequence to Sequense Prediction of Text

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [3]:
t_data = 'Jack and Jill went up the hill To fetch a pail of water Jack fell down and broke his crown, And Jill came tumbling after.'

In [4]:
t_data

'Jack and Jill went up the hill To fetch a pail of water Jack fell down and broke his crown, And Jill came tumbling after.'

# Tokenization and Encoding

In [5]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts([t_data])
total_words = len(tokenizer.word_index) + 1

In [6]:
word_ids = tokenizer.word_index
print(word_ids)

{'and': 1, 'jack': 2, 'jill': 3, 'went': 4, 'up': 5, 'the': 6, 'hill': 7, 'to': 8, 'fetch': 9, 'a': 10, 'pail': 11, 'of': 12, 'water': 13, 'fell': 14, 'down': 15, 'broke': 16, 'his': 17, 'crown': 18, 'came': 19, 'tumbling': 20, 'after': 21}


# Convert the text into sequences for training the model

In [7]:
input_seq = []
for line in t_data.split("."):
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    print(len(token_list))
    for i in range(1, len(token_list)):
        num_of_words_per_seq = token_list[:i+1]
        input_seq.append(num_of_words_per_seq)
print(input_seq)

[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3, 19, 20, 21]
25
[]
0
[[2, 1], [2, 1, 3], [2, 1, 3, 4], [2, 1, 3, 4, 5], [2, 1, 3, 4, 5, 6], [2, 1, 3, 4, 5, 6, 7], [2, 1, 3, 4, 5, 6, 7, 8], [2, 1, 3, 4, 5, 6, 7, 8, 9], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 14, 15, 1, 16, 17, 18, 1, 3], [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 

In [8]:
print(len(input_seq))

24


In [9]:
maximum_length_of_seq = max(len(seq) for seq in input_seq)
maximum_length_of_seq

25

In [10]:
input_seq_padded = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=maximum_length_of_seq, padding='pre')
print(input_seq_padded)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2
   1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1
   3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3
   4]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4
   5]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5
   6]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6
   7]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7
   8]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8
   9]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9
  10]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10
  11]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10 11
  12]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10 11 12
  13]
 [ 0  0  0  0  0  0  0  0  0  0  0  2  1

# Construct the training and testing DATA

In [12]:
X, y = input_seq_padded[:, :-1], input_seq_padded[:, -1]

In [13]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [14]:
print(X)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10 11]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10 11 12]
 [ 0  0  0  0  0  0  0  0  0  0  0  2  1  3  4  5  6  7  8  9 10 11 12 13]
 [ 0  0  0  0  0  0  0  0

In [15]:
print(y)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# LSTM Model Building using Tensorflow Library

In [16]:
model = Sequential()
model.add(Embedding(total_words, 50,input_length=maximum_length_of_seq-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 50)            1100      
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense (Dense)               (None, 22)                2222      
                                                                 
Total params: 63,722
Trainable params: 63,722
Non-trainable params: 0
_________________________________________________________________


# Compile the model

In [20]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the model

In [21]:
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1e52a7fded0>

# Now Lets check how the model generates Text

In [28]:
your_seed_text = "You:Jack"
next_words = 20

In [29]:
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([your_seed_text])[0]
    token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=maximum_length_of_seq-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = tf.argmax(predicted, axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            output_word = word
            break
    your_seed_text += " " + output_word
print(f"Model: {your_seed_text.replace('You:', '')}")

Model: Jack and jill went up the hill to fetch a pail of water jack fell down and broke his crown and


In [30]:
print(your_seed_text)

You:Jack and jill went up the hill to fetch a pail of water jack fell down and broke his crown and


In [20]:
SimpleRNN