### Create a LSTM model for text analysis using Python and TensorFlow.
(Input corpus = 
* "Anna, embrace change"
* "Change is constant"
* "She is positive thinker"
* "Anna takes life as a challenge")

#### Step 1: Import the required libraries

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
corpus = [
    "Anna, embrace change",
    "Change is constant",
    "She is a positive thinker",
    "Anna takes life as a challenge"
]

#### Step 2: Text-Tokenize

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [4]:
total_words

14

#### Step 3: Text to sequences conversion

In [5]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+3]
        input_sequences.append(n_gram_sequence)

In [6]:
input_sequences

[[1, 5, 2],
 [1, 5, 2],
 [2, 3, 6],
 [2, 3, 6],
 [7, 3, 4, 8],
 [7, 3, 4, 8, 9],
 [7, 3, 4, 8, 9],
 [7, 3, 4, 8, 9],
 [1, 10, 11, 12],
 [1, 10, 11, 12, 4],
 [1, 10, 11, 12, 4, 13],
 [1, 10, 11, 12, 4, 13],
 [1, 10, 11, 12, 4, 13]]

#### Step 4: Pad sequences

In [7]:
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

6

In [8]:
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
input_sequences

array([[ 0,  0,  0,  1,  5,  2],
       [ 0,  0,  0,  1,  5,  2],
       [ 0,  0,  0,  2,  3,  6],
       [ 0,  0,  0,  2,  3,  6],
       [ 0,  0,  7,  3,  4,  8],
       [ 0,  7,  3,  4,  8,  9],
       [ 0,  7,  3,  4,  8,  9],
       [ 0,  7,  3,  4,  8,  9],
       [ 0,  0,  1, 10, 11, 12],
       [ 0,  1, 10, 11, 12,  4],
       [ 1, 10, 11, 12,  4, 13],
       [ 1, 10, 11, 12,  4, 13],
       [ 1, 10, 11, 12,  4, 13]])

#### Step 5: Split into X(features) y(target)

In [9]:
X, y = input_sequences[:, :-1], input_sequences[:, -1]
X

array([[ 0,  0,  0,  1,  5],
       [ 0,  0,  0,  1,  5],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  7,  3,  4],
       [ 0,  7,  3,  4,  8],
       [ 0,  7,  3,  4,  8],
       [ 0,  7,  3,  4,  8],
       [ 0,  0,  1, 10, 11],
       [ 0,  1, 10, 11, 12],
       [ 1, 10, 11, 12,  4],
       [ 1, 10, 11, 12,  4],
       [ 1, 10, 11, 12,  4]])

In [10]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

#### Step 6: Model Creation

In [11]:
model = Sequential([
    Embedding(total_words, 10, input_length=max_sequence_len-1),
    LSTM(100),
    Dense(total_words, activation='softmax')
])



#### Step 7: Compile the model

In [12]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#### Step 8: Train the model

In [13]:
model.fit(X, y, epochs=200, verbose=1)

Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0000e+00 - loss: 2.6399
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.3846 - loss: 2.6348
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3846 - loss: 2.6297
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.6154 - loss: 2.6245
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.6154 - loss: 2.6190
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.6154 - loss: 2.6132
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.6154 - loss: 2.6069
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.6154 - loss: 2.6001
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x1e97eac2b10>

#### Step 9: Generate Text

In [14]:
def generate_text(model, tokenizer, seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)
        output_word = tokenizer.index_word[predicted_word_index[0]]
        seed_text += " " + output_word
    return seed_text

In [15]:
new_text = "Anna"
generated_text = generate_text(model, tokenizer, new_text, 5, max_sequence_len)
print(generated_text)

Anna constant change as a challenge
