## Next Word Prediction using LSTM

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [2]:
df = pd.read_csv("about_topics.csv", nrows=10)
df.head()

Unnamed: 0,topic,text
0,car,Cars are like four-wheeled engines on wheels t...
1,car,"Once a futuristic dream, cars are now everyday..."
2,car,Cars are like our modern horse and carriage! T...
3,car,"From clunky contraptions to sleek machines, ca..."
4,car,"Imagine a metal cheetah, purring with power! C..."


In [3]:
corpus = [sent for sent in df['text']]

corpus[0]

"Cars are like four-wheeled engines on wheels that zoom us around. They usually have space for a handful of people and come in all shapes and sizes, from tiny and sporty to big and comfy. Cars have a special engine inside that burns fuel, like gasoline or electricity, to give them the power to move. We steer them with a wheel and use pedals to brake and go! Cars help us travel long distances or get around town quickly. They can be a lot of fun to ride in, but it's important to remember to always wear a seatbelt and follow traffic rules to stay safe."

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1
print(total_words)

314


In [5]:
input_seq = []

for c in corpus:
    tokens = tokenizer.texts_to_sequences([c])[0]
    for i in range(1, len(tokens)):
        ng_seq = tokens[:i+1]
        input_seq.append(ng_seq)
input_seq[:5]

[[7, 12],
 [7, 12, 6],
 [7, 12, 6, 28],
 [7, 12, 6, 28, 44],
 [7, 12, 6, 28, 44, 72]]

In [6]:
print(input_seq[4])

setence_token = input_seq[4]
sentence = []
for token in setence_token:
    sentence.append(list((tokenizer.word_index).keys())[list((tokenizer.word_index).values()).index(token)])

print(sentence)

[7, 12, 6, 28, 44, 72]
['cars', 'are', 'like', 'four', 'wheeled', 'engines']


In [7]:
max_seq_len = max([len(seq) for seq in input_seq])
input_seq = np.array(pad_sequences(input_seq, maxlen=max_seq_len, padding='pre'))

In [8]:
X = input_seq[:, :-1]
y = np.array(tf.keras.utils.to_categorical(input_seq[:, -1], num_classes=total_words))

In [9]:
model = Sequential([
    Embedding(total_words, 100, input_length=max_seq_len-1),
    Bidirectional(LSTM(150)),
    Dense(total_words, activation='softmax')
])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 123, 100)          31400     
                                                                 
 bidirectional (Bidirection  (None, 300)               301200    
 al)                                                             
                                                                 
 dense (Dense)               (None, 314)               94514     
                                                                 
Total params: 427114 (1.63 MB)
Trainable params: 427114 (1.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [10]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [11]:
model.fit(X, y, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x28f9a261410>

In [12]:
seed_text = "A car have"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word



In [13]:
seed_text

'A car have machine on wheels'