In [47]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense , Embedding , LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical


In [23]:
#text data
sample_text ="Put your data"

In [24]:
#tokenize the text
tokenize = Tokenizer()
tokenize.fit_on_texts([sample_text]) #  create word_to_index (mapping)
total_word = len(tokenize.word_index) + 1 #Adds +1 because index 0 is reserved (e.g., padding or <OOV>).


In [53]:
#generate input sequence(making data for training)
input_seq = [ ]
for word in sample_text.split('.'):
  token_list = tokenize.texts_to_sequences([word])[0]

  for i in range(1 , len(token_list)):
    n_grams_seq = token_list[:i + 1]
    input_seq.append(n_grams_seq)



In [54]:
#padding(use currently created training data)
max_padding_leng = max([len(x) for x in input_seq])
input_seq_padding = np.array(pad_sequences(input_seq , maxlen=max_padding_leng , padding='pre'))


In [56]:
#split x and y(All rows, excluding the last token in each row → this becomes your input (X) , All rows, only the last token → this becomes your target (Y))
X= input_seq_padding[:, :-1]
y =  input_seq_padding[:, -1]
y = to_categorical(y, num_classes=total_word)

In [94]:
#lstm model
model = Sequential()
model.add(Embedding(total_word , 60))
model.add(LSTM(200))
model.add(Dense(total_word , activation='softmax'))

In [95]:
#compile and train model
model.compile(optimizer= 'adam' , loss = 'categorical_crossentropy' , metrics=['accuracy'])

model.fit(X, y , epochs = 20 , verbose=1)

Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.0157 - loss: 4.3825  
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1062 - loss: 4.3634
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1145 - loss: 4.3215 
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0833 - loss: 4.2496 
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1218 - loss: 4.1108
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0947 - loss: 4.1550
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.1166 - loss: 4.0520
Epoch 8/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0947 - loss: 4.0751
Epoch 9/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x7b1c662d7610>

In [104]:
#generate new text
def generate_new_text(seed_text , next_word , max_padding_leng):
  for i in range(next_word):
    token_list = tokenize.texts_to_sequences([seed_text])[0]
    token_list_padding = pad_sequences([token_list], maxlen=max_padding_leng - 1, padding='pre')
    predicted = np.argmax(model.predict(token_list_padding) , axis = -1)

    output_word = [ ]
    for word , index in tokenize.word_index.items():
         if index == predicted:
          output_word = word
          break

    seed_text += " " + output_word
  return seed_text


In [106]:
new_text = generate_new_text("The quick brown fox jumped over the lazy dog", 4, max_padding_leng)
print(new_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
The quick brown fox jumped over the lazy dog the the the the
