In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN,Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#Sample training text
data = """
Once upon a time in a land far away,there live a young prince.
The prince was brave,strong and kind.One day, the prince set out
on a adventure to discover new lands and find hidden treasures
"""

Data Preparation:

The sample text is tokenized using Keras's Tokenizer. Input sequences are created with an increasing number of tokens to predict the next word in each sequence, tokenizer = Tokenizer(): Creates an instance of the Tokenizer, which will be used to convert the text into sequences of numbers, tokenizer.fit_on_texts([data]): Fits the tokenizer on the input text (data), creating a dictionary that maps each word to a unique integer. total words = len(tokenizer.word_index) + 1: tokenizer.word_index is a dictionary that maps words to indices. We add 1 to the total number of words because indices typically start from 1, and we need to account for a padding token (if used).

In [None]:
# Preprocess the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

tokenizer texts to sequences([line]) takes the current sentence (line) and converts it into a list of tokens (or integers). This tokenization step assumes you have a tokenizer object (likely a Tokenizer from Keras or a similar library) that maps words to unique integers. Since texts to sequences returns a list of lists (because it processes batches of sentences), the [0] index is used to extract the token list for the current sentence. This for loop iterates over the tokenized sentence, starting from the second token (1). in each iteration, it creates an n-gram sequence by taking the first i+1 tokens from the token list using slicing: token list[i+1]. This gives a subsequence of increasing length. Each n_gram_sequence is added to the inout_sequences list.

In [None]:
# Convert the text into sequences of toens
input_sequences = []
for line in data.split('. '):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len: Finds the length of the longest sequence in input_sequences so that all sequences can be padded to the same length. pad_sequences (input_sequences, max_sequence_len, padding = 'pre'): Pads shorter sequences with zeroes at the beginning ('pre') so that all sequences are of the same length.

In [None]:
# Pad sequences for consistent input size
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

X= input sequences[-1]: Takes all but the last word of each sequence as the input (features). This is what the model will use to predict the next word. y= input sequences):,-1). The last word in the sequence is treated as the label (the word to be predicted). y = np.eye(total_words) [y]: Converts y into a one-hot encoded format, which is neaded for classification if total words is 100 the cutout will be a vector of size 100 where only one position (corresonding to the correct word) is 1, and the rest are 0.

In [None]:
# Create predictors and label
X, y = input_sequences[:,:-1],input_sequences[:,-1]
y = np.eye(total_words)[y]   # One-hot encode the labels

Model Architecture:
An embedding layer to represent words in vectors. A simple RNN layer to learn the sequences of words. A dense layer with softmax activation to predict the next word based on the input sequences.

In [None]:
# Build the RNN Model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(SimpleRNN(50))
model.add(Dense(total_words, activation='softmax'))



In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.1033 - loss: 3.4580
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1706 - loss: 3.4137 
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1602 - loss: 3.3818
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0853 - loss: 3.3590
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1318 - loss: 3.3302
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1318 - loss: 3.2953
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1602 - loss: 3.2620
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.1782 - loss: 3.2328
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7a678a5cd0f0>

predict_next_word: Function to predict the next num_words given some seed_text.

tokenizer.texts_to_sequences([seed_text])[0]: Converts the seed text into a sequence of integers. pad_sequences([token_list], maxlen=max_sequence_Jen-1, padding='pre): Pads the token list to match the input length required by the model. model.predict(token_list): The model predicts probabilities for each word in the vocabulary, np.argmax(predicted): Retrieves the index of the word with the highest probability, for word, index in tokenizer.word_index.items():: Finds the word corresponding to the predicted index, seed_text += output_word: Appends the predicted word to the seed text. return seed_text: Returns the seed text with predicted words appended. Testing the Model: python Copy code seed_text = "The prince" next_words = 5 print(predict_next_word(seed_text, next_words)) seed_text = "The prince: The seed text for which you want to generate the next few words. next_words = 5: The number of words you want to predict.

print(predict_next_word(seed_text, next_words)): Prints the result

In [None]:
# Function to predict next word
def predict_next_word(seed_text, num_words):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
# Test the model
seed_text = "The prince"
next_words = 5
print(predict_next_word(seed_text, next_words))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
The prince a a a a a
