In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import numpy as np

# Sample text corpus
corpus = [
    "the sky is blue",
    "the sun is bright",
    "the sun in the sky is bright",
    "we can see the shining sun",
    "the sun is in the sky"
]

# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input sequences for next word prediction
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(token_list)
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])

# Pad sequences
max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Split into input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print("Input:\n",X[0:10])
print("Decoded Input")
for x in X[0:10]:
  print([tokenizer.index_word[number] for number in x if number!=0])
print("Output:\n", y[0:10])
print("Decoded Output")
print([tokenizer.index_word[number] for number in y[0:10] ])
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Define the RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=10, input_length=max_seq_len - 1),
    SimpleRNN(64),
    Dense(total_words, activation='softmax')
])






[1, 4, 2, 7]
[1, 3, 2, 5]
[1, 3, 6, 1, 4, 2, 5]
[8, 9, 10, 1, 11, 3]
[1, 3, 2, 6, 1, 4]
Input:
 [[0 0 0 0 0 1]
 [0 0 0 0 1 4]
 [0 0 0 1 4 2]
 [0 0 0 0 0 1]
 [0 0 0 0 1 3]
 [0 0 0 1 3 2]
 [0 0 0 0 0 1]
 [0 0 0 0 1 3]
 [0 0 0 1 3 6]
 [0 0 1 3 6 1]]
Decoded Input
['the']
['the', 'sky']
['the', 'sky', 'is']
['the']
['the', 'sun']
['the', 'sun', 'is']
['the']
['the', 'sun']
['the', 'sun', 'in']
['the', 'sun', 'in', 'the']
Output:
 [4 2 7 3 2 5 3 6 1 4]
Decoded Output
['sky', 'is', 'blue', 'sun', 'is', 'bright', 'sun', 'in', 'the', 'sky']




In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=1)

Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.2273 - loss: 2.4790
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.3182 - loss: 2.4633
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.3182 - loss: 2.4473
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.3182 - loss: 2.4306
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4091 - loss: 2.4128
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.4091 - loss: 2.3935
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.4091 - loss: 2.3724
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.4545 - loss: 2.3492
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7b834a397e50>

In [12]:
# Function to predict the next word
def predict_next_word(seed_text, next_words=1):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probs, axis=1)[0]
        output_word = tokenizer.index_word[predicted_word_index]
        seed_text += " " + output_word
    return seed_text

# Example
print(predict_next_word("the sun", next_words=4))

the sun is in the sky
