In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
sentences = [
    "i love deep learaing",
    "deep learing is amazing"
]

In [4]:
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1

In [5]:
sequences = tokenizer.texts_to_sequences(sentences)
max_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

X, Y = sequences[:, :-1], sequences[:, 1:]

In [11]:
# Build RNN Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=10, input_length=max_len - 1),
    tf.keras.layers.LSTM(64, return_sequences=True),  
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs=100, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 2.0787
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.1667 - loss: 2.0765
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.5000 - loss: 2.0743
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.3333 - loss: 2.0721
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.3333 - loss: 2.0699
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.3333 - loss: 2.0676
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.3333 - loss: 2.0653
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.3333 - loss: 2.0629
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x276babb9330>

In [12]:
# Predict Next Word
def predict_next_word(text):
    tokens = tokenizer.texts_to_sequences([text])[0]
    tokens = pad_sequences([tokens], maxlen=max_len - 1, padding='post')
    prediction = model.predict(tokens)[0, -1]
    
    # distribution probability
    for word, index in tokenizer.word_index.items():
        print(f"{word}: {prediction[index - 1]:.4f}")
    
    predicted_token_id = np.argmax(prediction)

    return tokenizer.index_word.get(predicted_token_id, "[UNK]")
    

In [13]:
# Test
input_text = "is"
predicted_word = predict_next_word(input_text)
print(f"Predicted next word: {predicted_word}")
print("Tokenizer index-word mapping:", tokenizer.index_word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step
deep: 0.0007
i: 0.0012
love: 0.0007
learaing: 0.0004
learing: 0.0173
is: 0.0114
amazing: 0.2496
Predicted next word: amazing
Tokenizer index-word mapping: {1: 'deep', 2: 'i', 3: 'love', 4: 'learaing', 5: 'learing', 6: 'is', 7: 'amazing'}
