In [34]:
corpus = [
    "i like to play football",
    "i like to watch movies",
    "i love to play games",
    "i love to eat pizza",
    "i enjoy reading books"
]


In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [36]:
# Initialize tokenizer and fit on corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
tokenizer.word_counts

OrderedDict([('i', 5),
             ('like', 2),
             ('to', 4),
             ('play', 2),
             ('football', 1),
             ('watch', 1),
             ('movies', 1),
             ('love', 2),
             ('games', 1),
             ('eat', 1),
             ('pizza', 1),
             ('enjoy', 1),
             ('reading', 1),
             ('books', 1)])

In [37]:
line = "i like to play football"
sequence = tokenizer.texts_to_sequences([line])[0]
sequence

[1, 3, 2, 4, 6]

In [38]:
# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        print(n_gram_sequence)
        input_sequences.append(n_gram_sequence)
len(token_list)

[1, 3]
[1, 3, 2]
[1, 3, 2, 4]
[1, 3, 2, 4, 6]
[1, 3]
[1, 3, 2]
[1, 3, 2, 7]
[1, 3, 2, 7, 8]
[1, 5]
[1, 5, 2]
[1, 5, 2, 4]
[1, 5, 2, 4, 9]
[1, 5]
[1, 5, 2]
[1, 5, 2, 10]
[1, 5, 2, 10, 11]
[1, 12]
[1, 12, 13]
[1, 12, 13, 14]


4

In [41]:
# input_sequences
# print(tokenizer.texts_to_sequences)
input_sequences

array([[ 0,  0,  0,  1,  3],
       [ 0,  0,  1,  3,  2],
       [ 0,  1,  3,  2,  4],
       [ 1,  3,  2,  4,  6],
       [ 0,  0,  0,  1,  3],
       [ 0,  0,  1,  3,  2],
       [ 0,  1,  3,  2,  7],
       [ 1,  3,  2,  7,  8],
       [ 0,  0,  0,  1,  5],
       [ 0,  0,  1,  5,  2],
       [ 0,  1,  5,  2,  4],
       [ 1,  5,  2,  4,  9],
       [ 0,  0,  0,  1,  5],
       [ 0,  0,  1,  5,  2],
       [ 0,  1,  5,  2, 10],
       [ 1,  5,  2, 10, 11],
       [ 0,  0,  0,  1, 12],
       [ 0,  0,  1, 12, 13],
       [ 0,  1, 12, 13, 14]], dtype=int32)

In [40]:
# Pad sequences
max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')


In [8]:
# Split into input (X) and label (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense


In [10]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_seq_len-1))
model.add(SimpleRNN(100))
model.add(Dense(total_words, activation='softmax'))


2025-05-05 18:30:01.939116: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [13]:
model.fit(X, y, epochs=5, verbose=1)


Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.7368 - loss: 0.4527
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.7368 - loss: 0.4520
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.7368 - loss: 0.4514
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.7368 - loss: 0.4508
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.7368 - loss: 0.4502


<keras.src.callbacks.history.History at 0x766f1c53d2b0>

In [14]:
def predict_next_word(seed_text, next_words=1):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)
        
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break
    return seed_text

# Test
print(predict_next_word("i like to", next_words=1))
print(predict_next_word("i love to", next_words=1))


i like to play
i love to play
