# RNN - Next Word Prediction

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense


In [2]:
# Sample corpus
corpus = [
    "I love deep learning",
    "I love natural language processing",
    "I love machine learning",
    "deep learning is amazing",
    "natural language processing is fun",
    "machine learning is the future"
]

In [3]:
# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1 # +1 is added because Keras starts indexing from 1, and 0 is reserved for padding.

# Create input sequences
input_sequences = [] # Initializes an empty list that will store n-gram sequences for training.
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0] # Converts a sentence (e.g., "I love deep learning") into a list of integers (e.g., [1, 2, 3, 4]).
    for i in range(1, len(token_list)):
        n_gram = token_list[:i+1] # Generates sequences like: [1, 2] (e.g., "I love"); [1, 2, 3] (e.g., "I love deep"); [1, 2, 3, 4] (e.g., "I love deep learning")
        input_sequences.append(n_gram)

# Pad sequences
max_seq_len = max([len(x) for x in input_sequences]) # Finds the maximum sequence length among all sequences. This will be used for padding.
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre') # Pads each sequence with zeros at the beginning ('pre') so all sequences have the same length.
#For example: [1, 2] → [0, 0, 1, 2]; [1, 2, 3] → [0, 1, 2, 3]

# Split predictors and labels
xs = input_sequences[:, :-1] # the first N-1 words
labels = input_sequences[:, -1] # the Nth word (next word to predict)
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

| Variable | Meaning                          | Example                      |
| -------- | -------------------------------- | ---------------------------- |
| `xs`     | Input word sequences             | `[[0, 0, 1], [0, 1, 2]]`     |
| `labels` | Target next word (as index)      | `[2, 3]`                     |
| `ys`     | One-hot encoded labels for model | `[[0,0,1,0,0], [0,0,0,1,0]]` |


In [12]:
# Define RNN model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_seq_len-1))
model.add(SimpleRNN(64))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_seq_len-1))
model.summary()


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Train
model.fit(xs, ys, epochs=200, verbose=1)

Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0952 - loss: 2.6342
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2381 - loss: 2.6170
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2857 - loss: 2.5996
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.4286 - loss: 2.5817
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.4286 - loss: 2.5630
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.4762 - loss: 2.5434
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.4762 - loss: 2.5226
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.4286 - loss: 2.5006
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x79e53ee1d220>

In [14]:
# Predict next word
def predict_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0] # Converts your seed_text (like "I love") into a list of integers.
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0) # predicted = [0.01, 0.05, 0.02, 0.88, 0.04]  # Most likely word is index 3
    predicted_word = tokenizer.index_word[np.argmax(predicted)] # np.argmax(predicted) gets the index of the highest probability (i.e., most likely next word).
    # np.argmax(predicted) → 3; tokenizer.index_word[3] → "deep"
    return predicted_word

print("Next word prediction:")
print("Input: 'I love' ->", predict_next_word("I love"))

Next word prediction:
Input: 'I love' -> natural


# BiRNN - Next Word Prediction

In [16]:
from tensorflow.keras.layers import Bidirectional

# Define BiRNN model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_seq_len-1))
model.add(Bidirectional(SimpleRNN(64)))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_seq_len-1))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# Train
model.fit(xs, ys, epochs=200, verbose=1)

# Predict
print("Next word prediction (BiRNN):")
print("Input: 'I love' ->", predict_next_word("I love"))


Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0476 - loss: 2.6280
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1905 - loss: 2.6067
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.2381 - loss: 2.5852
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.2857 - loss: 2.5633
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2857 - loss: 2.5406
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.2857 - loss: 2.5170
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.2857 - loss: 2.4923
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.2857 - loss: 2.4662
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# RNN - Sentiment Analysis

In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
# Sample sentiment dataset
texts = [
    "I love this movie",
    "This film was fantastic",
    "What a great experience",
    "Absolutely wonderful acting",
    "I hate this movie",
    "This film was terrible",
    "What a bad experience",
    "Absolutely horrible acting"
]
labels = [1, 1, 1, 1, 0, 0, 0, 0]  # 1 = positive, 0 = negative

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Pad
max_len = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = np.array(labels)

In [19]:
# Build RNN model
model = Sequential()
model.add(Embedding(len(word_index)+1, 8, input_length=max_len))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
#model.build(input_shape=(None, max_seq_len-1))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [20]:
# Train
model.fit(X, y, epochs=20, verbose=1)

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.6250 - loss: 0.6925
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6250 - loss: 0.6904
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.6250 - loss: 0.6883
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6250 - loss: 0.6861
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6250 - loss: 0.6840
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.6250 - loss: 0.6817
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.6250 - loss: 0.6795
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6250 - loss: 0.6772
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x79e53ee1d2e0>

In [21]:
# Test
test_text = "I really love this"
seq = tokenizer.texts_to_sequences([test_text])
padded = pad_sequences(seq, maxlen=max_len, padding='post')
pred = model.predict(padded)
print(f"Prediction for '{test_text}':", "Positive" if pred[0][0] > 0.5 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
Prediction for 'I really love this': Positive


# BiRNN - Sentiment Analysis

In [22]:
from tensorflow.keras.layers import Bidirectional

# Build BiRNN model
model = Sequential()
model.add(Embedding(len(word_index)+1, 8, input_length=max_len))
model.add(Bidirectional(SimpleRNN(16)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train
model.fit(X, y, epochs=20, verbose=1)

# Test
test_text = "I really love this"
seq = tokenizer.texts_to_sequences([test_text])
padded = pad_sequences(seq, maxlen=max_len, padding='post')
pred = model.predict(padded)
print(f"Prediction for '{test_text}' (BiRNN):", "Positive" if pred[0][0] > 0.5 else "Negative")


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.3750 - loss: 0.6866
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.6250 - loss: 0.6841
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.6250 - loss: 0.6817
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.6250 - loss: 0.6792
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.7500 - loss: 0.6767
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.7500 - loss: 0.6741
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - accuracy: 0.7500 - loss: 0.6714
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.8750 - loss: 0.6687
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 263ms/step
Prediction for 'I really love this' (BiRNN): Positive
