In [1]:
import numpy as np
from nltk.corpus import brown
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
import nltk
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Step 1: Load and clean data
sentences = brown.sents()
sentences = [' '.join(sent).lower() for sent in sentences if len(sent) >= 3]
sentences = sentences[:5000]  # Limit for quick training

In [3]:
# Step 2: Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [4]:
# Step 3: Create training samples (trigrams: context → next word)
X, y = [], []
for sent in sentences:
    tokens = tokenizer.texts_to_sequences([sent])[0]
    for i in range(2, len(tokens)):
        context = tokens[i-2:i]
        target = tokens[i]
        X.append(context)
        y.append(target)

X = np.array(X)
y = np.array(y)

In [5]:
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Step 5: Model
embedding_dim = 100
rnn_units = 128

model = Sequential()
# Changed here: input_shape instead of input_length
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(2,)))
model.add(SimpleRNN(rnn_units))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [7]:
# Step 6: Train
model.fit(X_train, y_train, epochs=5, batch_size=256, validation_data=(X_test, y_test))

Epoch 1/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 160ms/step - accuracy: 0.0623 - loss: 8.2834 - val_accuracy: 0.0789 - val_loss: 7.2762
Epoch 2/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 149ms/step - accuracy: 0.0874 - loss: 6.9165 - val_accuracy: 0.0942 - val_loss: 7.1359
Epoch 3/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 150ms/step - accuracy: 0.1071 - loss: 6.5347 - val_accuracy: 0.1055 - val_loss: 7.1107
Epoch 4/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 149ms/step - accuracy: 0.1213 - loss: 6.2280 - val_accuracy: 0.1078 - val_loss: 7.1452
Epoch 5/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 145ms/step - accuracy: 0.1305 - loss: 5.9682 - val_accuracy: 0.1088 - val_loss: 7.2201


<keras.src.callbacks.history.History at 0x787a1e5ffa50>

In [8]:

# Step 7: Evaluation on Test Set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Optional: Top-3 Accuracy
def top_k_accuracy(model, X, y_true, k=3):
    preds = model.predict(X, verbose=0)
    top_k_preds = np.argsort(preds, axis=1)[:, -k:]
    match = np.any(top_k_preds == y_true.reshape(-1, 1), axis=1)
    return np.mean(match)

top3 = top_k_accuracy(model, X_test, y_test, k=3)
print(f"Top-3 Accuracy: {top3:.4f}")

[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.1124 - loss: 7.2202
Test Accuracy: 0.1088
Top-3 Accuracy: 0.1805


In [9]:
# Step 8: Sample Predictions
reverse_word_index = {v: k for k, v in word_index.items()}

print("Sample Predictions:")
for i in range(5):
    context = X_test[i]
    true_word = reverse_word_index.get(y_test[i], "<UNK>")
    pred = model.predict(np.array([context]), verbose=0)
    pred_word = reverse_word_index.get(np.argmax(pred), "<UNK>")
    print(f"Context: '{reverse_word_index[context[0]]} {reverse_word_index[context[1]]}' → Prediction: '{pred_word}' | Actual: '{true_word}'")


Sample Predictions:
Context: 'catholic atmosphere' → Prediction: 'of' | Actual: 'is'
Context: 'any test' → Prediction: 'the' | Actual: 'of'
Context: 'that an' → Prediction: 'own' | Actual: 'increase'
Context: 'city's snow' → Prediction: 'in' | Actual: 'clearing'
Context: 'interstate commerce' → Prediction: 'and' | Actual: 'commission'


In [10]:
context = 'i am'
# Tokenize the context words
context_sequence = tokenizer.texts_to_sequences([context.split()])[0]

# Ensure the context has two words, padding or truncating if necessary
if len(context_sequence) > 2:
    context_sequence = context_sequence[-2:]
elif len(context_sequence) < 2:
    # Handle cases where the context has fewer than two words
    print("Error: Context must contain at least two words.")
    pred_word = "<Error>"
else:
    # Reshape for the model
    context_sequence = np.array([context_sequence])

    # Predict the next word probabilities
    pred = model.predict(context_sequence, verbose=0)[0]

    # Get the index of the word with the highest probability
    predicted_word_index = np.argmax(pred)

    # Get the predicted word from the reverse word index
    pred_word = reverse_word_index.get(predicted_word_index, "<UNK>")

print (pred_word)

a
