In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import SparseCategoricalAccuracy, SparseTopKCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import numpy as np
import re

np.random.seed(42)
tf.random.set_seed(42)


2025-10-18 19:33:05.215390: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-18 19:33:05.250833: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-18 19:33:05.878059: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?])\s+', text) if sentence.strip()]
    return sentences


file_path = '../data/raw/sherlock_holmes.txt'  # update to your dataset path
text_data = file_to_sentence_list(file_path)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[: i + 1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max(len(seq) for seq in input_sequences)
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
)
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# Split into train/validation/test sets
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=True
)

y_train = y_train.astype('int32')
y_val = y_val.astype('int32')
y_test = y_test.astype('int32')

print(f'Training samples: {X_train.shape[0]}')
print(f'Validation samples: {X_val.shape[0]}')
print(f'Test samples: {X_test.shape[0]}')
print(f'Vocabulary size: {total_words}')
print(f'Max sequence length: {max_sequence_len}')


Training samples: 84699
Validation samples: 10587
Test samples: 10588
Vocabulary size: 8922
Max sequence length: 251


In [3]:
# Define the model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len - 1))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

model.build((None, max_sequence_len - 1))

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=[
        SparseCategoricalAccuracy(name='accuracy'),
        SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy'),
    ],
)

model.summary()


W0000 00:00:1760796206.851607  359237 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [4]:
# Train the model
EPOCHS = 100  # adjust based on dataset size and compute budget
BATCH_SIZE = 128

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1),
]

history = model.fit(
    X_train,
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1,
)


Epoch 1/100
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 192ms/step - accuracy: 0.0546 - loss: 6.6431 - top_5_accuracy: 0.1582 - val_accuracy: 0.0660 - val_loss: 6.4415 - val_top_5_accuracy: 0.1777 - learning_rate: 0.0010
Epoch 2/100
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 192ms/step - accuracy: 0.0801 - loss: 6.0935 - top_5_accuracy: 0.2145 - val_accuracy: 0.0887 - val_loss: 6.1379 - val_top_5_accuracy: 0.2445 - learning_rate: 0.0010
Epoch 3/100
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 191ms/step - accuracy: 0.1088 - loss: 5.7192 - top_5_accuracy: 0.2708 - val_accuracy: 0.1185 - val_loss: 5.9466 - val_top_5_accuracy: 0.2808 - learning_rate: 0.0010
Epoch 4/100
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 196ms/step - accuracy: 0.1286 - loss: 5.4611 - top_5_accuracy: 0.2981 - val_accuracy: 0.1274 - val_loss: 5.8692 - val_top_5_accuracy: 0.2898 - learning_rate: 0.0010
Epoch 5/100
[1m662/

In [5]:
# Evaluate metrics and perplexity
import math

def evaluate_split(features, labels):
    loss, accuracy, top5 = model.evaluate(features, labels, verbose=0)
    return {
        'loss': float(loss),
        'accuracy': float(accuracy),
        'top_5_accuracy': float(top5),
        'perplexity': float(math.exp(loss)),
    }

metrics_summary = {
    'train': evaluate_split(X_train, y_train),
    'validation': evaluate_split(X_val, y_val),
    'test': evaluate_split(X_test, y_test),
}

for split, stats in metrics_summary.items():
    print(f"{split.capitalize()} metrics:")
    for metric_name, value in stats.items():
        print(f"  {metric_name}: {value:.4f}")
    print()


Train metrics:
  loss: 4.9235
  accuracy: 0.1611
  top_5_accuracy: 0.3450
  perplexity: 137.4884

Validation metrics:
  loss: 5.8176
  accuracy: 0.1358
  top_5_accuracy: 0.3076
  perplexity: 336.1527

Test metrics:
  loss: 5.8391
  accuracy: 0.1357
  top_5_accuracy: 0.3087
  perplexity: 343.4813



In [6]:
# Persist model and tokenizer for later inference
from pathlib import Path
import json

artifacts_dir = Path("Notebook") / "artifacts"
artifacts_dir.mkdir(parents=True, exist_ok=True)

model_path = artifacts_dir / "notebook_model.keras"
tokenizer_path = artifacts_dir / "tokenizer.json"
metadata_path = artifacts_dir / "notebook_metadata.json"

model.save(model_path, include_optimizer=False)

tokenizer_json = tokenizer.to_json()
tokenizer_path.write_text(tokenizer_json, encoding="utf-8")

metadata = {
    "max_sequence_len": int(max_sequence_len),
    "vocab_size": int(total_words),
    "train_samples": int(X_train.shape[0]),
    "val_samples": int(X_val.shape[0]),
    "test_samples": int(X_test.shape[0]),
}
metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")

print(f"Saved model to {model_path}")
print(f"Saved tokenizer config to {tokenizer_path}")
print(f"Saved metadata to {metadata_path}")


Saved model to Notebook/artifacts/notebook_model.keras
Saved tokenizer config to Notebook/artifacts/tokenizer.json
Saved metadata to Notebook/artifacts/notebook_metadata.json


In [7]:
# Text generation helper with top-k probabilities
def generate_text(seed_text, num_words=30, top_k=5, temperature=1.0):
    generated = seed_text.strip()
    step_details = []

    for step in range(num_words):
        token_list = tokenizer.texts_to_sequences([generated])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predictions = model.predict(token_list, verbose=0)[0]
        predictions = predictions.astype('float64')

        if temperature != 1.0 and temperature > 0:
            predictions = np.log(predictions + 1e-8) / temperature
            predictions = np.exp(predictions)
            predictions = predictions / np.sum(predictions)

        top_indices = predictions.argsort()[-top_k:][::-1]
        top_predictions = [
            (tokenizer.index_word.get(idx, ''), float(predictions[idx]))
            for idx in top_indices
        ]

        next_index = top_indices[0]
        next_word = tokenizer.index_word.get(next_index, '')
        generated = f"{generated} {next_word}".strip()

        step_details.append(
            {
                'step': step + 1,
                'next_word': next_word,
                'top_k': top_predictions,
            }
        )

    return generated, step_details


In [8]:
# Generate sample outputs with detailed top-5 breakdown
examples = [
    ("I saw Holmes", 30),
    ("The adventure", 30),
    ("Dr Watson", 30),
]

primary_seed, primary_len = examples[0]
primary_text, primary_steps = generate_text(primary_seed, num_words=primary_len)

print(f'Seed: "{primary_seed}"')
print(f'Generated text (length {primary_len} words):\n{primary_text}\n')
print('Top-5 predictions at each generation step:')
for info in primary_steps:
    topk_formatted = ', '.join([f"{word} ({prob:.3f})" for word, prob in info['top_k']])
    print(f"Step {info['step']:>2}: next word = {info['next_word']} | top-5 -> {topk_formatted}")

for seed, length in examples[1:]:
    text, _ = generate_text(seed, num_words=length)
    print('\n' + '-' * 80)
    print(f'Seed: "{seed}"')
    print(f'Generated text (length {length} words):\n{text}')


Seed: "I saw Holmes"
Generated text (length 30 words):
I saw Holmes in the door and i had been a very man of the matter ” said holmes was a very man of the door of the door of the door of

Top-5 predictions at each generation step:
Step  1: next word = in | top-5 -> in (0.047), and (0.047), to (0.036), ” (0.030), as (0.023)
Step  2: next word = the | top-5 -> the (0.367), a (0.099), my (0.061), his (0.051), this (0.025)
Step  3: next word = door | top-5 -> door (0.015), room (0.012), very (0.011), matter (0.009), house (0.009)
Step  4: next word = and | top-5 -> and (0.218), of (0.149), which (0.053), ” (0.050), in (0.047)
Step  5: next word = i | top-5 -> i (0.098), he (0.042), the (0.039), a (0.035), we (0.025)
Step  6: next word = had | top-5 -> had (0.105), have (0.073), was (0.052), am (0.034), could (0.030)
Step  7: next word = been | top-5 -> been (0.147), not (0.039), a (0.029), no (0.024), seen (0.023)
Step  8: next word = a | top-5 -> a (0.041), in (0.033), to (0.024), the (0