In [None]:
# WikiText-103 Next-Word Prediction Model
# Complete implementation for Google Colab

# Install required packages
!pip install transformers tensorflow

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset
import re
import pickle
import os
from collections import Counter
import matplotlib.pyplot as plt

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:


print(" Loading WikiText-103 dataset...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Data preprocessing functions
def clean_text(text):
    """Clean and preprocess text"""
    if not isinstance(text, str) or text.strip() == '':
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove special tokens and formatting
    text = re.sub(r'@-@', '-', text)  # WikiText specific token
    text = re.sub(r'= = .+ = =', '', text)  # Headers
    text = re.sub(r'= .+ =', '', text)  # Subheaders

    # Keep only letters, numbers, spaces, and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\;\:]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def create_sequences_and_targets(texts, tokenizer, seq_length=5):
    """Create input sequences and target words for training"""
    sequences = []
    targets = []

    for text in texts:
        if not text or len(text.strip()) == 0:
            continue

        # Tokenize text
        tokens = tokenizer.texts_to_sequences([text])[0]

        # Create sequences of length seq_length + 1 (input + target)
        for i in range(len(tokens) - seq_length):
            sequence = tokens[i:i + seq_length + 1]
            if len(sequence) == seq_length + 1:
                sequences.append(sequence[:-1])  # Input sequence
                targets.append(sequence[-1])     # Target word

    return np.array(sequences), np.array(targets)


In [None]:

# Load and preprocess data
print(" Preprocessing training data...")
train_texts = []
for example in dataset['train']:
    cleaned = clean_text(example['text'])
    if cleaned and len(cleaned.split()) > 10:  # Only keep sentences with >10 words
        train_texts.append(cleaned)

# Limit dataset size for faster training in Colab
print(f"Total training examples: {len(train_texts)}")
train_texts = train_texts[:50000]  # Use first 50k examples
print(f"Using {len(train_texts)} training examples for faster training")



In [None]:
# Create and fit tokenizer
print(" Creating tokenizer...")
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")


In [None]:

# Create training sequences
print(" Creating training sequences...")
seq_length = 5  # Predict next word based on previous 5 words
X_train, y_train = create_sequences_and_targets(train_texts, tokenizer, seq_length)

print(f"Training sequences shape: {X_train.shape}")
print(f"Training targets shape: {y_train.shape}")


In [None]:

# Convert targets to categorical (one-hot encoding)
y_train_categorical = to_categorical(y_train, num_classes=vocab_size)


In [None]:

# Build LSTM model
print(" Building LSTM model...")
model = Sequential([
    Embedding(vocab_size, 100, input_length=seq_length),
    LSTM(150, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    LSTM(150, dropout=0.2, recurrent_dropout=0.2),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [None]:

# Train the model
print(" Training model...")
history = model.fit(
    X_train, y_train_categorical,
    epochs=5,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


In [None]:

# Save model and tokenizer
print(" Saving model and tokenizer...")
model.save('next_word_model.h5')

with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

print(" Model and tokenizer saved!")

# Prediction functions
def predict_next_word(input_text, model=model, tokenizer=tokenizer, seq_length=5, top_k=3):
    """
    Predict the next word given an input text

    Args:
        input_text: String of input text
        model: Trained Keras model
        tokenizer: Fitted tokenizer
        seq_length: Length of input sequence
        top_k: Number of top predictions to return

    Returns:
        List of (word, probability) tuples
    """
    # Clean and tokenize input
    cleaned_text = clean_text(input_text)
    tokens = tokenizer.texts_to_sequences([cleaned_text])[0]

    # Take last seq_length tokens
    if len(tokens) >= seq_length:
        input_sequence = tokens[-seq_length:]
    else:
        # Pad with zeros if input is shorter than seq_length
        input_sequence = [0] * (seq_length - len(tokens)) + tokens

    # Reshape for prediction
    input_sequence = np.array([input_sequence])

    # Get prediction probabilities
    predictions = model.predict(input_sequence, verbose=0)[0]

    # Get top k predictions
    top_indices = np.argsort(predictions)[-top_k:][::-1]

    # Convert indices back to words
    word_predictions = []
    for idx in top_indices:
        if idx in tokenizer.index_word:
            word = tokenizer.index_word[idx]
            probability = predictions[idx]
            word_predictions.append((word, probability))

    return word_predictions

def predict_next_word_simple(input_text):
    """Simple function that returns just the most likely next word"""
    predictions = predict_next_word(input_text, top_k=1)
    if predictions:
        return predictions[0][0]
    return "<unknown>"

# Test the model
print("\n🎯 Testing predictions...")

test_sentences = [
    "once upon a time in",
    "the quick brown fox",
    "artificial intelligence is",
    "machine learning algorithms",
    "deep learning models can"
]

for sentence in test_sentences:
    predictions = predict_next_word(sentence, top_k=3)
    print(f"\nInput: '{sentence}'")
    print("Top 3 predictions:")
    for i, (word, prob) in enumerate(predictions, 1):
        print(f"  {i}. {word} (probability: {prob:.4f})")

# Interactive prediction function
def interactive_prediction():
    """Interactive function for testing predictions"""
    print("\n🎮 Interactive Next-Word Prediction")
    print("Enter a sentence and get next word predictions!")
    print("Type 'quit' to exit\n")

    while True:
        user_input = input("Enter text: ").strip()

        if user_input.lower() == 'quit':
            break

        if not user_input:
            print("Please enter some text!")
            continue

        try:
            predictions = predict_next_word(user_input, top_k=5)
            if predictions:
                print(f"\nNext word predictions for: '{user_input}'")
                for i, (word, prob) in enumerate(predictions, 1):
                    print(f"  {i}. {word} (probability: {prob:.4f})")
            else:
                print("No predictions available for this input.")
        except Exception as e:
            print(f"Error making prediction: {e}")

        print()

# Function to load saved model and tokenizer
def load_saved_model():
    """Load previously saved model and tokenizer"""
    try:
        loaded_model = load_model('next_word_model.h5')

        with open('tokenizer.pickle', 'rb') as f:
            loaded_tokenizer = pickle.load(f)

        print(" Model and tokenizer loaded successfully!")
        return loaded_model, loaded_tokenizer

    except Exception as e:
        print(f" Error loading model: {e}")
        return None, None


In [None]:

# Display final results
print("\n" + "="*60)
print(" NEXT-WORD PREDICTION MODEL READY!")
print("="*60)
print("\nAvailable functions:")
print("• predict_next_word(text, top_k=3) - Get top k predictions")
print("• predict_next_word_simple(text) - Get single best prediction")
print("• interactive_prediction() - Interactive testing")
print("• load_saved_model() - Load saved model")

print(f"\nModel Info:")
print(f"• Vocabulary size: {vocab_size:,}")
print(f"• Sequence length: {seq_length}")
print(f"• Training examples: {len(X_train):,}")
print(f"• Model parameters: {model.count_params():,}")

print("\n Try running: interactive_prediction()")

# Save model and tokenizer
print(" Saving model and tokenizer...")
model.save('next_word_model.h5')

with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

print(" Model and tokenizer saved!")
