# Character Prediction with LSTM - Adapted for Project

In [16]:
# ========================
# 1. Imports
# ========================
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import os
import nltk

nltk.download('gutenberg')
from nltk.corpus import gutenberg


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


# 1. Characters Setup from characters.txt

In [17]:
# Load characters from characters.txt
characters_file_path = "../../data/characters.txt"

def load_characters(file_path):
    """Load characters from the specified file and add a space character."""
    try:
        with open(file_path, "r") as f:
            chars = f.read().strip()
        # Add space character to the set
        chars += " "
        print(f"Loaded {len(chars)} characters from {file_path} (including added space)")
        return chars
    except FileNotFoundError:
        print(f"Warning: Characters file not found at {file_path}. Using default character set.")
        # Default character set if file not found
        return "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ "

# Load characters from file
all_chars = load_characters(characters_file_path)
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(all_chars)

print("Vocabulary size:", vocab_size)
print("Characters in vocabulary:", all_chars)


Loaded 37 characters from ../../data/characters.txt (including added space)
Vocabulary size: 37
Characters in vocabulary: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ 


# 2. Data Cleaning Functions

In [18]:

def clean_text(text, allowed_chars):
    """
    Clean the text by:
    1. Converting all lowercase to uppercase
    2. Removing any characters not in the allowed set
    """
    # Convert to uppercase
    text = text.upper()
    # Filter to allowed characters
    text = "".join(ch for ch in text if ch in allowed_chars)
    return text


# 3. Model Definition

In [19]:
class CharPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super(CharPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embed = self.embedding(x)
        _, (hidden, _) = self.lstm(embed)
        hidden = hidden.squeeze(0)
        out = self.fc(hidden)
        return out


# 4. Training Function

In [20]:
def train_model(model, dataset, epochs=5, seq_len=10, batch_size=64):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0.0
        model.train()

        for i in range(0, len(dataset) - seq_len - 1, batch_size):
            inputs = []
            targets = []

            for b in range(batch_size):
                idx = i + b
                if idx + seq_len >= len(dataset) - 1:
                    break

                seq = dataset[idx: idx + seq_len]
                target = dataset[idx + seq_len]

                # Skip sequences with characters not in our vocabulary
                if all(ch in char2idx for ch in seq) and target in char2idx:
                    inputs.append([char2idx[ch] for ch in seq])
                    targets.append(char2idx[target])

            if not inputs:
                continue

            inputs = torch.tensor(inputs)
            targets = torch.tensor(targets)

            outputs = model(inputs)
            loss = loss_fn(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")


# 5. Prediction Function

In [21]:
def predict_next_chars(model, sentence, top_k=None):
    """
    Predict the next character probabilities given a prefix sentence.
    Returns probabilities for all characters in the vocabulary.
    
    Args:
        model: The trained CharPredictor model
        sentence: The prefix sentence to predict from
        top_k: If specified, return only top k predictions. If None, return all.
    
    Returns:
        Dictionary mapping characters to their probabilities
    """
    # Clean the input sentence
    sentence = clean_text(sentence, set(all_chars))
    
    model.eval()
    with torch.no_grad():
        # Convert to indices, skipping unknown characters
        input_seq = [char2idx[ch] for ch in sentence if ch in char2idx]
        if not input_seq:
            print("Warning: Input sentence contains no known characters. Using empty sequence.")
            # Return uniform distribution if no valid input
            result = {ch: 1.0/vocab_size for ch in all_chars}
            return result

        input_seq = torch.tensor(input_seq).unsqueeze(0)
        output = model(input_seq)
        probs = F.softmax(output, dim=-1).squeeze(0)
        
        # If top_k is specified, get only top k predictions
        if top_k is not None:
            top_k = min(top_k, vocab_size)  # Ensure top_k doesn't exceed vocab size
            top_probs, top_indices = torch.topk(probs, top_k)
            
            result = {}
            for prob, idx in zip(top_probs, top_indices):
                result[idx2char[idx.item()]] = round(prob.item(), 4)
        else:
            # Return all probabilities
            result = {}
            for idx, prob in enumerate(probs):
                result[idx2char[idx]] = round(prob.item(), 4)
        
        return result


# 6. Save and Load Model

In [22]:
def save_model(model, path="../../model/api/char_predictor.pth"):
    # Ensure directory exists
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}.")

def load_model(path="../../model/api/char_predictor.pth"):
    model = CharPredictor(vocab_size)
    try:
        model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
        model.eval()
        print(f"Model loaded from {path}.")
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Initializing new model.")
    return model


# 7. Train the Model

In [None]:
mode = "train"  # Change to "predict" when needed

if mode == "train":
    model = CharPredictor(vocab_size)

    # Load dataset
    book_ids = [
        'austen-emma.txt',
        'bible-kjv.txt',
#        'blake-poems.txt',
#        'melville-moby_dick.txt',
#        'shakespeare-macbeth.txt',
    ]

    # Merge all books
    text = ""
    for book_id in book_ids:
        text += gutenberg.raw(book_id)

    # Clean the text according to our requirements
    text = clean_text(text, set(all_chars))
    
    print(f"Training dataset size: {len(text)} characters.")
    print(f"Sample of cleaned text: {text[:100]}...")

    # Train the model
    train_model(model, text, epochs=10)

    # Save the model
    save_model(model)

elif mode == "predict":
    model = load_model()

    # Provide test input
    test_sentence = "A DARK HAND HEL"
    
    # Get all probabilities
    result = predict_next_chars(model, test_sentence)
    
    # Display result
    print("\nPrediction Probabilities for all characters:")
    # Sort by probability (descending)
    sorted_result = {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}
    print(json.dumps(sorted_result, indent=2))
    
    # Show most likely completion
    best_char = max(result, key=result.get)
    completed_sentence = test_sentence + best_char
    print("\nMost likely next character:", best_char)
    print("Completed sentence:", completed_sentence)
    
    # Verify we have exactly the right number of probabilities
    print(f"\nNumber of probability outputs: {len(result)}")
    if len(result) == vocab_size:
        print("✓ Output matches vocabulary size")
    else:
        print(f"✗ Output size ({len(result)}) does not match vocabulary size ({vocab_size})")


Training dataset size: 4903353 characters.
Sample of cleaned text: EMMA BY JANE AUSTEN 1816VOLUME ICHAPTER IEMMA WOODHOUSE HANDSOME CLEVER AND RICH WITH A COMFORTABLE ...


# 9. Example Usage in External Code

In [None]:
"""
# Example of how to use this model in external code:

from NLP_model_adapted import load_model, predict_next_chars, all_chars

# Load the model
model = load_model("path/to/model.pth")

# Get prediction for a prefix
prefix = "THE QUICK"
predictions = predict_next_chars(model, prefix)

# Print top 3 predictions
top_3 = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:3]
print(f"Top 3 predictions for '{prefix}':")
for char, prob in top_3:
    print(f"  {char}: {prob:.4f}")
"""