# Character Prediction with LSTM - Adapted for Project

In [1]:
import torch.nn.functional as F
import json
import nltk
from bundle.DataCraft import * 
from bundle.ApiCraft  import * 

nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


# 1. Characters Setup from characters.txt

In [2]:
# Load characters from file
all_chars = load_characters()
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(all_chars)

print("Vocabulary size:", vocab_size)
print("Characters in vocabulary:", all_chars)

Loaded 37 characters from ../../data/characters.txt (including added space)
Vocabulary size: 37
Characters in vocabulary: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ 


# 2. Data Cleaning Functions

In [3]:
def clean_text(text, allowed_chars):
    # Convert to uppercase
    text = text.upper()
    # Filter to allowed characters
    text = "".join(ch for ch in text if ch in allowed_chars)
    return text

# 3. Training Function

In [4]:
def train_model(model, dataset, epochs=5, seq_len=10, batch_size=64):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0.0
        model.train()

        for i in range(0, len(dataset) - seq_len - 1, batch_size):
            inputs = []
            targets = []

            for b in range(batch_size):
                idx = i + b
                if idx + seq_len >= len(dataset) - 1:
                    break

                seq = dataset[idx: idx + seq_len]
                target = dataset[idx + seq_len]

                # Skip sequences with characters not in our vocabulary
                if all(ch in char2idx for ch in seq) and target in char2idx:
                    inputs.append([char2idx[ch] for ch in seq])
                    targets.append(char2idx[target])

            if not inputs:
                continue

            inputs = torch.tensor(inputs)
            targets = torch.tensor(targets)

            outputs = model(inputs)
            loss = loss_fn(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")


# 4. Prediction Function

In [5]:
def predict_next_chars(model, sentence, top_k=None):

    # Clean the input sentence
    sentence = clean_text(sentence, set(all_chars))
    
    model.eval()
    with torch.no_grad():
        # Convert to indices, skipping unknown characters
        input_seq = [char2idx[ch] for ch in sentence if ch in char2idx]
        if not input_seq:
            print("Warning: Input sentence contains no known characters. Using empty sequence.")
            # Return uniform distribution if no valid input
            result = {ch: 1.0/vocab_size for ch in all_chars}
            return result

        input_seq = torch.tensor(input_seq).unsqueeze(0)
        output = model(input_seq)
        probs = F.softmax(output, dim=-1).squeeze(0)
        
        # If top_k is specified, get only top k predictions
        if top_k is not None:
            top_k = min(top_k, vocab_size)  # Ensure top_k doesn't exceed vocab size
            top_probs, top_indices = torch.topk(probs, top_k)
            
            result = {}
            for prob, idx in zip(top_probs, top_indices):
                result[idx2char[idx.item()]] = round(prob.item(), 4)
        else:
            # Return all probabilities
            result = {}
            for idx, prob in enumerate(probs):
                result[idx2char[idx]] = round(prob.item(), 4)
        
        return result


# 5. Save Model

In [6]:
def save_model(model, path="../../model/api/char_predictor.pth"):
    # Ensure directory exists
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}.")

# 6. Train the Model

In [7]:
mode = "predict"  # Change to "predict" when needed

if mode == "train":
    model = CharPredictor(vocab_size)

    # Load dataset
    book_ids = [
        'austen-emma.txt',
        'bible-kjv.txt',
#        'blake-poems.txt',
#        'melville-moby_dick.txt',
#        'shakespeare-macbeth.txt',
    ]

    # Merge all books
    text = ""
    for book_id in book_ids:
        text += gutenberg.raw(book_id)

    # Clean the text according to our requirements
    text = clean_text(text, set(all_chars))
    
    print(f"Training dataset size: {len(text)} characters.")
    print(f"Sample of cleaned text: {text[:100]}...")

    # Train the model
    train_model(model, text, epochs=10)

    # Save the model
    save_model(model)

elif mode == "predict":
    model = load_nlp_model(vocab_size)

    # Provide test input
    test_sentence = "A DARK HAND HEL"
    
    # Get all probabilities
    result = predict_next_chars(model, test_sentence)
    
    # Display result
    print("\nPrediction Probabilities for all characters:")
    # Sort by probability (descending)
    sorted_result = {k: v for k, v in sorted(result.items(), key=lambda item: item[1], reverse=True)}
    print(json.dumps(sorted_result, indent=2))
    
    # Show most likely completion
    best_char = max(result, key=result.get)
    completed_sentence = test_sentence + best_char
    print("\nMost likely next character:", best_char)
    print("Completed sentence:", completed_sentence)
    
    # Verify we have exactly the right number of probabilities
    print(f"\nNumber of probability outputs: {len(result)}")
    if len(result) == vocab_size:
        print("✓ Output matches vocabulary size")
    else:
        print(f"✗ Output size ({len(result)}) does not match vocabulary size ({vocab_size})")


Model loaded from ../../model/api/char_predictor.pth.

Prediction Probabilities for all characters:
{
  "D": 0.4425,
  "L": 0.235,
  "P": 0.1832,
  "I": 0.0519,
  "O": 0.0347,
  "A": 0.0222,
  "V": 0.0083,
  "E": 0.0054,
  "U": 0.0053,
  " ": 0.0041,
  "H": 0.002,
  "B": 0.0007,
  "S": 0.0007,
  "F": 0.0006,
  "K": 0.0006,
  "W": 0.0005,
  "T": 0.0004,
  "M": 0.0003,
  "N": 0.0003,
  "Y": 0.0003,
  "C": 0.0001,
  "R": 0.0001,
  "1": 0.0001,
  "2": 0.0001,
  "4": 0.0001,
  "5": 0.0001,
  "9": 0.0001,
  "G": 0.0,
  "J": 0.0,
  "Q": 0.0,
  "X": 0.0,
  "Z": 0.0,
  "3": 0.0,
  "6": 0.0,
  "7": 0.0,
  "8": 0.0,
  "_": 0.0
}

Most likely next character: D
Completed sentence: A DARK HAND HELD

Number of probability outputs: 37
✓ Output matches vocabulary size
