In [3]:
from bundle.DataCraft import * 
from bundle.ApiCraft  import * 
import torch.nn.functional as F

# Load characters from file
all_chars = load_characters()
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(all_chars)

print("Vocabulary size:", vocab_size)
print("Characters in vocabulary:", all_chars)


def predict_next_chars(model, sentence, top_k=None):
 
    model.eval()
    with torch.no_grad():
        # Convert to indices, skipping unknown characters
        input_seq = [char2idx[ch] for ch in sentence if ch in char2idx]
        if not input_seq:
            print("Warning: Input sentence contains no known characters. Using empty sequence.")
            # Return uniform distribution if no valid input
            result = {ch: 1.0/vocab_size for ch in all_chars}
            return result

        input_seq = torch.tensor(input_seq).unsqueeze(0)
        output = model(input_seq)
        probs = F.softmax(output, dim=-1).squeeze(0)
        
        # If top_k is specified, get only top k predictions
        if top_k is not None:
            top_k = min(top_k, vocab_size)  # Ensure top_k doesn't exceed vocab size
            top_probs, top_indices = torch.topk(probs, top_k)
            
            result = {}
            for prob, idx in zip(top_probs, top_indices):
                result[idx2char[idx.item()]] = round(prob.item(), 4)
        else:
            # Return all probabilities
            result = {}
            for idx, prob in enumerate(probs):
                result[idx2char[idx]] = round(prob.item(), 4)
        
        return result


# Read Sentences
sentences_filepath = "../../data/sentences.txt"             # Input: Sentences file

sentences = []
print(f"Reading sentences from: {sentences_filepath}")
try:
    with open(sentences_filepath, "r") as f:
        sentences = [line.strip() for line in f if line.strip()] # Read non-empty lines
    print(f"Read {len(sentences)} sentences.")
except FileNotFoundError:
    print(f"Error: Sentences file not found at {sentences_filepath}")
    exit()
except Exception as e:
    print(f"Error reading sentences file: {e}")
    exit()

if not sentences:
    print("No sentences found in the file. Exiting.")
    exit()

model = load_nlp_model(vocab_size)


Loaded 37 characters from ../../data/characters.txt (including added space)
Vocabulary size: 37
Characters in vocabulary: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ 
Reading sentences from: ../../data/sentences.txt
Read 400 sentences.
Model loaded from ../../model/api/char_predictor.pth.


In [4]:
# ===========================
# Accuracy Evaluation
# ===========================

correct = 0
total = 0

print("Evaluating model accuracy on next character prediction...")

for sentence in sentences:

    for i in range(1, len(sentence)):
        
        input_seq = sentence[:i]        # Input so far
        target_char = sentence[i]       # The correct next character

        top_k = 5  # Change this to top-1, top-5, etc.

        # Get model predictions
        prediction = predict_next_chars(model, input_seq, top_k=top_k)
        
        # Skip empty predictions
        if not prediction:
            continue
        
        # Sort predictions by probability
        sorted_preds = sorted(prediction.items(), key=lambda x: x[1], reverse=True)
        
        # Check if target character is in top-k predictions
        predicted_chars = [char for char, prob in sorted_preds[:top_k]]
        if target_char in predicted_chars:
            correct += 1
        total += 1


accuracy = (correct / total) * 100 if total > 0 else 0.0
print(f"Accuracy on next-character prediction: {accuracy:.2f}%")

Evaluating model accuracy on next character prediction...
Accuracy on next-character prediction: 76.67%
