# NanoGPT Model Evaluation

This notebook loads a trained NanoGPT model and evaluates it on train.bin and val.bin datasets using multiple metrics:
- **Perplexity**: Measures how well the model predicts the next token
- **BLEU Score**: Measures similarity between generated and reference text
- **ROUGE Scores**: Measures overlap of n-grams between generated and reference text

## Usage
1. Configure the paths and parameters in the configuration cell
2. Run all cells to perform the evaluation
3. View results in the final summary

## 1. Import Required Libraries

In [1]:
import os
import logging
import time
import traceback
from pathlib import Path
from typing import List, Tuple, Dict, Any

import numpy as np
import torch
import sys
import torch.nn.functional as F

# Configure logging to both file and console
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create formatters
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

file_handler = logging.FileHandler('nanogpt_evaluation.log', mode='a')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)

console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)


# Add the required paths for importing
current_dir = Path.cwd()
# sys.path.append(str(current_dir / "baselines/nanogpt/shakespeare-char/models"))
# sys.path.append(str(current_dir / "notebooks"))

logging.info(f"Current directory: {current_dir}")

2025-10-12 02:56:06 - INFO - Current directory: c:\Users\hayk_\OneDrive\Desktop\05_LMU_Masters\04_applied_dl\adl-bnn-textgen\notebooks


In [2]:
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## 2. Configuration

Set your model paths and evaluation parameters here:

In [36]:
# Configuration
CONFIG = {
    # 'data_dir': "/Users/sofianikolenko/Downloads",
    # 'model_path': parent_dir / 'baselines/nanogpt/shakespeare-char/models/baseline_model_2k.pt',
    # 'meta_path': parent_dir / 'baselines/nanogpt/shakespeare-char/models/meta.pkl',
    'data_dir': 'nanoGPT/data/shakespeare_char',
    'model_path': '../checkpoints/baseline_nanogpt/baseline_nanogpt.pt',
    'meta_path': '../checkpoints/baseline_nanogpt/nanogpt_meta.pkl',

    'batch_size': 16,
    'max_eval_samples': 1_000,
    'device': 'auto',  # 'auto', 'cpu', or 'cuda'
    'splits': ['val', 'train'],  # Dataset splits to evaluate
    'num_text_samples': 30,  # Number of text samples for BLEU/ROUGE
    'prompt_length': 20,  # Length of prompt for text generation
    'generation_length': 30,  # Length of generated text,
    "max_tokens": None # for fast debug, None = all
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Check if paths exist
for path_key in ['data_dir', 'model_path', 'meta_path']:
    path = Path(CONFIG[path_key])
    if not path.exists():
        raise FileNotFoundError(f"Required path not found: {path}")

logger.debug(f"Configuration: {CONFIG}")

Configuration:
  data_dir: nanoGPT/data/shakespeare_char
  model_path: ../checkpoints/baseline_nanogpt/baseline_nanogpt.pt
  meta_path: ../checkpoints/baseline_nanogpt/nanogpt_meta.pkl
  batch_size: 16
  max_eval_samples: 1000
  device: auto
  splits: ['val', 'train']
  num_text_samples: 30
  prompt_length: 20
  generation_length: 30
  max_tokens: None


<!-- ## 3. Alternative Utility Functions

These functions provide fallback implementations if the utils module is not available: -->

In [37]:
import sys
sys.path.append('../src')

from nanogpt_utils import load_model, load_tokenizer, encode, decode



## 4. NanoGPT Evaluator Class

This class handles model loading and evaluation with multiple metrics:

In [38]:
class NanoGPTEvaluator:
    """Evaluator for NanoGPT models with multiple metrics"""
    
    def __init__(self, model_path: str, meta_path: str, device: str = 'auto'):
        """
        Initialize the evaluator
        
        Args:
            model_path: Path to the model checkpoint
            meta_path: Path to the meta.pkl file containing tokenizer info
            device: Device to use ('cpu', 'cuda', or 'auto')
        """
        # Set device
        if device == 'auto':
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device
        
        print(f"Using device: {self.device}")
        

        self.model, self.checkpoint = load_model(Path(model_path), self.device)
        self.stoi, self.itos = load_tokenizer(Path(meta_path))
        
        self.vocab_size = len(self.itos)
        
        # Set model to evaluation mode
        self.model.eval()
        
        # Initialize metrics if available
        self.metrics = {}
        
        # Load evaluation metrics from HuggingFace evaluate
        self.bleu_metric = evaluate.load("bleu")
        self.rouge_metric = evaluate.load("rouge")
        self.perplexity_metric = evaluate.load("perplexity", module_type="metric")
        print("HuggingFace evaluation metrics loaded successfully")
        
            
            
print("NanoGPTEvaluator class defined")

NanoGPTEvaluator class defined


In [47]:
# Add data loading methods to the evaluator
def load_data(self, data_dir: str, split: str = 'val', max_tokens: int = None) -> np.ndarray:
    """
    Load train.bin or val.bin data
    
    Args:
        data_dir: Directory containing the data files
        split: 'train' or 'val'
        max_tokens: Optional limit on number of tokens to load (returns first x tokens)
        
    Returns:
        Numpy array of token indices
    """
    filename = f"{split}.bin"
    filepath = os.path.join(data_dir, filename)
    # filepath = "/Users/sofianikolenko/Downloads/val.bin"
    
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")
    
    if max_tokens is not None:
        # Load only the first max_tokens tokens
        data = np.memmap(filepath, dtype=np.uint16, mode='r', shape=(max_tokens,))
    else:
        data = np.memmap(filepath, dtype=np.uint16, mode='r')
    print(f"Loaded {split} data: {len(data):,} tokens")
    return data

def get_batch(self, data: np.ndarray, batch_size: int, block_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Get a random batch of data for evaluation
    
    Args:
        data: Token data array
        batch_size: Number of sequences in the batch
        block_size: Length of each sequence
        
    Returns:
        Tuple of (input_tokens, target_tokens)
    """
    if len(data) <= block_size:
        # If data is smaller than block_size, just use what we have
        ix = [0] * batch_size
        max_len = len(data) - 1
        x = torch.stack([torch.from_numpy(data[0:max_len].astype(np.int64)) for _ in range(batch_size)])
        y = torch.stack([torch.from_numpy(data[1:max_len+1].astype(np.int64)) for _ in range(batch_size)])
    else:
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    
    x, y = x.to(self.device), y.to(self.device)
    return x, y

# Add methods to the class
NanoGPTEvaluator.load_data = load_data
NanoGPTEvaluator.get_batch = get_batch

print("Data loading methods added to NanoGPTEvaluator")

Data loading methods added to NanoGPTEvaluator


In [48]:
# Add text generation and metric calculation methods
def generate_samples_for_metrics(self, data: np.ndarray, num_samples: int = 50, 
                               prompt_length: int = 20, generation_length: int = 30) -> Tuple[List[str], List[str]]:
    """
    Generate text samples for BLEU/ROUGE evaluation
    
    Args:
        data: Token data array
        num_samples: Number of samples to generate
        prompt_length: Length of prompt in tokens
        generation_length: Length of generated text in tokens
        
    Returns:
        Tuple of (references, predictions)
    """
    print(f"Generating {num_samples} samples for BLEU/ROUGE evaluation...")
    
    references = []
    predictions = []
    
    # Limit samples based on data size
    max_possible_samples = max(1, (len(data) - prompt_length - generation_length) // 100)
    num_samples = min(num_samples, max_possible_samples)
    
    print(f"Generating {num_samples} text samples...")
    
    for i in range(num_samples):
        try:
            # Select a random starting position
            if len(data) > prompt_length + generation_length + 10:
                start_idx = np.random.randint(0, len(data) - prompt_length - generation_length - 10)
            else:
                start_idx = 0
            
            # Extract prompt and reference
            prompt_tokens = data[start_idx:start_idx + prompt_length].astype(np.int64)
            reference_tokens = data[start_idx + prompt_length:start_idx + prompt_length + generation_length].astype(np.int64)
            
            # Decode reference
            reference_text = decode(reference_tokens.tolist(), self.itos)
            
            # Generate prediction
            x = torch.tensor(prompt_tokens, dtype=torch.long, device=self.device)[None, ...]
            
            with torch.no_grad():
                generated_tokens = []
                for _ in range(generation_length):
                    # Crop if sequence gets too long
                    x_cond = x if x.size(1) <= self.model.config.block_size else x[:, -self.model.config.block_size:]
                    
                    # Forward pass
                    logits, _ = self.model(x_cond)
                    logits = logits[:, -1, :] / 0.8  # temperature
                    
                    # Sample next token
                    probs = F.softmax(logits, dim=-1)
                    next_token = torch.multinomial(probs, num_samples=1)
                    generated_tokens.append(next_token.item())
                    
                    # Append to sequence
                    x = torch.cat((x, next_token), dim=1)
            
            # Decode prediction
            prediction_text = decode(generated_tokens, self.itos)
            
            # Clean up texts
            reference_text = reference_text.strip()
            prediction_text = prediction_text.strip()
            
            if len(reference_text) > 0 and len(prediction_text) > 0:
                references.append(reference_text)
                predictions.append(prediction_text)
            
            if (i + 1) % 10 == 0:
                print(f"  Generated {i + 1}/{num_samples} samples")
                
        except Exception as e:
            print(f"Error generating sample {i}: {e}")
            continue
    
    print(f"Successfully generated {len(references)} sample pairs")
    return references, predictions

# Add method to the class
NanoGPTEvaluator.generate_samples_for_metrics = generate_samples_for_metrics

print("Text generation method added")

Text generation method added


In [49]:
# Add custom tokenizer method for evaluation metrics
def get_tokenizer(self):
    """
    Get a tokenizer function that matches the model's vocabulary
    
    This works for both character-level and token-level models:
    - Character-level: stoi maps each character to an index
    - Token-level: stoi maps tokens/words to indices
    
    Returns:
        Tokenizer function that splits text according to model's vocabulary
    """
    def custom_tokenizer(text):
        """Tokenizer that matches the model's vocabulary"""
        tokens = []
        for char in text:
            if char in self.stoi:
                tokens.append(char)
        return tokens
    
    return custom_tokenizer

# Add method to the class
NanoGPTEvaluator.get_tokenizer = get_tokenizer

print("Custom tokenizer method added to NanoGPTEvaluator")

Custom tokenizer method added to NanoGPTEvaluator


In [50]:
# Fix the perplexity calculation method
def calculate_perplexity(self, data: np.ndarray, batch_size: int = 16, max_batches: int = 100) -> float:
    """
    Calculate perplexity using HuggingFace evaluate library only
    
    Args:
        data: Token data array
        batch_size: Batch size for evaluation
        max_batches: Maximum number of batches to evaluate
        
    Returns:
        Perplexity value or None if calculation fails
    """
    print(f"Calculating perplexity with {batch_size} batch size...")
    
    if self.perplexity_metric is not None:
        try:
            # Prepare data for HuggingFace perplexity metric
            # Convert tokens to text for the metric
            
            # Calculate number of batches
            seq_len = 256  # Standard sequence length
            max_start = len(data) - seq_len
            
            if max_start <= 0:
                print("Dataset too small for perplexity calculation")
                return None
                
            # Limit the number of batches
            num_batches = min(max_batches, max_start // batch_size)
            
            # Collect text samples for perplexity calculation
            texts = []
            for i in range(num_batches * batch_size):
                start_idx = i * (max_start // (num_batches * batch_size))
                if start_idx + seq_len <= len(data):
                    tokens = data[start_idx:start_idx + seq_len].astype(np.int64)
                    text = decode(tokens.tolist(), self.itos)
                    if len(text.strip()) > 0:
                        texts.append(text)
            
            if len(texts) == 0:
                print("No valid texts extracted for perplexity calculation")
                return None
                
            print(f"Computing perplexity for {len(texts)} text samples...")
            
            # Use HuggingFace evaluate perplexity metric
            result = self.perplexity_metric.compute(
                predictions=texts,
                model_id="gpt2"  # Use a standard reference model
            )
            perplexity_value = result.get('mean_perplexity', None)
            if perplexity_value is not None:
                print(f"Perplexity: {perplexity_value:.4f}")
                return float(perplexity_value)
            else:
                print("Perplexity calculation returned None")
                return None
                
        except Exception as e:
            print(f"Error with evaluate library perplexity: {e} | {traceback.format_exc()}")
            print("Perplexity calculation failed - returning None")
            return None
    else:
        print("Perplexity metric not available")
        return None

# Update the method in the class
NanoGPTEvaluator.calculate_perplexity = calculate_perplexity

print("Fixed perplexity calculation method - using only HuggingFace evaluate library")

Fixed perplexity calculation method - using only HuggingFace evaluate library


In [51]:
# Add BLEU and ROUGE score calculation methods
def calculate_bleu_score(self, references: List[str], predictions: List[str]) -> float:
    """
    Calculate BLEU score using HuggingFace evaluate library with custom tokenization
    
    Args:
        references: List of reference texts
        predictions: List of predicted texts
        
    Returns:
        BLEU score (0-1 range)
    """
    if self.bleu_metric is None:
        raise Exception("BLEU metric not available")
    
    try:
        # Get tokenizer that matches the model's vocabulary
        custom_tokenizer = self.get_tokenizer()
        
        # HuggingFace BLEU expects references as list of lists
        formatted_references = [[ref] for ref in references]
        
        # Use custom tokenizer for BLEU calculation
        result = self.bleu_metric.compute(
            predictions=predictions,
            references=formatted_references,
            tokenizer=custom_tokenizer
        )
        
        bleu_score = result.get('bleu', 0.0)
        print(f"  BLEU details: {result}")
        return float(bleu_score)
        
    except Exception as e:
        raise Exception(f"Error calculating BLEU score: {e} | {traceback.format_exc()}")

def calculate_rouge_score(self, references: List[str], predictions: List[str]) -> Dict[str, float]:
    """
    Calculate ROUGE scores using HuggingFace evaluate library with custom tokenization
    
    Args:
        references: List of reference texts
        predictions: List of predicted texts
        
    Returns:
        Dictionary with ROUGE-1, ROUGE-2, and ROUGE-L scores
    """
    if self.rouge_metric is None:
        raise ValueError("ROUGE metric not available")
    
    try:
        # Get tokenizer that matches the model's vocabulary
        custom_tokenizer = self.get_tokenizer()
        
        # Use custom tokenizer for ROUGE calculation
        result = self.rouge_metric.compute(
            predictions=predictions,
            references=references,
            tokenizer=custom_tokenizer
        )
        
        # Extract F1 scores for each ROUGE variant
        rouge_scores = {
            'rouge1': float(result.get('rouge1', 0.0)),
            'rouge2': float(result.get('rouge2', 0.0)),
            'rougeL': float(result.get('rougeL', 0.0))
        }
        
        print(f"  ROUGE details: {result}")
        return rouge_scores
        
    except Exception as e:
        error_msg = f"Error calculating ROUGE scores: {e} | {traceback.format_exc()}"
        
        raise Exception(error_msg)

# Add methods to the class
NanoGPTEvaluator.calculate_bleu_score = calculate_bleu_score
NanoGPTEvaluator.calculate_rouge_score = calculate_rouge_score

print("BLEU and ROUGE score calculation methods added")

BLEU and ROUGE score calculation methods added


In [52]:
def evaluate_dataset(self, data_dir: str, split: str = 'val', batch_size: int = 16, 
                    max_eval_samples: int = 1000, num_text_samples: int = 50,
                    prompt_length: int = 20, generation_length: int = 30, max_tokens = None) -> Dict[str, Any]:
    """
    Evaluate the model on a dataset split with proper None handling
    
    Args:
        data_dir: Directory containing train.bin and val.bin
        split: 'train' or 'val'
        batch_size: Batch size for evaluation
        max_eval_samples: Maximum number of samples for evaluation
        num_text_samples: Number of text samples for BLEU/ROUGE
        prompt_length: Length of prompt for text generation
        generation_length: Length of generated text
        
    Returns:
        Dictionary of evaluation metrics
    """
    print(f"\n{'='*50}")
    print(f"Evaluating on {split} set")
    print(f"{'='*50}")
    
    # Load data
    data = self.load_data(data_dir, split, max_tokens)
    
    results = {'split': split, 'total_tokens': len(data)}
    
    # Calculate perplexity
    print("\n1. Calculating Perplexity...")
    start_time = time.time()
    perplexity = self.calculate_perplexity(data, batch_size, max_batches=min(100, max_eval_samples//batch_size))
    results['perplexity'] = perplexity if perplexity is not None else 0.0
    
    # Handle None perplexity properly
    if perplexity is not None:
        print(f"Perplexity: {perplexity:.4f} (took {time.time() - start_time:.2f}s)")
    else:
        print(f"Perplexity: Failed to calculate (took {time.time() - start_time:.2f}s)")
    
    # Generate samples and calculate BLEU/ROUGE
    if len(data) > 100:  # Only if we have enough data
        print("\n2. Generating samples for BLEU/ROUGE evaluation...")
        start_time = time.time()
        num_samples = min(num_text_samples, max_eval_samples//20, len(data)//100)  # Reasonable number of samples
        references, predictions = self.generate_samples_for_metrics(
            data, num_samples, prompt_length, generation_length
        )
        
        if references and predictions:
            print("\n3. Calculating BLEU score...")
            bleu_score = self.calculate_bleu_score(references, predictions)
            results['bleu'] = bleu_score
            print(f"BLEU Score: {bleu_score:.4f}")
            
            print("\n4. Calculating ROUGE scores...")
            rouge_scores = self.calculate_rouge_score(references, predictions)
            results.update(rouge_scores)
            print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
            print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
            print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
            
            # Show some example generations
            print("\n5. Example generations:")
            for i in range(min(3, len(references))):
                print(f"\nExample {i+1}:")
                print(f"Reference: {references[i][:100]}...")
                print(f"Generated: {predictions[i][:100]}...")
        else:
            raise Exception("Could not generate samples for BLEU/ROUGE evaluation")
    else:
        raise Exception("Dataset too small for text generation evaluation")
    
    print(f"\nEvaluation completed in {time.time() - start_time:.2f}s")
    return results

# Replace the method in the class
NanoGPTEvaluator.evaluate_dataset = evaluate_dataset

print("Fixed main evaluation method - properly handles None perplexity values")

Fixed main evaluation method - properly handles None perplexity values


## 5. Initialize the Evaluator

Load the model and initialize the evaluator:

In [53]:
# Initialize evaluator
print("Initializing NanoGPT Evaluator...")
print("=" * 50)
print(f"Model: {CONFIG['model_path']}")
print(f"Data: {CONFIG['data_dir']} | {os.listdir(CONFIG['data_dir'])}")
print(f"Meta: {CONFIG['meta_path']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Max eval samples: {CONFIG['max_eval_samples']}")
print(f"Splits: {CONFIG['splits']}")

try:
    evaluator = NanoGPTEvaluator(
        CONFIG['model_path'], 
        CONFIG['meta_path'], 
        CONFIG['device'],
    )
    print("\nEvaluator initialized successfully!")
except Exception as e:
    print(f"Error initializing evaluator: {e}")
    evaluator = None

Initializing NanoGPT Evaluator...
Model: ../checkpoints/baseline_nanogpt/baseline_nanogpt.pt
Data: nanoGPT/data/shakespeare_char | ['input.txt', 'meta.pkl', 'prepare.py', 'readme.md', 'train.bin', 'val.bin']
Meta: ../checkpoints/baseline_nanogpt/nanogpt_meta.pkl
Batch size: 16
Max eval samples: 1000
Splits: ['val', 'train']
Using device: cpu


Model arguments: {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'block_size': 256, 'bias': False, 'vocab_size': 65, 'dropout': 0.2}
number of parameters: 10.65M
Model loaded successfully!
Number of parameters: 10,745,088
number of parameters: 10.65M
Model loaded successfully!
Number of parameters: 10,745,088
HuggingFace evaluation metrics loaded successfully

Evaluator initialized successfully!
HuggingFace evaluation metrics loaded successfully

Evaluator initialized successfully!


## 6. Run Evaluation

Evaluate the model on the specified dataset splits:

In [34]:
# Run evaluation on all specified splits
if evaluator is not None:
    all_results = {}
    
    for split in CONFIG['splits']:#[CONFIG['splits'][0]]:
        print(f"\nEvaluating {split} split...")
        try:
            results = evaluator.evaluate_dataset(
                CONFIG['data_dir'], 
                split, 
                CONFIG['batch_size'], 
                CONFIG['max_eval_samples'],
                CONFIG['num_text_samples'],
                CONFIG['prompt_length'],
                CONFIG['generation_length'],
                max_tokens=CONFIG['max_tokens']
            )
            all_results[split] = results
            print(f"{split} evaluation completed")
        except Exception as e:
            print(f"Error evaluating {split} split: {e}")
            continue
else:
    print("Cannot run evaluation - evaluator not initialized")
    all_results = {}


Evaluating val split...

Evaluating on val set
Loaded val data: 111,540 tokens

1. Calculating Perplexity...
Calculating perplexity with 16 batch size...
Computing perplexity for 992 text samples...


  6%|â–‹         | 4/62 [01:29<21:35, 22.34s/it]



KeyboardInterrupt: 

## 7. Results Summary

Display a comprehensive summary of all evaluation results:

In [14]:
# Print comprehensive summary
print(f"\n{'='*60}")
print("EVALUATION SUMMARY")
print(f"{'='*60}")

if all_results:
    # Create a summary table
    import pandas as pd
    
    summary_data = []
    for split, results in all_results.items():
        summary_data.append({
            'Split': split.upper(),
            'Total Tokens': f"{results.get('total_tokens', 0):,}",
            'Perplexity': f"{results.get('perplexity', 0):.4f}",
            'BLEU': f"{results.get('bleu', 0):.4f}",
            'ROUGE-1': f"{results.get('rouge1', 0):.4f}",
            'ROUGE-2': f"{results.get('rouge2', 0):.4f}",
            'ROUGE-L': f"{results.get('rougeL', 0):.4f}"
        })
    
    try:
        df = pd.DataFrame(summary_data)
        print(df.to_string(index=False))
    except:
        # Fallback if pandas is not available
        for split, results in all_results.items():
            print(f"\n{split.upper()} SET:")
            print(f"  Total tokens: {results.get('total_tokens', 0):,}")
            print(f"  Perplexity:   {results.get('perplexity', 0):.4f}")
            print(f"  BLEU:         {results.get('bleu', 0):.4f}")
            print(f"  ROUGE-1:      {results.get('rouge1', 0):.4f}")
            print(f"  ROUGE-2:      {results.get('rouge2', 0):.4f}")
            print(f"  ROUGE-L:      {results.get('rougeL', 0):.4f}")
    
    print(f"\nEvaluation completed successfully!")
    
    # Store results for further analysis
    evaluation_results = all_results
    print(f"\nResults stored in 'evaluation_results' variable for further analysis")
else:
    print("No evaluation results to display")
    evaluation_results = {}


EVALUATION SUMMARY
Split Total Tokens Perplexity   BLEU ROUGE-1 ROUGE-2 ROUGE-L
  VAL      111,540     0.0000 0.0775  0.4879  0.1386  0.3138
TRAIN    1,003,854     0.0000 0.0602  0.5093  0.1217  0.3126

Evaluation completed successfully!

Results stored in 'evaluation_results' variable for further analysis


## 8. Additional Analysis (Optional)

You can use this cell for additional analysis of the results:

In [15]:
# Additional analysis cell - customize as needed

if evaluation_results:
    print("Additional Analysis:")
    print("=" * 30)
    
    # Compare train vs validation performance
    if 'train' in evaluation_results and 'val' in evaluation_results:
        train_ppl = evaluation_results['train'].get('perplexity', 0)
        val_ppl = evaluation_results['val'].get('perplexity', 0)
        
        print(f"\nPerplexity Comparison:")
        print(f"  Training:   {train_ppl:.4f}")
        print(f"  Validation: {val_ppl:.4f}")
        
        if train_ppl > 0 and val_ppl > 0:
            ratio = val_ppl / train_ppl
            print(f"  Val/Train ratio: {ratio:.4f}")
            
            if ratio > 1.5:
                print("  High validation perplexity suggests overfitting")
            elif ratio < 1.1:
                print(f"  Good generalization - low overfitting")
            else:
                print(f"  Moderate generalization gap")
    
    # Text generation quality assessment
    for split in evaluation_results:
        results = evaluation_results[split]
        bleu = results.get('bleu', 0)
        rouge1 = results.get('rouge1', 0)
        
        print(f"\nText Generation Quality ({split}):")
        if bleu > 0.3:
            print(f"  BLEU {bleu:.4f}: Good text similarity")
        elif bleu > 0.1:
            print(f"  BLEU {bleu:.4f}: Moderate text similarity")
        else:
            print(f"  BLEU {bleu:.4f}: Low text similarity")
        
        if rouge1 > 0.3:
            print(f"  ROUGE-1 {rouge1:.4f}: Good word overlap")
        elif rouge1 > 0.15:
            print(f"  ROUGE-1 {rouge1:.4f}: Moderate word overlap")
        else:
            print(f"  ROUGE-1 {rouge1:.4f}: Low word overlap")
else:
    print("No results available for analysis")

Additional Analysis:

Perplexity Comparison:
  Training:   0.0000
  Validation: 0.0000

Text Generation Quality (val):
  BLEU 0.0775: Low text similarity
  ROUGE-1 0.4879: Good word overlap

Text Generation Quality (train):
  BLEU 0.0602: Low text similarity
  ROUGE-1 0.5093: Good word overlap


## 9. Export Results (Optional)

Save the evaluation results to a file for later analysis:

In [52]:
# Export results to JSON file
import json
from datetime import datetime

if evaluation_results:
    # Add metadata
    export_data = {
        'timestamp': datetime.now().isoformat(),
        'config': CONFIG,
        'results': evaluation_results,
        'model_info': {
            'model_path': CONFIG['model_path'],
            'meta_path': CONFIG['meta_path'],
            'vocab_size': evaluator.vocab_size if evaluator else None,
            'device': evaluator.device if evaluator else None
        }
    }
    
    # Save to file
    output_file = f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    try:
        with open(output_file, 'w') as f:
            json.dump(export_data, f, indent=2)
        print(f"Results exported to: {output_file}")
    except Exception as e:
        print(f"Error exporting results: {e}")
else:
    print("No results to export")

Results exported to: evaluation_results_20250919_144940.json
