# NanoGPT Model Evaluation

This notebook loads a trained NanoGPT model and evaluates it on train.bin and val.bin datasets using multiple metrics:
- **Perplexity**: Measures how well the model predicts the next token
- **BLEU Score**: Measures similarity between generated and reference text
- **ROUGE Scores**: Measures overlap of n-grams between generated and reference text

## Usage
1. Configure the paths and parameters in the configuration cell
2. Run all cells to perform the evaluation
3. View results in the final summary

## 1. Import Required Libraries

In [1]:
import os
import logging
import time
import traceback
from pathlib import Path
from typing import List, Tuple, Dict, Any

import numpy as np
import torch
import sys
import torch.nn.functional as F

# Configure logging to both file and console
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create formatters
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

file_handler = logging.FileHandler('nanogpt_evaluation.log', mode='a')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)

console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)


# Add the required paths for importing
current_dir = Path.cwd()
sys.path.append(str(current_dir / "baselines/nanogpt/shakespeare-char/models"))
sys.path.append(str(current_dir / "notebooks"))

print(f"Current directory: {current_dir}")

Current directory: c:\Users\hayk_\OneDrive\Desktop\05_LMU_Masters\04_applied_dl\adl-bnn-textgen\notebooks


In [2]:
import evaluate

## 2. Configuration

Set your model paths and evaluation parameters here:

In [34]:
# Configuration
CONFIG = {
    # 'data_dir': "/Users/sofianikolenko/Downloads",
    # 'model_path': parent_dir / 'checkpoints/baseline/models/baseline_model_2k.pt',
    # 'meta_path': parent_dir / 'checkpoints/baseline/models/meta.pkl',
    # 'data_dir': 'nanoGPT/data/shakespeare',
    # 'model_path': '../checkpoints/baseline_token_level_nano/token_level_1500_iter.pt',
    # 'meta_path': '../checkpoints/baseline_nanogpt/nanogpt_meta.pkl',
    # 'data_dir': "/Users/sofianikolenko/Downloads",
    # 'model_path': parent_dir / 'baselines/nanogpt/shakespeare-char/models/baseline_model_2k.pt',
    # 'meta_path': parent_dir / 'baselines/nanogpt/shakespeare-char/models/meta.pkl',
    'data_dir': 'nanoGPT/data/shakespeare_char',
    'model_path': '../checkpoints/baseline_nanogpt/baseline_nanogpt.pt',
    'meta_path': '../checkpoints/baseline_nanogpt/meta.pkl',

    'batch_size': 32,
    'max_eval_samples': 100,
    'device': 'auto',  # 'auto', 'cpu', or 'cuda'
    'splits': ['val', 'train'],  # Dataset splits to evaluate
    'num_text_samples': 10,  # Number of text samples for BLEU/ROUGE
    'prompt_length': 20,  # Length of prompt for text generation
    'generation_length': 30,  # Length of generated text,
    "max_tokens": None # for fast debug, None = all
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Check if paths exist
for path_key in ['data_dir', 'model_path', 'meta_path']:
    path = Path(CONFIG[path_key])
    if not path.exists():
        raise FileNotFoundError(f"Required path not found: {path}")

logger.debug(f"Configuration: {CONFIG}")

Configuration:
  data_dir: nanoGPT/data/shakespeare_char
  model_path: ../checkpoints/baseline_nanogpt/baseline_nanogpt.pt
  meta_path: ../checkpoints/baseline_nanogpt/meta.pkl
  batch_size: 32
  max_eval_samples: 100
  device: auto
  splits: ['val', 'train']
  num_text_samples: 10
  prompt_length: 20
  generation_length: 30
  max_tokens: None


<!-- ## 3. Alternative Utility Functions

These functions provide fallback implementations if the utils module is not available: -->

In [35]:
import sys
import os
from pathlib import Path

# --- Project Root Resolution ---
# Notebook lives in <repo>/notebooks; project root is parent of current working directory.
ROOT_DIR = Path.cwd().parent
if not (ROOT_DIR / 'baselines' / 'nanogpt' / 'model.py').exists():
    # Fallback: walk up until we find baselines/nanogpt/model.py
    for p in Path.cwd().parents:
        if (p / 'baselines' / 'nanogpt' / 'model.py').exists():
            ROOT_DIR = p
            break

# Set BASE_DIR so config.py uses the proper root (config.py defaults to cwd if BASE_DIR unset)
os.environ['BASE_DIR'] = str(ROOT_DIR)
os.environ['MODEL_PATH'] = CONFIG['model_path']
os.environ['META_PATH'] = CONFIG['meta_path']
os.environ['DATA_DIR'] = CONFIG['data_dir']

# Ensure src and baselines are on sys.path for imports
src_path = ROOT_DIR / 'src'
baselines_path = ROOT_DIR / 'baselines'
for p in (src_path, baselines_path):
    sp = str(p)
    if sp not in sys.path:
        sys.path.append(sp)

print(f"Resolved ROOT_DIR: {ROOT_DIR}")
print(f"sys.path contains src? {'src' in sp.lower() if 'sp' in locals() else 'N/A'}")
print(f"MODEL_PATH env: {os.environ['MODEL_PATH']}")
print(f"META_PATH env: {os.environ['META_PATH']}")
print(f"DATA_DIR env: {os.environ['DATA_DIR']}")

try:
    from nanogpt_utils import load_model, load_tokenizer, encode, decode
    print("Imported nanogpt_utils successfully.")
except ModuleNotFoundError as e:
    print("Failed to import nanogpt_utils. sys.path:")
    for p in sys.path:
        if 'baselines' in p or 'src' in p:
            print('  ', p)
    raise


Resolved ROOT_DIR: c:\Users\hayk_\OneDrive\Desktop\05_LMU_Masters\04_applied_dl\adl-bnn-textgen
sys.path contains src? False
MODEL_PATH env: ../checkpoints/baseline_nanogpt/baseline_nanogpt.pt
META_PATH env: ../checkpoints/baseline_nanogpt/meta.pkl
DATA_DIR env: nanoGPT/data/shakespeare_char
Imported nanogpt_utils successfully.


## 4. NanoGPT Evaluator Class

This class handles model loading and evaluation with multiple metrics:

In [None]:
# Replaced inline class definition with import from reusable module
from evaluation.nanogpt_evaluator import NanoGPTEvaluator, evaluate_splits
print("Imported NanoGPTEvaluator from evaluation.nanogpt_evaluator")

NanoGPTEvaluator class defined


## 5. Initialize the Evaluator

Load the model and initialize the evaluator:

In [44]:
# Initialize evaluator
print("Initializing NanoGPT Evaluator...")
print("=" * 50)
print(f"Model: {CONFIG['model_path']}")
print(f"Data: {CONFIG['data_dir']} | {os.listdir(CONFIG['data_dir'])}")
print(f"Meta: {CONFIG['meta_path']}")
print(f"Batch size: {CONFIG['batch_size']}")
print(f"Max eval samples: {CONFIG['max_eval_samples']}")
print(f"Splits: {CONFIG['splits']}")

try:
    evaluator = NanoGPTEvaluator(
        CONFIG['model_path'], 
        CONFIG['meta_path'], 
        CONFIG['device'],
    )
    print("\nEvaluator initialized successfully!")
except Exception as e:
    print(f"Error initializing evaluator: {e}")
    evaluator = None

Initializing NanoGPT Evaluator...
Model: ../checkpoints/baseline_nanogpt/baseline_nanogpt.pt
Data: nanoGPT/data/shakespeare_char | ['input.txt', 'meta.pkl', 'prepare.py', 'readme.md', 'train.bin', 'val.bin']
Meta: ../checkpoints/baseline_nanogpt/meta.pkl
Batch size: 32
Max eval samples: 100
Splits: ['val', 'train']
Using device: cpu


Model arguments: {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'block_size': 256, 'bias': False, 'vocab_size': 65, 'dropout': 0.2}
number of parameters: 10.65M
Model loaded successfully!
Number of parameters: 10,745,088
number of parameters: 10.65M
Model loaded successfully!
Number of parameters: 10,745,088
HuggingFace evaluation metrics loaded successfully

Evaluator initialized successfully!
HuggingFace evaluation metrics loaded successfully

Evaluator initialized successfully!


## 6. Run Evaluation

Evaluate the model on the specified dataset splits:

In [None]:
# Run evaluation concisely using the imported module
if 'evaluator' in globals() and evaluator is not None:
    all_results = evaluate_splits(evaluator, CONFIG)
else:
    print("Cannot run evaluation - evaluator not initialized")
    all_results = {}


Evaluating val split...

Evaluating on val set
Loaded val data: 111,540 tokens

1. Calculating Perplexity...
Calculating perplexity with 32 batch size...
Computing perplexity for 96 text samples...


  0%|          | 0/6 [00:00<?, ?it/s]

Perplexity: 139.7433
Perplexity: 139.7433 (took 35.06s)

2. Generating samples for BLEU/ROUGE evaluation...
Generating 5 samples for BLEU/ROUGE evaluation...
Generating 5 text samples...
Successfully generated 5 sample pairs

3. Calculating BLEU score...
  BLEU details: {'bleu': 0.1466420090326979, 'precisions': [0.5369127516778524, 0.20833333333333334, 0.07913669064748201, 0.05223880597014925], 'brevity_penalty': 1.0, 'length_ratio': 1.0067567567567568, 'translation_length': 149, 'reference_length': 148}
BLEU Score: 0.1466

4. Calculating ROUGE scores...
Successfully generated 5 sample pairs

3. Calculating BLEU score...
  BLEU details: {'bleu': 0.1466420090326979, 'precisions': [0.5369127516778524, 0.20833333333333334, 0.07913669064748201, 0.05223880597014925], 'brevity_penalty': 1.0, 'length_ratio': 1.0067567567567568, 'translation_length': 149, 'reference_length': 148}
BLEU Score: 0.1466

4. Calculating ROUGE scores...
  ROUGE details: {'rouge1': np.float64(0.5389401909214884), 'ro

  0%|          | 0/6 [00:00<?, ?it/s]

Perplexity: 182.1334
Perplexity: 182.1334 (took 30.14s)

2. Generating samples for BLEU/ROUGE evaluation...
Generating 5 samples for BLEU/ROUGE evaluation...
Generating 5 text samples...
Successfully generated 5 sample pairs

3. Calculating BLEU score...
  BLEU details: {'bleu': 0.0, 'precisions': [0.5763888888888888, 0.1223021582733813, 0.022388059701492536, 0.0], 'brevity_penalty': 0.9862071167439163, 'length_ratio': 0.9863013698630136, 'translation_length': 144, 'reference_length': 146}
BLEU Score: 0.0000

4. Calculating ROUGE scores...
Successfully generated 5 sample pairs

3. Calculating BLEU score...
  BLEU details: {'bleu': 0.0, 'precisions': [0.5763888888888888, 0.1223021582733813, 0.022388059701492536, 0.0], 'brevity_penalty': 0.9862071167439163, 'length_ratio': 0.9863013698630136, 'translation_length': 144, 'reference_length': 146}
BLEU Score: 0.0000

4. Calculating ROUGE scores...
  ROUGE details: {'rouge1': np.float64(0.572763088316935), 'rouge2': np.float64(0.1224068094304

## 7. Results Summary

Display a comprehensive summary of all evaluation results:

In [14]:
# Print comprehensive summary
print(f"\n{'='*60}")
print("EVALUATION SUMMARY")
print(f"{'='*60}")

if all_results:
    # Create a summary table
    import pandas as pd
    
    summary_data = []
    for split, results in all_results.items():
        summary_data.append({
            'Split': split.upper(),
            'Total Tokens': f"{results.get('total_tokens', 0):,}",
            'Perplexity': f"{results.get('perplexity', 0):.4f}",
            'BLEU': f"{results.get('bleu', 0):.4f}",
            'ROUGE-1': f"{results.get('rouge1', 0):.4f}",
            'ROUGE-2': f"{results.get('rouge2', 0):.4f}",
            'ROUGE-L': f"{results.get('rougeL', 0):.4f}"
        })
    
    try:
        df = pd.DataFrame(summary_data)
        print(df.to_string(index=False))
    except:
        # Fallback if pandas is not available
        for split, results in all_results.items():
            print(f"\n{split.upper()} SET:")
            print(f"  Total tokens: {results.get('total_tokens', 0):,}")
            print(f"  Perplexity:   {results.get('perplexity', 0):.4f}")
            print(f"  BLEU:         {results.get('bleu', 0):.4f}")
            print(f"  ROUGE-1:      {results.get('rouge1', 0):.4f}")
            print(f"  ROUGE-2:      {results.get('rouge2', 0):.4f}")
            print(f"  ROUGE-L:      {results.get('rougeL', 0):.4f}")
    
    print(f"\nEvaluation completed successfully!")
    
    # Store results for further analysis
    evaluation_results = all_results
    print(f"\nResults stored in 'evaluation_results' variable for further analysis")
else:
    print("No evaluation results to display")
    evaluation_results = {}


EVALUATION SUMMARY
Split Total Tokens Perplexity   BLEU ROUGE-1 ROUGE-2 ROUGE-L
  VAL      111,540     0.0000 0.0775  0.4879  0.1386  0.3138
TRAIN    1,003,854     0.0000 0.0602  0.5093  0.1217  0.3126

Evaluation completed successfully!

Results stored in 'evaluation_results' variable for further analysis


## 8. Additional Analysis (Optional)

You can use this cell for additional analysis of the results:

In [15]:
# Additional analysis cell - customize as needed

if evaluation_results:
    print("Additional Analysis:")
    print("=" * 30)
    
    # Compare train vs validation performance
    if 'train' in evaluation_results and 'val' in evaluation_results:
        train_ppl = evaluation_results['train'].get('perplexity', 0)
        val_ppl = evaluation_results['val'].get('perplexity', 0)
        
        print(f"\nPerplexity Comparison:")
        print(f"  Training:   {train_ppl:.4f}")
        print(f"  Validation: {val_ppl:.4f}")
        
        if train_ppl > 0 and val_ppl > 0:
            ratio = val_ppl / train_ppl
            print(f"  Val/Train ratio: {ratio:.4f}")
            
            if ratio > 1.5:
                print("  High validation perplexity suggests overfitting")
            elif ratio < 1.1:
                print(f"  Good generalization - low overfitting")
            else:
                print(f"  Moderate generalization gap")
    
    # Text generation quality assessment
    for split in evaluation_results:
        results = evaluation_results[split]
        bleu = results.get('bleu', 0)
        rouge1 = results.get('rouge1', 0)
        
        print(f"\nText Generation Quality ({split}):")
        if bleu > 0.3:
            print(f"  BLEU {bleu:.4f}: Good text similarity")
        elif bleu > 0.1:
            print(f"  BLEU {bleu:.4f}: Moderate text similarity")
        else:
            print(f"  BLEU {bleu:.4f}: Low text similarity")
        
        if rouge1 > 0.3:
            print(f"  ROUGE-1 {rouge1:.4f}: Good word overlap")
        elif rouge1 > 0.15:
            print(f"  ROUGE-1 {rouge1:.4f}: Moderate word overlap")
        else:
            print(f"  ROUGE-1 {rouge1:.4f}: Low word overlap")
else:
    print("No results available for analysis")

Additional Analysis:

Perplexity Comparison:
  Training:   0.0000
  Validation: 0.0000

Text Generation Quality (val):
  BLEU 0.0775: Low text similarity
  ROUGE-1 0.4879: Good word overlap

Text Generation Quality (train):
  BLEU 0.0602: Low text similarity
  ROUGE-1 0.5093: Good word overlap


## 9. Export Results (Optional)

Save the evaluation results to a file for later analysis:

In [52]:
# Export results to JSON file
import json
from datetime import datetime

if evaluation_results:
    # Add metadata
    export_data = {
        'timestamp': datetime.now().isoformat(),
        'config': CONFIG,
        'results': evaluation_results,
        'model_info': {
            'model_path': CONFIG['model_path'],
            'meta_path': CONFIG['meta_path'],
            'vocab_size': evaluator.vocab_size if evaluator else None,
            'device': evaluator.device if evaluator else None
        }
    }
    
    # Save to file
    output_file = f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    try:
        with open(output_file, 'w') as f:
            json.dump(export_data, f, indent=2)
        print(f"Results exported to: {output_file}")
    except Exception as e:
        print(f"Error exporting results: {e}")
else:
    print("No results to export")

Results exported to: evaluation_results_20250919_144940.json
