# Batch Evaluation of Llama 3.2 Vision on Safaitic Inscriptions

This notebook runs systematic evaluations across multiple inscriptions to:
1. Test VLM performance at scale
2. Generate aggregate statistics
3. Identify strengths and weaknesses
4. Export results for analysis

**Note**: This can take significant time depending on the number of inscriptions tested.

## Setup

In [None]:
import sys
from pathlib import Path
import json
from datetime import datetime
import time

# Add src to path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

from src import (
    load_metadata,
    get_inscription_data,
    list_available_inscriptions,
    LlamaVision,
    SafaiticPrompts,
    InscriptionEvaluator
)

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

print("✓ Imports successful")

In [None]:
# Initialize components
llama = LlamaVision(model_name="llama3.2-vision", timeout=180)
prompts = SafaiticPrompts()
evaluator = InscriptionEvaluator()

# Verify model availability
if not llama.check_availability():
    raise RuntimeError("Llama 3.2 Vision not available. Cannot proceed.")

print("✓ Model ready for batch processing")

In [None]:
# Load metadata
metadata_path = project_root / "metadata" / "BES15.csv"
data_dir = project_root / "data"
df = load_metadata(str(metadata_path))

available_inscriptions = list_available_inscriptions(df)
print(f"Total available inscriptions: {len(available_inscriptions)}")

## Configure Batch Evaluation

In [None]:
# Configuration
CONFIG = {
    # Select inscriptions to test
    'inscription_ids': available_inscriptions[:10],  # Start with first 10
    
    # Which prompt template to use
    'prompt_template': 'transliteration_attempt',  # Options: see SafaiticPrompts.get_all_prompts()
    
    # Image selection per inscription
    'images_per_inscription': 1,  # How many images to analyze per inscription (use -1 for all)
    'image_selection': 'first',  # 'first', 'random', or 'all'
    
    # Model parameters
    'temperature': 0.1,
    
    # Output
    'save_results': True,
    'results_dir': project_root / 'results',
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## Define Evaluation Function

In [None]:
def evaluate_inscription(inscription_id: str, config: dict) -> dict:
    """
    Evaluate a single inscription.
    
    Returns:
        Dictionary with evaluation results
    """
    try:
        # Load inscription data
        inscription = get_inscription_data(df, str(data_dir), inscription_id, load_images=True)
        
        if not inscription.images:
            return {
                'inscription_id': inscription_id,
                'status': 'error',
                'error': 'No images found',
            }
        
        # Select images to test
        if config['image_selection'] == 'first':
            test_images = inscription.images[:config['images_per_inscription']]
        elif config['image_selection'] == 'all':
            test_images = inscription.images
        else:  # random
            import random
            n = min(config['images_per_inscription'], len(inscription.images))
            test_images = random.sample(inscription.images, n)
        
        # Get prompt
        prompt_method = getattr(prompts, config['prompt_template'])
        prompt = prompt_method()
        
        # Analyze images
        image_results = []
        for idx, image in enumerate(test_images):
            result = llama.analyze_image(
                image,
                prompt,
                temperature=config['temperature']
            )
            
            if result['success']:
                # Evaluate the response
                evaluation = evaluator.evaluate_inscription(
                    inscription.transliteration,
                    inscription.translation,
                    result['response']
                )
                
                # Add script identification check
                script_check = evaluator.check_script_identification(result['response'])
                
                image_results.append({
                    'image_index': idx,
                    'success': True,
                    'response': result['response'],
                    'evaluation': evaluation,
                    'script_identification': script_check,
                    'duration': result.get('total_duration'),
                })
            else:
                image_results.append({
                    'image_index': idx,
                    'success': False,
                    'error': result.get('error'),
                })
        
        # Calculate aggregate metrics
        successful_evals = [r['evaluation'] for r in image_results if r['success']]
        
        if successful_evals:
            avg_score = np.mean([e['overall_score'] for e in successful_evals])
            avg_char_sim = np.mean([e['character_metrics']['similarity_ratio'] for e in successful_evals])
            avg_word_acc = np.mean([e['word_metrics']['word_accuracy'] for e in successful_evals])
        else:
            avg_score = avg_char_sim = avg_word_acc = 0
        
        return {
            'inscription_id': inscription_id,
            'status': 'success',
            'ground_truth': {
                'transliteration': inscription.transliteration,
                'translation': inscription.translation,
                'transliteration_length': len(inscription.transliteration),
            },
            'num_images_tested': len(test_images),
            'image_results': image_results,
            'aggregate_metrics': {
                'avg_overall_score': avg_score,
                'avg_character_similarity': avg_char_sim,
                'avg_word_accuracy': avg_word_acc,
            },
        }
        
    except Exception as e:
        return {
            'inscription_id': inscription_id,
            'status': 'error',
            'error': str(e),
        }

print("✓ Evaluation function defined")

## Run Batch Evaluation

In [None]:
# Run evaluations
results = []
start_time = time.time()

print(f"Starting evaluation of {len(CONFIG['inscription_ids'])} inscriptions...\n")

for inscription_id in tqdm(CONFIG['inscription_ids'], desc="Evaluating"):
    result = evaluate_inscription(inscription_id, CONFIG)
    results.append(result)
    
    # Brief pause between inscriptions to avoid overwhelming the system
    time.sleep(0.5)

elapsed_time = time.time() - start_time
print(f"\n✓ Evaluation complete in {elapsed_time:.1f} seconds")
print(f"  Average time per inscription: {elapsed_time/len(results):.1f}s")

## Analyze Results

In [None]:
# Summary statistics
successful_results = [r for r in results if r['status'] == 'success']
failed_results = [r for r in results if r['status'] == 'error']

print(f"Results Summary:")
print(f"  Total inscriptions: {len(results)}")
print(f"  Successful: {len(successful_results)}")
print(f"  Failed: {len(failed_results)}")

if failed_results:
    print("\nFailed inscriptions:")
    for r in failed_results:
        print(f"  - {r['inscription_id']}: {r['error']}")

In [None]:
# Performance metrics
if successful_results:
    scores = [r['aggregate_metrics']['avg_overall_score'] for r in successful_results]
    char_sims = [r['aggregate_metrics']['avg_character_similarity'] for r in successful_results]
    word_accs = [r['aggregate_metrics']['avg_word_accuracy'] for r in successful_results]
    
    print("\nPerformance Metrics:")
    print(f"  Overall Score:")
    print(f"    Mean: {np.mean(scores):.3f}")
    print(f"    Median: {np.median(scores):.3f}")
    print(f"    Std Dev: {np.std(scores):.3f}")
    print(f"    Range: {np.min(scores):.3f} - {np.max(scores):.3f}")
    
    print(f"\n  Character Similarity:")
    print(f"    Mean: {np.mean(char_sims):.3f}")
    print(f"    Median: {np.median(char_sims):.3f}")
    
    print(f"\n  Word Accuracy:")
    print(f"    Mean: {np.mean(word_accs):.3f}")
    print(f"    Median: {np.median(word_accs):.3f}")

In [None]:
# Script identification accuracy
if successful_results:
    script_identifications = []
    for r in successful_results:
        for img_result in r['image_results']:
            if img_result['success'] and 'script_identification' in img_result:
                script_identifications.append(img_result['script_identification'])
    
    if script_identifications:
        correct_ids = sum(1 for s in script_identifications if s['correctly_identified'])
        total_ids = len(script_identifications)
        
        print(f"\nScript Identification:")
        print(f"  Correctly identified as Safaitic: {correct_ids}/{total_ids} ({100*correct_ids/total_ids:.1f}%)")
        print(f"  Average confidence: {np.mean([s['confidence'] for s in script_identifications]):.3f}")

## Visualize Results

In [None]:
if successful_results:
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Score distribution
    axes[0, 0].hist(scores, bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(np.mean(scores), color='red', linestyle='--', label=f'Mean: {np.mean(scores):.3f}')
    axes[0, 0].set_xlabel('Overall Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Distribution of Overall Scores')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Performance by inscription
    inscr_ids = [r['inscription_id'] for r in successful_results]
    inscr_scores = [r['aggregate_metrics']['avg_overall_score'] for r in successful_results]
    
    axes[0, 1].barh(range(len(inscr_ids)), inscr_scores, color='skyblue', edgecolor='black')
    axes[0, 1].set_yticks(range(len(inscr_ids)))
    axes[0, 1].set_yticklabels(inscr_ids, fontsize=8)
    axes[0, 1].set_xlabel('Overall Score')
    axes[0, 1].set_title('Performance by Inscription')
    axes[0, 1].grid(True, alpha=0.3, axis='x')
    
    # 3. Character vs Word accuracy
    axes[1, 0].scatter(char_sims, word_accs, alpha=0.6, s=100)
    axes[1, 0].set_xlabel('Character Similarity')
    axes[1, 0].set_ylabel('Word Accuracy')
    axes[1, 0].set_title('Character Similarity vs Word Accuracy')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Add diagonal line
    axes[1, 0].plot([0, 1], [0, 1], 'r--', alpha=0.5, label='Perfect correlation')
    axes[1, 0].legend()
    
    # 4. Score vs transliteration length
    translit_lengths = [len(r['ground_truth']['transliteration']) for r in successful_results]
    axes[1, 1].scatter(translit_lengths, inscr_scores, alpha=0.6, s=100)
    axes[1, 1].set_xlabel('Transliteration Length (characters)')
    axes[1, 1].set_ylabel('Overall Score')
    axes[1, 1].set_title('Performance vs Inscription Length')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No successful results to visualize")

## Identify Best and Worst Performers

In [None]:
if successful_results:
    # Sort by overall score
    sorted_results = sorted(
        successful_results,
        key=lambda x: x['aggregate_metrics']['avg_overall_score'],
        reverse=True
    )
    
    print("Top 5 Best Performers:")
    for i, r in enumerate(sorted_results[:5], 1):
        score = r['aggregate_metrics']['avg_overall_score']
        translit = r['ground_truth']['transliteration']
        print(f"  {i}. {r['inscription_id']}: {score:.3f}")
        print(f"     Ground truth: {translit}")
    
    print("\nTop 5 Worst Performers:")
    for i, r in enumerate(sorted_results[-5:][::-1], 1):
        score = r['aggregate_metrics']['avg_overall_score']
        translit = r['ground_truth']['transliteration']
        print(f"  {i}. {r['inscription_id']}: {score:.3f}")
        print(f"     Ground truth: {translit}")

## Save Results

In [None]:
if CONFIG['save_results']:
    # Create results directory
    CONFIG['results_dir'].mkdir(exist_ok=True)
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"batch_eval_{CONFIG['prompt_template']}_{timestamp}.json"
    output_path = CONFIG['results_dir'] / filename
    
    # Prepare output data
    output = {
        'metadata': {
            'timestamp': timestamp,
            'model': 'llama3.2-vision',
            'num_inscriptions': len(CONFIG['inscription_ids']),
            'prompt_template': CONFIG['prompt_template'],
            'config': {k: str(v) for k, v in CONFIG.items() if k != 'results_dir'},
            'duration_seconds': elapsed_time,
        },
        'summary': {
            'total_inscriptions': len(results),
            'successful': len(successful_results),
            'failed': len(failed_results),
            'avg_overall_score': float(np.mean(scores)) if scores else 0,
            'avg_character_similarity': float(np.mean(char_sims)) if char_sims else 0,
            'avg_word_accuracy': float(np.mean(word_accs)) if word_accs else 0,
        },
        'results': results,
    }
    
    # Save to JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Results saved to: {output_path}")
    print(f"  File size: {output_path.stat().st_size / 1024:.1f} KB")
else:
    print("Results not saved (CONFIG['save_results'] = False)")

## Export Summary CSV

In [None]:
# Create a summary DataFrame for easier analysis
if successful_results:
    summary_data = []
    for r in successful_results:
        summary_data.append({
            'inscription_id': r['inscription_id'],
            'transliteration': r['ground_truth']['transliteration'],
            'transliteration_length': r['ground_truth']['transliteration_length'],
            'overall_score': r['aggregate_metrics']['avg_overall_score'],
            'character_similarity': r['aggregate_metrics']['avg_character_similarity'],
            'word_accuracy': r['aggregate_metrics']['avg_word_accuracy'],
            'num_images_tested': r['num_images_tested'],
        })
    
    summary_df = pd.DataFrame(summary_data)
    
    # Display
    print("\nSummary DataFrame:")
    display(summary_df)
    
    # Save to CSV
    if CONFIG['save_results']:
        csv_path = CONFIG['results_dir'] / f"summary_{CONFIG['prompt_template']}_{timestamp}.csv"
        summary_df.to_csv(csv_path, index=False)
        print(f"\n✓ Summary CSV saved to: {csv_path}")

## Next Steps

- Analyze the detailed results JSON file
- Compare performance across different prompt templates
- Test on larger subsets of inscriptions
- Investigate worst-performing cases
- Try different temperature settings
- Test multi-image analysis strategies