# Annotate Latin Inscriptions with Gemini API

This notebook provides a complete workflow for:
1. Downloading epigraphic data from EDH (Epigraphic Database Heidelberg)
2. Annotating inscriptions using Gemini Flash 2.5 API
3. Validating annotation quality
4. Saving results and committing to git

## Prerequisites

1. **Google AI API Key**: Get one at https://aistudio.google.com/app/apikey
2. **Installed packages**: Run the setup cell below

## Cost Estimate

- Gemini Flash 2.5: ~$0.20-0.50 USD per 1000 inscriptions
- Processing time: ~20-40 minutes per 1000 inscriptions (with 1s delay)

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q google-generativeai pandas requests

# Install latinepi if not already installed
import os
if not os.path.exists('latinepi'):
    print("Installing latinepi package...")
    !pip install -e . -q
else:
    print("latinepi already available")

In [None]:
# Imports
import json
import time
import re
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
import pandas as pd
from collections import Counter, defaultdict

try:
    import google.generativeai as genai
    print("‚úÖ Google Generative AI imported successfully")
except ImportError:
    print("‚ùå Please run the installation cell above")
    raise

## 2. Configuration

In [None]:
# ========================================
# CONFIGURATION - Adjust these settings
# ========================================

# Google AI API Key
GOOGLE_AI_API_KEY = os.environ.get("GOOGLE_AI_API_KEY", "")

if not GOOGLE_AI_API_KEY:
    print("‚ö†Ô∏è  No API key found in environment.")
    print("   Option 1: Set environment variable: export GOOGLE_AI_API_KEY='your-key'")
    print("   Option 2: Enter it directly in the next cell")
else:
    print(f"‚úÖ API key loaded (first 10 chars): {GOOGLE_AI_API_KEY[:10]}...")

# Gemini model to use
GEMINI_MODEL = "gemini-2.0-flash-exp"  # Fast and cost-effective
# Alternative: "gemini-1.5-pro" for higher quality (more expensive)

# Output paths
OUTPUT_DIR = Path("assets")
OUTPUT_DIR.mkdir(exist_ok=True)

ANNOTATIONS_FILE = OUTPUT_DIR / "gemini_annotations.jsonl"
CHECKPOINT_FILE = OUTPUT_DIR / "gemini_annotations.jsonl.tmp"

# Processing parameters
SAVE_CHECKPOINT_EVERY = 50  # Save progress every N inscriptions
API_DELAY = 1.0  # Seconds between API calls (rate limiting)
MAX_RETRIES = 3  # Retry failed API calls

print("\nüìã Configuration loaded:")
print(f"   Model: {GEMINI_MODEL}")
print(f"   Output: {ANNOTATIONS_FILE}")
print(f"   Checkpoint every: {SAVE_CHECKPOINT_EVERY} inscriptions")
print(f"   API delay: {API_DELAY}s")

In [None]:
# If you need to set API key directly (not from environment):
# GOOGLE_AI_API_KEY = "paste-your-key-here"
# genai.configure(api_key=GOOGLE_AI_API_KEY)

## 3. Download Inscription Data from EDH

We'll use the `latinepi` package to search and download inscriptions from the Epigraphic Database Heidelberg.

In [None]:
# EDH Search Parameters
# Adjust these to get different inscription sets

SEARCH_PARAMS = {
    "year_from": 1,      # 1 CE
    "year_to": 100,      # 100 CE (1st century)
    "limit": 2000,       # Number of inscriptions to download
}

DOWNLOAD_DIR = Path("edh_downloads/batch_" + datetime.now().strftime("%Y%m%d_%H%M%S"))

print(f"üîç Search parameters:")
for key, value in SEARCH_PARAMS.items():
    print(f"   {key}: {value}")
print(f"\nüìÇ Will download to: {DOWNLOAD_DIR}")

In [None]:
# Download inscriptions from EDH using latinepi command-line tool
import subprocess

DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

cmd = [
    "latinepi",
    "--search-edh",
    "--search-year-from", str(SEARCH_PARAMS["year_from"]),
    "--search-year-to", str(SEARCH_PARAMS["year_to"]),
    "--search-limit", str(SEARCH_PARAMS["limit"]),
    "--download-dir", str(DOWNLOAD_DIR),
]

print(f"‚è≥ Downloading {SEARCH_PARAMS['limit']} inscriptions from EDH...")
print(f"   Command: {' '.join(cmd)}")
print("\n" + "="*60)

try:
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
    print(result.stdout)
    if result.stderr:
        print("Errors/Warnings:")
        print(result.stderr)
    
    # Count downloaded files
    json_files = list(DOWNLOAD_DIR.glob("*.json"))
    print("\n" + "="*60)
    print(f"‚úÖ Downloaded {len(json_files)} inscription files")
    
except subprocess.TimeoutExpired:
    print("‚ö†Ô∏è  Download timed out after 5 minutes")
except Exception as e:
    print(f"‚ùå Error during download: {e}")

## 4. Load and Prepare Inscription Data

In [None]:
def clean_leiden_text(text):
    """
    Clean Leiden convention markup from inscription text.
    Converts diplomatic transcription to plain Latin text.
    """
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    
    # Remove lost text markers
    text = re.sub(r"\[\s*-+\??\s*\]", "", text)
    text = re.sub(r"-+\]", "", text)
    text = re.sub(r"\[-+", "", text)
    
    # Replace line breaks with spaces
    text = text.replace("/", " ")
    
    # Remove Leiden markup
    text = text.replace("(", "").replace(")", "")  # Abbreviations
    text = text.replace("[", "").replace("]", "")  # Restorations
    text = text.replace("?", "")  # Uncertain readings
    
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


def load_inscriptions_from_json_dir(json_dir: Path) -> List[Dict]:
    """
    Load and clean inscriptions from a directory of JSON files.
    
    Returns:
        List of dicts with keys: id, text, transcription
    """
    json_files = list(json_dir.glob("*.json"))
    
    inscriptions = []
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Get diplomatic text and transcription
            raw_text = data.get("diplomatic_text", "")
            transcription = data.get("transcription", "")
            
            # Clean the transcription
            cleaned = clean_leiden_text(transcription)
            
            # Only include if we have text
            if cleaned and len(cleaned.strip()) > 2:
                inscriptions.append({
                    "id": data.get("id", json_file.stem),
                    "text": raw_text,
                    "transcription": cleaned
                })
                
        except Exception as e:
            print(f"‚ö†Ô∏è  Error loading {json_file.name}: {e}")
            continue
    
    return inscriptions


# Load inscriptions
print(f"üìö Loading inscriptions from {DOWNLOAD_DIR}...")
inscriptions = load_inscriptions_from_json_dir(DOWNLOAD_DIR)

print(f"\n‚úÖ Loaded {len(inscriptions)} inscriptions with valid text")

# Show sample
if inscriptions:
    print("\nüìã Sample inscription:")
    sample = inscriptions[0]
    print(f"   ID: {sample['id']}")
    print(f"   Text: {sample['transcription'][:80]}...")
    print(f"   Length: {len(sample['transcription'])} characters")

## 5. Load Annotation Prompt

In [None]:
# Load the annotation prompt
PROMPT_FILE = Path("gemini_annotation_prompt.md")

if not PROMPT_FILE.exists():
    print(f"‚ùå Prompt file not found: {PROMPT_FILE}")
    print("   Make sure you're running this notebook from the repository root")
else:
    with open(PROMPT_FILE, 'r', encoding='utf-8') as f:
        ANNOTATION_PROMPT = f.read()
    
    print(f"‚úÖ Loaded annotation prompt from {PROMPT_FILE}")
    print(f"   Prompt length: {len(ANNOTATION_PROMPT)} characters")
    print(f"   Estimated tokens: ~{len(ANNOTATION_PROMPT.split()) * 1.3:.0f}")

## 6. Initialize Gemini API

In [None]:
# Configure Gemini API
if not GOOGLE_AI_API_KEY:
    raise ValueError(
        "No API key set! Either:\n"
        "1. Set environment variable: export GOOGLE_AI_API_KEY='your-key'\n"
        "2. Or set it directly in the configuration cell above"
    )

genai.configure(api_key=GOOGLE_AI_API_KEY)

# Initialize the model
model = genai.GenerativeModel(
    model_name=GEMINI_MODEL,
    generation_config={
        "temperature": 0.1,  # Low temperature for deterministic output
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 2048,
    }
)

print(f"‚úÖ Gemini API configured")
print(f"   Model: {GEMINI_MODEL}")
print(f"   Temperature: 0.1 (deterministic)")
print(f"\nüß™ Testing API connection...")

try:
    test_response = model.generate_content("Say 'API connection successful' in exactly those words.")
    print(f"   Response: {test_response.text.strip()}")
    print("   ‚úÖ API is working!")
except Exception as e:
    print(f"   ‚ùå API test failed: {e}")
    raise

## 7. Annotation Functions

In [None]:
def annotate_single_inscription(inscription: Dict, prompt: str, model) -> Optional[Dict]:
    """
    Annotate a single inscription using Gemini API.
    
    Args:
        inscription: Dict with keys: id, text, transcription
        prompt: The annotation prompt (system instructions)
        model: Gemini model instance
    
    Returns:
        Dict with annotations added, or None if failed
    """
    # Prepare input JSON
    input_json = json.dumps({
        "id": inscription.get("id", ""),
        "text": inscription.get("text", ""),
        "transcription": inscription.get("transcription", "")
    }, ensure_ascii=False, indent=2)
    
    # Construct full prompt
    full_prompt = f"""{prompt}

---

Please annotate the following inscription:

{input_json}

Return ONLY the JSON object with annotations added. No other text.
"""
    
    # Try API call with retries
    for attempt in range(MAX_RETRIES):
        try:
            response = model.generate_content(full_prompt)
            response_text = response.text.strip()
            
            # Clean up response
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.startswith("```"):
                response_text = response_text[3:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            response_text = response_text.strip()
            
            # Parse JSON
            result = json.loads(response_text)
            
            # Validate structure
            if "annotations" not in result:
                print(f"‚ö†Ô∏è  Missing 'annotations' field for {inscription.get('id')}")
                return None
            
            if not isinstance(result["annotations"], list):
                print(f"‚ö†Ô∏è  Invalid annotations format for {inscription.get('id')}")
                return None
            
            return result
            
        except json.JSONDecodeError as e:
            if attempt < MAX_RETRIES - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
                continue
            print(f"‚ùå JSON parse error for {inscription.get('id')} after {MAX_RETRIES} attempts")
            return None
            
        except Exception as e:
            if attempt < MAX_RETRIES - 1:
                time.sleep(2 ** attempt)
                continue
            print(f"‚ùå API error for {inscription.get('id')}: {e}")
            return None
    
    return None


def save_checkpoint(results: List[Dict], checkpoint_file: Path):
    """Save intermediate results to checkpoint file."""
    with open(checkpoint_file, 'w', encoding='utf-8') as f:
        for result in results:
            f.write(json.dumps(result, ensure_ascii=False) + '\n')


def save_final(results: List[Dict], output_file: Path, checkpoint_file: Path):
    """Save final results and clean up checkpoint."""
    with open(output_file, 'w', encoding='utf-8') as f:
        for result in results:
            # Remove any error markers
            clean_result = {k: v for k, v in result.items() if k != "_error"}
            f.write(json.dumps(clean_result, ensure_ascii=False) + '\n')
    
    # Clean up checkpoint
    if checkpoint_file.exists():
        checkpoint_file.unlink()

print("‚úÖ Annotation functions defined")

## 8. Run Batch Annotation

‚ö†Ô∏è **Important**: This cell will make API calls and may take 30-60 minutes for 2000 inscriptions.

Progress is saved every 50 inscriptions, so you can interrupt and resume if needed.

In [None]:
# Optional: Limit number of inscriptions for testing
TEST_LIMIT = None  # Set to e.g. 10 for testing, None for all

if TEST_LIMIT:
    inscriptions_to_process = inscriptions[:TEST_LIMIT]
    print(f"‚ö†Ô∏è  TEST MODE: Processing only {TEST_LIMIT} inscriptions")
else:
    inscriptions_to_process = inscriptions
    print(f"üìù Processing all {len(inscriptions)} inscriptions")

# Check for existing checkpoint
completed = []
resume_from = 0

if CHECKPOINT_FILE.exists():
    print(f"\nüìÇ Found checkpoint file: {CHECKPOINT_FILE}")
    with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
        completed = [json.loads(line) for line in f]
    resume_from = len(completed)
    print(f"   Resuming from index {resume_from}")

# Statistics
stats = {
    'start_time': datetime.now(),
    'total': len(inscriptions_to_process),
    'successful': len(completed),
    'failed': 0,
    'total_entities': sum(len(r.get('annotations', [])) for r in completed)
}

print(f"\nüöÄ Starting annotation...")
print(f"   Total: {stats['total']}")
print(f"   Already completed: {len(completed)}")
print(f"   Remaining: {stats['total'] - resume_from}")
print(f"   Save checkpoint every: {SAVE_CHECKPOINT_EVERY}")
print(f"   API delay: {API_DELAY}s")
print("\n" + "="*70)

# Process inscriptions
for i, inscription in enumerate(inscriptions_to_process[resume_from:], start=resume_from):
    inscription_id = inscription.get('id', f'index_{i}')
    
    # Progress indicator
    progress_pct = (i + 1) / stats['total'] * 100
    print(f"[{i+1}/{stats['total']} ({progress_pct:.1f}%)] {inscription_id}... ", end="", flush=True)
    
    # Annotate
    result = annotate_single_inscription(inscription, ANNOTATION_PROMPT, model)
    
    if result:
        completed.append(result)
        num_entities = len(result.get('annotations', []))
        stats['successful'] += 1
        stats['total_entities'] += num_entities
        print(f"‚úì ({num_entities} entities)")
    else:
        # Save failed case with error marker
        completed.append({
            **inscription,
            "annotations": [],
            "_error": True
        })
        stats['failed'] += 1
        print("‚úó (failed)")
    
    # Save checkpoint
    if (i + 1) % SAVE_CHECKPOINT_EVERY == 0:
        save_checkpoint(completed, CHECKPOINT_FILE)
        elapsed = (datetime.now() - stats['start_time']).total_seconds()
        rate = (i + 1) / elapsed
        remaining = (stats['total'] - i - 1) / rate if rate > 0 else 0
        print(f"   üíæ Checkpoint saved ({len(completed)} completed, ~{remaining/60:.1f} min remaining)")
    
    # Rate limiting
    if i < stats['total'] - 1:
        time.sleep(API_DELAY)

# Save final results
print("\n" + "="*70)
print("üíæ Saving final results...")
save_final(completed, ANNOTATIONS_FILE, CHECKPOINT_FILE)

# Final statistics
elapsed = (datetime.now() - stats['start_time']).total_seconds()
avg_entities = stats['total_entities'] / max(1, stats['successful'])

print("\n" + "="*70)
print("‚úÖ ANNOTATION COMPLETE")
print("="*70)
print(f"Total inscriptions: {stats['total']}")
print(f"Successful: {stats['successful']}")
print(f"Failed: {stats['failed']}")
print(f"Success rate: {stats['successful']/stats['total']*100:.1f}%")
print(f"\nTotal entities annotated: {stats['total_entities']}")
print(f"Avg entities per inscription: {avg_entities:.1f}")
print(f"\nTime elapsed: {elapsed/60:.1f} minutes")
print(f"Rate: {stats['total']/elapsed*60:.1f} inscriptions/minute")
print(f"\nOutput saved to: {ANNOTATIONS_FILE}")
print("="*70)

## 9. Validate Annotations

Run quality checks on the annotated data.

In [None]:
# Quick validation
print("üîç Running validation checks...\n")

# Load results
with open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as f:
    results = [json.loads(line) for line in f]

# Count entity types
entity_counts = Counter()
errors = []
overlapping = []

for record in results:
    text = record.get('transcription', '')
    annotations = record.get('annotations', [])
    
    if not annotations:
        continue
    
    # Count entity types
    for ann in annotations:
        if len(ann) == 3:
            start, end, label = ann
            entity_counts[label] += 1
            
            # Check bounds
            if start < 0 or end > len(text) or start >= end:
                errors.append({
                    'id': record.get('id'),
                    'error': f'Invalid bounds [{start}:{end}] for text length {len(text)}'
                })
    
    # Check for overlaps
    sorted_anns = sorted(annotations, key=lambda x: x[0])
    for i in range(len(sorted_anns) - 1):
        if sorted_anns[i][1] > sorted_anns[i+1][0]:  # end[i] > start[i+1]
            overlapping.append({
                'id': record.get('id'),
                'overlap': f'{sorted_anns[i]} overlaps with {sorted_anns[i+1]}'
            })

# Print results
print("üìä Entity Type Distribution:")
print(f"{'Entity Type':<30} {'Count':>8}")
print("-" * 40)
for label, count in entity_counts.most_common():
    print(f"{label:<30} {count:8d}")

print(f"\nüîç Quality Checks:")
print(f"   Total records: {len(results)}")
print(f"   Records with annotations: {sum(1 for r in results if r.get('annotations'))}")
print(f"   Total entities: {sum(entity_counts.values())}")

if errors:
    print(f"\n‚ùå Found {len(errors)} errors:")
    for err in errors[:5]:
        print(f"   {err['id']}: {err['error']}")
    if len(errors) > 5:
        print(f"   ... and {len(errors) - 5} more")
else:
    print(f"\n‚úÖ No boundary errors found")

if overlapping:
    print(f"\n‚ö†Ô∏è  Found {len(overlapping)} overlapping spans:")
    for ovr in overlapping[:5]:
        print(f"   {ovr['id']}: {ovr['overlap']}")
    if len(overlapping) > 5:
        print(f"   ... and {len(overlapping) - 5} more")
else:
    print(f"\n‚úÖ No overlapping spans found")

# Overall quality
error_rate = len(errors) / max(1, len(results))
overlap_rate = len(overlapping) / max(1, len(results))

if error_rate == 0 and overlap_rate == 0:
    quality = "üåü EXCELLENT"
elif error_rate < 0.01 and overlap_rate < 0.05:
    quality = "‚úÖ GOOD"
elif error_rate < 0.05 and overlap_rate < 0.10:
    quality = "‚ö†Ô∏è  FAIR"
else:
    quality = "‚ùå NEEDS REVIEW"

print(f"\nüéØ Overall Quality: {quality}")
print(f"   Error rate: {error_rate:.2%}")
print(f"   Overlap rate: {overlap_rate:.2%}")

## 10. Show Sample Annotations

In [None]:
# Display a few sample annotations
import random

samples = [r for r in results if r.get('annotations')]
if samples:
    print("üìã Sample Annotated Inscriptions:\n")
    
    for i, record in enumerate(random.sample(samples, min(3, len(samples))), 1):
        print("=" * 70)
        print(f"Sample {i}")
        print("=" * 70)
        print(f"ID: {record.get('id', 'N/A')}")
        print(f"Text: {record['transcription']}")
        print(f"\nEntities ({len(record['annotations'])}):")
        
        for start, end, label in record['annotations'][:15]:  # Max 15 entities
            entity_text = record['transcription'][start:end]
            print(f"  [{start:3d}:{end:3d}] {label:25s} = '{entity_text}'")
        
        if len(record['annotations']) > 15:
            print(f"  ... and {len(record['annotations']) - 15} more")
        print()
else:
    print("No annotated samples to display")

## 11. Commit and Push Results

In [None]:
import subprocess
from datetime import datetime

# Generate commit message
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
commit_message = f"""Add Gemini-annotated inscriptions ({timestamp})

Annotated {stats['successful']} inscriptions using {GEMINI_MODEL}
- Total entities: {stats['total_entities']}
- Avg entities per inscription: {avg_entities:.1f}
- Success rate: {stats['successful']/stats['total']*100:.1f}%
- Search params: {SEARCH_PARAMS['year_from']}-{SEARCH_PARAMS['year_to']} CE

Files:
- {ANNOTATIONS_FILE}
- {DOWNLOAD_DIR}/ (raw EDH data)
"""

print("üìù Commit message:")
print("=" * 70)
print(commit_message)
print("=" * 70)
print()

# Ask for confirmation
confirm = input("Commit and push these files? (yes/no): ").strip().lower()

if confirm == 'yes':
    try:
        # Check git status
        print("\nüìä Git status:")
        result = subprocess.run(['git', 'status', '--short'], capture_output=True, text=True)
        print(result.stdout)
        
        # Add files
        print("\nüì¶ Adding files to git...")
        subprocess.run(['git', 'add', str(ANNOTATIONS_FILE)], check=True)
        subprocess.run(['git', 'add', str(DOWNLOAD_DIR)], check=True)
        print("   ‚úÖ Files staged")
        
        # Commit
        print("\nüíæ Creating commit...")
        subprocess.run(['git', 'commit', '-m', commit_message], check=True)
        print("   ‚úÖ Commit created")
        
        # Get current branch
        branch_result = subprocess.run(
            ['git', 'rev-parse', '--abbrev-ref', 'HEAD'],
            capture_output=True,
            text=True,
            check=True
        )
        current_branch = branch_result.stdout.strip()
        
        # Push
        print(f"\nüöÄ Pushing to origin/{current_branch}...")
        subprocess.run(['git', 'push', 'origin', current_branch], check=True)
        print("   ‚úÖ Pushed successfully")
        
        print("\n‚úÖ All done!")
        
    except subprocess.CalledProcessError as e:
        print(f"\n‚ùå Git operation failed: {e}")
        print("   You may need to commit and push manually")
        
else:
    print("\n‚ÑπÔ∏è  Skipping git operations")
    print(f"   To commit later, run:")
    print(f"   git add {ANNOTATIONS_FILE} {DOWNLOAD_DIR}")
    print(f"   git commit -m 'Add Gemini annotations'")
    print(f"   git push")

## 12. Summary and Next Steps

In [None]:
print("="*70)
print("üéâ WORKFLOW COMPLETE")
print("="*70)
print(f"\nüìä Results:")
print(f"   Inscriptions downloaded: {len(json_files)}")
print(f"   Inscriptions annotated: {stats['successful']}")
print(f"   Total entities: {stats['total_entities']}")
print(f"   Output file: {ANNOTATIONS_FILE}")

print(f"\nüî¨ Next Steps:")
print(f"\n1. Review annotation quality:")
print(f"   python validate_annotations.py {ANNOTATIONS_FILE}")

print(f"\n2. Use in your training notebook:")
print(f"   INPUT_FILE = '{ANNOTATIONS_FILE}'")
print(f"   partition_data(INPUT_FILE, CLEAN_OUTPUT_FILE, FIX_OUTPUT_FILE)")

print(f"\n3. Compare with synthetic data:")
print(f"   - Train model A: synthetic data only (463 examples)")
print(f"   - Train model B: real data only ({stats['successful']} examples)")
print(f"   - Train model C: combined")
print(f"   - Evaluate all three on held-out real inscriptions")

print(f"\n4. If quality is good, annotate more:")
print(f"   - Adjust SEARCH_PARAMS in cell 3")
print(f"   - Download different time periods/regions")
print(f"   - Aim for 5000+ total inscriptions")

print("\n" + "="*70)