# Batch Judge Evaluation

Simple interface for running and monitoring Gemini API batch evaluation jobs with context caching.

In [7]:
# Setup
import sys
from pathlib import Path
import pandas as pd
import time
import warnings

project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(project_root))
sys.path.insert(1, str(project_root / "src"))

warnings.filterwarnings("ignore", category=UserWarning,
module="google.genai._common")

import json
from config.config import settings
from xai_pkg.evaluator.judgeLLM import JudgeLLM
from xai_pkg.evaluator.utils import save_batch_results, process_batch_results

print(f"✅ Project root: {project_root}")
print(f"✅ Config loaded")

✅ Project root: /Users/timo/Documents/Thesis/xai-credit-decisions
✅ Config loaded


## Configuration

In [8]:
# Load configuration
judge_model = settings.get("default_judge", "gemini-2.5-flash")
fallback_model = settings.get("judge_fallback", "llama3.2-3b")

# Initialize evaluator
judge = JudgeLLM(judge_name=judge_model)

## Submit Batch Job

**Data Sources:**
- **Explanations**: DynamoDB (primary) → JSON fallback  
- **Predictions Context**: JSON file (for age, income, etc.)
- Why both? Explanations only contain the text, not input data needed for evaluation

In [9]:
# Print available explanations
explanations = judge._load_explanations_from_dynamo()
print(len(explanations))

1200


In [10]:
# Submit batch evaluation job

NUM_EXPLANATIONS_LIMIT = None # Adjust as needed - start small to test
USE_CONTEXT_CACHING = True # Enables massive token savings
USE_DYNAMODB = True # Try DynamoDB first, fallback to JSON

if input(f"Do you want to submit a new batch job with {NUM_EXPLANATIONS_LIMIT} explanations? (y/n)") == "y":

    job_info = judge.submit_batch_job(
        predictions_file=project_root / "output" / "predictions" / "prediction_results.json",
        num_explanations=NUM_EXPLANATIONS_LIMIT,
        cache_system_prompt=USE_CONTEXT_CACHING,
        use_dynamo=USE_DYNAMODB,
        force_submit=False
    )
    print(job_info)

## Monitor Job Status

In [11]:
# Check job status
recent_jobs = judge.list_batch_jobs(limit=5)
jobs_df = pd.DataFrame(recent_jobs)


# judge.cancel_batch_job("batches/h4p7s1ddp2m0mzalzq77ejoex82w8gnlbzwh")

if not jobs_df.empty:
    display(jobs_df)

    job_status = judge.check_batch_status(jobs_df.iloc[0]['job_id'])
    print(f"Target job status: {job_status}")
else:
    print("No batch jobs found")

Unnamed: 0,job_id,display_name,state,create_time
0,batches/65s8qwgnmlqfjs5x8irnv6yktjl9ibt64ph0,credit-explanation-evaluation-1200-explanation...,JOB_STATE_SUCCEEDED,2025-08-29 15:15:05.178404+00:00
1,batches/x9hd8fajgq4fc6tbkznsakpyy0945v0kv1oa,credit-explanation-evaluation-1200-explanation...,JOB_STATE_SUCCEEDED,2025-08-28 17:11:48.961958+00:00
2,batches/h4p7s1ddp2m0mzalzq77ejoex82w8gnlbzwh,credit-explanation-evaluation-669-explanations...,JOB_STATE_CANCELLED,2025-08-28 17:10:07.882755+00:00
3,batches/p57vv3ftits6f8qe3ef4h2ji409o559u1229,credit-explanation-evaluation-40-explanations-...,JOB_STATE_SUCCEEDED,2025-08-28 12:20:53.221625+00:00
4,batches/33hspbyeegev3lr8n998mx7gygflsrm5tyxx,credit-explanation-evaluation-1200-explanation...,JOB_STATE_SUCCEEDED,2025-08-27 21:57:01.656648+00:00


Target job status: {'job_id': 'batches/65s8qwgnmlqfjs5x8irnv6yktjl9ibt64ph0', 'state': 'JOB_STATE_SUCCEEDED', 'create_time': '2025-08-29 15:15:05.178404+00:00', 'update_time': '2025-08-29 15:42:54.352836+00:00', 'request_count': 0}


## Get Batch Results for specific job

In [13]:
# Get batch results
target_job_id = "batches/65s8qwgnmlqfjs5x8irnv6yktjl9ibt64ph0" # Final evaluation ID

job_info = {
    'job_id': target_job_id,
    'request_count': 1200,
    'timestamp': int(time.time())
}

results = judge.get_batch_results(job_info)
print(f"Retrieved {len(results)} results")

if results:
    # Save results
    from xai_pkg.evaluator.utils import save_batch_results
    
    output_dir = project_root / "output" / "evaluations"
    saved_files = save_batch_results(results, job_info, output_dir, filename_prefix="batch_results", return_format="json")
    
    print("Files saved:")
    for file_type, filepath in saved_files.items():
        print(f"  {filepath}")
        
    # Quick stats
    print(f"\nSample result keys: {list(results[0].keys())}")

Retrieved 1199 results
Files saved:
  /Users/timo/Documents/Thesis/xai-credit-decisions/output/evaluations/batch_results_raw_65s8qwgnmlqfjs5x8irnv6yktjl9ibt64ph0.json

Sample result keys: ['evaluation_result', 'metadata', 'batch_index']
