In [42]:
# Load all result files from Sample-Harness_1
from glob import glob
import json
from collections import defaultdict

files = glob("results/Sample-Harness_1/run_*.json")
print(f"Total files found: {len(files)}")

Total files found: 160


In [43]:
# Group files by model
collection = defaultdict(list)
for file in files:
    with open(file, "r") as f:
        data = json.load(f)
    collection[str(data["model"] or "unknown")].append(file)

print("Files per model:")
for model, file_list in collection.items():
    print(f"  {model}: {len(file_list)}")

Files per model:
  gemini-2.5-pro: 40
  claude-sonnet-4-5: 40
  grok-4: 40
  gpt-5: 40


In [44]:
# Define success checker function
def did_model_succeed(f):
    with open(f, "r") as f:
        data = json.load(f)
    is_success = all(scenario["success"] for scenario in data["scenarios"][0]["verifiers"])
    return is_success


In [45]:
# Calculate total success rate per model
print(f"{'='*80}")
print(f"TOTAL SUCCESS RATES BY MODEL")
print(f"{'='*80}\n")

model_success = {}
for model, files_list in collection.items():
    total_runs = len(files_list)
    successful_runs = sum(1 for f in files_list if did_model_succeed(f))
    success_rate = (successful_runs / total_runs * 100) if total_runs > 0 else 0
    
    model_success[model] = {
        "total": total_runs,
        "success": successful_runs,
        "failed": total_runs - successful_runs,
        "success_rate": success_rate
    }
    
    print(f"{model}:")
    print(f"  Total runs: {total_runs}")
    print(f"  Successful: {successful_runs}")
    print(f"  Failed: {total_runs - successful_runs}")
    print(f"  Success rate: {success_rate:.2f}%")
    print()

model_success


TOTAL SUCCESS RATES BY MODEL

gemini-2.5-pro:
  Total runs: 40
  Successful: 18
  Failed: 22
  Success rate: 45.00%

claude-sonnet-4-5:
  Total runs: 40
  Successful: 40
  Failed: 0
  Success rate: 100.00%

grok-4:
  Total runs: 40
  Successful: 33
  Failed: 7
  Success rate: 82.50%

gpt-5:
  Total runs: 40
  Successful: 30
  Failed: 10
  Success rate: 75.00%



{'gemini-2.5-pro': {'total': 40,
  'success': 18,
  'failed': 22,
  'success_rate': 45.0},
 'claude-sonnet-4-5': {'total': 40,
  'success': 40,
  'failed': 0,
  'success_rate': 100.0},
 'grok-4': {'total': 40, 'success': 33, 'failed': 7, 'success_rate': 82.5},
 'gpt-5': {'total': 40, 'success': 30, 'failed': 10, 'success_rate': 75.0}}

In [46]:
# File naming convention and extraction function
import os
import re

def extract_test_info(filepath, model):
    """Extract test name and run number from filepath.
    
    Pattern: run_{test_name}-{model_name}-{run_number}.json
    Example: run_new_sys_task10_3_c1_p1_r8_v2_harness-claude-sonnet-4-5-1.json
    
    The challenge is that model names contain hyphens (grok-4, gpt-5, gemini-2.5-pro, claude-sonnet-4-5)
    """
    basename = os.path.basename(filepath)
    
    # Remove 'run_' prefix
    if basename.startswith('run_'):
        basename = basename[4:]
    
    # Remove '.json' suffix
    if basename.endswith('.json'):
        basename = basename[:-5]
    
    # Strategy: find the model name followed by a hyphen and single/double digit number
    # Escape special chars in model name for regex
    model_escaped = re.escape(model)
    
    # Pattern: anything, then model, then hyphen, then 1-2 digits at the end
    pattern = f'^(.+)-{model_escaped}-(\\d+)$'
    match = re.match(pattern, basename)
    
    if match:
        test_name = match.group(1)
        run_number = match.group(2)
        return test_name, run_number
    
    # Fallback: just extract last number
    match = re.match(r'(.+)-(\d+)$', basename)
    if match:
        test_name = match.group(1)
        run_number = match.group(2)
        return test_name, run_number
    
    return basename, "unknown"

# Test the extraction
print("Sample file name extraction (first 3 files per model):")
for model, files_list in list(collection.items())[:2]:
    print(f"\n{model}:")
    for f in files_list[:3]:
        test_name, run_num = extract_test_info(f, model)
        print(f"  {os.path.basename(f)}")
        print(f"    -> Test: {test_name}, Run: {run_num}")


Sample file name extraction (first 3 files per model):

gemini-2.5-pro:
  run_task4-google-gemini-2.5-pro-6.json
    -> Test: task4-google, Run: 6
  run_task2-google-gemini-2.5-pro-1.json
    -> Test: task2-google, Run: 1
  run_task5-google-gemini-2.5-pro-5.json
    -> Test: task5-google, Run: 5

claude-sonnet-4-5:
  run_task5-anthropic-claude-sonnet-4-5-2.json
    -> Test: task5-anthropic, Run: 2
  run_task4-anthropic-claude-sonnet-4-5-8.json
    -> Test: task4-anthropic, Run: 8
  run_task3-anthropic-claude-sonnet-4-5-3.json
    -> Test: task3-anthropic, Run: 3


In [47]:
# Group runs by task and analyze success per task per model
print(f"{'='*80}")
print(f"SUCCESS RATES PER TASK PER MODEL")
print(f"{'='*80}\n")

# Structure: task_results[model][task_name] = {"total": X, "success": Y, "runs": [...]}
def create_task_stats():
    return {"total": 0, "success": 0, "runs": []}

task_results = defaultdict(lambda: defaultdict(create_task_stats))

for model, files_list in collection.items():
    for file_path in files_list:
        test_name, run_num = extract_test_info(file_path, model)
        is_success = did_model_succeed(file_path)
        
        task_results[model][test_name]["total"] += 1
        if is_success:
            task_results[model][test_name]["success"] += 1
        task_results[model][test_name]["runs"].append({
            "run_number": run_num,
            "success": is_success,
            "file": file_path
        })

# Display summary
print(f"Summary: Found unique tasks per model:")
for model in task_results.keys():
    unique_tasks = len(task_results[model])
    print(f"  {model}: {unique_tasks} unique tasks")

print("\n" + "="*80)
print("Checking run counts per task:")
print("="*80 + "\n")

for model, tasks in task_results.items():
    print(f"\n{model}:")
    
    # Group by run count
    run_counts = defaultdict(int)
    for task_name, stats in tasks.items():
        run_counts[stats["total"]] += 1
    
    for count in sorted(run_counts.keys(), reverse=True):
        print(f"  {run_counts[count]} tasks with {count} run(s)")
    
    # Show examples of non-standard counts
    non_standard = []
    for task_name, stats in sorted(tasks.items()):
        if stats["total"] != 1:  # Assuming sample data has 1 run per task
            non_standard.append((task_name, stats["total"]))
    
    if non_standard:
        print(f"  \n  Examples of tasks with multiple runs:")
        for task_name, count in non_standard[:3]:
            print(f"    - {task_name}: {count} runs")


SUCCESS RATES PER TASK PER MODEL

Summary: Found unique tasks per model:
  gemini-2.5-pro: 5 unique tasks
  claude-sonnet-4-5: 5 unique tasks
  grok-4: 5 unique tasks
  gpt-5: 5 unique tasks

Checking run counts per task:


gemini-2.5-pro:
  5 tasks with 8 run(s)
  
  Examples of tasks with multiple runs:
    - task1-google: 8 runs
    - task2-google: 8 runs
    - task3-google: 8 runs

claude-sonnet-4-5:
  5 tasks with 8 run(s)
  
  Examples of tasks with multiple runs:
    - task1-anthropic: 8 runs
    - task2-anthropic: 8 runs
    - task3-anthropic: 8 runs

grok-4:
  5 tasks with 8 run(s)
  
  Examples of tasks with multiple runs:
    - task1-xai: 8 runs
    - task2-xai: 8 runs
    - task3-xai: 8 runs

gpt-5:
  5 tasks with 8 run(s)
  
  Examples of tasks with multiple runs:
    - task1-openai: 8 runs
    - task2-openai: 8 runs
    - task3-openai: 8 runs


In [48]:
# Detailed success rate per task per model
print(f"{'='*80}")
print(f"DETAILED TASK SUCCESS RATES")
print(f"{'='*80}\n")

for model in sorted(task_results.keys()):
    print(f"\n{'='*80}")
    print(f"{model.upper()}")
    print(f"{'='*80}\n")
    
    tasks = task_results[model]
    
    # Sort tasks by success rate (lowest first to highlight problems)
    sorted_tasks = sorted(tasks.items(), key=lambda x: (x[1]["success"] / x[1]["total"], x[0]))
    
    for task_name, stats in sorted_tasks:
        success_rate = (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
        
        # Use different symbols based on success rate
        if success_rate == 100:
            symbol = "✅"
        elif success_rate >= 50:
            symbol = "⚠️ "
        else:
            symbol = "❌"
        
        print(f"{symbol} {task_name}")
        print(f"   Success: {stats['success']}/{stats['total']} ({success_rate:.1f}%)")
        
        # Show which specific runs failed
        failed_runs = [r for r in stats["runs"] if not r["success"]]
        if failed_runs:
            failed_run_nums = [r["run_number"] for r in failed_runs]
            print(f"   Failed runs: {', '.join(failed_run_nums)}")
        print()


DETAILED TASK SUCCESS RATES


CLAUDE-SONNET-4-5

✅ task1-anthropic
   Success: 8/8 (100.0%)

✅ task2-anthropic
   Success: 8/8 (100.0%)

✅ task3-anthropic
   Success: 8/8 (100.0%)

✅ task4-anthropic
   Success: 8/8 (100.0%)

✅ task5-anthropic
   Success: 8/8 (100.0%)


GEMINI-2.5-PRO

❌ task2-google
   Success: 0/8 (0.0%)
   Failed runs: 1, 7, 6, 5, 4, 8, 3, 2

⚠️  task3-google
   Success: 4/8 (50.0%)
   Failed runs: 3, 6, 7, 1

⚠️  task4-google
   Success: 4/8 (50.0%)
   Failed runs: 6, 7, 2, 5

⚠️  task1-google
   Success: 5/8 (62.5%)
   Failed runs: 6, 4, 5

⚠️  task5-google
   Success: 5/8 (62.5%)
   Failed runs: 8, 3, 2


GPT-5

❌ task4-openai
   Success: 3/8 (37.5%)
   Failed runs: 1, 6, 5, 4, 2

⚠️  task1-openai
   Success: 5/8 (62.5%)
   Failed runs: 7, 8, 2

⚠️  task2-openai
   Success: 7/8 (87.5%)
   Failed runs: 7

⚠️  task5-openai
   Success: 7/8 (87.5%)
   Failed runs: 1

✅ task3-openai
   Success: 8/8 (100.0%)


GROK-4

⚠️  task1-xai
   Success: 5/8 (62.5%)
   Failed runs

In [49]:
# Export results to CSV for analysis
import csv

print(f"{'='*80}")
print(f"EXPORTING RESULTS TO CSV")
print(f"{'='*80}\n")

# Export 1: Summary by model
with open("sample_model_summary.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Total Runs", "Successful", "Failed", "Success Rate %"])
    
    for model, stats in sorted(model_success.items()):
        writer.writerow([
            model,
            stats["total"],
            stats["success"],
            stats["failed"],
            f"{stats['success_rate']:.2f}"
        ])

print("✅ Exported: sample_model_summary.csv")

# Export 2: Detailed task results
with open("sample_task_results_detailed.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task", "Total Runs", "Successful", "Failed", "Success Rate %", "Failed Run Numbers"])
    
    for model in sorted(task_results.keys()):
        for task_name, stats in sorted(task_results[model].items()):
            success_rate = (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
            failed_runs = [r for r in stats["runs"] if not r["success"]]
            failed_run_nums = ", ".join([r["run_number"] for r in failed_runs])
            
            writer.writerow([
                model,
                task_name,
                stats["total"],
                stats["success"],
                stats["total"] - stats["success"],
                f"{success_rate:.1f}",
                failed_run_nums
            ])

print("✅ Exported: sample_task_results_detailed.csv")

# Export 3: Per-run details
with open("sample_run_details.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task", "Run Number", "Success", "File Path"])
    
    for model in sorted(task_results.keys()):
        for task_name, stats in sorted(task_results[model].items()):
            for run in stats["runs"]:
                writer.writerow([
                    model,
                    task_name,
                    run["run_number"],
                    "Yes" if run["success"] else "No",
                    run["file"]
                ])

print("✅ Exported: sample_run_details.csv")

print(f"\n{'='*80}")
print(f"EXPORT COMPLETE")
print(f"{'='*80}")
print(f"\nGenerated files:")
print(f"  1. sample_model_summary.csv - Overall model statistics")
print(f"  2. sample_task_results_detailed.csv - Success rates per task per model")
print(f"  3. sample_run_details.csv - Individual run details")


EXPORTING RESULTS TO CSV

✅ Exported: sample_model_summary.csv
✅ Exported: sample_task_results_detailed.csv
✅ Exported: sample_run_details.csv

EXPORT COMPLETE

Generated files:
  1. sample_model_summary.csv - Overall model statistics
  2. sample_task_results_detailed.csv - Success rates per task per model
  3. sample_run_details.csv - Individual run details


In [50]:
# Final Summary
print(f"{'='*80}")
print(f"SAMPLE DATA ANALYSIS SUMMARY")
print(f"{'='*80}\n")

print(f"Dataset: results/Sample-Harness_1/")
print(f"Total files analyzed: {len(files)}\n")

print(f"Models analyzed: {len(task_results)}")
for model in sorted(task_results.keys()):
    print(f"  - {model}")

print(f"\n{'='*80}")
print(f"SUCCESS RATES BY MODEL (sorted)")
print(f"{'='*80}\n")

for model, stats in sorted(model_success.items(), key=lambda x: x[1]['success_rate'], reverse=True):
    print(f"{model}: {stats['success_rate']:.2f}% ({stats['success']}/{stats['total']})")

print(f"\n{'='*80}")
print(f"KEY FINDINGS")
print(f"{'='*80}\n")

# Count tasks with issues per model
for model in sorted(task_results.keys()):
    failed_tasks = sum(1 for stats in task_results[model].values() if stats["success"] < stats["total"])
    total_tasks = len(task_results[model])
    print(f"{model}:")
    print(f"  - Total unique tasks: {total_tasks}")
    print(f"  - Tasks with at least one failure: {failed_tasks}")
    print(f"  - Perfect tasks (all runs passed): {total_tasks - failed_tasks}")
    print()

print(f"{'='*80}")
print(f"All CSV files have been exported with 'sample_' prefix.")
print(f"{'='*80}")


SAMPLE DATA ANALYSIS SUMMARY

Dataset: results/Sample-Harness_1/
Total files analyzed: 160

Models analyzed: 4
  - claude-sonnet-4-5
  - gemini-2.5-pro
  - gpt-5
  - grok-4

SUCCESS RATES BY MODEL (sorted)

claude-sonnet-4-5: 100.00% (40/40)
grok-4: 82.50% (33/40)
gpt-5: 75.00% (30/40)
gemini-2.5-pro: 45.00% (18/40)

KEY FINDINGS

claude-sonnet-4-5:
  - Total unique tasks: 5
  - Tasks with at least one failure: 0
  - Perfect tasks (all runs passed): 5

gemini-2.5-pro:
  - Total unique tasks: 5
  - Tasks with at least one failure: 5
  - Perfect tasks (all runs passed): 0

gpt-5:
  - Total unique tasks: 5
  - Tasks with at least one failure: 4
  - Perfect tasks (all runs passed): 1

grok-4:
  - Total unique tasks: 5
  - Tasks with at least one failure: 3
  - Perfect tasks (all runs passed): 2

All CSV files have been exported with 'sample_' prefix.


In [51]:
# Categorize prompts by difficulty based on failure rate per model
print(f"{'='*80}")
print(f"PROMPT DIFFICULTY CATEGORIZATION BY MODEL")
print(f"{'='*80}\n")

# Categories:
# Easy: 0-1 failures (close to perfect success)
# Medium: 2-4 failures (50-80% success range)
# Hard: 5+ failures (more failures than successes)

def categorize_difficulty(success_count, total_runs):
    """Categorize based on number of failures"""
    failures = total_runs - success_count
    if failures <= 1:
        return "easy"
    elif failures <= 4:
        return "medium"
    else:
        return "hard"

# Structure: difficulty_by_model[model][category] = [list of tasks]
difficulty_by_model = {}

for model in sorted(task_results.keys()):
    difficulty_by_model[model] = {
        "easy": [],
        "medium": [],
        "hard": []
    }
    
    for task_name, stats in task_results[model].items():
        category = categorize_difficulty(stats["success"], stats["total"])
        difficulty_by_model[model][category].append({
            "task": task_name,
            "success": stats["success"],
            "total": stats["total"],
            "failures": stats["total"] - stats["success"]
        })

# Display results
for model in sorted(difficulty_by_model.keys()):
    print(f"\n{'='*80}")
    print(f"{model.upper()}")
    print(f"{'='*80}\n")
    
    categories = difficulty_by_model[model]
    
    print(f"✅ EASY (0-1 failures): {len(categories['easy'])} prompts")
    print(f"⚠️  MEDIUM (2-4 failures): {len(categories['medium'])} prompts")
    print(f"❌ HARD (5+ failures): {len(categories['hard'])} prompts")
    
    total_prompts = len(categories['easy']) + len(categories['medium']) + len(categories['hard'])
    print(f"\nTotal prompts: {total_prompts}")
    
    if total_prompts > 0:
        easy_pct = len(categories['easy']) / total_prompts * 100
        medium_pct = len(categories['medium']) / total_prompts * 100
        hard_pct = len(categories['hard']) / total_prompts * 100
        
        print(f"\nDistribution:")
        print(f"  Easy: {easy_pct:.1f}%")
        print(f"  Medium: {medium_pct:.1f}%")
        print(f"  Hard: {hard_pct:.1f}%")
    
    # Show some examples from each category
    if categories['hard']:
        print(f"\n❌ HARD prompts (up to 5):")
        for item in sorted(categories['hard'], key=lambda x: x['failures'], reverse=True)[:5]:
            print(f"  - {item['task']}")
            print(f"    Failures: {item['failures']}/{item['total']}")
    
    if categories['medium']:
        print(f"\n⚠️  MEDIUM prompts (up to 3):")
        for item in sorted(categories['medium'], key=lambda x: x['failures'], reverse=True)[:3]:
            print(f"  - {item['task']}")
            print(f"    Failures: {item['failures']}/{item['total']}")

print(f"\n{'='*80}")
print(f"SUMMARY ACROSS ALL MODELS")
print(f"{'='*80}\n")

# Aggregate summary
summary_table = []
for model in sorted(difficulty_by_model.keys()):
    categories = difficulty_by_model[model]
    total = len(categories['easy']) + len(categories['medium']) + len(categories['hard'])
    summary_table.append({
        "model": model,
        "easy": len(categories['easy']),
        "medium": len(categories['medium']),
        "hard": len(categories['hard']),
        "total": total
    })

# Print as table
print(f"{'Model':<25} {'Easy':<10} {'Medium':<10} {'Hard':<10} {'Total':<10}")
print(f"{'-'*65}")
for row in summary_table:
    print(f"{row['model']:<25} {row['easy']:<10} {row['medium']:<10} {row['hard']:<10} {row['total']:<10}")


PROMPT DIFFICULTY CATEGORIZATION BY MODEL


CLAUDE-SONNET-4-5

✅ EASY (0-1 failures): 5 prompts
⚠️  MEDIUM (2-4 failures): 0 prompts
❌ HARD (5+ failures): 0 prompts

Total prompts: 5

Distribution:
  Easy: 100.0%
  Medium: 0.0%
  Hard: 0.0%

GEMINI-2.5-PRO

✅ EASY (0-1 failures): 0 prompts
⚠️  MEDIUM (2-4 failures): 4 prompts
❌ HARD (5+ failures): 1 prompts

Total prompts: 5

Distribution:
  Easy: 0.0%
  Medium: 80.0%
  Hard: 20.0%

❌ HARD prompts (up to 5):
  - task2-google
    Failures: 8/8

⚠️  MEDIUM prompts (up to 3):
  - task4-google
    Failures: 4/8
  - task3-google
    Failures: 4/8
  - task5-google
    Failures: 3/8

GPT-5

✅ EASY (0-1 failures): 3 prompts
⚠️  MEDIUM (2-4 failures): 1 prompts
❌ HARD (5+ failures): 1 prompts

Total prompts: 5

Distribution:
  Easy: 60.0%
  Medium: 20.0%
  Hard: 20.0%

❌ HARD prompts (up to 5):
  - task4-openai
    Failures: 5/8

⚠️  MEDIUM prompts (up to 3):
  - task1-openai
    Failures: 3/8

GROK-4

✅ EASY (0-1 failures): 3 prompts
⚠️  MEDIU

In [None]:
# Find tasks with perfect success rate per model
print(f"{'='*80}")
print(f"PERFECT TASKS ANALYSIS (All runs passing)")
print(f"{'='*80}\n")

perfect_tasks_by_model = {}

for model in sorted(task_results.keys()):
    perfect_tasks = []
    
    for task_name, stats in task_results[model].items():
        # Perfect means all runs succeeded
        if stats["success"] == stats["total"] and stats["total"] > 0:
            perfect_tasks.append(task_name)
    
    perfect_tasks_by_model[model] = perfect_tasks
    
    print(f"\n{model}:")
    print(f"  Tasks with perfect success: {len(perfect_tasks)}/{len(task_results[model])}")
    if len(task_results[model]) > 0:
        print(f"  Percentage: {len(perfect_tasks)/len(task_results[model])*100:.1f}%")

# Summary table
print(f"\n{'='*80}")
print(f"SUMMARY TABLE")
print(f"{'='*80}\n")

print(f"{'Model':<25} {'Perfect Tasks':<20} {'Total Tasks':<15} {'Perfect %':<15}")
print(f"{'-'*75}")

for model in sorted(perfect_tasks_by_model.keys()):
    perfect_count = len(perfect_tasks_by_model[model])
    total_count = len(task_results[model])
    perfect_pct = perfect_count / total_count * 100 if total_count > 0 else 0
    
    print(f"{model:<25} {perfect_count:<20} {total_count:<15} {perfect_pct:.1f}%")

# Compare which tasks are perfect across all models
print(f"\n{'='*80}")
print(f"TASKS PERFECT FOR ALL MODELS")
print(f"{'='*80}\n")

all_tasks_set = set()
for model in task_results.keys():
    all_tasks_set.update(task_results[model].keys())

perfect_for_all = []
for task in all_tasks_set:
    is_perfect_for_all = True
    
    for model in task_results.keys():
        if task in task_results[model]:
            stats = task_results[model][task]
            if stats["success"] != stats["total"] or stats["total"] == 0:
                is_perfect_for_all = False
                break
        else:
            is_perfect_for_all = False
            break
    
    if is_perfect_for_all:
        perfect_for_all.append(task)

print(f"Tasks with perfect success across ALL models: {len(perfect_for_all)}/{len(all_tasks_set)}")
if len(all_tasks_set) > 0:
    print(f"Percentage: {len(perfect_for_all)/len(all_tasks_set)*100:.1f}%\n")

if perfect_for_all:
    print(f"These tasks are perfect for all models:")
    for i, task in enumerate(sorted(perfect_for_all), 1):
        print(f"  {i}. {task}")
else:
    print("No tasks are perfect across all models.")

# Find tasks that are perfect for some models but not all
print(f"\n{'='*80}")
print(f"TASKS WITH MIXED PERFECT SCORES")
print(f"{'='*80}\n")

mixed_perfect = {}
for task in all_tasks_set:
    perfect_models = []
    imperfect_models = []
    
    for model in sorted(task_results.keys()):
        if task in task_results[model]:
            stats = task_results[model][task]
            if stats["success"] == stats["total"] and stats["total"] > 0:
                perfect_models.append(model)
            else:
                imperfect_models.append(model)
    
    # Only include if some (but not all) models have perfect score
    if perfect_models and imperfect_models:
        mixed_perfect[task] = {
            "perfect": perfect_models,
            "imperfect": imperfect_models
        }

print(f"Tasks with mixed perfect scores: {len(mixed_perfect)}\n")

if mixed_perfect:
    # Show top examples (tasks where most models are perfect)
    mixed_sorted = sorted(mixed_perfect.items(), 
                         key=lambda x: len(x[1]["perfect"]), 
                         reverse=True)
    
    print(f"Top examples (most models perfect):")
    for i, (task, data) in enumerate(mixed_sorted[:10], 1):
        print(f"\n{i}. {task}")
        print(f"   Perfect for ({len(data['perfect'])}/{len(task_results)}): {', '.join(data['perfect'])}")
        print(f"   Not perfect for: {', '.join(data['imperfect'])}")
else:
    print("No tasks with mixed perfect scores found.")


PERFECT TASKS ANALYSIS (All runs passing)


claude-sonnet-4-5:
  Tasks with perfect success: 5/5
  Percentage: 100.0%

gemini-2.5-pro:
  Tasks with perfect success: 0/5
  Percentage: 0.0%

gpt-5:
  Tasks with perfect success: 1/5
  Percentage: 20.0%

grok-4:
  Tasks with perfect success: 2/5
  Percentage: 40.0%

SUMMARY TABLE

Model                     Perfect Tasks        Total Tasks     Perfect %      
---------------------------------------------------------------------------
claude-sonnet-4-5         5                    5               100.0%
gemini-2.5-pro            0                    5               0.0%
gpt-5                     1                    5               20.0%
grok-4                    2                    5               40.0%

TASKS PERFECT FOR ALL MODELS

Tasks with perfect success across ALL models: 0/20
Percentage: 0.0%

No tasks are perfect across all models.

TASKS WITH MIXED PERFECT SCORES

Tasks with mixed perfect scores: 0

No tasks with mixed perfect sc

In [None]:
# Find tasks where ALL runs fail (complete failure)
print(f"{'='*80}")
print(f"COMPLETE FAILURE ANALYSIS (All runs failing)")
print(f"{'='*80}\n")

failed_tasks_by_model = {}

for model in sorted(task_results.keys()):
    failed_tasks = []
    
    for task_name, stats in task_results[model].items():
        # Complete failure means zero successes
        if stats["success"] == 0 and stats["total"] > 0:
            failed_tasks.append({
                "task": task_name,
                "runs": stats["total"]
            })
    
    failed_tasks_by_model[model] = failed_tasks
    
    print(f"\n{model}:")
    print(f"  Tasks with complete failure: {len(failed_tasks)}/{len(task_results[model])}")
    if len(task_results[model]) > 0:
        print(f"  Percentage: {len(failed_tasks)/len(task_results[model])*100:.1f}%")
    
    if failed_tasks:
        print(f"\n  Failed tasks:")
        for item in sorted(failed_tasks, key=lambda x: x['runs'], reverse=True):
            print(f"    - {item['task']}")
            print(f"      All {item['runs']} run(s) failed")

# Summary table
print(f"\n{'='*80}")
print(f"COMPLETE FAILURE SUMMARY TABLE")
print(f"{'='*80}\n")

print(f"{'Model':<25} {'Failed Tasks':<20} {'Total Tasks':<15} {'Failure %':<15}")
print(f"{'-'*75}")

for model in sorted(failed_tasks_by_model.keys()):
    failed_count = len(failed_tasks_by_model[model])
    total_count = len(task_results[model])
    failed_pct = failed_count / total_count * 100 if total_count > 0 else 0
    
    print(f"{model:<25} {failed_count:<20} {total_count:<15} {failed_pct:.1f}%")

# Find tasks that fail for ALL models
print(f"\n{'='*80}")
print(f"TASKS THAT FAIL FOR ALL MODELS")
print(f"{'='*80}\n")

all_tasks_set = set()
for model in task_results.keys():
    all_tasks_set.update(task_results[model].keys())

failed_for_all = []
for task in all_tasks_set:
    is_failed_for_all = True
    
    for model in task_results.keys():
        if task in task_results[model]:
            stats = task_results[model][task]
            if stats["success"] > 0 or stats["total"] == 0:
                is_failed_for_all = False
                break
        else:
            is_failed_for_all = False
            break
    
    if is_failed_for_all:
        failed_for_all.append(task)

print(f"Tasks that fail across ALL models: {len(failed_for_all)}/{len(all_tasks_set)}")
if len(all_tasks_set) > 0:
    print(f"Percentage: {len(failed_for_all)/len(all_tasks_set)*100:.1f}%\n")

if failed_for_all:
    print(f"⚠️  These tasks fail for ALL models:")
    for i, task in enumerate(sorted(failed_for_all), 1):
        print(f"  {i}. {task}")
        # Show how many runs failed for each model
        print(f"     Failures per model:")
        for model in sorted(task_results.keys()):
            if task in task_results[model]:
                total = task_results[model][task]["total"]
                print(f"       - {model}: {total}/{total} runs failed")
else:
    print("✅ No tasks fail across all models.")

# Find tasks that fail for some models
print(f"\n{'='*80}")
print(f"TASKS WITH MIXED FAILURE PATTERNS")
print(f"{'='*80}\n")

mixed_failure = {}
for task in all_tasks_set:
    failed_models = []
    passed_models = []
    
    for model in sorted(task_results.keys()):
        if task in task_results[model]:
            stats = task_results[model][task]
            if stats["success"] == 0 and stats["total"] > 0:
                failed_models.append(model)
            elif stats["success"] > 0:
                passed_models.append(model)
    
    # Only include if some (but not all) models have complete failure
    if failed_models and passed_models:
        mixed_failure[task] = {
            "failed": failed_models,
            "passed": passed_models
        }

print(f"Tasks with mixed failure patterns: {len(mixed_failure)}\n")

if mixed_failure:
    # Show tasks that fail for most models
    mixed_sorted = sorted(mixed_failure.items(), 
                         key=lambda x: len(x[1]["failed"]), 
                         reverse=True)
    
    print(f"Tasks that fail for multiple models (showing up to 10):")
    for i, (task, data) in enumerate(mixed_sorted[:10], 1):
        print(f"\n{i}. {task}")
        print(f"   ❌ Failed for ({len(data['failed'])}/{len(task_results)}): {', '.join(data['failed'])}")
        print(f"   ✅ Has successes for: {', '.join(data['passed'])}")
else:
    print("No tasks with mixed failure patterns found.")


COMPLETE FAILURE ANALYSIS (All runs failing)


claude-sonnet-4-5:
  Tasks with complete failure: 0/5
  Percentage: 0.0%

gemini-2.5-pro:
  Tasks with complete failure: 1/5
  Percentage: 20.0%

  Failed tasks:
    - task2-google
      All 8 run(s) failed

gpt-5:
  Tasks with complete failure: 0/5
  Percentage: 0.0%

grok-4:
  Tasks with complete failure: 0/5
  Percentage: 0.0%

COMPLETE FAILURE SUMMARY TABLE

Model                     Failed Tasks         Total Tasks     Failure %      
---------------------------------------------------------------------------
claude-sonnet-4-5         0                    5               0.0%
gemini-2.5-pro            1                    5               20.0%
gpt-5                     0                    5               0.0%
grok-4                    0                    5               0.0%

TASKS THAT FAIL FOR ALL MODELS

Tasks that fail across ALL models: 0/20
Percentage: 0.0%

✅ No tasks fail across all models.

TASKS WITH MIXED FAILURE PATTERNS

In [None]:
# Export additional analysis to CSV
print(f"{'='*80}")
print(f"EXPORTING ADDITIONAL ANALYSIS TO CSV")
print(f"{'='*80}\n")

# Export 1: Difficulty categorization summary
with open("sample_difficulty_summary.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Easy (0-1 failures)", "Medium (2-4 failures)", "Hard (5+ failures)", "Total Prompts", "Easy %", "Medium %", "Hard %"])
    
    for model in sorted(difficulty_by_model.keys()):
        categories = difficulty_by_model[model]
        total = len(categories['easy']) + len(categories['medium']) + len(categories['hard'])
        
        easy_pct = len(categories['easy']) / total * 100 if total > 0 else 0
        medium_pct = len(categories['medium']) / total * 100 if total > 0 else 0
        hard_pct = len(categories['hard']) / total * 100 if total > 0 else 0
        
        writer.writerow([
            model,
            len(categories['easy']),
            len(categories['medium']),
            len(categories['hard']),
            total,
            f"{easy_pct:.1f}",
            f"{medium_pct:.1f}",
            f"{hard_pct:.1f}"
        ])

print("✅ Exported: sample_difficulty_summary.csv")

# Export 2: Detailed difficulty per task
with open("sample_difficulty_detailed.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task", "Difficulty", "Successes", "Failures", "Total Runs", "Success Rate %"])
    
    for model in sorted(difficulty_by_model.keys()):
        for category in ['easy', 'medium', 'hard']:
            for item in sorted(difficulty_by_model[model][category], key=lambda x: x['failures'], reverse=True):
                success_rate = (item['success'] / item['total'] * 100) if item['total'] > 0 else 0
                writer.writerow([
                    model,
                    item['task'],
                    category.upper(),
                    item['success'],
                    item['failures'],
                    item['total'],
                    f"{success_rate:.1f}"
                ])

print("✅ Exported: sample_difficulty_detailed.csv")

# Export 3: Perfect tasks summary
with open("sample_perfect_tasks.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Perfect Tasks", "Total Tasks", "Perfect %", "Imperfect Tasks"])
    
    for model in sorted(perfect_tasks_by_model.keys()):
        perfect_count = len(perfect_tasks_by_model[model])
        total_count = len(task_results[model])
        imperfect_count = total_count - perfect_count
        perfect_pct = perfect_count / total_count * 100 if total_count > 0 else 0
        
        writer.writerow([
            model,
            perfect_count,
            total_count,
            f"{perfect_pct:.1f}",
            imperfect_count
        ])

print("✅ Exported: sample_perfect_tasks.csv")

# Export 4: Perfect tasks list
with open("sample_perfect_tasks_list.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task Name"])
    
    for model in sorted(perfect_tasks_by_model.keys()):
        for task in sorted(perfect_tasks_by_model[model]):
            writer.writerow([model, task])

print("✅ Exported: sample_perfect_tasks_list.csv")

# Export 5: Complete failure summary
with open("sample_complete_failures.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Failed Tasks", "Total Tasks", "Failure %"])
    
    for model in sorted(failed_tasks_by_model.keys()):
        failed_count = len(failed_tasks_by_model[model])
        total_count = len(task_results[model])
        failed_pct = failed_count / total_count * 100 if total_count > 0 else 0
        
        writer.writerow([
            model,
            failed_count,
            total_count,
            f"{failed_pct:.1f}"
        ])

print("✅ Exported: sample_complete_failures.csv")

# Export 6: Failed tasks list
with open("sample_failed_tasks_list.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task Name", "Total Runs Failed"])
    
    for model in sorted(failed_tasks_by_model.keys()):
        for item in sorted(failed_tasks_by_model[model], key=lambda x: x['runs'], reverse=True):
            writer.writerow([model, item['task'], item['runs']])

print("✅ Exported: sample_failed_tasks_list.csv")

# Export 7: Universal perfect and failed tasks
with open("sample_universal_analysis.csv", "w", newline='') as f:
    writer = csv.writer(f)
    
    writer.writerow(["Category", "Task Name", "Details"])
    
    writer.writerow(["=== PERFECT FOR ALL MODELS ===", "", ""])
    for task in sorted(perfect_for_all):
        writer.writerow(["Perfect for all", task, "All models succeeded"])
    
    writer.writerow(["", "", ""])
    writer.writerow(["=== FAILED FOR ALL MODELS ===", "", ""])
    for task in sorted(failed_for_all):
        writer.writerow(["Failed for all", task, "All models failed"])
    
    writer.writerow(["", "", ""])
    writer.writerow(["=== MIXED PATTERNS ===", "", ""])
    writer.writerow(["Type", "Task", "Perfect Models", "Imperfect/Failed Models"])
    
    for task, data in mixed_sorted:
        writer.writerow([
            "Mixed perfect",
            task,
            ", ".join(data["perfect"]),
            ", ".join(data["imperfect"])
        ])

print("✅ Exported: sample_universal_analysis.csv")

print(f"\n{'='*80}")
print(f"ALL EXPORTS COMPLETE")
print(f"{'='*80}\n")

print(f"Generated additional analysis files:")
print(f"  1. sample_difficulty_summary.csv - Difficulty distribution per model")
print(f"  2. sample_difficulty_detailed.csv - Detailed difficulty categorization")
print(f"  3. sample_perfect_tasks.csv - Perfect task summary")
print(f"  4. sample_perfect_tasks_list.csv - List of all perfect tasks")
print(f"  5. sample_complete_failures.csv - Complete failure summary")
print(f"  6. sample_failed_tasks_list.csv - List of all failed tasks")
print(f"  7. sample_universal_analysis.csv - Universal patterns across models")


EXPORTING ADDITIONAL ANALYSIS TO CSV

✅ Exported: sample_difficulty_summary.csv
✅ Exported: sample_difficulty_detailed.csv
✅ Exported: sample_perfect_tasks.csv
✅ Exported: sample_perfect_tasks_list.csv
✅ Exported: sample_complete_failures.csv
✅ Exported: sample_failed_tasks_list.csv


NameError: name 'mixed_sorted' is not defined

In [None]:
# Comprehensive Final Summary
print(f"{'='*80}")
print(f"COMPREHENSIVE SAMPLE DATA ANALYSIS SUMMARY")
print(f"{'='*80}\n")

print(f"Dataset: results/Sample-Harness_1/")
print(f"Total files analyzed: {len(files)}\n")

print(f"Models analyzed: {len(task_results)}")
for model in sorted(task_results.keys()):
    print(f"  - {model}")

print(f"\n{'='*80}")
print(f"SUCCESS RATES BY MODEL (sorted)")
print(f"{'='*80}\n")

for model, stats in sorted(model_success.items(), key=lambda x: x[1]['success_rate'], reverse=True):
    print(f"{model}: {stats['success_rate']:.2f}% ({stats['success']}/{stats['total']})")

print(f"\n{'='*80}")
print(f"KEY FINDINGS BY MODEL")
print(f"{'='*80}\n")

for model in sorted(task_results.keys()):
    total_tasks = len(task_results[model])
    failed_tasks = sum(1 for stats in task_results[model].values() if stats["success"] < stats["total"])
    perfect_tasks = len(perfect_tasks_by_model[model])
    complete_failures = len(failed_tasks_by_model[model])
    
    print(f"{model}:")
    print(f"  - Total unique tasks: {total_tasks}")
    print(f"  - Perfect tasks (100% success): {perfect_tasks}")
    print(f"  - Tasks with partial failures: {failed_tasks - complete_failures}")
    print(f"  - Complete failures (0% success): {complete_failures}")
    
    # Difficulty breakdown
    easy = len(difficulty_by_model[model]['easy'])
    medium = len(difficulty_by_model[model]['medium'])
    hard = len(difficulty_by_model[model]['hard'])
    print(f"  - Difficulty: Easy={easy}, Medium={medium}, Hard={hard}")
    print()

print(f"{'='*80}")
print(f"UNIVERSAL PATTERNS")
print(f"{'='*80}\n")

print(f"Tasks perfect for ALL models: {len(perfect_for_all)}/{len(all_tasks_set)}")
print(f"Tasks failed for ALL models: {len(failed_for_all)}/{len(all_tasks_set)}")
print(f"Tasks with mixed patterns: {len(mixed_perfect)}")

print(f"\n{'='*80}")
print(f"EXPORTED CSV FILES (all with 'sample_' prefix)")
print(f"{'='*80}\n")

print(f"Basic Analysis:")
print(f"  1. sample_model_summary.csv - Overall model statistics")
print(f"  2. sample_task_results_detailed.csv - Per-task success rates")
print(f"  3. sample_run_details.csv - Individual run details")

print(f"\nDifficulty Analysis:")
print(f"  4. sample_difficulty_summary.csv - Difficulty distribution per model")
print(f"  5. sample_difficulty_detailed.csv - Detailed difficulty categorization")

print(f"\nPerfect Tasks Analysis:")
print(f"  6. sample_perfect_tasks.csv - Perfect task summary")
print(f"  7. sample_perfect_tasks_list.csv - List of all perfect tasks")

print(f"\nFailure Analysis:")
print(f"  8. sample_complete_failures.csv - Complete failure summary")
print(f"  9. sample_failed_tasks_list.csv - List of all failed tasks")

print(f"\nCross-Model Patterns:")
print(f"  10. sample_universal_analysis.csv - Universal patterns across models")

print(f"\n{'='*80}")
print(f"✅ ANALYSIS COMPLETE!")
print(f"{'='*80}")
