In [59]:
from glob import glob

files = glob("results/batch_*/run_*.json")
len(files)

1512

In [61]:
files_extra = glob("results/redo/**/run_*.json")
len(files_extra)

88

In [62]:
total_files = files + files_extra
len(total_files)

1600

In [63]:
import json
from collections import defaultdict
collection = defaultdict(list)
for file in total_files:
    with open(file, "r") as f:
        data = json.load(f)
    collection[str(data["model"] or "unknown")].append(file)

In [64]:
for model, file in collection.items():
    print(model, len(file))
    # break

grok-4 400
gpt-5 400
gemini-2.5-pro 400
claude-sonnet-4-5 400


In [66]:
def did_model_succeed(f):
    with open(f, "r") as f:
        data = json.load(f)
    is_success = all(scenario["success"] for scenario in data["scenarios"][0]["verifiers"])
    return is_success

In [67]:
# Calculate total success rate per model
print(f"{'='*80}")
print(f"TOTAL SUCCESS RATES BY MODEL")
print(f"{'='*80}\n")

model_success = {}
for model, files_list in collection.items():
    total_runs = len(files_list)
    successful_runs = sum(1 for f in files_list if did_model_succeed(f))
    success_rate = (successful_runs / total_runs * 100) if total_runs > 0 else 0
    
    model_success[model] = {
        "total": total_runs,
        "success": successful_runs,
        "failed": total_runs - successful_runs,
        "success_rate": success_rate
    }
    
    print(f"{model}:")
    print(f"  Total runs: {total_runs}")
    print(f"  Successful: {successful_runs}")
    print(f"  Failed: {total_runs - successful_runs}")
    print(f"  Success rate: {success_rate:.2f}%")
    print()

model_success



TOTAL SUCCESS RATES BY MODEL

grok-4:
  Total runs: 400
  Successful: 359
  Failed: 41
  Success rate: 89.75%

gpt-5:
  Total runs: 400
  Successful: 319
  Failed: 81
  Success rate: 79.75%

gemini-2.5-pro:
  Total runs: 400
  Successful: 299
  Failed: 101
  Success rate: 74.75%

claude-sonnet-4-5:
  Total runs: 400
  Successful: 361
  Failed: 39
  Success rate: 90.25%



{'grok-4': {'total': 400, 'success': 359, 'failed': 41, 'success_rate': 89.75},
 'gpt-5': {'total': 400, 'success': 319, 'failed': 81, 'success_rate': 79.75},
 'gemini-2.5-pro': {'total': 400,
  'success': 299,
  'failed': 101,
  'success_rate': 74.75},
 'claude-sonnet-4-5': {'total': 400,
  'success': 361,
  'failed': 39,
  'success_rate': 90.25}}

In [69]:
# Understand the file naming convention for multiple runs
import os

print(f"{'='*80}")
print(f"UNDERSTANDING FILE NAMING CONVENTION")
print(f"{'='*80}\n")

# Sample some files to understand the pattern
sample_files = collection["grok-4"][:10]
print("Sample file names (grok-4):")
for f in sample_files:
    basename = os.path.basename(f)
    print(f"  {basename}")

print("\n" + "="*80)
print("PATTERN ANALYSIS")
print("="*80 + "\n")

# The pattern appears to be: run_<test_name>-<model>-<run_number>.json
# Let's verify this and extract the test name

def extract_test_info(filepath, model):
    """Extract test name and run number from filepath.
    
    Pattern: run_{test_name}-{model_name}-{run_number}.json
    Example: run_new_sys_task10_3_c1_p1_r8_v2_harness-claude-sonnet-4-5-1.json
    
    The challenge is that model names contain hyphens (grok-4, gpt-5, gemini-2.5-pro, claude-sonnet-4-5)
    """
    basename = os.path.basename(filepath)
    
    # Remove 'run_' prefix
    if basename.startswith('run_'):
        basename = basename[4:]
    
    # Remove '.json' suffix
    if basename.endswith('.json'):
        basename = basename[:-5]
    
    # The pattern is: {test_name}-{model}-{run_number}
    # We know the model, so we can split on it
    # But we need to be careful because model might be in the test name too
    
    # Strategy: find the model name followed by a hyphen and single/double digit number
    import re
    
    # Escape special chars in model name for regex
    model_escaped = re.escape(model)
    
    # Pattern: anything, then model, then hyphen, then 1-2 digits at the end
    pattern = f'^(.+)-{model_escaped}-(\\d+)$'
    match = re.match(pattern, basename)
    
    if match:
        test_name = match.group(1)
        run_number = match.group(2)
        return test_name, run_number
    
    # Fallback: just extract last number
    match = re.match(r'(.+)-(\d+)$', basename)
    if match:
        test_name = match.group(1)
        run_number = match.group(2)
        return test_name, run_number
    
    return basename, "unknown"

# Test the extraction
print("Testing extraction on sample files:")
for f in sample_files[:3]:
    test_name, run_num = extract_test_info(f, "grok-4")
    print(f"  Test: {test_name}, Run: {run_num}")



UNDERSTANDING FILE NAMING CONVENTION

Sample file names (grok-4):
  run_task6_2_c1_p1_r8_v1_gpt5_v1_harness-grok-4-1.json
  run_task24_8_c1_p1_r8_v1_gpt_v1_harness-grok-4-2.json
  run_task21_10_c1_p1_r8_v6_harness-grok-4-6.json
  run_task2_7_c1_p1_r8_v2_harness-grok-4-7.json
  run_task10_7_c1_p1_r8_v3_harness-grok-4-6.json
  run_task26_8_c1_p1_r8_v1_gpt_v1_harness-grok-4-3.json
  run_new_sys_task_task16_6_c1_p1_r8_v3_harness-grok-4-7.json
  run_task7_2_c1_p1_r8_v9_gpt5_v1_harness-grok-4-5.json
  run_task7_2_c1_p1_r8_v9_gpt5_v1_harness-grok-4-4.json
  run_new_sys_task_task16_6_c1_p1_r8_v3_harness-grok-4-6.json

PATTERN ANALYSIS

Testing extraction on sample files:
  Test: task6_2_c1_p1_r8_v1_gpt5_v1_harness, Run: 1
  Test: task24_8_c1_p1_r8_v1_gpt_v1_harness, Run: 2
  Test: task21_10_c1_p1_r8_v6_harness, Run: 6


In [70]:
# Group runs by task and analyze success per task per model
print(f"{'='*80}")
print(f"SUCCESS RATES PER TASK PER MODEL")
print(f"{'='*80}\n")

# Structure: task_results[model][task_name] = {"total": X, "success": Y, "runs": [...]}
def create_task_stats():
    return {"total": 0, "success": 0, "runs": []}

task_results = defaultdict(lambda: defaultdict(create_task_stats))

for model, files_list in collection.items():
    for file_path in files_list:
        test_name, run_num = extract_test_info(file_path, model)
        is_success = did_model_succeed(file_path)
        
        task_results[model][test_name]["total"] += 1
        if is_success:
            task_results[model][test_name]["success"] += 1
        task_results[model][test_name]["runs"].append({
            "run_number": run_num,
            "success": is_success,
            "file": file_path
        })

# Display summary
print(f"Summary: Found unique tasks per model:")
for model in task_results.keys():
    unique_tasks = len(task_results[model])
    print(f"  {model}: {unique_tasks} unique tasks")

print("\n" + "="*80)
print("Verifying 8 runs per task:")
print("="*80 + "\n")

for model, tasks in task_results.items():
    print(f"\n{model}:")
    non_eight_count = 0
    for task_name, stats in sorted(tasks.items()):
        if stats["total"] != 8:
            non_eight_count += 1
            print(f"  ‚ö†Ô∏è  {task_name}: {stats['total']} runs (expected 8)")
    
    if non_eight_count == 0:
        print(f"  ‚úÖ All tasks have exactly 8 runs")
    else:
        print(f"  ‚ùå {non_eight_count} tasks don't have 8 runs")



SUCCESS RATES PER TASK PER MODEL

Summary: Found unique tasks per model:
  grok-4: 50 unique tasks
  gpt-5: 50 unique tasks
  gemini-2.5-pro: 50 unique tasks
  claude-sonnet-4-5: 50 unique tasks

Verifying 8 runs per task:


grok-4:
  ‚úÖ All tasks have exactly 8 runs

gpt-5:
  ‚úÖ All tasks have exactly 8 runs

gemini-2.5-pro:
  ‚úÖ All tasks have exactly 8 runs

claude-sonnet-4-5:
  ‚úÖ All tasks have exactly 8 runs


In [71]:
# Detailed success rate per task per model
print(f"{'='*80}")
print(f"DETAILED TASK SUCCESS RATES")
print(f"{'='*80}\n")

for model in sorted(task_results.keys()):
    print(f"\n{'='*80}")
    print(f"{model.upper()}")
    print(f"{'='*80}\n")
    
    tasks = task_results[model]
    
    # Sort tasks by success rate (lowest first to highlight problems)
    sorted_tasks = sorted(tasks.items(), key=lambda x: (x[1]["success"] / x[1]["total"], x[0]))
    
    for task_name, stats in sorted_tasks:
        success_rate = (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
        
        # Use different symbols based on success rate
        if success_rate == 100:
            symbol = "‚úÖ"
        elif success_rate >= 50:
            symbol = "‚ö†Ô∏è "
        else:
            symbol = "‚ùå"
        
        print(f"{symbol} {task_name}")
        print(f"   Success: {stats['success']}/{stats['total']} ({success_rate:.1f}%)")
        
        # Show which specific runs failed
        failed_runs = [r for r in stats["runs"] if not r["success"]]
        if failed_runs:
            failed_run_nums = [r["run_number"] for r in failed_runs]
            print(f"   Failed runs: {', '.join(failed_run_nums)}")
        print()



DETAILED TASK SUCCESS RATES


CLAUDE-SONNET-4-5

‚ùå new_sys_task17_9_c1_p1_r8_v4_harness
   Success: 0/8 (0.0%)
   Failed runs: 1, 6, 7, 8, 4, 5, 2, 3

‚ùå new_sys_task6_4_c1_p1_r8_v3_harness
   Success: 0/8 (0.0%)
   Failed runs: 3, 2, 5, 4, 8, 7, 6, 1

‚ùå new_sys_task9_7_c1_p1_r8_v3_harness
   Success: 0/8 (0.0%)
   Failed runs: 1, 6, 7, 4, 8, 5, 2, 3

‚ö†Ô∏è  new_sys_task6_6_c1_p1_r8_v3_harness
   Success: 4/8 (50.0%)
   Failed runs: 8, 3, 1, 7

‚ö†Ô∏è  new_sys_task2_1_c1_p1_r8_v1_claude_v1_harness
   Success: 5/8 (62.5%)
   Failed runs: 1, 5, 4

‚ö†Ô∏è  new_sys_task2_8_c1_p1_r8_v2_harness
   Success: 6/8 (75.0%)
   Failed runs: 1, 8

‚ö†Ô∏è  new_sys_task5_3_c1_p1_r8_v1_harness
   Success: 6/8 (75.0%)
   Failed runs: 2, 7

‚ö†Ô∏è  claude-sonnet-4-5
   Success: 7/8 (87.5%)
   Failed runs: 2

‚ö†Ô∏è  new_sys_task15_3_c1_p1_r8_v1_harness
   Success: 7/8 (87.5%)
   Failed runs: 2

‚ö†Ô∏è  new_sys_task1_8_c1_p1_r8_v4_gpt5_v2_harness
   Success: 7/8 (87.5%)
   Failed runs: 7

‚ö†Ô∏è  ne

In [72]:
# Export results to CSV for analysis
import csv

print(f"{'='*80}")
print(f"EXPORTING RESULTS TO CSV")
print(f"{'='*80}\n")

# Export 1: Summary by model
with open("model_summary.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Total Runs", "Successful", "Failed", "Success Rate %"])
    
    for model, stats in sorted(model_success.items()):
        writer.writerow([
            model,
            stats["total"],
            stats["success"],
            stats["failed"],
            f"{stats['success_rate']:.2f}"
        ])

print("‚úÖ Exported: model_summary.csv")

# Export 2: Detailed task results
with open("task_results_detailed.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task", "Total Runs", "Successful", "Failed", "Success Rate %", "Failed Run Numbers"])
    
    for model in sorted(task_results.keys()):
        for task_name, stats in sorted(task_results[model].items()):
            success_rate = (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
            failed_runs = [r for r in stats["runs"] if not r["success"]]
            failed_run_nums = ", ".join([r["run_number"] for r in failed_runs])
            
            writer.writerow([
                model,
                task_name,
                stats["total"],
                stats["success"],
                stats["total"] - stats["success"],
                f"{success_rate:.1f}",
                failed_run_nums
            ])

print("‚úÖ Exported: task_results_detailed.csv")

# Export 3: Per-run details
with open("run_details.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task", "Run Number", "Success", "File Path"])
    
    for model in sorted(task_results.keys()):
        for task_name, stats in sorted(task_results[model].items()):
            for run in stats["runs"]:
                writer.writerow([
                    model,
                    task_name,
                    run["run_number"],
                    "Yes" if run["success"] else "No",
                    run["file"]
                ])

print("‚úÖ Exported: run_details.csv")

print(f"\n{'='*80}")
print(f"EXPORT COMPLETE")
print(f"{'='*80}")
print(f"\nGenerated files:")
print(f"  1. model_summary.csv - Overall model statistics")
print(f"  2. task_results_detailed.csv - Success rates per task per model")
print(f"  3. run_details.csv - Individual run details")



EXPORTING RESULTS TO CSV

‚úÖ Exported: model_summary.csv
‚úÖ Exported: task_results_detailed.csv
‚úÖ Exported: run_details.csv

EXPORT COMPLETE

Generated files:
  1. model_summary.csv - Overall model statistics
  2. task_results_detailed.csv - Success rates per task per model
  3. run_details.csv - Individual run details


In [73]:
# Cross-model task comparison - which tasks are hardest?
print(f"{'='*80}")
print(f"CROSS-MODEL TASK DIFFICULTY ANALYSIS")
print(f"{'='*80}\n")

# Get all unique tasks across all models
all_tasks = set()
for model, tasks in task_results.items():
    all_tasks.update(tasks.keys())

print(f"Total unique tasks: {len(all_tasks)}\n")

# Build comparison matrix
task_comparison = {}
for task in all_tasks:
    task_comparison[task] = {}
    total_success = 0
    total_runs = 0
    
    for model in sorted(task_results.keys()):
        if task in task_results[model]:
            stats = task_results[model][task]
            task_comparison[task][model] = {
                "success": stats["success"],
                "total": stats["total"],
                "rate": (stats["success"] / stats["total"] * 100) if stats["total"] > 0 else 0
            }
            total_success += stats["success"]
            total_runs += stats["total"]
        else:
            task_comparison[task][model] = {"success": 0, "total": 0, "rate": 0}
    
    task_comparison[task]["overall_rate"] = (total_success / total_runs * 100) if total_runs > 0 else 0

# Sort tasks by overall difficulty (lowest success rate first)
sorted_tasks_by_difficulty = sorted(task_comparison.items(), key=lambda x: x[1]["overall_rate"])

print("MOST DIFFICULT TASKS (lowest success rate):")
print("="*80 + "\n")

for i, (task, stats) in enumerate(sorted_tasks_by_difficulty[:10], 1):
    print(f"{i}. {task}")
    print(f"   Overall success rate: {stats['overall_rate']:.1f}%")
    print(f"   Per model:")
    for model in sorted(task_results.keys()):
        model_stats = stats[model]
        if model_stats["total"] > 0:
            print(f"     - {model}: {model_stats['success']}/{model_stats['total']} ({model_stats['rate']:.1f}%)")
    print()

print("\n" + "="*80)
print("EASIEST TASKS (highest success rate):")
print("="*80 + "\n")

for i, (task, stats) in enumerate(reversed(sorted_tasks_by_difficulty[-10:]), 1):
    print(f"{i}. {task}")
    print(f"   Overall success rate: {stats['overall_rate']:.1f}%")
    print(f"   Per model:")
    for model in sorted(task_results.keys()):
        model_stats = stats[model]
        if model_stats["total"] > 0:
            print(f"     - {model}: {model_stats['success']}/{model_stats['total']} ({model_stats['rate']:.1f}%)")
    print()



CROSS-MODEL TASK DIFFICULTY ANALYSIS

Total unique tasks: 52

MOST DIFFICULT TASKS (lowest success rate):

1. gemini-2.5-pro
   Overall success rate: 0.0%
   Per model:
     - gemini-2.5-pro: 0/8 (0.0%)

2. new_sys_task9_7_c1_p1_r8_v3_harness
   Overall success rate: 0.0%
   Per model:
     - claude-sonnet-4-5: 0/8 (0.0%)
     - gpt-5: 0/8 (0.0%)
     - grok-4: 0/8 (0.0%)

3. new_sys_task6_4_c1_p1_r8_v3_harness
   Overall success rate: 0.0%
   Per model:
     - claude-sonnet-4-5: 0/8 (0.0%)
     - gemini-2.5-pro: 0/8 (0.0%)
     - gpt-5: 0/8 (0.0%)
     - grok-4: 0/8 (0.0%)

4. new_sys_task17_9_c1_p1_r8_v4_harness
   Overall success rate: 6.2%
   Per model:
     - claude-sonnet-4-5: 0/8 (0.0%)
     - gemini-2.5-pro: 0/8 (0.0%)
     - gpt-5: 0/8 (0.0%)
     - grok-4: 2/8 (25.0%)

5. new_sys_task2_8_c1_p1_r8_v2_harness
   Overall success rate: 34.4%
   Per model:
     - claude-sonnet-4-5: 6/8 (75.0%)
     - gemini-2.5-pro: 4/8 (50.0%)
     - gpt-5: 1/8 (12.5%)
     - grok-4: 0/8 (0.0%)



In [74]:
# Export task comparison matrix
print(f"{'='*80}")
print(f"EXPORTING TASK COMPARISON MATRIX")
print(f"{'='*80}\n")

with open("task_comparison_matrix.csv", "w", newline='') as f:
    writer = csv.writer(f)
    
    # Header
    models = sorted(task_results.keys())
    header = ["Task", "Overall Success Rate %"]
    for model in models:
        header.extend([f"{model} Success", f"{model} Total", f"{model} Rate %"])
    writer.writerow(header)
    
    # Sort by overall rate (hardest first)
    for task, stats in sorted_tasks_by_difficulty:
        row = [task, f"{stats['overall_rate']:.1f}"]
        
        for model in models:
            model_stats = stats[model]
            row.extend([
                model_stats["success"],
                model_stats["total"],
                f"{model_stats['rate']:.1f}" if model_stats["total"] > 0 else "N/A"
            ])
        
        writer.writerow(row)

print("‚úÖ Exported: task_comparison_matrix.csv")

print(f"\n{'='*80}")
print(f"ANALYSIS COMPLETE")
print(f"{'='*80}\n")

print(f"Summary of findings:")
print(f"  - Total models: {len(task_results)}")
print(f"  - Total unique tasks: {len(all_tasks)}")
print(f"  - Total runs analyzed: {sum(m['total'] for m in model_success.values())}")
print(f"\nSuccess rates by model:")
for model, stats in sorted(model_success.items(), key=lambda x: x[1]['success_rate'], reverse=True):
    print(f"  {model}: {stats['success_rate']:.2f}% ({stats['success']}/{stats['total']})")

print(f"\nExported files:")
print(f"  1. model_summary.csv")
print(f"  2. task_results_detailed.csv")
print(f"  3. run_details.csv")
print(f"  4. task_comparison_matrix.csv")



EXPORTING TASK COMPARISON MATRIX

‚úÖ Exported: task_comparison_matrix.csv

ANALYSIS COMPLETE

Summary of findings:
  - Total models: 4
  - Total unique tasks: 52
  - Total runs analyzed: 1600

Success rates by model:
  claude-sonnet-4-5: 90.25% (361/400)
  grok-4: 89.75% (359/400)
  gpt-5: 79.75% (319/400)
  gemini-2.5-pro: 74.75% (299/400)

Exported files:
  1. model_summary.csv
  2. task_results_detailed.csv
  3. run_details.csv
  4. task_comparison_matrix.csv


In [75]:
# Categorize prompts by difficulty based on failure rate per model
print(f"{'='*80}")
print(f"PROMPT DIFFICULTY CATEGORIZATION BY MODEL")
print(f"{'='*80}\n")

# Categories:
# Easy: 0-1 failures (7-8 successes out of 8)
# Medium: 2-4 failures (4-6 successes out of 8)
# Hard: 5-8 failures (0-3 successes out of 8)

def categorize_difficulty(success_count, total_runs=8):
    """Categorize based on number of failures"""
    failures = total_runs - success_count
    if failures <= 1:
        return "easy"
    elif failures <= 4:
        return "medium"
    else:
        return "hard"

# Structure: difficulty_by_model[model][category] = [list of tasks]
difficulty_by_model = {}

for model in sorted(task_results.keys()):
    difficulty_by_model[model] = {
        "easy": [],
        "medium": [],
        "hard": []
    }
    
    for task_name, stats in task_results[model].items():
        category = categorize_difficulty(stats["success"], stats["total"])
        difficulty_by_model[model][category].append({
            "task": task_name,
            "success": stats["success"],
            "total": stats["total"],
            "failures": stats["total"] - stats["success"]
        })

# Display results
for model in sorted(difficulty_by_model.keys()):
    print(f"\n{'='*80}")
    print(f"{model.upper()}")
    print(f"{'='*80}\n")
    
    categories = difficulty_by_model[model]
    
    print(f"‚úÖ EASY (0-1 failures): {len(categories['easy'])} prompts")
    print(f"‚ö†Ô∏è  MEDIUM (2-4 failures): {len(categories['medium'])} prompts")
    print(f"‚ùå HARD (5-8 failures): {len(categories['hard'])} prompts")
    
    total_prompts = len(categories['easy']) + len(categories['medium']) + len(categories['hard'])
    print(f"\nTotal prompts: {total_prompts}")
    
    if total_prompts > 0:
        easy_pct = len(categories['easy']) / total_prompts * 100
        medium_pct = len(categories['medium']) / total_prompts * 100
        hard_pct = len(categories['hard']) / total_prompts * 100
        
        print(f"\nDistribution:")
        print(f"  Easy: {easy_pct:.1f}%")
        print(f"  Medium: {medium_pct:.1f}%")
        print(f"  Hard: {hard_pct:.1f}%")
    
    # Show some examples from each category
    if categories['hard']:
        print(f"\n‚ùå HARD prompts (showing up to 5):")
        for item in sorted(categories['hard'], key=lambda x: x['failures'], reverse=True)[:5]:
            print(f"  - {item['task']}")
            print(f"    Failures: {item['failures']}/{item['total']}")
    
    if categories['medium']:
        print(f"\n‚ö†Ô∏è  MEDIUM prompts (showing up to 3):")
        for item in sorted(categories['medium'], key=lambda x: x['failures'], reverse=True)[:3]:
            print(f"  - {item['task']}")
            print(f"    Failures: {item['failures']}/{item['total']}")

print(f"\n{'='*80}")
print(f"SUMMARY ACROSS ALL MODELS")
print(f"{'='*80}\n")

# Aggregate summary
summary_table = []
for model in sorted(difficulty_by_model.keys()):
    categories = difficulty_by_model[model]
    total = len(categories['easy']) + len(categories['medium']) + len(categories['hard'])
    summary_table.append({
        "model": model,
        "easy": len(categories['easy']),
        "medium": len(categories['medium']),
        "hard": len(categories['hard']),
        "total": total
    })

# Print as table
print(f"{'Model':<20} {'Easy':<10} {'Medium':<10} {'Hard':<10} {'Total':<10}")
print(f"{'-'*60}")
for row in summary_table:
    print(f"{row['model']:<20} {row['easy']:<10} {row['medium']:<10} {row['hard']:<10} {row['total']:<10}")



PROMPT DIFFICULTY CATEGORIZATION BY MODEL


CLAUDE-SONNET-4-5

‚úÖ EASY (0-1 failures): 43 prompts
‚ö†Ô∏è  MEDIUM (2-4 failures): 4 prompts
‚ùå HARD (5-8 failures): 3 prompts

Total prompts: 50

Distribution:
  Easy: 86.0%
  Medium: 8.0%
  Hard: 6.0%

‚ùå HARD prompts (showing up to 5):
  - new_sys_task6_4_c1_p1_r8_v3_harness
    Failures: 8/8
  - new_sys_task9_7_c1_p1_r8_v3_harness
    Failures: 8/8
  - new_sys_task17_9_c1_p1_r8_v4_harness
    Failures: 8/8

‚ö†Ô∏è  MEDIUM prompts (showing up to 3):
  - new_sys_task6_6_c1_p1_r8_v3_harness
    Failures: 4/8
  - new_sys_task2_1_c1_p1_r8_v1_claude_v1_harness
    Failures: 3/8
  - new_sys_task2_8_c1_p1_r8_v2_harness
    Failures: 2/8

GEMINI-2.5-PRO

‚úÖ EASY (0-1 failures): 32 prompts
‚ö†Ô∏è  MEDIUM (2-4 failures): 9 prompts
‚ùå HARD (5-8 failures): 9 prompts

Total prompts: 50

Distribution:
  Easy: 64.0%
  Medium: 18.0%
  Hard: 18.0%

‚ùå HARD prompts (showing up to 5):
  - gemini-2.5-pro
    Failures: 8/8
  - new_sys_task17_9_c1_p1_r8

In [76]:
# Export difficulty categorization to CSV
print(f"{'='*80}")
print(f"EXPORTING DIFFICULTY CATEGORIZATION")
print(f"{'='*80}\n")

# Export 1: Difficulty summary by model
with open("difficulty_summary_by_model.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Easy (0-1 failures)", "Medium (2-4 failures)", "Hard (5-8 failures)", "Total Prompts", "Easy %", "Medium %", "Hard %"])
    
    for model in sorted(difficulty_by_model.keys()):
        categories = difficulty_by_model[model]
        total = len(categories['easy']) + len(categories['medium']) + len(categories['hard'])
        
        easy_pct = len(categories['easy']) / total * 100 if total > 0 else 0
        medium_pct = len(categories['medium']) / total * 100 if total > 0 else 0
        hard_pct = len(categories['hard']) / total * 100 if total > 0 else 0
        
        writer.writerow([
            model,
            len(categories['easy']),
            len(categories['medium']),
            len(categories['hard']),
            total,
            f"{easy_pct:.1f}",
            f"{medium_pct:.1f}",
            f"{hard_pct:.1f}"
        ])

print("‚úÖ Exported: difficulty_summary_by_model.csv")

# Export 2: Detailed prompt difficulty per model
with open("prompt_difficulty_detailed.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Prompt/Task", "Difficulty", "Successes", "Failures", "Total Runs", "Success Rate %"])
    
    for model in sorted(difficulty_by_model.keys()):
        for category in ['easy', 'medium', 'hard']:
            for item in sorted(difficulty_by_model[model][category], key=lambda x: x['failures'], reverse=True):
                success_rate = (item['success'] / item['total'] * 100) if item['total'] > 0 else 0
                writer.writerow([
                    model,
                    item['task'],
                    category.upper(),
                    item['success'],
                    item['failures'],
                    item['total'],
                    f"{success_rate:.1f}"
                ])

print("‚úÖ Exported: prompt_difficulty_detailed.csv")

# Export 3: Cross-model difficulty comparison (which prompts are hard for which models)
print("\nAnalyzing cross-model difficulty patterns...")

# Get all unique tasks
all_tasks_set = set()
for model in task_results.keys():
    all_tasks_set.update(task_results[model].keys())

with open("prompt_difficulty_cross_model.csv", "w", newline='') as f:
    writer = csv.writer(f)
    
    # Header
    header = ["Prompt/Task"]
    for model in sorted(task_results.keys()):
        header.extend([f"{model} Difficulty", f"{model} Failures"])
    header.append("Models Finding This Hard")
    writer.writerow(header)
    
    # For each task, show difficulty across models
    for task in sorted(all_tasks_set):
        row = [task]
        hard_count = 0
        
        for model in sorted(task_results.keys()):
            if task in task_results[model]:
                stats = task_results[model][task]
                category = categorize_difficulty(stats["success"], stats["total"])
                failures = stats["total"] - stats["success"]
                
                row.extend([category.upper(), failures])
                
                if category == "hard":
                    hard_count += 1
            else:
                row.extend(["N/A", "N/A"])
        
        row.append(hard_count)
        writer.writerow(row)

print("‚úÖ Exported: prompt_difficulty_cross_model.csv")

print(f"\n{'='*80}")
print(f"EXPORT COMPLETE")
print(f"{'='*80}\n")

print(f"Generated difficulty analysis files:")
print(f"  1. difficulty_summary_by_model.csv - Summary table of easy/medium/hard counts per model")
print(f"  2. prompt_difficulty_detailed.csv - Detailed list of all prompts with difficulty ratings per model")
print(f"  3. prompt_difficulty_cross_model.csv - Cross-model comparison showing which prompts are hard for which models")



EXPORTING DIFFICULTY CATEGORIZATION

‚úÖ Exported: difficulty_summary_by_model.csv
‚úÖ Exported: prompt_difficulty_detailed.csv

Analyzing cross-model difficulty patterns...
‚úÖ Exported: prompt_difficulty_cross_model.csv

EXPORT COMPLETE

Generated difficulty analysis files:
  1. difficulty_summary_by_model.csv - Summary table of easy/medium/hard counts per model
  2. prompt_difficulty_detailed.csv - Detailed list of all prompts with difficulty ratings per model
  3. prompt_difficulty_cross_model.csv - Cross-model comparison showing which prompts are hard for which models


In [77]:
# Identify universally difficult prompts and model-specific challenges
print(f"{'='*80}")
print(f"UNIVERSAL vs MODEL-SPECIFIC DIFFICULTY ANALYSIS")
print(f"{'='*80}\n")

# Get all unique tasks
all_tasks_set = set()
for model in task_results.keys():
    all_tasks_set.update(task_results[model].keys())

# Analyze each task across all models
universal_hard = []
mostly_hard = []
some_hard = []
model_specific_hard = {}

for task in all_tasks_set:
    hard_models = []
    medium_models = []
    easy_models = []
    
    for model in sorted(task_results.keys()):
        if task in task_results[model]:
            stats = task_results[model][task]
            category = categorize_difficulty(stats["success"], stats["total"])
            
            if category == "hard":
                hard_models.append(model)
            elif category == "medium":
                medium_models.append(model)
            else:
                easy_models.append(model)
    
    num_hard = len(hard_models)
    total_models = len(hard_models) + len(medium_models) + len(easy_models)
    
    if num_hard == total_models and total_models > 0:
        # All models find this hard
        universal_hard.append({
            "task": task,
            "hard_count": num_hard,
            "models": hard_models
        })
    elif num_hard >= total_models * 0.75 and total_models > 0:
        # Most models find this hard (75%+)
        mostly_hard.append({
            "task": task,
            "hard_count": num_hard,
            "total": total_models,
            "hard_models": hard_models,
            "other_models": medium_models + easy_models
        })
    elif num_hard > 0:
        # Some models find this hard
        if num_hard == 1:
            # Model-specific difficulty
            model = hard_models[0]
            if model not in model_specific_hard:
                model_specific_hard[model] = []
            model_specific_hard[model].append({
                "task": task,
                "failures": task_results[model][task]["total"] - task_results[model][task]["success"]
            })
        else:
            some_hard.append({
                "task": task,
                "hard_count": num_hard,
                "total": total_models,
                "hard_models": hard_models
            })

print(f"üî¥ UNIVERSAL HARD PROMPTS (Hard for ALL models): {len(universal_hard)}")
if universal_hard:
    for i, item in enumerate(universal_hard, 1):
        print(f"\n{i}. {item['task']}")
        print(f"   Hard for: {', '.join(item['models'])}")

print(f"\n{'='*80}")
print(f"üü† MOSTLY HARD PROMPTS (Hard for 75%+ of models): {len(mostly_hard)}")
if mostly_hard:
    for i, item in enumerate(mostly_hard[:5], 1):
        print(f"\n{i}. {item['task']}")
        print(f"   Hard for: {', '.join(item['hard_models'])} ({item['hard_count']}/{item['total']})")
        if item['other_models']:
            print(f"   Other models: {', '.join(item['other_models'])}")

print(f"\n{'='*80}")
print(f"üü° SOME HARD PROMPTS (Hard for multiple but not all models): {len(some_hard)}")
if some_hard:
    for i, item in enumerate(sorted(some_hard, key=lambda x: x['hard_count'], reverse=True)[:5], 1):
        print(f"\n{i}. {item['task']}")
        print(f"   Hard for: {', '.join(item['hard_models'])} ({item['hard_count']}/{item['total']})")

print(f"\n{'='*80}")
print(f"üîµ MODEL-SPECIFIC HARD PROMPTS (Hard for only ONE model)")
print(f"{'='*80}\n")

for model in sorted(model_specific_hard.keys()):
    prompts = model_specific_hard[model]
    print(f"\n{model}: {len(prompts)} unique hard prompts")
    if prompts:
        print(f"  Top 3 (by failures):")
        for item in sorted(prompts, key=lambda x: x['failures'], reverse=True)[:3]:
            print(f"    - {item['task']}")
            print(f"      Failures: {item['failures']}/8")

# Summary statistics
print(f"\n{'='*80}")
print(f"DIFFICULTY PATTERN SUMMARY")
print(f"{'='*80}\n")

total_analyzed = len(all_tasks_set)
print(f"Total unique prompts analyzed: {total_analyzed}")
print(f"\nDifficulty patterns:")
print(f"  üî¥ Universal (all models): {len(universal_hard)} ({len(universal_hard)/total_analyzed*100:.1f}%)")
print(f"  üü† Mostly hard (75%+ models): {len(mostly_hard)} ({len(mostly_hard)/total_analyzed*100:.1f}%)")
print(f"  üü° Some hard (2-3 models): {len(some_hard)} ({len(some_hard)/total_analyzed*100:.1f}%)")

model_specific_total = sum(len(prompts) for prompts in model_specific_hard.values())
print(f"  üîµ Model-specific (1 model): {model_specific_total} ({model_specific_total/total_analyzed*100:.1f}%)")

# Calculate prompts that are easy for all
all_easy_count = 0
for task in all_tasks_set:
    all_easy = True
    for model in task_results.keys():
        if task in task_results[model]:
            stats = task_results[model][task]
            category = categorize_difficulty(stats["success"], stats["total"])
            if category != "easy":
                all_easy = False
                break
    if all_easy:
        all_easy_count += 1

print(f"  ‚úÖ Easy for all models: {all_easy_count} ({all_easy_count/total_analyzed*100:.1f}%)")



UNIVERSAL vs MODEL-SPECIFIC DIFFICULTY ANALYSIS

üî¥ UNIVERSAL HARD PROMPTS (Hard for ALL models): 4

1. new_sys_task17_9_c1_p1_r8_v4_harness
   Hard for: claude-sonnet-4-5, gemini-2.5-pro, gpt-5, grok-4

2. gemini-2.5-pro
   Hard for: gemini-2.5-pro

3. new_sys_task9_7_c1_p1_r8_v3_harness
   Hard for: claude-sonnet-4-5, gpt-5, grok-4

4. new_sys_task6_4_c1_p1_r8_v3_harness
   Hard for: claude-sonnet-4-5, gemini-2.5-pro, gpt-5, grok-4

üü† MOSTLY HARD PROMPTS (Hard for 75%+ of models): 0

üü° SOME HARD PROMPTS (Hard for multiple but not all models): 2

1. new_sys_task2_8_c1_p1_r8_v2_harness
   Hard for: gpt-5, grok-4 (2/4)

2. task6_2_c1_p1_r8_v1_gpt5_v1_harness
   Hard for: gpt-5, grok-4 (2/4)

üîµ MODEL-SPECIFIC HARD PROMPTS (Hard for only ONE model)


gemini-2.5-pro: 6 unique hard prompts
  Top 3 (by failures):
    - new_sys_task7_6_c1_p1_r8_v1
      Failures: 8/8
    - new_sys_task8_1_c1_p1_r8_v1_claude_v1_harness
      Failures: 7/8
    - new_sys_task6_6_c1_p1_r8_v3_harness
  

In [79]:
# Find tasks with perfect 8/8 success rate per model
print(f"{'='*80}")
print(f"PERFECT SUCCESS RATE ANALYSIS (8/8 runs passing)")
print(f"{'='*80}\n")

perfect_tasks_by_model = {}

for model in sorted(task_results.keys()):
    perfect_tasks = []
    
    for task_name, stats in task_results[model].items():
        if stats["success"] == stats["total"] and stats["total"] == 8:
            perfect_tasks.append(task_name)
    
    perfect_tasks_by_model[model] = perfect_tasks
    
    print(f"\n{model}:")
    print(f"  Tasks with 8/8 success: {len(perfect_tasks)}/{len(task_results[model])}")
    print(f"  Percentage: {len(perfect_tasks)/len(task_results[model])*100:.1f}%")

# Summary table
print(f"\n{'='*80}")
print(f"SUMMARY TABLE")
print(f"{'='*80}\n")

print(f"{'Model':<25} {'Perfect Tasks (8/8)':<20} {'Total Tasks':<15} {'Perfect %':<15}")
print(f"{'-'*75}")

for model in sorted(perfect_tasks_by_model.keys()):
    perfect_count = len(perfect_tasks_by_model[model])
    total_count = len(task_results[model])
    perfect_pct = perfect_count / total_count * 100 if total_count > 0 else 0
    
    print(f"{model:<25} {perfect_count:<20} {total_count:<15} {perfect_pct:.1f}%")

# Compare which tasks are perfect across all models
print(f"\n{'='*80}")
print(f"TASKS PERFECT FOR ALL MODELS")
print(f"{'='*80}\n")

all_tasks_set = set()
for model in task_results.keys():
    all_tasks_set.update(task_results[model].keys())

perfect_for_all = []
for task in all_tasks_set:
    is_perfect_for_all = True
    
    for model in task_results.keys():
        if task in task_results[model]:
            stats = task_results[model][task]
            if stats["success"] != 8 or stats["total"] != 8:
                is_perfect_for_all = False
                break
        else:
            is_perfect_for_all = False
            break
    
    if is_perfect_for_all:
        perfect_for_all.append(task)

print(f"Tasks with 8/8 success across ALL models: {len(perfect_for_all)}/{len(all_tasks_set)}")
print(f"Percentage: {len(perfect_for_all)/len(all_tasks_set)*100:.1f}%\n")

if perfect_for_all:
    print(f"These tasks are perfect for all models:")
    for i, task in enumerate(sorted(perfect_for_all), 1):
        print(f"  {i}. {task}")

# Find tasks that are perfect for some models but not all
print(f"\n{'='*80}")
print(f"TASKS WITH MIXED PERFECT SCORES")
print(f"{'='*80}\n")

mixed_perfect = {}
for task in all_tasks_set:
    perfect_models = []
    imperfect_models = []
    
    for model in sorted(task_results.keys()):
        if task in task_results[model]:
            stats = task_results[model][task]
            if stats["success"] == 8 and stats["total"] == 8:
                perfect_models.append(model)
            else:
                imperfect_models.append(model)
    
    # Only include if some (but not all) models have perfect score
    if perfect_models and imperfect_models:
        mixed_perfect[task] = {
            "perfect": perfect_models,
            "imperfect": imperfect_models
        }

print(f"Tasks with mixed perfect scores: {len(mixed_perfect)}\n")

# Show top examples (tasks where only 1-2 models fail)
mixed_sorted = sorted(mixed_perfect.items(), 
                     key=lambda x: len(x[1]["perfect"]), 
                     reverse=True)

print(f"Top examples (most models perfect):")
for i, (task, data) in enumerate(mixed_sorted[:10], 1):
    print(f"\n{i}. {task}")
    print(f"   Perfect for ({len(data['perfect'])}/4): {', '.join(data['perfect'])}")
    print(f"   Not perfect for: {', '.join(data['imperfect'])}")



PERFECT SUCCESS RATE ANALYSIS (8/8 runs passing)


claude-sonnet-4-5:
  Tasks with 8/8 success: 39/50
  Percentage: 78.0%

gemini-2.5-pro:
  Tasks with 8/8 success: 21/50
  Percentage: 42.0%

gpt-5:
  Tasks with 8/8 success: 25/50
  Percentage: 50.0%

grok-4:
  Tasks with 8/8 success: 40/50
  Percentage: 80.0%

SUMMARY TABLE

Model                     Perfect Tasks (8/8)  Total Tasks     Perfect %      
---------------------------------------------------------------------------
claude-sonnet-4-5         39                   50              78.0%
gemini-2.5-pro            21                   50              42.0%
gpt-5                     25                   50              50.0%
grok-4                    40                   50              80.0%

TASKS PERFECT FOR ALL MODELS

Tasks with 8/8 success across ALL models: 13/52
Percentage: 25.0%

These tasks are perfect for all models:
  1. new_sys_task11_9_c1_p1_r8_v2_harness
  2. new_sys_task12_3_c1_p1_r8_v4_harness
  3. new_sys_task15

In [80]:
# Export perfect score analysis
print(f"{'='*80}")
print(f"EXPORTING PERFECT SCORE ANALYSIS")
print(f"{'='*80}\n")

# Export 1: Perfect score summary by model
with open("perfect_score_summary.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Perfect Tasks (8/8)", "Total Tasks", "Perfect %", "Imperfect Tasks"])
    
    for model in sorted(perfect_tasks_by_model.keys()):
        perfect_count = len(perfect_tasks_by_model[model])
        total_count = len(task_results[model])
        imperfect_count = total_count - perfect_count
        perfect_pct = perfect_count / total_count * 100 if total_count > 0 else 0
        
        writer.writerow([
            model,
            perfect_count,
            total_count,
            f"{perfect_pct:.1f}",
            imperfect_count
        ])

print("‚úÖ Exported: perfect_score_summary.csv")

# Export 2: Perfect tasks per model
with open("perfect_tasks_by_model.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Model", "Task Name"])
    
    for model in sorted(perfect_tasks_by_model.keys()):
        for task in sorted(perfect_tasks_by_model[model]):
            writer.writerow([model, task])

print("‚úÖ Exported: perfect_tasks_by_model.csv")

# Export 3: Cross-model perfect score matrix
with open("perfect_score_cross_model.csv", "w", newline='') as f:
    writer = csv.writer(f)
    
    # Header
    models_sorted = sorted(task_results.keys())
    header = ["Task Name"] + [f"{model} (8/8?)" for model in models_sorted] + ["Perfect for # Models", "Perfect for All?"]
    writer.writerow(header)
    
    # For each task
    for task in sorted(all_tasks_set):
        row = [task]
        perfect_count = 0
        
        for model in models_sorted:
            if task in task_results[model]:
                stats = task_results[model][task]
                is_perfect = stats["success"] == 8 and stats["total"] == 8
                row.append("Yes" if is_perfect else "No")
                if is_perfect:
                    perfect_count += 1
            else:
                row.append("N/A")
        
        row.append(perfect_count)
        row.append("Yes" if perfect_count == len(models_sorted) else "No")
        writer.writerow(row)

print("‚úÖ Exported: perfect_score_cross_model.csv")

# Export 4: Universal perfect tasks
with open("universal_perfect_tasks.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Task Name", "Status"])
    
    writer.writerow(["=== PERFECT FOR ALL MODELS ===", ""])
    for task in sorted(perfect_for_all):
        writer.writerow([task, "Perfect for all"])
    
    writer.writerow(["", ""])
    writer.writerow(["=== MIXED PERFECT SCORES ===", ""])
    writer.writerow(["Task Name", "Perfect Models", "Imperfect Models"])
    
    for task, data in mixed_sorted:
        writer.writerow([
            task,
            ", ".join(data["perfect"]),
            ", ".join(data["imperfect"])
        ])

print("‚úÖ Exported: universal_perfect_tasks.csv")

print(f"\n{'='*80}")
print(f"PERFECT SCORE EXPORT COMPLETE")
print(f"{'='*80}\n")

print(f"Generated files:")
print(f"  1. perfect_score_summary.csv - Summary of perfect scores per model")
print(f"  2. perfect_tasks_by_model.csv - List of all perfect tasks per model")
print(f"  3. perfect_score_cross_model.csv - Matrix showing which tasks are perfect for which models")
print(f"  4. universal_perfect_tasks.csv - Tasks perfect for all vs some models")



EXPORTING PERFECT SCORE ANALYSIS

‚úÖ Exported: perfect_score_summary.csv
‚úÖ Exported: perfect_tasks_by_model.csv
‚úÖ Exported: perfect_score_cross_model.csv
‚úÖ Exported: universal_perfect_tasks.csv

PERFECT SCORE EXPORT COMPLETE

Generated files:
  1. perfect_score_summary.csv - Summary of perfect scores per model
  2. perfect_tasks_by_model.csv - List of all perfect tasks per model
  3. perfect_score_cross_model.csv - Matrix showing which tasks are perfect for which models
  4. universal_perfect_tasks.csv - Tasks perfect for all vs some models
