In [6]:
import json
import pandas as pd
import os
import glob
from pathlib import Path

# Define the methods/processes
methods = ['default_essay']

# Create a dictionary to store all results
results_data = {}

In [7]:
def process_json_file(filepath):
    """
    Process a single JSON file and calculate the aggregate score
    Returns: (model_name, total_score, num_questions, normalized_score)
    """
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        # Extract model name from filename
        filename = os.path.basename(filepath)
        # Remove 'evaluated_results_' prefix and '.json' suffix, then split by '__'
        model_name = filename.replace('evaluated_results_', '').replace('.json', '').split('__')[0]
        
        # Calculate total self_grade_score
        total_score = 0
        num_questions = 0
        
        for item in data:
            if 'self_grade_score' in item and item['self_grade_score'] is not None:
                total_score += item['self_grade_score']
                num_questions += 1
        
        # Calculate normalized score (total achieved / total possible)
        # Total possible = num_questions * 7
        normalized_score = total_score / (num_questions * 7) if num_questions > 0 else 0
        
        return model_name, total_score, num_questions, normalized_score
    
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None, 0, 0, 0

In [8]:
def check_scores_over_7():
    """
    Check all JSON files for any self_grade_score over 7 and flag them
    """
    issues_found = []
    
    print("Checking for scores over 7...")
    print("=" * 60)
    
    for method in methods:
        method_dir = f"results/{method}"
        
        if os.path.exists(method_dir):
            json_files = glob.glob(os.path.join(method_dir, "evaluated_results_*.json"))
            
            for filepath in json_files:
                try:
                    with open(filepath, 'r') as f:
                        data = json.load(f)
                    
                    # Extract model name from filename
                    filename = os.path.basename(filepath)
                    model_name = filename.replace('evaluated_results_', '').replace('.json', '').split('__')[0]
                    
                    # Check each question in the file
                    for idx, item in enumerate(data):
                        if 'self_grade_score' in item and item['self_grade_score'] is not None:
                            score = item['self_grade_score']
                            
                            if score > 7:
                                issue = {
                                    'method': method,
                                    'model': model_name,
                                    'question_index': idx,
                                    'score': score,
                                    'file': filepath
                                }
                                
                                # Try to get question text or ID for better identification
                                question_id = item.get('question_id', item.get('id', f"Question_{idx}"))
                                issue['question_id'] = question_id
                                
                                issues_found.append(issue)
                                
                                print(f"🚨 ISSUE FOUND:")
                                print(f"   Method: {method}")
                                print(f"   Model: {model_name}")
                                print(f"   Question ID: {question_id}")
                                print(f"   Question Index: {idx}")
                                print(f"   Score: {score} (over 7!)")
                                print(f"   File: {os.path.basename(filepath)}")
                                print("-" * 40)
                
                except Exception as e:
                    print(f"Error checking {filepath}: {e}")
    
    # Summary
    if issues_found:
        print(f"\n⚠️  TOTAL ISSUES FOUND: {len(issues_found)}")
        print("\nSummary by Model:")
        
        # Group issues by model
        from collections import defaultdict
        issues_by_model = defaultdict(list)
        for issue in issues_found:
            issues_by_model[issue['model']].append(issue)
        
        for model, model_issues in issues_by_model.items():
            print(f"  {model}: {len(model_issues)} issues")
            for issue in model_issues:
                print(f"    - {issue['method']}, Q{issue['question_index']}: {issue['score']}")
        
        # Create a DataFrame for easier analysis
        issues_df = pd.DataFrame(issues_found)
        print(f"\nSaving issues to 'scoring_issues.csv'")
        issues_df.to_csv('scoring_issues.csv', index=False)
        
        return issues_found
    else:
        print("✅ No scores over 7 found. All data looks good!")
        return []

# Run the validation check
issues = check_scores_over_7()

Checking for scores over 7...
✅ No scores over 7 found. All data looks good!


In [12]:
# Process each method directory
for method in methods:
    method_dir = f"results/{method}"
    
    if os.path.exists(method_dir):
        # Find all JSON files in the directory
        json_files = glob.glob(os.path.join(method_dir, "evaluated_results_*.json"))
        
        print(f"Processing {method}: Found {len(json_files)} files")
        
        for filepath in json_files:
            model_name, total_score, num_questions, normalized_score = process_json_file(filepath)
            
            if model_name:
                # Initialize model entry if not exists
                if model_name not in results_data:
                    results_data[model_name] = {}
                
                # Store the normalized score for this method
                results_data[model_name][method] = normalized_score
                
                # print(f"  {model_name}: {total_score}/{num_questions*7} = {normalized_score:.4f}")
    else:
        print(f"Directory {method_dir} not found")

Processing default_essay: Found 25 files


In [17]:
# Convert to DataFrame
df = pd.DataFrame.from_dict(results_data, orient='index')

# Reorder columns to match the specified order
df = df.reindex(columns=methods)

# Fill NaN values with 0 (in case some models don't have results for all methods)
df = df.fillna(0)

# Sort by model name for better readability
df = df.sort_index()

print("Final Results Table:")
print("=" * 80)
print(df.round(4))

Final Results Table:
                              default_essay
claude-3.5-haiku                     0.4086
claude-3.5-sonnet                    0.4319
claude-3.7-sonnet                    0.4286
claude-opus-4                        0.3821
claude-sonnet-4                      0.4053
codestral-latest-official            0.3156
deepseek-r1                          0.4518
gemini-2.5-flash                     0.4452
gemini-2.5-pro                       0.4452
gpt-4.1                              0.3123
gpt-4.1-mini                         0.3588
gpt-4.1-nano                         0.3123
gpt-4o                               0.4120
grok-3                               0.4319
grok-3-mini-beta-high-effort         0.4530
grok-3-mini-beta-low-effort          0.4518
groq-llama-4-maverick                0.4618
groq-llama-4-scout                   0.4053
groq-llama-guard-4                   0.0000
groq-llama3.1-8b-instant             0.3322
groq-llama3.3-70b                    0.3953
mistral-lar