### Load the methods

In [61]:
import json
import pandas as pd
import os
import glob
from pathlib import Path

# Define the methods/processes
methods = ['default_essay','self_consistency_essay_n3', 'self_consistency_essay_n5', 'self_discover_essay']

# Create a dictionary to store all results
results_data = {}

In [62]:
# Load the grading details file and extract max scores in order
# We assume the questions are in the same order in both files
try:
    with open('data/answer_grading_details.json', 'r') as f:
        grading_details_list = json.load(f)
    
    # Extract max scores in order - assuming same order as evaluated results
    max_scores_per_question = [item['max_score'] for item in grading_details_list]
    total_possible_points = sum(max_scores_per_question)
    
    print(f"✅ Loaded grading details successfully: {len(max_scores_per_question)} questions")
    print(f"📊 Total possible points across all questions: {total_possible_points}")
    print(f"📈 Max scores per question: {max_scores_per_question}")
    
except Exception as e:
    print(f"❌ Error loading grading details: {e}")
    max_scores_per_question = []
    total_possible_points = 0

✅ Loaded grading details successfully: 43 questions
📊 Total possible points across all questions: 149
📈 Max scores per question: [6, 2, 2, 5, 2, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 5, 5, 4, 3, 7, 3, 4, 4, 4, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3]


In [63]:
def process_json_file(filepath):
    """
    Process a single JSON file and calculate the aggregate score using actual max scores
    Assumes questions are in the same order as the grading details file
    Returns: (model_name, total_score, num_questions, normalized_score)
    """
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        # Extract model name from filename
        filename = os.path.basename(filepath)
        model_name = filename.replace('evaluated_results_', '').replace('.json', '').split('__')[0]
        
        # Calculate total self_grade_score
        total_score = 0
        num_questions_with_scores = 0
        
        for idx, item in enumerate(data):
            if 'self_grade_score' in item and item['self_grade_score'] is not None:
                total_score += item['self_grade_score']
                num_questions_with_scores += 1
        
        # Calculate normalized score using the predetermined total possible points
        # We assume all questions should have scores, so we use the full total_possible_points
        normalized_score = total_score / total_possible_points if total_possible_points > 0 else 0
        
        return model_name, total_score, num_questions_with_scores, normalized_score
    
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None, 0, 0, 0

In [64]:
def check_invalid_scores():
    """
    Check all JSON files for any self_grade_score that exceeds the max score for that question
    or is below 0 (minimum score)
    """
    issues_found = []
    
    print("Checking for invalid scores (> max_score or < 0)...")
    print("=" * 60)
    
    for method in methods:
        method_dir = f"results/{method}"
        
        if os.path.exists(method_dir):
            json_files = glob.glob(os.path.join(method_dir, "evaluated_results_*.json"))
            
            for filepath in json_files:
                try:
                    with open(filepath, 'r') as f:
                        data = json.load(f)
                    
                    # Extract model name from filename
                    filename = os.path.basename(filepath)
                    model_name = filename.replace('evaluated_results_', '').replace('.json', '').split('__')[0]
                    
                    # Check each question in the file
                    for idx, item in enumerate(data):
                        if 'self_grade_score' in item and item['self_grade_score'] is not None:
                            score = item['self_grade_score']
                            
                            # Get the max score for this question (assuming same order)
                            if idx < len(max_scores_per_question):
                                max_score = max_scores_per_question[idx]
                            else:
                                max_score = 7  # Fallback if index out of range
                                print(f"⚠️  Question index {idx} out of range, using default max_score=7")
                            
                            # Check for invalid scores
                            is_invalid = False
                            issue_type = ""
                            
                            if score > max_score:
                                is_invalid = True
                                issue_type = f"OVER MAX (score: {score}, max: {max_score})"
                            elif score < 0:
                                is_invalid = True
                                issue_type = f"BELOW MIN (score: {score}, min: 0)"
                            
                            if is_invalid:
                                issue = {
                                    'method': method,
                                    'model': model_name,
                                    'question_index': idx,
                                    'score': score,
                                    'max_score': max_score,
                                    'issue_type': issue_type,
                                    'file': filepath
                                }
                                
                                # Try to get question text for better identification
                                question_text = item.get('question', f"Question_{idx}")
                                question_preview = question_text[:100] + "..." if len(question_text) > 100 else question_text
                                issue['question_preview'] = question_preview
                                
                                issues_found.append(issue)
                                
                                print(f"🚨 ISSUE FOUND:")
                                print(f"   Method: {method}")
                                print(f"   Model: {model_name}")
                                print(f"   Question Index: {idx}")
                                print(f"   Issue: {issue_type}")
                                print(f"   Question: {question_preview}")
                                print(f"   File: {os.path.basename(filepath)}")
                                print("-" * 40)
                
                except Exception as e:
                    print(f"Error checking {filepath}: {e}")
    
    # Summary
    if issues_found:
        print(f"\n⚠️  TOTAL ISSUES FOUND: {len(issues_found)}")
        
        # Count by issue type
        over_max_count = sum(1 for issue in issues_found if "OVER MAX" in issue['issue_type'])
        below_min_count = sum(1 for issue in issues_found if "BELOW MIN" in issue['issue_type'])
        
        print(f"   - Scores over maximum: {over_max_count}")
        print(f"   - Scores below minimum (0): {below_min_count}")
        
        print("\nSummary by Model:")
        
        # Group issues by model
        from collections import defaultdict
        issues_by_model = defaultdict(list)
        for issue in issues_found:
            issues_by_model[issue['model']].append(issue)
        
        for model, model_issues in issues_by_model.items():
            print(f"  {model}: {len(model_issues)} issues")
            for issue in model_issues:
                print(f"    - {issue['method']}, Q{issue['question_index']}: {issue['issue_type']}")
        
        print("\nSummary by Method:")
        issues_by_method = defaultdict(list)
        for issue in issues_found:
            issues_by_method[issue['method']].append(issue)
        
        for method, method_issues in issues_by_method.items():
            print(f"  {method}: {len(method_issues)} issues")
        
        # Create a DataFrame for easier analysis
        issues_df = pd.DataFrame(issues_found)
        print(f"\nSaving issues to 'scoring_issues.csv'")
        issues_df.to_csv('scoring_issues.csv', index=False)
        
        return issues_found
    else:
        print("✅ No invalid scores found. All scores are within valid ranges!")
        return []

# Run the validation check
issues = check_invalid_scores()

Checking for invalid scores (> max_score or < 0)...
✅ No invalid scores found. All scores are within valid ranges!


In [65]:
# Process each method directory
for method in methods:
    method_dir = f"results/{method}"
    
    if os.path.exists(method_dir):
        # Find all JSON files in the directory
        json_files = glob.glob(os.path.join(method_dir, "evaluated_results_*.json"))
        
        print(f"Processing {method}: Found {len(json_files)} files")
        
        for filepath in json_files:
            model_name, total_score, num_questions, normalized_score = process_json_file(filepath)
            
            if model_name:
                # Initialize model entry if not exists
                if model_name not in results_data:
                    results_data[model_name] = {}
                
                # Store the normalized score for this method
                results_data[model_name][method] = normalized_score
                
                print(f"  {model_name}: {total_score}/{total_possible_points} = {normalized_score:.4f} ({num_questions} questions)")
    else:
        print(f"Directory {method_dir} not found")

Processing default_essay: Found 25 files
  codestral-latest-official: 65/149 = 0.4362 (43 questions)
  gpt-4o: 97/149 = 0.6510 (43 questions)
  grok-3: 100/149 = 0.6711 (43 questions)
  groq-llama3.3-70b: 89/149 = 0.5973 (43 questions)
  claude-sonnet-4: 95/149 = 0.6376 (43 questions)
  claude-3.7-sonnet: 91/149 = 0.6107 (43 questions)
  groq-llama-guard-4: 0/149 = 0.0000 (43 questions)
  claude-3.5-haiku: 79/149 = 0.5302 (43 questions)
  gemini-2.5-pro: 100/149 = 0.6711 (43 questions)
  groq-llama-4-scout: 83/149 = 0.5570 (43 questions)
  o4-mini: 117/149 = 0.7852 (43 questions)
  gemini-2.5-flash: 100/149 = 0.6711 (43 questions)
  grok-3-mini-beta-high-effort: 91/149 = 0.6107 (41 questions)
  claude-opus-4: 89/149 = 0.5973 (43 questions)
  mistral-large-official: 78/149 = 0.5235 (43 questions)
  gpt-4.1-mini: 83/149 = 0.5570 (43 questions)
  groq-llama-4-maverick: 103/149 = 0.6913 (43 questions)
  deepseek-r1: 93/149 = 0.6242 (43 questions)
  gpt-4.1: 81/149 = 0.5436 (43 questions)
 

In [66]:
# Convert results_data → DataFrame
df = pd.DataFrame.from_dict(results_data, orient="index")

# Keep columns in the desired order
df = df.reindex(columns=methods)

# Replace missing scores with 0
df = df.fillna(0)

# ════════════════════════════════════════════════════════════════════════════
# One table per method, ordered by value
# ════════════════════════════════════════════════════════════════════════════
for method in methods:
    ordered_tbl = (
        df[[method]]             # keep only that method’s column
        .sort_values(
            by=method,
            ascending=False      # highest score at the top
        )
        .round(4)
    )

    print(f"\nResults ordered by {method}:")
    print("=" * 80)
    print(ordered_tbl.to_string())



Results ordered by default_essay:
                              default_essay
o4-mini                              0.7852
o3-mini                              0.7114
groq-llama-4-maverick                0.6913
grok-3-mini-beta-low-effort          0.6779
grok-3                               0.6711
gemini-2.5-pro                       0.6711
gemini-2.5-flash                     0.6711
gpt-4o                               0.6510
claude-sonnet-4                      0.6376
deepseek-r1                          0.6242
grok-3-mini-beta-high-effort         0.6107
claude-3.7-sonnet                    0.6107
claude-opus-4                        0.5973
groq-llama3.3-70b                    0.5973
palmyra-fin-default                  0.5839
claude-3.5-sonnet                    0.5705
groq-llama-4-scout                   0.5570
gpt-4.1-mini                         0.5570
gpt-4.1                              0.5436
claude-3.5-haiku                     0.5302
mistral-large-official               0.52