In [6]:
import json
from collections import defaultdict
import re

def calculate_accuracy(input_file):
    results = defaultdict(lambda: {'correct': 0, 'total': 0})
    level_results = defaultdict(lambda: {'correct': 0, 'total': 0})
    overall = {'correct': 0, 'total': 0}  # Overall accuracy counter
    
    with open(input_file, 'r') as file:
        for line in file:
            data = json.loads(line)
            problem_id = next(iter(data))  # Retrieve the problem identifier
            problem_data = data[problem_id]
            
            label_level = f"{problem_data['label']} - {problem_data['level']}"
            level = problem_data['level']
            try:
                # Try directly converting to int, assuming is_correct is simple integer or string
                is_correct = int(problem_data['is_correct'])
            except ValueError:
                # Handle case where is_correct could be multiple values separated by newline or whitespace
                if '\n' in problem_data['is_correct'] or ' ' in problem_data['is_correct']:
                    parts = re.split(r'\s+', problem_data['is_correct'])
                    for part in parts:
                        try:
                            is_correct = int(part)
                            break
                        except ValueError:
                            continue
                else:
                    print(f"Error parsing problem_id {problem_id} with data: {problem_data['is_correct']}")
                    match = re.search(r'Score:\s*(-?\d+)', problem_data['is_correct'])
                    if match:
                        is_correct = int(match.group(1))
                    else:
                        print(f"No score found in problem_id {problem_id}. Assuming score of 0")
                        is_correct = 0  # Assuming score of 0 if no valid score can be parsed
            
            # Update counts for label-level combination
            results[label_level]['total'] += 1
            if is_correct == 1:
                results[label_level]['correct'] += 1
            
            # Update counts for level only
            level_results[level]['total'] += 1
            if is_correct == 1:
                level_results[level]['correct'] += 1

            # Update overall accuracy counts
            overall['total'] += 1
            if is_correct == 1:
                overall['correct'] += 1

    # Print accuracy for each label-level category
    print("Accuracy by Label and Level:")
    for category, counts in results.items():
        accuracy = counts['correct'] / counts['total'] * 100
        print(f"  {category}: {accuracy:.2f}% ({counts['correct']}/{counts['total']})")

    # Print overall accuracy by level
    print("\nOverall Accuracy by Level:")
    for level, counts in level_results.items():
        accuracy = counts['correct'] / counts['total'] * 100
        print(f"  {level}: {accuracy:.2f}% ({counts['correct']}/{counts['total']})")

    # Print overall accuracy for the entire dataset
    total_accuracy = overall['correct'] / overall['total'] * 100
    print(f"\nOverall Accuracy for the Entire Dataset: {total_accuracy:.2f}% ({overall['correct']}/{overall['total']})")

# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-turbo-2024-04-09-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)


Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 16.00% (4/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 88.41% (61/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 76.60% (36/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 100.00% (5/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 71.43% (10/14)
  Linear Algebra and Abstract Algebra - college math: 44.00% (11/25)
  Calculus and Analysis - college math: 62.50% (15/24)
  Probability - college math: 14.29% (3/21)
  Statistics - college math: 64.71% (11/17)

Overall Accuracy by Level:
  high school competition: 10.14% (15/148)
  high school math: 84.78% (117/138)
  college math: 49.50% (50/101)

Overall Accuracy for the Entire Dataset: 47.03% (182/387)


In [7]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-0125-preview-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 14.63% (12/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 71.01% (49/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 74.47% (35/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 42.86% (6/14)
  Linear Algebra and Abstract Algebra - college math: 40.00% (10/25)
  Calculus and Analysis - college math: 58.33% (14/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 58.82% (10/17)

Overall Accuracy by Level:
  high school competition: 11.49% (17/148)
  high school math: 74.64% (103/138)
  college math: 41.58% (42/101)

Overall Accuracy for the Entire Dataset: 41.86% (162/387)


In [8]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-1106-preview-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 7.32% (6/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 2.70% (1/37)
  Algebra - high school math: 47.83% (33/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 65.96% (31/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 100.00% (5/5)
  Series - high school math: 0.00% (0/1)
  Differential Equations - college math: 50.00% (7/14)
  Linear Algebra and Abstract Algebra - college math: 36.00% (9/25)
  Calculus and Analysis - college math: 41.67% (10/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 52.94% (9/17)

Overall Accuracy by Level:
  high school competition: 6.08% (9/148)
  high school math: 60.14% (83/138)
  college math: 36.63% (37/101)

Overall Accuracy for the Entire Dataset: 33.33% (129/387)


In [9]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-0613-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 4.88% (4/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 8.11% (3/37)
  Algebra - high school math: 73.91% (51/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 72.34% (34/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 60.00% (3/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 42.86% (6/14)
  Linear Algebra and Abstract Algebra - college math: 44.00% (11/25)
  Calculus and Analysis - college math: 45.83% (11/24)
  Probability - college math: 0.00% (0/21)
  Statistics - college math: 29.41% (5/17)

Overall Accuracy by Level:
  high school competition: 5.41% (8/148)
  high school math: 74.64% (103/138)
  college math: 32.67% (33/101)

Overall Accuracy for the Entire Dataset: 37.21% (144/387)


In [10]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-3.5-turbo-0125-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 2.44% (2/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 0.00% (0/37)
  Algebra - high school math: 39.13% (27/69)
  Geometry - high school math: 71.43% (10/14)
  PreCalculus - high school math: 34.04% (16/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 40.00% (2/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 28.57% (4/14)
  Linear Algebra and Abstract Algebra - college math: 16.00% (4/25)
  Calculus and Analysis - college math: 16.67% (4/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 11.76% (2/17)

Overall Accuracy by Level:
  high school competition: 2.03% (3/148)
  high school math: 41.30% (57/138)
  college math: 15.84% (16/101)

Overall Accuracy for the Entire Dataset: 19.64% (76/387)


In [11]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-llama3-70b-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 25.00% (1/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 44.93% (31/69)
  Geometry - high school math: 71.43% (10/14)
  PreCalculus - high school math: 55.32% (26/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 0.00% (0/1)
  Differential Equations - college math: 50.00% (7/14)
  Linear Algebra and Abstract Algebra - college math: 24.00% (6/25)
  Calculus and Analysis - college math: 20.83% (5/24)
  Probability - college math: 0.00% (0/21)
  Statistics - college math: 23.53% (4/17)

Overall Accuracy by Level:
  high school competition: 9.46% (14/148)
  high school math: 52.17% (72/138)
  college math: 21.78% (22/101)

Overall Accuracy for the Entire Dataset: 27.91% (108/387)


In [13]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-dbrx-instruct-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 4.88% (4/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 20.00% (5/25)
  Combinatorics - high school competition: 8.11% (3/37)
  Algebra - high school math: 39.13% (27/69)
  Geometry - high school math: 57.14% (8/14)
  PreCalculus - high school math: 40.43% (19/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 60.00% (3/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 28.57% (4/14)
  Linear Algebra and Abstract Algebra - college math: 28.00% (7/25)
  Calculus and Analysis - college math: 12.50% (3/24)
  Probability - college math: 4.76% (1/21)
  Statistics - college math: 35.29% (6/17)

Overall Accuracy by Level:
  high school competition: 8.11% (12/148)
  high school math: 42.75% (59/138)
  college math: 20.79% (21/101)

Overall Accuracy for the Entire Dataset: 23.77% (92/387)
