In [1]:
import json
from collections import defaultdict
import re

def calculate_accuracy(input_file):
    results = defaultdict(lambda: {'correct': 0, 'total': 0})
    level_results = defaultdict(lambda: {'correct': 0, 'total': 0})
    overall = {'correct': 0, 'total': 0}  # Overall accuracy counter
    
    with open(input_file, 'r') as file:
        for line in file:
            data = json.loads(line)
            problem_id = next(iter(data))  # Retrieve the problem identifier
            problem_data = data[problem_id]
            
            label_level = f"{problem_data['label']} - {problem_data['level']}"
            level = problem_data['level']
            try:
                # Try directly converting to int, assuming is_correct is simple integer or string
                is_correct = int(problem_data['is_correct'])
            except ValueError:
                # Handle case where is_correct could be multiple values separated by newline or whitespace
                if '\n' in problem_data['is_correct'] or ' ' in problem_data['is_correct']:
                    parts = re.split(r'\s+', problem_data['is_correct'])
                    for part in parts:
                        try:
                            is_correct = int(part)
                            break
                        except ValueError:
                            continue
                else:
                    print(f"Error parsing problem_id {problem_id} with data: {problem_data['is_correct']}")
                    match = re.search(r'Score:\s*(-?\d+)', problem_data['is_correct'])
                    if match:
                        is_correct = int(match.group(1))
                    else:
                        print(f"No score found in problem_id {problem_id}. Assuming score of 0")
                        is_correct = 0  # Assuming score of 0 if no valid score can be parsed
            
            # Update counts for label-level combination
            results[label_level]['total'] += 1
            if is_correct == 1:
                results[label_level]['correct'] += 1
            
            # Update counts for level only
            level_results[level]['total'] += 1
            if is_correct == 1:
                level_results[level]['correct'] += 1

            # Update overall accuracy counts
            overall['total'] += 1
            if is_correct == 1:
                overall['correct'] += 1

    # Print accuracy for each label-level category
    print("Accuracy by Label and Level:")
    for category, counts in results.items():
        accuracy = counts['correct'] / counts['total'] * 100
        print(f"  {category}: {accuracy:.2f}% ({counts['correct']}/{counts['total']})")

    # Print overall accuracy by level
    print("\nOverall Accuracy by Level:")
    for level, counts in level_results.items():
        accuracy = counts['correct'] / counts['total'] * 100
        print(f"  {level}: {accuracy:.2f}% ({counts['correct']}/{counts['total']})")

    # Print overall accuracy for the entire dataset
    total_accuracy = overall['correct'] / overall['total'] * 100
    print(f"\nOverall Accuracy for the Entire Dataset: {total_accuracy:.2f}% ({overall['correct']}/{overall['total']})")

# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-turbo-2024-04-09-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)


Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 16.00% (4/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 88.41% (61/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 76.60% (36/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 100.00% (5/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 71.43% (10/14)
  Linear Algebra and Abstract Algebra - college math: 44.00% (11/25)
  Calculus and Analysis - college math: 62.50% (15/24)
  Probability - college math: 14.29% (3/21)
  Statistics - college math: 64.71% (11/17)

Overall Accuracy by Level:
  high school competition: 10.14% (15/148)
  high school math: 84.78% (117/138)
  college math: 49.50% (50/101)

Overall Accuracy for the Entire Dataset: 47.03% (182/387)


In [3]:
# Specify the path to your JSONL file
input_file_path = '/Users/elricwan/Downloads/NetmindAI/odyssey-math/jsonl/eval/result-deepseek-v3-Instruct-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 26.83% (22/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 20.00% (5/25)
  Combinatorics - high school competition: 16.22% (6/37)
  Algebra - high school math: 94.20% (65/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 72.34% (34/47)
  Trigonometry - high school math: 100.00% (2/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 78.57% (11/14)
  Linear Algebra and Abstract Algebra - college math: 80.00% (20/25)
  Calculus and Analysis - college math: 70.83% (17/24)
  Probability - college math: 42.86% (9/21)
  Statistics - college math: 82.35% (14/17)

Overall Accuracy by Level:
  high school competition: 22.30% (33/148)
  high school math: 86.23% (119/138)
  college math: 70.30% (71/101)

Overall Accuracy for the Entire Dataset: 57.62% (223/387)


In [3]:
# Specify the path to your JSONL file
input_file_path = '/root/Xiangpeng/odyssey-math/jsonl/eval/result-gpt-3.5-turbo-0125-solution-clean-new.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 3.66% (3/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 0.00% (0/37)
  Algebra - high school math: 39.13% (27/69)
  Geometry - high school math: 50.00% (7/14)
  PreCalculus - high school math: 34.04% (16/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 40.00% (2/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 35.71% (5/14)
  Linear Algebra and Abstract Algebra - college math: 12.00% (3/25)
  Calculus and Analysis - college math: 16.67% (4/24)
  Probability - college math: 0.00% (0/21)
  Statistics - college math: 29.41% (5/17)

Overall Accuracy by Level:
  high school competition: 3.38% (5/148)
  high school math: 39.13% (54/138)
  college math: 16.83% (17/101)

Overall Accuracy for the Entire Dataset: 19.64% (76/387)


In [4]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-o1-preview-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 51.22% (42/82)
  NumberTheory - high school competition: 75.00% (3/4)
  Geometry - high school competition: 56.00% (14/25)
  Combinatorics - high school competition: 21.62% (8/37)
  Algebra - high school math: 81.16% (56/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 74.47% (35/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 71.43% (10/14)
  Linear Algebra and Abstract Algebra - college math: 92.00% (23/25)
  Calculus and Analysis - college math: 79.17% (19/24)
  Probability - college math: 52.38% (11/21)
  Statistics - college math: 70.59% (12/17)

Overall Accuracy by Level:
  high school competition: 45.27% (67/148)
  high school math: 79.71% (110/138)
  college math: 74.26% (75/101)

Overall Accuracy for the Entire Dataset: 65.12% (252/387)


In [4]:
# Specify the path to your JSONL file
input_file_path = '/root/Xiangpeng/odyssey-math/jsonl/eval/result-gpt-4-turbo-2024-04-09-second-clean-new.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 10.98% (9/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 13.51% (5/37)
  Algebra - high school math: 85.51% (59/69)
  Geometry - high school math: 78.57% (11/14)
  PreCalculus - high school math: 82.98% (39/47)
  Trigonometry - high school math: 100.00% (2/2)
  Calculus - high school math: 100.00% (5/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 64.29% (9/14)
  Linear Algebra and Abstract Algebra - college math: 52.00% (13/25)
  Calculus and Analysis - college math: 79.17% (19/24)
  Probability - college math: 14.29% (3/21)
  Statistics - college math: 58.82% (10/17)

Overall Accuracy by Level:
  high school competition: 10.81% (16/148)
  high school math: 84.78% (117/138)
  college math: 53.47% (54/101)

Overall Accuracy for the Entire Dataset: 48.32% (187/387)


In [2]:
# Specify the path to your JSONL file
input_file_path = '/root/Xiangpeng/odyssey-math/jsonl/eval/result-gpt-4-turbo-2024-04-09-clean-new.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 12.20% (10/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 13.51% (5/37)
  Algebra - high school math: 85.51% (59/69)
  Geometry - high school math: 85.71% (12/14)
  PreCalculus - high school math: 80.85% (38/47)
  Trigonometry - high school math: 100.00% (2/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 64.29% (9/14)
  Linear Algebra and Abstract Algebra - college math: 72.00% (18/25)
  Calculus and Analysis - college math: 70.83% (17/24)
  Probability - college math: 23.81% (5/21)
  Statistics - college math: 58.82% (10/17)

Overall Accuracy by Level:
  high school competition: 10.81% (16/148)
  high school math: 84.06% (116/138)
  college math: 58.42% (59/101)

Overall Accuracy for the Entire Dataset: 49.35% (191/387)


In [6]:
# Specify the path to your JSONL file
input_file_path = '/root/Xiangpeng/odyssey-math/jsonl/eval/result-gpt-4-0125-preview-solution-clean-new.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 6.10% (5/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 5.41% (2/37)
  Algebra - high school math: 88.41% (61/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 82.98% (39/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 60.00% (3/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 42.86% (6/14)
  Linear Algebra and Abstract Algebra - college math: 40.00% (10/25)
  Calculus and Analysis - college math: 58.33% (14/24)
  Probability - college math: 19.05% (4/21)
  Statistics - college math: 64.71% (11/17)

Overall Accuracy by Level:
  high school competition: 5.41% (8/148)
  high school math: 85.51% (118/138)
  college math: 44.55% (45/101)

Overall Accuracy for the Entire Dataset: 44.19% (171/387)


In [5]:
# Specify the path to your JSONL file
input_file_path = '/root/Xiangpeng/odyssey-math/jsonl/eval/result-gpt-4-1106-preview-solution-clean-new.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 5.41% (2/37)
  Algebra - high school math: 76.81% (53/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 51.06% (24/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 40.00% (2/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 57.14% (8/14)
  Linear Algebra and Abstract Algebra - college math: 40.00% (10/25)
  Calculus and Analysis - college math: 50.00% (12/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 47.06% (8/17)

Overall Accuracy by Level:
  high school competition: 7.43% (11/148)
  high school math: 68.12% (94/138)
  college math: 39.60% (40/101)

Overall Accuracy for the Entire Dataset: 37.47% (145/387)


In [3]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-0613-solution-clean-last.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 9.76% (8/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 78.26% (54/69)
  Geometry - high school math: 85.71% (12/14)
  PreCalculus - high school math: 68.09% (32/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 20.00% (1/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 64.29% (9/14)
  Linear Algebra and Abstract Algebra - college math: 20.00% (5/25)
  Calculus and Analysis - college math: 33.33% (8/24)
  Probability - college math: 0.00% (0/21)
  Statistics - college math: 17.65% (3/17)

Overall Accuracy by Level:
  high school competition: 8.78% (13/148)
  high school math: 73.19% (101/138)
  college math: 24.75% (25/101)

Overall Accuracy for the Entire Dataset: 35.92% (139/387)


In [4]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-output.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 7.32% (6/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 12.00% (3/25)
  Combinatorics - high school competition: 2.70% (1/37)
  Algebra - high school math: 65.22% (45/69)
  Geometry - high school math: 78.57% (11/14)
  PreCalculus - high school math: 63.83% (30/47)
  Trigonometry - high school math: 100.00% (2/2)
  Calculus - high school math: 40.00% (2/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 78.57% (11/14)
  Linear Algebra and Abstract Algebra - college math: 24.00% (6/25)
  Calculus and Analysis - college math: 33.33% (8/24)
  Probability - college math: 14.29% (3/21)
  Statistics - college math: 47.06% (8/17)

Overall Accuracy by Level:
  high school competition: 6.76% (10/148)
  high school math: 65.94% (91/138)
  college math: 35.64% (36/101)

Overall Accuracy for the Entire Dataset: 35.40% (137/387)


In [2]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-output-original.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 25.00% (1/4)
  Geometry - high school competition: 16.00% (4/25)
  Combinatorics - high school competition: 8.11% (3/37)
  Algebra - high school math: 44.93% (31/69)
  Geometry - high school math: 78.57% (11/14)
  PreCalculus - high school math: 53.19% (25/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 0.00% (0/1)
  Differential Equations - college math: 57.14% (8/14)
  Linear Algebra and Abstract Algebra - college math: 28.00% (7/25)
  Calculus and Analysis - college math: 20.83% (5/24)
  Probability - college math: 4.76% (1/21)
  Statistics - college math: 23.53% (4/17)

Overall Accuracy by Level:
  high school competition: 10.14% (15/148)
  high school math: 52.17% (72/138)
  college math: 24.75% (25/101)

Overall Accuracy for the Entire Dataset: 28.94% (112/387)


In [2]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-output-last.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 5.41% (2/37)
  Algebra - high school math: 72.46% (50/69)
  Geometry - high school math: 85.71% (12/14)
  PreCalculus - high school math: 63.83% (30/47)
  Trigonometry - high school math: 100.00% (2/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 42.86% (6/14)
  Linear Algebra and Abstract Algebra - college math: 36.00% (9/25)
  Calculus and Analysis - college math: 41.67% (10/24)
  Probability - college math: 14.29% (3/21)
  Statistics - college math: 17.65% (3/17)

Overall Accuracy by Level:
  high school competition: 7.43% (11/148)
  high school math: 71.74% (99/138)
  college math: 30.69% (31/101)

Overall Accuracy for the Entire Dataset: 36.43% (141/387)


In [2]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-0125-preview-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 14.63% (12/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 71.01% (49/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 74.47% (35/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 42.86% (6/14)
  Linear Algebra and Abstract Algebra - college math: 40.00% (10/25)
  Calculus and Analysis - college math: 58.33% (14/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 58.82% (10/17)

Overall Accuracy by Level:
  high school competition: 11.49% (17/148)
  high school math: 74.64% (103/138)
  college math: 41.58% (42/101)

Overall Accuracy for the Entire Dataset: 41.86% (162/387)


In [8]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-1106-preview-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 7.32% (6/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 2.70% (1/37)
  Algebra - high school math: 47.83% (33/69)
  Geometry - high school math: 92.86% (13/14)
  PreCalculus - high school math: 65.96% (31/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 100.00% (5/5)
  Series - high school math: 0.00% (0/1)
  Differential Equations - college math: 50.00% (7/14)
  Linear Algebra and Abstract Algebra - college math: 36.00% (9/25)
  Calculus and Analysis - college math: 41.67% (10/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 52.94% (9/17)

Overall Accuracy by Level:
  high school competition: 6.08% (9/148)
  high school math: 60.14% (83/138)
  college math: 36.63% (37/101)

Overall Accuracy for the Entire Dataset: 33.33% (129/387)


In [25]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-4-0613-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 4.88% (4/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 8.11% (3/37)
  Algebra - high school math: 78.26% (54/69)
  Geometry - high school math: 85.71% (12/14)
  PreCalculus - high school math: 74.47% (35/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 60.00% (3/5)
  Series - high school math: 0.00% (0/1)
  Differential Equations - college math: 35.71% (5/14)
  Linear Algebra and Abstract Algebra - college math: 32.00% (8/25)
  Calculus and Analysis - college math: 37.50% (9/24)
  Probability - college math: 0.00% (0/21)
  Statistics - college math: 35.29% (6/17)

Overall Accuracy by Level:
  high school competition: 5.41% (8/148)
  high school math: 76.09% (105/138)
  college math: 27.72% (28/101)

Overall Accuracy for the Entire Dataset: 36.43% (141/387)


In [10]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-gpt-3.5-turbo-0125-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 2.44% (2/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 4.00% (1/25)
  Combinatorics - high school competition: 0.00% (0/37)
  Algebra - high school math: 39.13% (27/69)
  Geometry - high school math: 71.43% (10/14)
  PreCalculus - high school math: 34.04% (16/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 40.00% (2/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 28.57% (4/14)
  Linear Algebra and Abstract Algebra - college math: 16.00% (4/25)
  Calculus and Analysis - college math: 16.67% (4/24)
  Probability - college math: 9.52% (2/21)
  Statistics - college math: 11.76% (2/17)

Overall Accuracy by Level:
  high school competition: 2.03% (3/148)
  high school math: 41.30% (57/138)
  college math: 15.84% (16/101)

Overall Accuracy for the Entire Dataset: 19.64% (76/387)


In [11]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-llama3-70b-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 8.54% (7/82)
  NumberTheory - high school competition: 25.00% (1/4)
  Geometry - high school competition: 8.00% (2/25)
  Combinatorics - high school competition: 10.81% (4/37)
  Algebra - high school math: 44.93% (31/69)
  Geometry - high school math: 71.43% (10/14)
  PreCalculus - high school math: 55.32% (26/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 80.00% (4/5)
  Series - high school math: 0.00% (0/1)
  Differential Equations - college math: 50.00% (7/14)
  Linear Algebra and Abstract Algebra - college math: 24.00% (6/25)
  Calculus and Analysis - college math: 20.83% (5/24)
  Probability - college math: 0.00% (0/21)
  Statistics - college math: 23.53% (4/17)

Overall Accuracy by Level:
  high school competition: 9.46% (14/148)
  high school math: 52.17% (72/138)
  college math: 21.78% (22/101)

Overall Accuracy for the Entire Dataset: 27.91% (108/387)


In [13]:
# Specify the path to your JSONL file
input_file_path = 'jsonl/eval/result-dbrx-instruct-solution-clean.jsonl'

# Call the function
calculate_accuracy(input_file_path)

Accuracy by Label and Level:
  Algebra - high school competition: 4.88% (4/82)
  NumberTheory - high school competition: 0.00% (0/4)
  Geometry - high school competition: 20.00% (5/25)
  Combinatorics - high school competition: 8.11% (3/37)
  Algebra - high school math: 39.13% (27/69)
  Geometry - high school math: 57.14% (8/14)
  PreCalculus - high school math: 40.43% (19/47)
  Trigonometry - high school math: 50.00% (1/2)
  Calculus - high school math: 60.00% (3/5)
  Series - high school math: 100.00% (1/1)
  Differential Equations - college math: 28.57% (4/14)
  Linear Algebra and Abstract Algebra - college math: 28.00% (7/25)
  Calculus and Analysis - college math: 12.50% (3/24)
  Probability - college math: 4.76% (1/21)
  Statistics - college math: 35.29% (6/17)

Overall Accuracy by Level:
  high school competition: 8.11% (12/148)
  high school math: 42.75% (59/138)
  college math: 20.79% (21/101)

Overall Accuracy for the Entire Dataset: 23.77% (92/387)


In [23]:
import json
import pandas as pd

def calculate_accuracy_by_category(jsonl_file_path, txt_file_path):
    # Load the JSONL file into a list of dictionaries
    def load_jsonl(file_path):
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line.strip()))
        return data

    # Load the TXT file into a DataFrame
    def load_txt(file_path):
        df = pd.read_csv(file_path, sep='\t', header=None, names=['Problem_ID', 'Level', 'Category'])
        return df

    # Calculate accuracy based on categories
    def calculate_accuracy(jsonl_data, txt_data):
        # Convert jsonl_data to a DataFrame
        jsonl_df = pd.DataFrame([{'Problem_ID': k, **v} for d in jsonl_data for k, v in d.items()])
        jsonl_df['Problem_ID'] = jsonl_df['Problem_ID'].str.replace('Problem_', '').astype(int)

        # Merge the two DataFrames on Problem_ID
        merged_df = pd.merge(jsonl_df, txt_data, on='Problem_ID')

        # Calculate accuracy for each category
        accuracy = merged_df.groupby('Category').apply(lambda x: x['is_correct'].astype(int).mean())

        return accuracy

    # Load the data
    jsonl_data = load_jsonl(jsonl_file_path)
    txt_data = load_txt(txt_file_path)

    # Calculate and return accuracy
    return calculate_accuracy(jsonl_data, txt_data)

# Example usage:
jsonl_file_path = '/root/Xiangpeng/odyssey-math/jsonl/eval/result-dbrx-instruct-solution-clean.jsonl'
txt_file_path = 'jsonl/processed.txt'

accuracy_results = calculate_accuracy_by_category(jsonl_file_path, txt_file_path)

print("Accuracy by Category:")
print(accuracy_results)


Accuracy by Category:
Category
Multiple Choice    0.460938
Open Question      0.098361
True-False         0.600000
dtype: float64


  accuracy = merged_df.groupby('Category').apply(lambda x: x['is_correct'].astype(int).mean())
