In [2]:
import os
import json
import re
import time

from datasets import Dataset
from pyprojroot import here
from pydantic import BaseModel, Field
from langchain_mistralai import ChatMistralAI
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
dataset = Dataset.load_from_disk(here("evals/logs/phased_self_discover/llama/structured/few_shot_0/math/math_eval"))
dataset

Dataset({
    features: ['problem', 'level', 'type', 'solution', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 200
})

In [3]:
dataset.filter(lambda x: x["answer_pred"] == None)

Dataset({
    features: ['problem', 'level', 'type', 'solution', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 0
})

In [4]:
dataset["answer_pred"][2]

'y^4 - 2y^3 + 7y^2 + y - 5.'

In [7]:
rate_limiter = InMemoryRateLimiter(
        requests_per_second=0.5,
        check_every_n_seconds=1,
        max_bucket_size=1,
    )

llm = ChatMistralAI(model="mistral-large-2407", rate_limiter=rate_limiter)

class ReasoningState(BaseModel):
    is_correct: bool = Field(
        "True if reasoning answer is same as the answer in the solution, False otherwise"
    )
    correction_reasoning: str = Field(
        "Your reasoning to why you determined if your answer is True or False"
    )

structured_llm = llm.with_structured_output(ReasoningState)

In [8]:
system_prompt = """You are an expert mathematician who is tasked with comparing a student's reasoning and final answer with a given reference answer.

You don't need to be worried about the math problem. Only if the student's final answer is correct.

The final answer in the student's answer is given inside latex \\boxed command and their total reasoning is wrapped within <reasoning> tags.

The reference answer is given within <reference> tags.

You will provide your answer in the given schema. You will determine True if the student's final answer(within \\boxed latex) is same as the <reference> answer."""

user_message = """<reasoning>
{reasoning}
</reasoning>

<reference>
{reference}
</reference>"""

In [9]:
def compare(instance):
    answer = instance["solution"]
    reasoning = instance["reasoning"]

    prompt_template = ChatPromptTemplate([
        ("system", system_prompt),
        ("user", user_message)
    ])

    chain = prompt_template | structured_llm

    try:
        response = chain.invoke({"reasoning": reasoning, "reference": answer})
    except:
        return {
            "is_correct": "",
            "correction_reasoning": ""
        }

    return response.model_dump()

In [None]:
batch_size = 10
output_dir = "math_processed_batches"
os.makedirs(output_dir, exist_ok=True)

# Get previously saved batches
saved_batches = {int(f.split("_")[1].split(".")[0]) for f in os.listdir(output_dir) if f.startswith("batch_") and f.endswith(".json")}

for i in range(0, len(dataset), batch_size):
    if (i//batch_size) in saved_batches:
        print(f"Skipping already processed batch {i//batch_size}")
        continue
    
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    processed_batch = batch.map(compare)

    # Save batch to JSON
    batch_path = os.path.join(output_dir, f"batch_{i//batch_size}.json")
    processed_batch.to_json(batch_path)
    print(f"Saved batch {i//batch_size} to {batch_path}")

    time.sleep(5)

print("Processing complete!")

# Load and calculate accuracy

In [12]:
from datasets import concatenate_datasets

In [16]:
processed_dataset_list = []

for batch_file in os.listdir(output_dir):
    if batch_file.endswith(".json"):
        processed_dataset_list.append(Dataset.from_json(os.path.join(output_dir, batch_file)))

processed_dataset = concatenate_datasets(processed_dataset_list)

processed_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['problem', 'level', 'type', 'solution', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred', 'is_correct', 'correction_reasoning'],
    num_rows: 200
})

In [26]:
processed_dataset.to_json(f"{output_dir}/math.json")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3032811

In [23]:
processed_dataset.filter(lambda x: x["is_correct"] == True).num_rows / 200

0.605

In [None]:
processed_dataset.filter(lambda x: x["is_correct"] == "")

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['problem', 'level', 'type', 'solution', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred', 'is_correct', 'correction_reasoning'],
    num_rows: 0
})

# Corrected results accuracy

In [12]:
ds = Dataset.from_json(f"{output_dir}/math.json")
ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['problem', 'level', 'type', 'solution', 'self_discover_input', 'task_description', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred', 'is_correct', 'correction_reasoning'],
    num_rows: 200
})

In [13]:
ds.filter(lambda x: x["is_correct"] == True).num_rows / 200

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

0.635