In [22]:
class MathPrompts:
    REGULAR_PROMPT = """
        You are an expert math tutor. When given a word problem, solve it following these exact requirements:
        Present your solution as a sequence of logical steps
        Write in clear, complete sentences
        Show EVERY calculation inside double angle brackets with an equals sign: <<calculation=result>>
        After each calculation, state the result in a descriptive sentence
        End with the final answer preceded by four hash symbols (####)
        Include relevant units in all answers
        Do not use bullet points or numbered lists in your answer

        For example:
        Given: "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"
        Your response should be exactly in this format:
        Natalia sold 48/2 = <<48/2=24>>24 clips in May.
        Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
        #### 72

        Now solve the math word problem provided, following this exact format.
    """


In [23]:
import sys
sys.path.append('..')
from src.utils import initialize_openai_model, callGPT

In [24]:
def extract_final_answer(response):
    """Extract the final answer from a response string."""
    match = re.search(r'####\s*(\d+)', response)
    return int(match.group(1)) if match else None

In [26]:
import sys
sys.path.append("..")
from src.data_utils import CoTDataset

In [9]:
math_ds = CoTDataset("openai/gsm8k")
commonsense_ds = CoTDataset("tau/commonsense_qa")

Loaded dataset 'openai/gsm8k'.
Processed 'openai/gsm8k' dataset.
Loaded dataset 'tau/commonsense_qa'.
Processed 'tau/commonsense_qa' dataset.


In [None]:
def evaluate_single_question(dataset, client):
    """Evaluate the GPT model on a single question."""
    sample = dataset[0]
    problem = sample['question']
    expected_answer = extract_final_answer(sample['answer'])

    gpt_response = callGPT(MathPrompts.REGULAR_PROMPT, problem, client)
    
    if gpt_response is None:
        print("Error: Failed to get a response from GPT-4.")
        return

    gpt_answer = extract_final_answer(gpt_response)

    print("Problem:", problem)
    print("Expected Answer:", expected_answer)
    print("GPT's Response:", gpt_response)
    print("GPT's Extracted Answer:", gpt_answer)
    
    if gpt_answer == expected_answer:
        print("The answer is correct!")
    else:
        print("The answer is incorrect.")

client = initialize_openai_model()

evaluate_single_question([math_ds[238]], client)

Problem: Jack is running a bake sale to help pay for his basketball team's uniforms. He's already sold 4 brownies for $3 each and 5 lemon squares for $2 each. If Jack's goal is to make $50 and he sells cookies for $4 each, how many cookies does he need to sell to reach his goal?
Expected Answer: 7
GPT's Response: Jack has sold 4 brownies for $3 each. To find out how much money he made from the brownies, I will calculate the total earnings from the brownies. 

First, I can calculate the total earnings from the brownies as follows: 
4 brownies * $3 per brownie = <<4*3=12>>12 dollars from brownies. 
Jack made $12 from selling brownies. 

Next, Jack has sold 5 lemon squares for $2 each. I will calculate the total earnings from the lemon squares now: 
5 lemon squares * $2 per lemon square = <<5*2=10>>10 dollars from lemon squares. 
Jack made $10 from selling lemon squares. 

Now I will find the total earnings Jack has made so far by adding his earnings from brownies and lemon squares: 
$12 

In [None]:
def evaluate_100_questions(dataset, client):
    """Evaluate the GPT model on the first 100 questions."""
    correct = 0
    total = min(len(dataset), 100)
    results = []

    for i in range(total):
        sample = dataset[i]
        problem = sample['question']
        expected_answer = extract_final_answer(sample['answer'])

        if expected_answer is None:
            print(f"Skipping question {i}: Invalid expected answer format.")
            continue

        gpt_response = callGPT(MathPrompts.REGULAR_PROMPT, problem, client)
        if gpt_response is None:
            print(f"Skipping question {i}: API error.")
            continue

        gpt_answer = extract_final_answer(gpt_response)

        is_correct = (gpt_answer == expected_answer)
        if is_correct:
            correct += 1

        results.append({
            'problem': problem,
            'expected_answer': expected_answer,
            'gpt_response': gpt_response,
            'gpt_answer': gpt_answer,
            'is_correct': is_correct
        })

    accuracy = correct / total * 100
    print(f"\nTotal questions: {total}, Correct answers: {correct}, Accuracy: {accuracy:.2f}%")

    results_df = pd.DataFrame(results)
    return results_df


In [None]:
client = initialize_openai_model()

results_df = evaluate_100_questions(math_ds, client)

results_df.head()


Total questions: 100, Correct answers: 87, Accuracy: 87.00%


Unnamed: 0,problem,expected_answer,gpt_response,gpt_answer,is_correct
0,Natalia sold clips to 48 of her friends in Apr...,72,Natalia sold clips to 48 of her friends in Apr...,72.0,True
1,Weng earns $12 an hour for babysitting. Yester...,10,Weng earns $12 for every hour she babysits. Fi...,10.0,True
2,Betty is saving money for a new wallet which c...,5,Betty needs a total of $100 for the wallet. Sh...,5.0,True
3,"Julie is reading a 120-page book. Yesterday, s...",42,Julie read 12 pages yesterday. Since she read ...,42.0,True
4,James writes a 3-page letter to 2 different fr...,624,James writes 3 pages per letter. Since he writ...,624.0,True
