## Imports and setup

In [2]:
import openai
import utils
import json
import re
import concurrent.futures

In [3]:
openai.api_key = '<API_KEY>' # Insert your API key here

## Read problems

In [4]:
def read_problems(filename: str) -> list[utils.Problem]:
    problems = json.load(open(filename))
    problems = [utils.Problem(**p) for p in problems]
    return problems

problems = read_problems('data/2022/math_1.json')

## Call model

In [13]:
def get_system_message(problem_number: str) -> str:
    problem_number = int(problem_number)
    if problem_number <= 8:
        return r'Answer the following math question. The answer to the question is a numerical value. If the numerical value has more than two decimal places, truncate/roundoff the value to two decimal places. State the answer as a number - do not use LaTeX. Include the final answer in a \boxed{} tag. For example, \boxed{2.33}.'
    elif problem_number <= 14:
        return r'''Answer the following math question. Each question has FOUR options (A), (B), (C) and (D). ONE OR MORE THAN ONE of these four option(s) is(are) correct answer(s).

For each question, choose the option(s) corresponding to (all) the correct answer(s).

Include your final answer with comma-separated options in a \boxed{} tag, eg \boxed{A,C} or \boxed{B,C,D}.

Answer to each question will be evaluated according to the following marking scheme:

Full Marks : +4 ONLY if (all) the correct option(s) is(are) chosen;
Partial Marks : +3 If all the four options are correct but ONLY three options are chosen;
Partial Marks : +2 If three or more options are correct but ONLY two options are chosen, both of which are correct;
Partial Marks : +1 If two or more options are correct but ONLY one option is chosen and it is a correct option;
Zero Marks : 0 If none of the options is chosen (i.e. the question is unanswered);
Negative Marks : -2 In all other cases.
'''
    elif problem_number <= 18:
        return r'''Answer the following math question. The question is a Multiple Choice Question with Matching List Sets.

The set has TWO lists: List-I and List-II.

List-I has Four entries (I), (II), (III) and (IV) and List-II has Five entries (P), (Q),(R), (S) and (T).

FOUR options are given based on List-I and List-II and ONLY ONE of these four options satisfies the condition asked in the Multiple Choice Question.

The answer will be evaluated according to the following marking scheme: 

Full Marks :+3 ONLY if the option corresponding to the correct combination is chosen;
Zero Marks : 0 If none of the options is chosen (i.e. the question is unanswered);
Negative Marks : -1 In all other cases.

Include the final answer in a \boxed{} tag, eg \boxed{A} or \boxed{B}.
'''
    else:
        raise ValueError(f'Unknown problem number: {problem_number}')

def strip_latex(s: str) -> str:
    return re.sub(r'\\[a-zA-Z]+\{(.*)\}', r'\1', s)

def strip_parens(s: str) -> str:
    return re.sub(r'\((.*)\)', r'\1', s)

def score_answer(problem: utils.Problem, answer: str, debug: bool = False) -> int:
    # TODO(mehtarishi): Set up model-based grading.
    number = int(problem.number)
    def print_debug(message):
        if debug:
            print(f'\n\n#####{message}\n{problem.number=}\n{problem.answer=}\n{answer=}\n#####')
    if number <= 8:
        if problem.answer == answer:
            print_debug('Correct!')
            return 3
        return 0
    elif number <= 14:
        if answer == None:
            return 0
        answer = strip_parens(strip_latex(answer))
        expected_choices = set([option.strip('() ') for option in problem.answer.split(',')])
        actual_choices = set([option.strip('() ') for option in answer.split(',')])
        if expected_choices == actual_choices:
            print_debug('Correct!')
            return 4
        elif len(expected_choices) > 1 and len(actual_choices) == len(expected_choices) - 1 and actual_choices.issubset(expected_choices):
            print_debug('Partially correct!')
            return len(expected_choices) - 1
        return -2
    elif number <= 18:
        answer = strip_parens(strip_latex(answer))
        if problem.answer == answer:
            print_debug('Correct!')
            return 3
        elif answer == None:
            return 0
        return -1
    else:
        raise ValueError(f'Unknown problem number: {number}')

def extract_answer(completion: str) -> str | None:
    m = re.search(r'\\boxed{(.*)}', completion)
    if m:
        return m.group(1)
    return None

def call_api_with_retries(number, model, messages, num_retries: int = 3, debug: bool = False):
    for i in range(num_retries):
        if debug:
            print(f'Calling API for problem {number}: {i+1}/{num_retries}...')
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                max_tokens=2048,
            )
            return response
        except Exception as e:
            print(f'Error: {e}')
    raise RuntimeError('Failed to call API')

def call_model(problem: utils.Problem, model: str, debug=True) -> int:
    response = call_api_with_retries(problem.number, model, [
        {"role": "system", "content": get_system_message(problem.number)},
        {"role": "user", "content": problem.question},
    ], debug=debug)

    completion = response.choices[0].message['content']
    answer = extract_answer(completion)
    score = score_answer(problem, answer, debug=debug)
    if debug:
        print(f'\n\n\n\n\n{problem=}\ncorrect answer={problem.answer}\n{completion=}\n{answer=}\n{score=}')
    # TODO(mehtarishi): Persist the response so we can inspect attempts later.
    return score

# call_model(problems[0], model='gpt-3.5-turbo')

In [15]:
def score_model(model: str, debug=False) -> int:
    total_score = 0
    # TODO(mehtarishi): Parallelize.
    executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
    scores = executor.map(lambda problem: call_model(problem, model=model, debug=debug), problems)
    return sum(scores)

score = score_model('gpt-4', debug=True)
print(f'Final score: {score}')

Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...
Calling API 1/3...





problem=Problem(number='1', question='Considering only the principal values of the inverse trigonometric functions, the value of $\\frac{3}{2} \\cos ^{-1} \\sqrt{\\frac{2}{2+\\pi^{2}}}+\\frac{1}{4} \\sin ^{-1} \\frac{2 \\sqrt{2} \\pi}{2+\\pi^{2}}+\\tan ^{-1} \\frac{\\sqrt{2}}{\\pi}$ is', answer='2.36')
correct answer=2.36
completion="Let's first rewrite the expression:\n\n$$\\frac{3}{2} \\cos ^{-1} \\sqrt{\\frac{2}{2+\\pi^{2}}}+\\frac{1}{4} \\sin ^{-1} \\frac{2 \\sqrt{2} \\pi}{2+\\pi^{2}}+\\tan ^{-1} \\frac{\\sqrt{2}}{\\pi}$$\n\nNow we find the principal values of the inverse trigonometric functions:\n\n$$\\cos^{-1}\\sqrt{\\frac{2}{2+\\pi^2}} \\approx 1.20599$$\n$$\\sin^{-1}\\frac{2\\sqrt2\\pi}{2+\\pi^2} \\approx 0.86234$$\n$$\\tan^{-1}\\frac{\\sqrt2}{\\pi} \\approx 0.61548$$\n\nNow substitute 

Bad pipe message: %s [b'\x888\xf9A(\xaa*\xda\x0f\x11+\xee9\xfe7\xa5\xd5\xdc \x94\xfd']
Bad pipe message: %s [b'i1\x83\x9e\xcd\xf9\xa5\x9e[#\x03\xed\x04{20\xe6h\x00\x00\xf4\xc00\xc0,\xc0(\xc0$\xc0\x14\xc0\n\x00\xa5\x00\xa3\x00\xa1\x00\x9f\x00k\x00j\x00i\x00h\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00\xa7\x00', b":\x00\x89\xc02\xc0.\xc0*\xc0&\xc0\x0f\xc0\x05\x00\x9d\x00=\x005\x00\x84\xc0/\xc0+\xc0'\xc0#\xc0\x13\xc0\t\x00\xa4\x00\xa2\x00\xa0\x00\x9e\x00g\x00@\x00?\x00>\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x00\xa6\x00l\x004\x00\x9b\x00F\xc01\xc0-\xc0)\xc0%\xc0\x0e\xc0\x04\x00\x9c\x00<\x00/\x00\x96\x00"]
Bad pipe message: %s [b"\x81\xae\x94Q\xc5\x16'\xf1\x06\x9c\x90\xfb\xd3\x8c\xccOD\x8c \xe5H}\x98\xdcO\xa1\x82\xa2\x1b\xa5\x8c\xc3]\x13\x00\x18\xa4\xddq\x17\x9c\x83\x1865\x88k"]
Bad pipe message: %s [b'-\x1eRh\xb7[\xb8k\xe0Z\x1a\x12\x89\xf6\xf5\x8d\n\xf1\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\x

Error: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Calling API 2/3...
Error: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Calling API 2/3...





problem=Problem(number='18', question='Consider the ellipse\n\n$$\n\\frac{x^{2}}{4}+\\frac{y^{2}}{3}=1\n$$\n\nLet $\\mathrm{H}(\\alpha, 0), 0<\\alpha<2$, be a point. A straight line drawn through $H$ parallel to the $y$-axis crosses the ellipse and its auxiliary circle at points $E$ and $F$ respectively, in the first quadrant. The tangent to the ellipse at the point $E$ intersects the positive $x$-axis at a point $G$. Suppose the straight line joining $F$ and the origin makes an angle $\\phi$ with the positive $x$-axis.\n\nList-I\n(I) If $\\phi=\\frac{\\pi}{4}$, then the area of the triangle $F G H$ is\n\n(II) If $\\phi=\\frac{\\pi}{3}$, then the area of the triangle $F G H$ is\n\n(III) If $\\phi=\\frac{\\pi}{6}$, the

## Results

Current scores (temp=1):
* GPT 3.5 Turbo: -9 / 64
* GPT 4: -5 / 64