In [9]:
from dotenv import load_dotenv, find_dotenv
from tqdm.asyncio import tqdm_asyncio

In [10]:
load_dotenv(find_dotenv())

True

In [11]:
from openai import OpenAI

load_dotenv(override=True)

client = OpenAI()

In [12]:
import os

openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set - please head to the troubleshooting guide in the setup folder")

OpenAI API Key exists and begins sk-proj-


In [13]:
openai = OpenAI()

In [14]:
messages = [{"role": "user", "content": "What is 2+2?"}]

In [15]:
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-5-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"}
    ]
)

In [16]:
print(response.choices[0].message.content)

The capital of France is Paris.


In [17]:
from anthropic import Anthropic

client2 = Anthropic()

In [18]:
response2 = client2.messages.create(
    model="claude-sonnet-4-5",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "What is the capital of France?"}
    ]
)

In [19]:
print(response2.content[0].text)

The capital of France is Paris.


In [20]:
from litellm import completion

messages = []
# First exchange
messages.append({"role": "user", "content": "My name is Sajjad."})
response1 = completion(model="gpt-5-mini", messages=messages)
assistant_message1 = response1.choices[0].message.content
messages.append({"role": "assistant", "content": assistant_message1})
print(assistant_message1)
# Second exchange - includes previous conversation history
messages.append({"role": "user", "content": "What is my name?"})
response2 = completion(model="gpt-5-mini", messages=messages)
assistant_message2 = response2.choices[0].message.content
print(assistant_message2)

You told me your name is Sajjad.


In [21]:
from pydantic import BaseModel
from litellm import completion

In [22]:
class ExtractedInfo(BaseModel):
    name: str
    email: str
    phone: str | None = None

In [23]:
response = completion(
    model="gpt-5-mini",
    messages=[{
        "role": "user",
        "content": "My name is Hamad Bin Abu Bakar and my email is hamadba@gmail.com and my phone is 1234-5678",
    }],
    response_format=ExtractedInfo
)

result = response.choices[0].message.content
print(result)

{"name":"Hamad Bin Abu Bakar","email":"hamadba@gmail.com","phone":"1234-5678"}


In [24]:
import asyncio
from litellm import acompletion


async def get_response(prompt: str) -> str:
    response = await acompletion(
        model="gpt-5-mini",
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content


prompts = [
    "whats is 2 + 2?",
    "what is the capital of France?",
    "Who wrote the Romeo and Juliet?"
]

tasks = [get_response(p) for p in prompts]

results = await asyncio.gather(*tasks)

for prompt, result in zip(prompts, results):
    print(f"Q: {prompt}")
    print(f"A: {result}")

semaphore = asyncio.Semaphore(10)


async def call_llm(prompt: str) -> str:
    """LLM call with rate limiting and automatic retry"""
    async with semaphore:
        response = await acompletion(
            model="gpt-5-mini",
            messages=[{"role": "user", "content": prompt}],
            num_retries=3,  # automatic retry with exponential backuff
        )

        return response.choices[0].message.content


# Even with 100 concurrent tasks, only 10 API calls run at a time
prompts = [f"Whats is {i} + {i} ? " for i in range(100)]
results = await asyncio.gather(*tasks, return_exceptions=True)

Q: whats is 2 + 2?
A: 2 + 2 = 4.
Q: what is the capital of France?
A: Paris.
Q: Who wrote the Romeo and Juliet?
A: Romeo and Juliet was written by William Shakespeare. It’s a tragic play likely written and first performed in the mid-1590s (c. 1595–1597), based on earlier Italian stories.


In [25]:
from huggingface_hub import login

login(token=os.getenv("HF_TOKEN"))

from datasets import load_dataset

level1_problems = load_dataset("gaia-benchmark/GAIA", "2023_level1",
                               split="validation")
print(f"Number of Level 1 problems: {len(level1_problems)}")

Number of Level 1 problems: 53


DEFINING THE RESPONSE FORMAT

In [26]:
from pydantic import BaseModel


class GaiaOutput(BaseModel):
    is_solvable: bool
    unsolvable_reason: str = ""
    final_answer: str = ""

Managing API rate limits

In [60]:
PROVIDER_SEMAPHORES = {
    "openai": asyncio.Semaphore(30),
    "anthropic": asyncio.Semaphore(10),
}


def get_provider(model: str) -> str:
    """Extract provider name from model"""
    return "anthropic" if model.startswith("anthropic/") else "openai"

In [52]:
gaia_prompt = """You are a general AI assistant. I will ask you a question.
First, determine if you can solve this problem with your current capabilities and
set "is_solvable" accordingly.
If you can solve it, set "is_solvable" to true and provide your answer in
"final_answer".
If you cannot solve it, set "is_solvable" to false and explain why in
"unsolvable_reason".
Your final answer should be a number OR as few words as possible OR a comma
separated list of numbers and/or strings.
If you are asked for a number, don't use comma to write your number neither
use units such as $ or percent sign unless specified otherwise.
If you are asked for a string, don't use articles, neither abbreviations (e.g. for
cities), and write the digits in plain text unless specified otherwise.
If you are asked for a comma separated list, apply the above rules depending on
whether the element is a number or a string."""

In [68]:
# defined somewhere ABOVE this function
gaia_prompt = """You are a careful problem-solving assistant.
Return ONLY valid JSON matching the GaiaOutput schema.
"""

async def solve_problem(model: str, question: str) -> GaiaOutput:
    provider = get_provider(model)

    async with PROVIDER_SEMAPHORE[provider]:
        response = await acompletion(
            model=model,
            messages=[
                {"role": "system", "content": gaia_prompt},
                {"role": "user", "content": question},
            ],
            response_format=GaiaOutput,
            num_retries=2,
        )

    finish_reason = response.choices[0].finish_reason
    content = response.choices[0].message.content

    if finish_reason == "refusal" or content is None:
        return GaiaOutput(
            is_solvable=False,
            unsolvable_reason=f"Model refused (finish_reason={finish_reason})",
            final_answer="",
        )

    return GaiaOutput.model_validate_json(content)

In [63]:
def is_correct(prediction: str | None, answer: str) -> bool:
    """Check exact match between prediction and answer(case insensitive"""
    if prediction is None:
        return False
    return prediction.strip().lower() == answer.strip().lower()

In [72]:
async def evaluate_gaia_single(problem: dict, model: str) -> dict:
    try:
        output = await solve_problem(model, problem["Question"])

        gold_answer = (
            problem.get("Final Answer")
            or problem.get("final_answer")
            or problem.get("answer")
            or problem.get("Answer")
        )

        if gold_answer is None:
            raise KeyError("No gold answer field found in problem")

        return {
            "task_id": problem.get("task_id"),
            "model": model,
            "correct": is_correct(output.final_answer, gold_answer),
            "is_solvable": output.is_solvable,
            "prediction": output.final_answer,
            "answer": gold_answer,
            "unsolvable_reason": output.unsolvable_reason,
        }

    except Exception as e:
        return {
            "task_id": problem.get("task_id"),
            "model": model,
            "correct": False,
            "is_solvable": False,
            "prediction": "",
            "answer": (
                problem.get("Final Answer")
                or problem.get("final_answer")
                or problem.get("answer")
                or problem.get("Answer")
            ),
            "error": f"{type(e).__name__}: {e}",
        }

In [73]:
from tqdm.asyncio import tqdm_asyncio

async def run_experiment(
        problems: list[dict],
        models: list[str],
) -> dict[str, str]:
    """Evaluate all models on all problems"""
    tasks = [
        evaluate_gaia_single(problem, model)
        for problem in problems
        for model in models
    ]

    all_results = await tqdm_asyncio.gather(*tasks)

    # Group results by model
    results = {model: [] for model in models}

    for result in all_results:
        results[result["model"]].append(result)

    return results


In [74]:
MODELS = [
    "gpt-5",
    "gpt-5-mini",
    "anthropic/claude-sonnet-4-5",
    "anthropic/claude-haiku-4-5"
]

subset = level1_problems.select(range(20)).to_list()
results = await run_experiment(subset, MODELS)




In [75]:
print(results)

{'gpt-5': [{'task_id': 'e1fc63a2-da7a-432f-be78-7c4a95598703', 'model': 'gpt-5', 'correct': False, 'is_solvable': False, 'prediction': '', 'answer': None, 'error': "KeyError: 'No gold answer field found in problem'"}, {'task_id': '8e867cd7-cff9-4e6c-867a-ff5ddc2550be', 'model': 'gpt-5', 'correct': False, 'is_solvable': False, 'prediction': '', 'answer': None, 'error': "KeyError: 'No gold answer field found in problem'"}, {'task_id': 'ec09fa32-d03f-4bf8-84b0-1f16922c3ae4', 'model': 'gpt-5', 'correct': False, 'is_solvable': False, 'prediction': '', 'answer': None, 'error': "KeyError: 'No gold answer field found in problem'"}, {'task_id': '5d0080cb-90d7-4712-bc33-848150e917d3', 'model': 'gpt-5', 'correct': False, 'is_solvable': False, 'prediction': '', 'answer': None, 'error': "KeyError: 'No gold answer field found in problem'"}, {'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', 'model': 'gpt-5', 'correct': False, 'is_solvable': False, 'prediction': '', 'answer': None, 'error': "KeyErro