In [None]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())



True

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
    model = 'gpt-5-mini',
    messages = [
        {'role': 'system', 'content' : 'You are a helpful assistant !'},
        {'role': 'user', 'content': 'What is the capital of India ?'}
    ]
)

print(response.choices[0].message)
print(response.choices[0].message.content)


ChatCompletionMessage(content='The capital of India is New Delhi.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)
The capital of India is New Delhi.


In [5]:
## with this we can unify all providers

from litellm import completion
response = completion(
    model = 'gpt-5-mini',
    messages = [{'role' : 'user', 'content' : 'Hello !' }]
)

print(response.choices[0].message.content)

Hello! How can I help you today?


In [None]:
from litellm import completion

response1 = completion(
        model = 'gpt-5-mini',
    messages = [{'role' : 'user', 'content':'My name is Akhil'}]
)

response2 = completion(
        model = 'gpt-5-mini',
    messages = [{'role' : 'user', 'content':'what\'s my name'}]
)

print(response1.choices[0].message.content)
print(response2.choices[0].message.content)

### This proves that each LLM call is independent. Our Model doesn't have memory

Nice to meet you, Akhil — how can I help you today?
I don't know — I don't have access to personal details unless you tell me. What would you like me to call you in this chat? (I can use that name for this conversation, but I can't remember it across separate sessions unless you set it in your app/profile.)


In [8]:
### Managing conversation history


from litellm import completion

## Maintain a messages object
messages = []

## append your message/conversation
messages.append({'role':'user', 'content':'My name is Akhil and I am going to be the Future of AI'})
response3 = completion(model = 'gpt-5-mini', messages = messages)

print(response3.choices[0].message.content)

## append the message from assistant
messages.append({'role':'assistant', 'content':response3.choices[0].message.content})

## write a new message
messages.append({'role':'user', 'content':'who am i'})
response4 = completion(model = 'gpt-5-mini', messages = messages)

print(response4.choices[0].message.content)




Great — love the ambition, Akhil. If you want to be “the future of AI,” I can help you get there. How would you like me to help right now? (Pick one: roadmap, project ideas, resume/LinkedIn copy, interview prep, or a 12‑month actionable plan.)

Below are a few immediately useful things you can use or ask me to expand.

Quick elevator pitch / LinkedIn headline
- Headline: Akhil — Building safe, scalable AI that augments human creativity and solves real-world problems
- 1‑line pitch: “I build trustworthy AI systems that turn complex data into products people love — with a focus on safety, scalability, and real-world impact.”

High‑level skills to prioritize
- Foundations: probability, linear algebra, optimization
- Core ML: supervised learning, neural networks, transfer learning, transformers
- Systems & infra: PyTorch/TensorFlow, Docker, Kubernetes, model serving, MLOps
- Specialized: LLMs, RL, generative models, multimodal models (vision+language)
- Soft skills: product sense, communic

In [None]:
### Structured output

from pydantic import BaseModel
from litellm import completion

class ExtractedInfo(BaseModel):
    name  : str
    email : str
    phone : str | None = None

response = completion(
    model="gpt-5-mini",
    messages=[{
        "role": "user", 
        "content": "My name is Akhil, my email is akhil.masters21@gmail.com, and my phone is 9550303420."
    }],
    response_format=ExtractedInfo
)

print(response.choices[0].message.content)

{"name":"Akhil","email":"akhil.masters21@gmail.com","phone":"9550303420"}


In [None]:
### Asynchronus calls

import asyncio
from litellm import acompletion
async def get_response(prompt: str) -> str:
    response = await acompletion(
        model = 'gpt-5-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content
    
prompts = [
    "What is 2 + 2?",
    "What is the capital of Japan?",
    "Who wrote Romeo and Juliet?"
]

### here 
## tasks = [get_response(What is 2 + 2?), get_response(What is the capital of Japan?)] 
## doesnt run the function, it just creates a coroutine object. Thats the difference in async.
## functions are called in gather step

tasks = [get_response(p) for p in prompts]
results = await asyncio.gather(*tasks)

for prompt, result in zip(prompts, results):
      print(f"Q: {prompt}")
      print(f"A: {result}\n")


Q: What is 2 + 2?
A: 2 + 2 = 4.

Q: What is the capital of Japan?
A: The capital of Japan is Tokyo.

Q: Who wrote Romeo and Juliet?
A: Romeo and Juliet was written by William Shakespeare. It was likely written and first performed in the mid-1590s (published in 1597).



In [14]:
### rate limiting queries
semaphore = asyncio.Semaphore(10)

async def call_llm(prompt : str) -> str:
    async with semaphore:
        response = await acompletion(
            model="gpt-5-mini",
            messages=[{"role": "user", "content": prompt}],
            num_retries=3  # Automatic retry with exponential backoff
        )
        return response.choices[0].message.content
prompts = [f"What is {i} + {i}?" for i in range(20)]
tasks = [call_llm(p) for p in prompts]
results = await asyncio.gather(*tasks, return_exceptions=True)


for prompt, result in zip(prompts, results):
      print(f"Q: {prompt}")
      print(f"A: {result}\n")


Q: What is 0 + 0?
A: 0

Because adding zero to zero yields zero.

Q: What is 1 + 1?
A: 1 + 1 = 2.

Q: What is 2 + 2?
A: 2 + 2 = 4.

Q: What is 3 + 3?
A: 3 + 3 = 6.

Q: What is 4 + 4?
A: 8

Q: What is 5 + 5?
A: 10

Q: What is 6 + 6?
A: 12

Q: What is 7 + 7?
A: 14

Q: What is 8 + 8?
A: 16

Q: What is 9 + 9?
A: 18

Q: What is 10 + 10?
A: 10 + 10 = 20

Q: What is 11 + 11?
A: 22

Q: What is 12 + 12?
A: 24

Q: What is 13 + 13?
A: 26

Q: What is 14 + 14?
A: 28

Q: What is 15 + 15?
A: 30

Q: What is 16 + 16?
A: 32

Q: What is 17 + 17?
A: 34

Q: What is 18 + 18?
A: 36

Q: What is 19 + 19?
A: 38



In [16]:
## loading the GAIA dataset

from datasets import load_dataset
level1_problems = load_dataset("gaia-benchmark/GAIA", "2023_level1", split="validation")
print(f"Number of Level 1 problems: {len(level1_problems)}")


Generating test split: 100%|██████████| 93/93 [00:00<00:00, 1653.78 examples/s]
Generating validation split: 100%|██████████| 53/53 [00:00<00:00, 32022.20 examples/s]

Number of Level 1 problems: 53





In [17]:
level1_problems[1]

{'task_id': '8e867cd7-cff9-4e6c-867a-ff5ddc2550be',
 'Question': 'How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.',
 'Level': '1',
 'Final answer': '3',
 'file_name': '',
 'file_path': '',
 'Annotator Metadata': {'Steps': '1. I did a search for Mercedes Sosa\n2. I went to the Wikipedia page for her\n3. I scrolled down to "Studio albums"\n4. I counted the ones between 2000 and 2009',
  'Number of steps': '4',
  'How long did this take?': '5 minutes',
  'Tools': '1. web browser\n2. google search',
  'Number of tools': '2'}}

In [19]:
## defining a respose for gaia
from pydantic import BaseModel
from tqdm.asyncio import tqdm
gaia_prompt = """You are a general AI assistant. I will ask you a question.
First, determine if you can solve this problem with your current capabilities and set "is_solvable" accordingly.
If you can solve it, set "is_solvable" to true and provide your answer in "final_answer".
If you cannot solve it, set "is_solvable" to false and explain why in "unsolvable_reason".
Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
If you are asked for a comma separated list, apply the above rules depending on whether the element is a number or a string."""

class GaiaOutput(BaseModel):
    is_solvable: bool
    unsolvable_reason: str = ""
    final_answer: str = ""

PROVIDER_SEMAPHORES = {'openai': asyncio.Semaphore(30), 'anthropic': asyncio.Semaphore(10)}

def get_provider(model: str) -> str:
    return "anthropic" if model.startswith("anthropic/") else "openai"


async def solve_problem(model: str, question: str) -> GaiaOutput:
    provider = get_provider(model)
    async with PROVIDER_SEMAPHORES[provider]:
        response = await acompletion(
            model = model,
            messages=[
                {"role": "system", "content": gaia_prompt},
                {"role": "user", "content": question},
            ],
            response_format=GaiaOutput,
            num_retries=2,
        )
        finish_reason = response.choices[0].finish_reason
        content = response.choices[0].message.content
        if finish_reason == "refusal" or content is None:
            return GaiaOutput(
                is_solvable=False,
                unsolvable_reason=f"Model refused to answer (finish_reason: {finish_reason})",
                final_answer=""
            )
        return GaiaOutput.model_validate_json(content)

def is_correct(prediction: str | None, answer: str) -> bool:
    """Check exact match between prediction and answer (case-insensitive)."""
    if prediction is None:
        return False
    return prediction.strip().lower() == answer.strip().lower()

async def evaluate_gaia_single(problem: dict, model: str) -> dict:
    """Evaluate a single problem-model pair and return result."""
    try:
        output = await solve_problem(model, problem["Question"])
        return {
            "task_id": problem["task_id"],
            "model": model,
            "correct": is_correct(output.final_answer, problem["Final answer"]),
            "is_solvable": output.is_solvable,
            "prediction": output.final_answer,
            "answer": problem["Final answer"],
            "unsolvable_reason": output.unsolvable_reason,
        }
    except Exception as e:
        return {
            "task_id": problem["task_id"],
            "model": model,
            "correct": False,
            "is_solvable": None,
            "prediction": None,
            "answer": problem["Final answer"],
            "error": str(e),
        }

async def run_experiment(
    problems: list[dict],
    models: list[str],
) -> dict[str, list]:
    """Evaluate all models on all problems."""
    tasks = [
        evaluate_gaia_single(problem, model)
        for problem in problems
        for model in models
    ]
    
    all_results = await tqdm.gather(*tasks)
    
    # Group results by model
    results = {model: [] for model in models}
    for result in all_results:
        results[result["model"]].append(result)
    
    return results

MODELS = [
    "gpt-5",
    "gpt-5-mini"
]
 
subset = level1_problems.select(range(20))
results = await run_experiment(subset, MODELS)

100%|██████████| 40/40 [02:23<00:00,  3.58s/it]


In [20]:
results

{'gpt-5': [{'task_id': 'e1fc63a2-da7a-432f-be78-7c4a95598703',
   'model': 'gpt-5',
   'correct': True,
   'is_solvable': True,
   'prediction': '17',
   'answer': '17',
   'unsolvable_reason': ''},
  {'task_id': '8e867cd7-cff9-4e6c-867a-ff5ddc2550be',
   'model': 'gpt-5',
   'correct': False,
   'is_solvable': True,
   'prediction': '4',
   'answer': '3',
   'unsolvable_reason': ''},
  {'task_id': 'ec09fa32-d03f-4bf8-84b0-1f16922c3ae4',
   'model': 'gpt-5',
   'correct': True,
   'is_solvable': True,
   'prediction': '3',
   'answer': '3',
   'unsolvable_reason': ''},
  {'task_id': '5d0080cb-90d7-4712-bc33-848150e917d3',
   'model': 'gpt-5',
   'correct': False,
   'is_solvable': False,
   'prediction': '',
   'answer': '0.1777',
   'unsolvable_reason': 'I don’t have access to the specific paper text or its figures and can’t browse to retrieve the exact calculated volume.'},
  {'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
   'model': 'gpt-5',
   'correct': False,
   'is_solvable

## Tool Usage