In [1]:
import datasets

eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
to_keep = [
    "What's the last line of the rhyme under the flavor",
    'Of the authors (First M. Last) that worked on the paper "Pie Menus or Linear Menus',
    "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",
    "Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?",
    "The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.",
    "I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.",
    "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's ",
    "Under DDC 633 on Bielefeld University Library's BASE, as of 2020",
    "In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?",
    "The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators",
    "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?",
    'In the year 2022, and before December, what does "R" stand for in the three core policies of the type of content',
    "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
]
eval_ds = eval_ds.filter(lambda row: any([el in row["Question"] for el in to_keep]))
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})


In [3]:
import os

from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv(override=True)

login(os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Text browser

In [4]:
from scripts.run_agents import answer_questions
from scripts.text_inspector_tool import TextInspectorTool
from scripts.text_web_browser import (
    ArchiveSearchTool,
    FinderTool,
    FindNextTool,
    NavigationalSearchTool,
    PageDownTool,
    PageUpTool,
    SearchInformationTool,
    VisitTool,
)
from scripts.visual_qa import VisualQAGPT4Tool

from smolagents import CodeAgent, LiteLLMModel

Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.


In [5]:
proprietary_model = LiteLLMModel("gpt-4o")

### BUILD AGENTS & TOOLS

WEB_TOOLS = [
    SearchInformationTool(),
    NavigationalSearchTool(),
    VisitTool(),
    PageUpTool(),
    PageDownTool(),
    FinderTool(),
    FindNextTool(),
    ArchiveSearchTool(),
]


surfer_agent = CodeAgent(
    model=proprietary_model,
    tools=WEB_TOOLS,
    max_steps=20,
    verbosity_level=2,
)

results_text = answer_questions(
    eval_ds,
    surfer_agent,
    "code_o1_27-01_text",
    output_folder="output_browsers",
    visual_inspection_tool=VisualQAGPT4Tool(),
    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),
)

* 'fields' has been removed


Loading answers from output_browsers/code_o1_27-01_text.jsonl...
Found 12 previous results!


100%|██████████| 12/12 [00:00<00:00, 4817.35it/s]


### Vision browser

In [6]:
from scripts.visual_qa import VisualQAGPT4Tool
from scripts.vlm_web_browser import helium_instructions, make_browser_agent

from smolagents import CodeAgent, LiteLLMModel

proprietary_model = LiteLLMModel("gpt-4o")
vision_browser_agent = make_browser_agent(proprietary_model)
### BUILD AGENTS & TOOLS

results_vision = answer_questions(
    eval_ds,
    vision_browser_agent,
    "code_o1_27-01_vision",
    output_folder="output_browsers",
    visual_inspection_tool=VisualQAGPT4Tool(),
    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),
    postprompt=helium_instructions,
)

Loading answers from output_browsers/code_o1_27-01_vision.jsonl...
Found 12 previous results!


100%|██████████| 12/12 [00:00<00:00, 4047.25it/s]


### Get results

In [None]:
import pandas as pd
from scripts.gaia_scorer import question_scorer


results_vision, results_text = pd.DataFrame(results_vision), pd.DataFrame(results_text)

results_vision["is_correct"] = results_vision.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)
results_text["is_correct"] = results_text.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)

In [8]:
results = pd.concat([results_vision, results_text])
results.groupby("agent_name")["is_correct"].mean()

agent_name
code_o1_27-01_text      0.416667
code_o1_27-01_vision    0.333333
Name: is_correct, dtype: float64

In [9]:
correct_vision_results = results_vision.loc[results_vision["is_correct"]]
correct_vision_results

Unnamed: 0,agent_name,question,augmented_question,prediction,intermediate_steps,parsing_error,iteration_limit_exceeded,agent_error,start_time,end_time,task,true_answer,is_correct
3,code_o1_27-01_vision,Which contributor to the version of OpenCV whe...,It is paramount that you complete this task an...,Li Peng,[SystemPromptStep(system_prompt='You are an ex...,False,False,,2025-01-27 23:00:46,2025-01-27 23:01:46,2,Li Peng,True
7,code_o1_27-01_vision,"In the 2018 VSCode blog post on replit.com, wh...",It is paramount that you complete this task an...,Format Document,[SystemPromptStep(system_prompt='You are an ex...,True,False,,2025-01-27 23:14:24,2025-01-27 23:17:17,2,Format Document,True
9,code_o1_27-01_vision,In Nature journal's Scientific Reports confere...,It is paramount that you complete this task an...,diamond,[SystemPromptStep(system_prompt='You are an ex...,False,False,,2025-01-27 23:17:58,2025-01-27 23:19:12,1,diamond,True
11,code_o1_27-01_vision,Who nominated the only Featured Article on Eng...,It is paramount that you complete this task an...,FunkMonk,[SystemPromptStep(system_prompt='You are an ex...,False,False,,2025-01-27 23:19:31,2025-01-27 23:19:55,1,FunkMonk,True


In [11]:
false_text_results = results_text.loc[~results_text["is_correct"]]
false_text_results

Unnamed: 0,agent_name,question,augmented_question,prediction,intermediate_steps,parsing_error,iteration_limit_exceeded,agent_error,start_time,end_time,task,true_answer,is_correct
0,code_o1_27-01_text,What's the last line of the rhyme under the fl...,It is paramount that you complete this task an...,Caused its demise,[SystemPromptStep(system_prompt='You are an ex...,False,False,,2025-01-27 22:24:18,2025-01-27 22:24:47,2,So we had to let it die.,False
1,code_o1_27-01_text,Of the authors (First M. Last) that worked on ...,It is paramount that you complete this task an...,Anthropomorphic Vs Non-Anthropomorphic Softwar...,[SystemPromptStep(system_prompt='You are an ex...,False,False,,2025-01-27 22:43:03,2025-01-27 22:44:11,1,Mapping Human Oriented Information to Software...,False
2,code_o1_27-01_text,"In Series 9, Episode 11 of Doctor Who, the Doc...",It is paramount that you complete this task an...,[Teleport chamber room],[SystemPromptStep(system_prompt='You are an ex...,True,False,,2025-01-27 22:44:11,2025-01-27 22:44:58,1,THE CASTLE,False
3,code_o1_27-01_text,Which contributor to the version of OpenCV whe...,It is paramount that you complete this task an...,Peng Xiao,[SystemPromptStep(system_prompt='You are an ex...,True,False,,2025-01-27 22:44:58,2025-01-27 22:46:27,2,Li Peng,False
4,code_o1_27-01_text,The photograph in the Whitney Museum of Americ...,It is paramount that you complete this task an...,Russo-German Legion,[SystemPromptStep(system_prompt='You are an ex...,False,False,,2025-01-27 22:46:27,2025-01-27 22:46:47,2,Russian-German Legion,False
6,code_o1_27-01_text,Under DDC 633 on Bielefeld University Library'...,It is paramount that you complete this task an...,Ukraine,[SystemPromptStep(system_prompt='You are an ex...,True,False,,2025-01-27 22:48:03,2025-01-27 22:49:27,1,Guatemala,False
8,code_o1_27-01_text,The Metropolitan Museum of Art has a portrait ...,It is paramount that you complete this task an...,Silvio Savelli,[SystemPromptStep(system_prompt='You are an ex...,True,False,,2025-01-27 22:50:04,2025-01-27 22:50:39,2,Alfonso Visconti,False
