# Compare a text-based vs a vision-based browser

Warning: this notebook is experimental, it probably won't work out of the box!

In [23]:
!pip install "smolagents[litellm]" -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import pandas as pd

eval_df = pd.read_json("GAIA_web.jsonl", lines=True)

In [4]:
import os

from dotenv import load_dotenv
from huggingface_hub import login
from scripts.reformulator import prepare_response


load_dotenv(override=True)

login(os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Text browser

In [None]:
from scripts.text_inspector_tool import TextInspectorTool
from scripts.text_web_browser import (
    ArchiveSearchTool,
    FinderTool,
    FindNextTool,
    PageDownTool,
    PageUpTool,
    SearchInformationTool,
    VisitTool,
    SimpleTextBrowser,
)
from scripts.visual_qa import visualizer

from smolagents import CodeAgent, LiteLLMModel, GoogleSearchTool


model = LiteLLMModel("gpt-4o")

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
BROWSER_CONFIG = {
    "viewport_size": 1024 * 5,
    "downloads_folder": "downloads_folder",
    "request_kwargs": {
        "headers": {"User-Agent": user_agent},
        "timeout": 300,
    },
    "serpapi_key": os.getenv("SERPAPI_API_KEY"),
}

os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)

browser = SimpleTextBrowser(**BROWSER_CONFIG)

WEB_TOOLS = [
    GoogleSearchTool(provider="serper"),
    VisitTool(browser),
    PageUpTool(browser),
    PageDownTool(browser),
    FinderTool(browser),
    FindNextTool(browser),
    ArchiveSearchTool(browser),
]


surfer_agent = CodeAgent(
    model=model,
    tools=WEB_TOOLS,
    max_steps=20,
    verbosity_level=1,
)

answers_text = []

for example in eval_df.to_dict(orient="index").values():
    try:
        answer = surfer_agent.run(example["ques"])
        final_answer = prepare_response(
            example["ques"], f"Here is the final report from the agent: {answer}", reformulation_model=model
        )

        example["prediction"] = final_answer
        answers_text.append(example)
    except Exception as e:
        print(e)

import json

with open("gaia_text.jsonl", "w") as f:
    for item in answers_text:
        json_line = json.dumps(item)
        f.write(json_line + "\n")

> Reformulated answer:  17000


> Reformulated answer:  3


> Reformulated answer:  0.1777


> Reformulated answer:  3


> Reformulated answer:  Anthropomorphic Vs Non-Anthropomorphic Software Interface Feedback for Online Factual Delivery


### Vision browser

In [None]:
!pip install helium -q

In [None]:
from scripts.visual_qa import VisualQAGPT4Tool

from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
from smolagents.vision_web_browser import (
    close_popups,
    go_back,
    helium_instructions,
    initialize_agent,
    save_screenshot,
    search_item_ctrl_f,
)


proprietary_model = LiteLLMModel("gpt-4o")
vision_browser_agent = initialize_agent(proprietary_model)
### BUILD AGENTS & TOOLS

CodeAgent(
    tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
    model=proprietary_model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=20,
    verbosity_level=2,
)

results_vision = answer_questions(
    eval_ds,
    vision_browser_agent,
    "code_gpt4o_27-01_vision",
    reformulation_model=proprietary_model,
    output_folder="output_browsers",
    visual_inspection_tool=VisualQAGPT4Tool(),
    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),
    postprompt=helium_instructions
    + "Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them",
)

### Browser-use browser

In [None]:
!pip install browser-use lxml_html_clean -q
!playwright install -q

In [None]:
import asyncio

import nest_asyncio


nest_asyncio.apply()

from browser_use import ActionResult, Agent, Browser, Controller
from browser_use.browser.context import BrowserContext, BrowserContextConfig
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from smolagents import GoogleSearchTool, LiteLLMModel


load_dotenv()


config = BrowserContextConfig(
    wait_for_network_idle_page_load_time=3.0,
    browser_window_size={"width": 1280, "height": 1000},
    locale="en-US",
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    highlight_elements=True,
    viewport_expansion=500,
)

browser = Browser()
context = BrowserContext(browser=browser, config=config)

model = LiteLLMModel("gpt-4o")

# Initialize the controller
controller = Controller()


@controller.action("Use this to run google searches")
def search_google(question: str) -> str:
    answer = GoogleSearchTool(provider="serper")(question)
    return ActionResult(extracted_content=answer)


class BrowserUseAgent:
    logs = []

    def write_inner_memory_from_logs(self, summary_mode):
        return self.results

    def run(self, task, **kwargs):
        agent = Agent(
            browser_context=context,
            task=task,
            llm=ChatOpenAI(model="gpt-4o"),
            controller=controller,
        )
        self.results = asyncio.get_event_loop().run_until_complete(agent.run())
        return self.results.history[-1].result[0].extracted_content


browser_use_agent = BrowserUseAgent()

answers_browser_use = []

for example in eval_df.to_dict(orient="index").values():
    try:
        answer = browser_use_agent.run(
            example["ques"] + "Use your search_google action rather than browsing google", max_steps=30
        )
        final_answer = prepare_response(
            example["ques"], f"Here is the final report from the agent: {answer}", reformulation_model=model
        )

        example["prediction"] = final_answer
        answers_browser_use.append(example)
    except Exception as e:
        print(e)

import json

with open("gaia_browser-use.jsonl", "w") as f:
    for item in answers_browser_use:
        json_line = json.dumps(item)
        f.write(json_line + "\n")

INFO     [agent] 🚀 Starting task: In terms of geographical distance between capital cities, which 2 countries are the furthest from each other within the ASEAN bloc according to wikipedia? Answer using a comma separated list, ordering the countries by alphabetical order.Use your search_google action rather than browsing google
INFO     [agent] 
📍 Step 1
INFO     [agent] 🤷 Eval: Unknown - Starting with a blank page, need to initiate a search.
INFO     [agent] 🧠 Memory: 
INFO     [agent] 🎯 Next goal: Search for information on the geographical distance between ASEAN capitals.
INFO     [agent] 🛠️  Action 1/1: {"search_google":{"question":"which 2 countries are the furthest from each other within the ASEAN bloc according to wikipedia"}}
INFO     [agent] 
📍 Step 2
INFO     [agent] 👍 Eval: Success - Retrieved search results with relevant links to ASEAN and Wikipedia.
INFO     [agent] 🧠 Memory: Need to find the information on geographical distance between ASEAN capitals.
INFO     [agent] 🎯 Nex

[92m14:51:02 - LiteLLM:INFO[0m: utils.py:2944 - 
LiteLLM completion() model= gpt-4o; provider = openai


INFO     [LiteLLM] 
LiteLLM completion() model= gpt-4o; provider = openai


[92m14:51:03 - LiteLLM:INFO[0m: utils.py:1120 - Wrapper: Completed Call, calling success_handler


INFO     [LiteLLM] Wrapper: Completed Call, calling success_handler
> Reformulated answer:  Myanmar, Philippines
INFO     [agent] 🚀 Starting task: I need to fact-check a citation. This is the citation from the bibliography:

Greetham, David. "Uncoupled: OR, How I Lost My Author(s)." Textual Cultures: Texts, Contexts, Interpretation, vol. 3 no. 1, 2008, p. 45-46. Project MUSE, doi:10.2979/tex.2008.3.1.44.

And this is the in-line citation:

Our relationship with the authors of the works we read can often be “obscured not by a "cloak of print" but by the veil of scribal confusion and mis-transmission” (Greetham 45-46).

Does the quoted text match what is actually in the article? If Yes, answer Yes, otherwise, give me the word in my citation that does not match with the correct one (without any article).Use your search_google action rather than browsing google
INFO     [agent] 
📍 Step 1
INFO     [agent] 👍 Eval: Success - Browser started successfully.
INFO     [agent] 🧠 Memory: Started bro

[92m14:53:18 - LiteLLM:INFO[0m: utils.py:2944 - 
LiteLLM completion() model= gpt-4o; provider = openai


INFO     [LiteLLM] 
LiteLLM completion() model= gpt-4o; provider = openai


[92m14:53:19 - LiteLLM:INFO[0m: utils.py:1120 - Wrapper: Completed Call, calling success_handler


INFO     [LiteLLM] Wrapper: Completed Call, calling success_handler
> Reformulated answer:  Unable to determine
INFO     [agent] 🚀 Starting task: Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?Use your search_google action rather than browsing google
INFO     [agent] 
📍 Step 1
INFO     [agent] 🤷 Eval: Unknown - Unable to evaluate goal since this is a CAPTCHA verification page.
INFO     [agent] 🧠 Memory: Need to bypass captcha to access page content.
INFO     [agent] 🎯 Next goal: Attempt to solve the CAPTCHA by selecting all images with motorcycles and verify access.
INFO     [agent] 🛠️  Action 1/10: {"click_element":{"index":12}}
INFO     [agent] 🛠️  Action 2/10: {"click_element":{"index":14}}
INFO     [agent] 🛠️  Action 3/10: {"click_element":{"index":16}}
INFO     [agent] 🛠️  Action 4/10: {"click_element":{"index":20}}
INFO     [

[92m14:55:54 - LiteLLM:INFO[0m: utils.py:2944 - 
LiteLLM completion() model= gpt-4o; provider = openai


INFO     [LiteLLM] 
LiteLLM completion() model= gpt-4o; provider = openai


[92m14:55:55 - LiteLLM:INFO[0m: utils.py:1120 - Wrapper: Completed Call, calling success_handler


INFO     [LiteLLM] Wrapper: Completed Call, calling success_handler
> Reformulated answer:  Unable to determine
INFO     [agent] 🚀 Starting task: What is the maximum length in meters of #9 in the first National Geographic short on YouTube that was ever released according to the Monterey Bay Aquarium website? Just give the number.Use your search_google action rather than browsing google
INFO     [agent] 
📍 Step 1
ERROR    [agent] ❌ Result failed 1/3 times:
 Browser closed: no valid pages available
INFO     [agent] 
📍 Step 1
ERROR    [agent] ❌ Result failed 2/3 times:
 Browser closed: no valid pages available
INFO     [agent] 
📍 Step 1
ERROR    [agent] ❌ Result failed 3/3 times:
 Browser closed: no valid pages available
ERROR    [agent] ❌ Stopping due to 3 consecutive failures


IndexError: list index out of range

In [None]:
import json

with open("gaia_browser-use2.jsonl", "w") as f:
    for item in list(previous_answers.to_dict(orient="index").values()) + answers_browser_use:
        json_line = json.dumps(item)
        f.write(json_line + "\n")

### Omniparser vision agent

In [2]:
!pip install ultralytics easyocr paddleocr paddlepaddle numpy==1.26.4 supervision==0.18.0 -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from scripts.vision_web_browser import make_vision_agent
from scripts.visual_qa import visualizer
import pandas as pd
from smolagents import LiteLLMModel

instructions = """
Use your web_search tool when you want to get Google search results.
Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"

You can navigate to pages.
Code:
```py
go_to('github.com/trending')
```<end_code>

You can directly click a box by giving its number
Code:
```py
click_box(18)
```<end_code>

In general stop your action after each button click to see what happens on your screenshot.
Never try to login in a page.

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
```py
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
Just use your built-in tool `close_popups` to close them:
Code:
```py
close_popups()
```<end_code>

You can use .exists() to check for the existence of an element. For example:
Code:
```py
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>

Proceed in several steps rather than trying to solve the task in one shot.
And at the end, only when you have your answer, return your final answer.
Code:
```py
final_answer("YOUR_ANSWER_HERE")
```<end_code>

If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
Of course, you can act on buttons like a user would do when navigating.
After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
Don't kill the browser.
When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them
"""

vision_browser_agent = make_vision_agent()

model = LiteLLMModel("gpt-4o")

results = []
for example in eval_ds:
    example["question"] += instructions
    answer = answer_single_question(example, model, "out.jsonl", visualizer)
    results += answer

results_omniparser = pd.DataFrame(results)

### Get results

In [3]:
results_browseruse

Unnamed: 0,task_id,Level,true_answer,id,web,ques,prediction,agent_name
0,e1fc63a2-da7a-432f-be78-7c4a95598703,1,17,level1-0,https://www.google.com/,If Eliud Kipchoge could maintain his record-ma...,,browser use
1,8e867cd7-cff9-4e6c-867a-ff5ddc2550be,1,3,level1-1,https://www.google.com/,How many studio albums were published by Merce...,,browser use
2,5d0080cb-90d7-4712-bc33-848150e917d3,1,0.1777,level1-2,https://www.google.com/,What was the volume in m^3 of the fish bag tha...,,browser use
3,a1e91b78-d3d8-4675-bb8d-62741b4b68a6,1,3,level1-3,https://www.google.com/,In the video https://www.youtube.com/watch?v=L...,,browser use
4,46719c30-f4c3-4cad-be07-d5cb21eee6bb,1,Mapping Human Oriented Information to Software...,level1-4,https://www.google.com/,Of the authors (First M. Last) that worked on ...,,browser use
...,...,...,...,...,...,...,...,...
88,853c8244-429e-46ca-89f2-addf40dfb2bd,2,11,level2-62,https://www.google.com/,In the 2015 Metropolitan Museum of Art exhibit...,,browser use
89,7a4a336d-dcfa-45a0-b014-824c7619e8de,2,1:41.614,level2-63,https://www.google.com/,At the two-minute mark in the YouTube video up...,,browser use
90,f0f46385-fc03-4599-b5d3-f56496c3e69f,2,"Indonesia, Myanmar",level2-10,https://www.google.com/,In terms of geographical distance between capi...,"Myanmar, Philippines",browser use
91,e4e91f1c-1dcd-439e-9fdd-cb976f5293fd,2,cloak,level2-11,https://www.google.com/,I need to fact-check a citation. This is the c...,Unable to determine,browser use


In [5]:
import pandas as pd
from scripts.gaia_scorer import question_scorer

results_browseruse = pd.read_json("gaia_browser-use.jsonl", lines=True)
results_browseruse["agent_name"] = "browser use"
results_browseruse["is_correct"] = results_browseruse.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)

String Unable to determine cannot be normalized to number str.
String Unable to determine cannot be normalized to number str.




In [9]:
results = pd.concat([results_browseruse])

display(results.groupby("agent_name")["is_correct"].count())
results.groupby("agent_name")["is_correct"].mean()

agent_name
browser use    36
Name: is_correct, dtype: int64

agent_name
browser use    0.333333
Name: is_correct, dtype: float64

In [None]:
correct_vision_results = results_vision.loc[results_vision["is_correct"]]
correct_vision_results

In [None]:
false_text_results = results_text.loc[~results_text["is_correct"]]
false_text_results