In [1]:
from src.agents.browser_agent import BrowserAgent
from src.models.models import ModelConfig
import os
import dotenv

dotenv.load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

In [2]:
# Main browser agent - for navigation and interaction
browser_model_config = ModelConfig(
    type="api",
    provider="openrouter",
    name="google/gemini-2.5-flash",
    temperature=0.1,  # Low temperature for precise actions
    max_tokens=5000,
    thinking_budget=None,
    api_key=OPENROUTER_API_KEY,
)

# Vision agent - for interactive element detection
vision_model_config = ModelConfig(
    type="api",
    provider="openrouter", #"openai",
    name="google/gemini-2.5-flash",#"gpt-4.1",  # Vision-capable model
    temperature=0.1,
    max_tokens=5000,
    thinking_budget=None,
    api_key=OPENROUTER_API_KEY, #OPENAI_API_KEY,
)

web_search_input_schema = {
    "type": "object",
    "properties": {
        "search_query": {"type": "string", "description": "The search query to execute on Google"},
        "num_results": {"type": "integer", "description": "Number of results to return"}
    },
    "required": ["search_query"]
}

web_search_output_schema = {
    "type": "object",
    "properties": {
        "search_results_urls": {
            "type": "array",
            "items": {"type": "string"},
            "description": "List of clean URLs from search results"
        }
    },
    "required": ["search_results_urls"]
}

In [3]:
WEB_SEARCH_DESCRIPTION = (
    "You are a specialized Web Search Agent using a browser. Your single responsibility is to "
    "take a search query, perform a Google search, and return a list of the top organic "
    "search result URLs.\n\n"
    "Your workflow:\n"
    "1. Navigate to google.com.\n"
    "2. Handle any popups.\n"
    "3. Enter the search query and execute the search.\n"
    "4. Scrape the search result links from the page using the `get_attribute_all` tool on relevant selectors (e.g., 'h3 a', 'a[data-ved]').\n"
    "5. Clean the URLs to remove any Google tracking or redirect parameters.\n"
    "6. Return a JSON object containing a list of the clean URLs in the `search_results_urls` field."
)

In [4]:
# Create Web Search Agent with vision capabilities
web_search_agent = await BrowserAgent.create_safe(
    model_config=browser_model_config,
    vision_model_config=vision_model_config,  # Enable vision analysis
    description=WEB_SEARCH_DESCRIPTION,
    agent_name="WebSearchAgent",
    headless=False,  # Set to False for debugging
    browser_channel="chrome",  # Use system Chrome if available
    input_schema=web_search_input_schema,
    output_schema=web_search_output_schema,
    tmp_dir="./tmp",
    timeout=5000,
    viewport_width=1440,
    viewport_height=960,
    auto_screenshot=True,
)


In [5]:
await web_search_agent.browser_tool.goto("https://scholar.google.com")

In [6]:
await web_search_agent.browser_tool.mouse_click(x=800, y=810)

In [6]:
await web_search_agent.browser_tool.type_text(text="Elon Musk")

In [7]:
await web_search_agent.browser_tool.keyboard_press(key="Enter")

In [None]:
content = await web_search_agent.browser_tool.get_accessibility_tree()
content

In [None]:
await web_search_agent.browser_tool.highlight_pixel_grid(pixel_spacing=100, reasoning="Highlight pixel grid on page")

In [None]:
elements = await web_search_agent.predict_interactive_elements()
elements

In [None]:
await web_search_agent.browser_tool.highlight_bbox(elements=elements)

In [6]:
await web_search_agent.highlight_interactive_elements(
    visible_only=True,
    use_prediction=True,
    use_rule_based=True,
    intersection_threshold=0.2,
)



{'screenshot_path': 'tmp/screenshots/highlighted_elements_1751398960975.png',
 'elements': [{'label': '', 'number': 1, 'center': [27, 31]},
  {'label': ': Mein Profil', 'number': 2, 'center': [115, 31]},
  {'label': ': Meine Bibliothek', 'number': 3, 'center': [254, 31]},
  {'label': ': Anmelden', 'number': 4, 'center': [1383, 31]},
  {'label': 'text', 'number': 5, 'center': [700, 253]},
  {'label': 'submit', 'number': 6, 'center': [989, 253]},
  {'label': ': Beliebige Sprache', 'number': 7, 'center': [646, 296]},
  {'label': ': Seiten auf Deutsch', 'number': 8, 'center': [791, 296]},
  {'label': 'Link - Auf den Schultern von Riesen',
   'number': 9,
   'center': [718.5, 373.5]},
  {'label': ': DE', 'number': 10, 'center': [39, 940]},
  {'label': ': Datenschutzerkl√§rung', 'number': 11, 'center': [1148, 940]},
  {'label': ': Nutzungsbedingungen', 'number': 12, 'center': [1300, 940]},
  {'label': ': Hilfe', 'number': 13, 'center': [1402, 940]},
  {'label': 'radio', 'number': 14, 'center

In [10]:
await web_search_agent.browser_tool.press_key(key="l")

In [None]:
await web_search_agent.screenshot("latest_screenshot", full_page=True)

In [None]:
await web_search_agent.click(selector="#L2AGLb")

In [None]:
await web_search_agent.screenshot("latest_screenshot")

In [8]:
from bs4 import BeautifulSoup

In [9]:
# html_content = await web_search_agent.page.content()

In [None]:
html_content = await web_search_agent.get_clean_html()
soup = BeautifulSoup(html_content, 'html.parser')

# Print nicely formatted indented html tags
print(soup.prettify())

In [None]:
## Define the search request
# initial_request = {
#     "search_query": "Latest papers for reasoning in large thinking models LLMs",
#     "num_results": 10,
# }

# output = await web_search_agent.auto_run(initial_request=initial_request)