In [None]:
%pip install -q helium pillow selenium langgraph langchain-core langchain-community langchain_ollama

In [None]:
import os, getpass

def _set_env(var: str):
    # Check if the variable is set in the OS environment
    env_value = os.environ.get(var)
    if not env_value:
        # If not set, prompt the user for input
        env_value = getpass.getpass(f"{var}: ")
    
    # Set the environment variable for the current process
    os.environ[var] = env_value

_set_env("LANGCHAIN_API_KEY")

import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "web browsing agent"

In [None]:
import warnings
warnings.filterwarnings("ignore")

# LangGraph

In [None]:
import logging
import helium

from IPython.display import Image, display

from typing import Annotated, Dict, List, Literal, Optional

from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotInteractableException, TimeoutException

from langchain_core.tools import tool
from langgraph.prebuilt import InjectedState, ToolNode, tools_condition
from langgraph.graph import END, START, MessagesState, StateGraph
from langchain_fireworks import ChatFireworks

from langchain_core.prompts import ChatPromptTemplate

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langgraph.checkpoint.memory import MemorySaver

from langchain.agents import AgentType, initialize_agent
from langgraph.prebuilt import create_react_agent
from langchain_ollama import ChatOllama

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

logging = logging.getLogger(__name__)

BROWSER_DRIVER = None

## Utils

In [None]:
def open_browser(
    headless: bool = False, 
    window_width: int = 980, 
    window_height: int = 1300
) -> Optional[webdriver.Chrome]:
    """
    Open a Chrome browser with specified options.
    
    Args:
        headless (bool): Run browser in headless mode.
        window_width (int): Browser window width.
        window_height (int): Browser window height.
    
    Returns:
        Optional[webdriver.Chrome]: Opened browser driver.
    """
    try:
        # Configure Chrome options
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--force-device-scale-factor=1")
        chrome_options.add_argument(f"--window-size={window_width},{window_height}")
        chrome_options.add_argument("--disable-pdf-viewer")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        if headless:
            chrome_options.add_argument("--headless")
        
        # Start Chrome with Helium
        driver = helium.start_chrome(headless=headless, options=chrome_options)
        
        logging.info("Browser opened successfully.")
        return driver
    
    except Exception as e:
        logging.error(f"Failed to open browser: {e}")
        return None

def is_browser_open(driver: Optional[webdriver.Chrome]) -> bool:
    """
    Check if the browser driver is open and functional
    
    Args:
        driver (WebDriver): Selenium WebDriver instance
    
    Returns:
        bool: True if browser is open, False otherwise
    """
    try:
        # Multiple checks to ensure browser is truly open
        return (
            driver is not None and 
            hasattr(driver, 'current_window_handle') and
            driver.current_window_handle is not None and
            driver.title is not None
        )
    except Exception:
        return False

def close_browser(driver: Optional[webdriver.Chrome]):
    """
    Close the current browser session.
    
    Args:
        driver (WebDriver): Selenium WebDriver instance to close
    """
    try:
        if is_browser_open(driver):
            driver.quit()
            logging.info("Browser closed successfully.")
    except Exception as e:
        logging.error(f"Error closing browser: {e}")


## State

In [None]:
class BrowserAgentState(MessagesState):
    current_url: Optional[str]
    error: Optional[str]
    next: Optional[Literal["continue", END]]

## Tools

In [None]:
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for the specified text on the current webpage using a Ctrl+F-like functionality.
    """
    global BROWSER_DRIVER
    if not BROWSER_DRIVER:
        return "No browser driver available"
    
    try:
        elements = BROWSER_DRIVER.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
        if nth_result > len(elements):
            raise Exception(f"Match nÂ°{nth_result} not found (only {len(elements)} matches found)")
        
        result = f"Found {len(elements)} matches for '{text}'."
        elem = elements[nth_result - 1]
        BROWSER_DRIVER.execute_script("arguments[0].scrollIntoView(true);", elem)
        result += f"Focused on element {nth_result} of {len(elements)}"
        return result
    
    except Exception as e:
        return f"Search error: {str(e)}"

@tool
def go_to(url: str) -> str:
    """
    Navigates the browser to the specified URL.
    """
    global BROWSER_DRIVER
    if not BROWSER_DRIVER:
        return "No browser driver available"
    
    try:
        BROWSER_DRIVER.get(url)
        return f"Navigated to {url}"
    except Exception as e:
        return f"Navigation error: {str(e)}"

@tool
def go_back() -> str:
    """
    Navigates the browser to the previous page in the session history.
    """
    global BROWSER_DRIVER
    if not BROWSER_DRIVER:
        return "No browser driver available"
    
    try:
        BROWSER_DRIVER.back()
        return "Navigated to previous page"
    except Exception as e:
        return f"Navigation error: {str(e)}"

@tool
def close_popups() -> str:
    """
    Closes any visible modal dialogs or pop-up windows on the current webpage.
    """
    global BROWSER_DRIVER
    if not BROWSER_DRIVER:
        return "No browser driver available"

    modal_selectors = [
        "button[class*='close']",
        "[class*='modal']",
        "[class*='modal'] button",
        "[class*='CloseButton']",
        "[aria-label*='close']",
        ".modal-close",
        ".close-modal",
        ".modal .close",
        ".modal-backdrop",
        ".modal-overlay",
        "[class*='overlay']",
    ]

    wait = WebDriverWait(BROWSER_DRIVER, timeout=0.5)

    for selector in modal_selectors:
        try:
            elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))

            for element in elements:
                if element.is_displayed():
                    try:
                        BROWSER_DRIVER.execute_script("arguments[0].click();", element)
                    except ElementNotInteractableException:
                        element.click()

        except TimeoutException:
            continue
        except Exception as e:
            print(f"Error handling selector {selector}: {str(e)}")
            continue
    return "Modals closed"

In [None]:
tools = [search_item_ctrl_f, go_to, go_back, close_popups]

# Prompt

In [None]:
prompt = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
First you need to import everything from helium, then you can do other actions!
Code:
```py
from helium import *
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
```py
click("Top products")
```<end_code>

If it's a link:
Code:
```py
click(Link("Top products"))
```<end_code>

If you try to interact with an element and it's not found, you'll get a LookupError.
In general stop your action after each button click to see what happens on your screenshot.
Never try to login in a page.

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
```py
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
Just use your built-in tool `close_popups` to close them:
Code:
```py
close_popups()
```<end_code>

You can use .exists() to check for the existence of an element. For example:
Code:
```py
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>

Proceed in several steps rather than trying to solve the task in one shot.
And at the end, only when you have your answer, return your final answer.
Code:
```py
final_answer("YOUR_ANSWER_HERE")
```<end_code>

If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
Of course, you can act on buttons like a user would do when navigating.
After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
Don't kill the browser.
"""

## Graph

In [None]:
model = ChatOllama(
    base_url="http://localhost:11434",
    model="llama3.2",
    temperature=0,
    streaming=True
).bind_tools(tools)

In [None]:
def agent_node(state: BrowserAgentState):
    global BROWSER_DRIVER
    
    if not BROWSER_DRIVER or not is_browser_open(BROWSER_DRIVER):
        try:
            BROWSER_DRIVER = open_browser(headless=False)
        except Exception as e:
            return BrowserAgentState(
                messages=state.messages + [AIMessge(content=f"Error opening browser: {str(e)}")],
                current_url=None,
                error=str(e),
                next=None
            )
            
    messages = [
        SystemMessage(content=prompt),
        *state["messages"]
    ]

    response = model.invoke(messages)

    return {
        **state,
        "current_url": BROWSER_DRIVER.current_url if BROWSER_DRIVER else None,
        "messages": state["messages"] + [response]
    }


In [None]:
def should_continue(state: BrowserAgentState) -> str:    
    messages = state['messages']
    last_message = messages[-1]
    
    return "continue" if last_message.tool_calls else END

In [None]:
builder = StateGraph(BrowserAgentState)
    
builder.add_node("agent", agent_node)
builder.add_node("tools", ToolNode(tools))

builder.set_entry_point("agent")

builder.add_conditional_edges(
    "agent",
    should_continue,
    {
        "continue": "tools",
        END: END
    }
)

builder.add_edge("tools", "agent")


graph = builder.compile()

In [None]:
from IPython.display import Image, display

try:
    display(Image(graph.get_graph().draw_mermaid_png()))
except Exception:
    pass

# Test graph

In [27]:
message = "please navigate to https://www.langchain.com/about and explain me their mission"

In [28]:
config = {"recursion_limit": 10}
messages = [HumanMessage(content=message)]
messages = graph.invoke({"messages": messages}, config)
for m in messages['messages']:
    m.pretty_print()

2025-01-25 22:00:15,584 - INFO - Browser opened successfully.
2025-01-25 22:00:25,505 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-01-25 22:00:33,837 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"



please navigate to https://www.langchain.com/about and explain me their mission
Tool Calls:
  go_to (461a8eac-7ddb-4757-a7f7-7bf94cc3f068)
 Call ID: 461a8eac-7ddb-4757-a7f7-7bf94cc3f068
  Args:
    url: https://www.langchain.com/about
Name: go_to

Navigated to https://www.langchain.com/about

According to the website, LangChain's mission is to "empower developers and researchers to build scalable, reliable, and maintainable AI systems by providing a set of tools and frameworks that simplify the process of integrating multiple AI models and services into a single workflow."
