# Web browsing agents with LlamaIndex
### Setup agent functions

In [1]:
%pip install llama-index
%pip install playwright
%pip install tarsier
!playwright install

Collecting llama-index
  Using cached llama_index-0.8.67-py3-none-any.whl (859 kB)
Collecting aiostream<0.6.0,>=0.5.2 (from llama-index)
  Using cached aiostream-0.5.2-py3-none-any.whl (39 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from llama-index)
  Using cached dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting deprecated>=1.2.9.3 (from llama-index)
  Using cached Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting fsspec>=2023.5.0 (from llama-index)
  Using cached fsspec-2023.10.0-py3-none-any.whl (166 kB)
Collecting nltk<4.0.0,>=3.8.1 (from llama-index)
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting pandas (from llama-index)
  Downloading pandas-2.1.3-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting tiktoken>=0.3.3 (from llama-index)
  Downloading tiktoken-0.5.1-cp311-cp311-macosx_11_0_arm64.whl (924 kB)

In [17]:
from llama_index.tools import FunctionTool
from playwright.async_api import async_playwright
from tarsier import Tarsier, GoogleVisionOCRService
import os

# Setup Playwright
p = await async_playwright().__aenter__()
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()

# Setup Creds
os.environ["OPENAI_API_KEY"] = "sk-..."
google_cloud_credentials = {...}

# Setup Tarsier
ocr_service = GoogleVisionOCRService(google_cloud_credentials)
tarsier = Tarsier(ocr_service)
tag_to_xpath = {}


# Define tools/actions
async def read_page() -> str:
    """
    Use to read the current state of the page
    """
    page_text, inner_tag_to_xpath = await tarsier.page_to_text(page)
    tag_to_xpath.clear()
    tag_to_xpath.update(inner_tag_to_xpath)
    return page_text


read_page_tool = FunctionTool.from_defaults(fn=read_page, async_fn=read_page)


async def click(element_id: int) -> str:
    """
    Click on an element based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]
    print(x_path)
    element = page.locator(x_path)
    await element.scroll_into_view_if_needed()
    await page.wait_for_timeout(1000)
    await element.click()
    await page.wait_for_timeout(2000)
    return await read_page()


click_tool = FunctionTool.from_defaults(fn=click, async_fn=click)


async def type_text(element_id: int, text: str) -> str:
    """
    Input text into a textbox based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]
    print(x_path)
    await page.locator(x_path).press_sequentially(text)
    return await read_page()


type_text_tool = FunctionTool.from_defaults(fn=type_text, async_fn=type_text)


async def press_key(key: str) -> str:
    """
    Press a key on the keyboard and return the new page state
    """
    await page.keyboard.press(key)
    await page.wait_for_timeout(2000)
    return await read_page()


press_key_tool = FunctionTool.from_defaults(fn=press_key, async_fn=press_key)

### GPT4-V + Tarsier

In [18]:
from llama_index.agent import OpenAIAgent
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-4")
tarsier_agent = OpenAIAgent.from_tools(
    [read_page_tool, click_tool, type_text_tool, press_key_tool],
    llm=llm,
    verbose=True,
    system_prompt="You are a web interaction agent. Start first by using the read page tool to understand where you currently are. You will be passed in OCR text of a web page where element ids are to the left of elements.",
)

In [19]:
# Try out the agent
await page.goto("https://www.google.com/")
await tarsier_agent.achat(
    """
    Read the page, search for OpenAI Dev day, go to the first video
    """
)

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: read_page with args: {}
Got output: --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0 ] About  [ 1 ] Store
    [																		    [ 2 ] Gmail			[ 5 ] Sign
																				   [ 3 ]
																					 Images
																						 [ 4 ]
												       GOOGLE
																				 Sign in to    Google
																				 Save your     passwords securely with
																						Account
																				       Google
																				 your
																	   EY
								     [ 6 ]
																					  signed out
																				    Stay		    Sign in
																	 [ 9 ] [ 10
							   Q   { 8 }									      ]
											      Search
										11 )  Google
									      (			  { 12 } I'm  Feeling Lucky
										      Learn how  Goog

AgentChatResponse(response='The first video titled "OpenAI DevDay, Opening Keynote" has been opened.', sources=[ToolOutput(content="--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n0 ] About  [ 1 ] Store\n    [\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t    [ 2 ] Gmail\t\t\t[ 5 ] Sign\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t   [ 3 ]\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Images\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t [ 4 ]\n\t\t\t\t\t\t\t\t\t\t\t\t       GOOGLE\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Sign in to    Google\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Save your     passwords securely with\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAccount\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t       Google\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t your\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t   EY\n\t\t\t\t\t\t\t\t     [ 6 ]\n\t\t\t\t\