# Web browsing agents with langchain
### Setup agent functions

In [1]:
%pip install langchain
%pip install playwright
%pip install tarsier
!playwright install


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import tool
from playwright.async_api import async_playwright
from tarsier import Tarsier, GoogleVisionOCRService
import os

# Setup Playwright
p = await async_playwright().__aenter__()
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()

# Setup Creds
os.environ['OPENAI_API_KEY'] = "sk-..."
google_cloud_credentials = {...}

# Setup Tarsier
ocr_service = GoogleVisionOCRService(google_cloud_credentials)
tarsier = Tarsier(ocr_service)
tag_to_xpath = {}

# Define tools/actions
@tool
async def read_page() -> str:
    """
    Use to read the current state of the page
    """
    return await read_page_impl()

    
async def read_page_impl() -> str:
    page_text, inner_tag_to_xpath = await tarsier.page_to_text(page)
    tag_to_xpath.clear()
    tag_to_xpath.update(inner_tag_to_xpath)
    return page_text

@tool
async def click(element_id: int) -> str:
    """
    Click on an element based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]
    print(x_path)
    element = page.locator(x_path)
    await element.scroll_into_view_if_needed()
    await page.wait_for_timeout(1000)
    await element.click()
    await page.wait_for_timeout(2000)
    return await read_page_impl()
    

@tool
async def type_text(element_id: int, text: str) -> str:
    """
    Input text into a textbox based on element_id and return the new page state
    """
    x_path = tag_to_xpath[element_id]
    print(x_path)
    await page.locator(x_path).press_sequentially(text)
    return await read_page_impl()

@tool
async def press_key(key: str) -> str:
    """
    Press a key on the keyboard and return the new page state
    """
    await page.keyboard.press(key)
    await page.wait_for_timeout(2000)
    return await read_page_impl()

### GPT4-V + Tarsier

In [3]:
from langchain.prompts import ChatPromptTemplate
from langchain.agents import initialize_agent, AgentType
from langchain.chains import LLMChain
template = """
You are a web interaction agent. Use the read page tool to understand where you currently are. 
You will be passed in OCR text of a web page where element ids are to the left of elements. 

You have access to the following tools:
{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

These were previous tasks you completed:

Begin!

Question: {input}
{agent_scratchpad}"""
prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(model_name="gpt-4", temperature=0)
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Setup chain
tarsier_agent_chain = initialize_agent(
    [read_page, click, type_text],
    llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

# Go to google

In [4]:
await page.goto("https://www.google.com/")
await tarsier_agent_chain.arun(
    """
    Read the page, search for OpenAI Dev day, go to the first video
    """)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: The first step is to read the current state of the page to understand where we are and what elements are available. 

Action:
```
{
  "action": "read_page",
  "action_input": {}
}
```[0m
Observation: [36;1m[1;3m--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0 ] About  [ 1 ] Store
    [																		    [ 2 ] Gmail			[ 5 ] Sign
																				   [ 3 ]
																					 Images
																						 [ 4 ]
												       GOOGLE
																				 Sign in to    Google
																				 Save your     passwords securely with
																						Account
																				       Google
																				 your
																	   EY
								     [ 6 ]
																					  signed out
																				    Stay		    Sign in
																	 

"We are now on the page of the first video titled 'OpenAI DevDay, Opening Keynote' on YouTube."