In [10]:
# Setup
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, initialize_agent, AgentType
from PIL import Image
import pytesseract
import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

# Load env variables
load_dotenv()

# Set tesseract path
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\FahRe\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# LangSmith setup (optional)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "ReAct-Demo"

# Define a constant for the screenshot path
SCREENSHOT_PATH = "compendium_screenshot.png"

# Function to open compendium.ch and take screenshot
def take_screenshot_of_compendium(output_path=SCREENSHOT_PATH):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # comment this out if you want to see the browser
    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://compendium.ch")
    time.sleep(3)  # wait for page to load
    driver.save_screenshot(output_path)
    driver.quit()
    return output_path

# OCR function
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# --- New function: type into search field ---
def type_into_search_field(search_term="Dafalgan EXTRA"):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Optional: remove if you want to see it
    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://compendium.ch")
    time.sleep(3)

    search_input = driver.find_element("css selector", "input[placeholder*='Suche']")
    search_input.send_keys(search_term)
    search_input.submit()

    time.sleep(3)
    screenshot_path = "search_result.png"
    driver.save_screenshot(screenshot_path)
    driver.quit()
    return f"Typed '{search_term}' into search. Screenshot saved as {screenshot_path}"


# LLM setup
llm = ChatOpenAI(temperature=0, model="gpt-4", openai_api_key=os.getenv("OPENAI_KEY"))

# Tools for the agent
tools = [
    Tool(
        name="TakeScreenshotOfCompendium",
        func=take_screenshot_of_compendium,
        description="Use this to take a screenshot of compendium.ch"
    ),
    Tool(
        name="ExtractTextFromImage",
        func=extract_text_from_image,
        description="Use this to read text from a screenshot"
    ),
    Tool(
            name="TypeIntoSearch",
            func=type_into_search_field,
            description="Use this to type 'Dafalgan EXTRA' into the website's search bar"
        )
]

# Initialize the agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# Run the workflow
result = agent.run("Take a screenshot of compendium.ch, extract text from it, and tell me what action to take.")
print(result)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mFirst, I need to take a screenshot of the website.
Action: TakeScreenshotOfCompendium
Action Input: output_path='compendium_screenshot.png'[0m

  return self.get_screenshot_as_file(filename)



Observation: [36;1m[1;3moutput_path='compendium_screenshot.png'[0m
Thought:[32;1m[1;3mNow that I have the screenshot, I need to extract the text from it.
Action: ExtractTextFromImage
Action Input: image_path='compendium_screenshot.png'[0m

FileNotFoundError: [Errno 2] No such file or directory: "image_path='compendium_screenshot.png'"

In [11]:
# === 🧱 SETUP ===
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, initialize_agent, AgentType
from PIL import Image
import pytesseract
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Setup LangSmith (optional)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "ReAct-HCI-Agent"

# Point Tesseract to the installed path
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\FahRe\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# Screenshot file path constant
SCREENSHOT_PATH = "compendium_screenshot.png"


In [13]:
# Take screenshot of compendium.ch
def take_screenshot_of_compendium(output_path=SCREENSHOT_PATH):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://compendium.ch")
    time.sleep(3)
    driver.save_screenshot(output_path)
    driver.quit()
    return f"Screenshot saved to {output_path}"

# OCR from screenshot
def extract_text_from_image(image_path=SCREENSHOT_PATH):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# Type into the search bar
def type_into_search_field(search_term="Dafalgan EXTRA"):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://compendium.ch")
    time.sleep(3)

    # Locate search input and enter term
    search_input = driver.find_element("css selector", "input[placeholder*='Suche']")
    search_input.send_keys(search_term)
    search_input.submit()

    time.sleep(3)
    screenshot_path = "search_result.png"
    driver.save_screenshot(screenshot_path)
    driver.quit()
    return f"Searched for '{search_term}'. Screenshot saved as {screenshot_path}"


In [14]:
# Load your LLM
llm = ChatOpenAI(temperature=0, model="gpt-4", openai_api_key=os.getenv("OPENAI_KEY"))

# Define tools
tools = [
    Tool(
        name="TakeScreenshotOfCompendium",
        func=take_screenshot_of_compendium,
        description="Takes a screenshot of the compendium.ch homepage."
    ),
    Tool(
        name="ExtractTextFromImage",
        func=extract_text_from_image,
        description="Extracts visible text from a screenshot using OCR."
    ),
    Tool(
        name="TypeIntoSearchField",
        func=type_into_search_field,
        description="Types 'Dafalgan EXTRA' into the search bar and returns new screenshot."
    )
]

# Initialize ReAct agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)


In [15]:
# Set a goal and run the agent multiple times
goal = """
Visit https://compendium.ch, extract what is visible, and search for 'Dafalgan EXTRA'.
Repeat until you have found the search results.
"""

# Loop over steps, simulating ReAct-style behavior
for step in range(3):
    print(f"\n🔁 Step {step + 1} ----------------------------")
    result = agent.run(goal)
    print("📌 Agent output:", result)

    # Optional: Break early if successful
    if "500/65 mg" in result or "Filmtabl" in result:
        print("\n✅ Goal achieved: Found product info.")
        break



🔁 Step 1 ----------------------------


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mFirst, I need to take a screenshot of the compendium.ch homepage.

Action: 
TakeScreenshotOfCompendium

Action Input: 
output_path='compendium_screenshot.png'
[0m

  return self.get_screenshot_as_file(filename)



Observation: [36;1m[1;3mScreenshot saved to output_path='compendium_screenshot.png'
[0m
Thought:[32;1m[1;3mNow I need to extract the visible text from the screenshot.

Action: 
ExtractTextFromImage

Action Input: 
image_path='compendium_screenshot.png'
[0m

OSError: [Errno 22] Invalid argument: "image_path='compendium_screenshot.png'\n"

In [45]:
# === SETUP ===
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, initialize_agent, AgentType
from selenium.webdriver import Edge, EdgeOptions
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pytesseract
import os
import time
from dotenv import load_dotenv

# Load environment
load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "HCI-Live-Agent"
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\FahRe\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

SCREENSHOT_PATH = "edge_screenshot.png"
driver_path = r"C:\Users\FahRe\Downloads\edgedriver_win64\msedgedriver.exe"

def slow_type(element, text, delay=0.12):
    for char in text:
        element.send_keys(char)
        time.sleep(delay)

def open_edge_and_search(search_term="Dafalgan EXTRA"):
    from selenium.common.exceptions import SessionNotCreatedException, WebDriverException

    try:
        edge_options = EdgeOptions()
        edge_options.use_chromium = True
        edge_options.add_argument("start-maximized")
        edge_options.add_argument("--disable-extensions")

        service = Service(executable_path=driver_path)
        driver = Edge(service=service, options=edge_options)

        driver.get("https://compendium.ch")
        time.sleep(4)

        search_input = driver.find_element(By.CSS_SELECTOR, "input[placeholder*='Suche']")
        slow_type(search_input, search_term, delay=0.15)
        time.sleep(0.5)
        search_input.send_keys(Keys.RETURN)

        time.sleep(4)
        driver.save_screenshot(SCREENSHOT_PATH)
        driver.quit()

        return f"Searched '{search_term}' and saved screenshot to '{SCREENSHOT_PATH}'"

    except (SessionNotCreatedException, WebDriverException) as e:
        return f"❌ Failed to launch Edge or complete search: {str(e)}"

## OCR function
def extract_text_from_screenshot(image_path=SCREENSHOT_PATH):
    if isinstance(image_path, str):
        image_path = image_path.strip().replace("image_path=", "").replace("'", "")
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return f"OCR Result:\n{text}"

def safe_search_wrapper(term, retries=2):
    for attempt in range(retries):
        print(f"🔁 Attempt {attempt + 1}")
        result = open_edge_and_search(term)
        print(result)
        if "Screenshot saved" in result:
            return result
        time.sleep(1)
    return "❌ All retries failed."


In [46]:
llm = ChatOpenAI(temperature=0, model="gpt-4", openai_api_key=os.getenv("OPENAI_KEY"))

tools = [
    Tool(
        name="SearchInCompendium",
        func=lambda term: safe_search_wrapper(term),
        description="Searches compendium.ch for a medication like 'Dafalgan EXTRA' and takes a screenshot."
    ),
    Tool(
        name="ExtractTextFromScreenshot",
        func=extract_text_from_screenshot,
        description="Extracts visible text from a screenshot using OCR."
    )
]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)


In [47]:
goal = """
Search for 'Dafalgan EXTRA' on compendium.ch, extract the visible text from the results page.
Use tools step-by-step. Screenshot filename is 'edge_screenshot.png'.
"""

for i in range(3):
    print(f"\n🔄 Step {i+1}")
    result = agent.run(goal)
    print(result)

    if "Filmtabl" in result or "500/65 mg" in result:
        print("✅ Found product.")
        break



🔄 Step 1


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to search for 'Dafalgan EXTRA' on compendium.ch and then extract the visible text from the screenshot of the results page.

Action: SearchInCompendium
Action Input: 'Dafalgan EXTRA'[0m🔁 Attempt 1
Searched ''Dafalgan EXTRA'' and saved screenshot to 'edge_screenshot.png'
🔁 Attempt 2
Searched ''Dafalgan EXTRA'' and saved screenshot to 'edge_screenshot.png'

Observation: [36;1m[1;3m❌ All retries failed.[0m
Thought:[32;1m[1;3mThe search for 'Dafalgan EXTRA' on compendium.ch failed. I can't proceed to the next step without this information.
Final Answer: The search for 'Dafalgan EXTRA' on compendium.ch failed. I was unable to extract the visible text from the results page.[0m

[1m> Finished chain.[0m
The search for 'Dafalgan EXTRA' on compendium.ch failed. I was unable to extract the visible text from the results page.

🔄 Step 2


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to search 

KeyboardInterrupt: 