In [1]:
import numpy as np
import torch
import torch.fx
import transformers
from transformers import AutoTokenizer, LlamaForCausalLM
from vllm import LLM, SamplingParams

import openai
import os 
from gymnasium import envs
from tests.utils import get_all_registered_miniwob_envs
import time
import gymnasium as gym
import re

In [2]:
from miniwob.action import (
    create_coord_click_action,
    create_element_click_action,
    create_focus_and_type_action,
    create_type_action,
    create_none_action   
)

# Predefined Function

In [3]:
# Modified click_action function
def click_action1(tc, it, observation): 
    for element in observation["dom_elements"]:
        if (element["tag"] == tc or element["classes"] == tc) and (element["id"] == it or element["text"] == it):
            return create_element_click_action(element["ref"])
    return 'Cannot find in the DOM_element'

In [4]:
# Find the input text box and click on it
def find_input_action(tag, observation):
    for element in obs["dom_elements"]:
        if element["tag"] == tag:
            return create_element_click_action(element["ref"])

In [5]:
# Type Text
def enter_text_action(input_text, observation):
    target = input_text
    return create_type_action(target)

In [6]:
# Click on the screen based on input coordinates
def coord_click_action(x ,y):
    action = create_coord_click_action(x, y)
    return action

In [7]:
# Action function of doing Nothing
def do_nothing_action():
    return create_none_action()

In [8]:
# Count number of text or shape
def count_type(t, observation):
    shape_type = ['rect', 'circle', 'polygon']
    count = 0
    for element in obs['dom_elements']:
        if element['classes'] == 'SVG_CLASS':
            if t == 'text' and element['tag'] == t: 
                count += 1
            elif t == 'shape' and element['tag'] in shape_type:
                count += 1
    return count

# Get DOM_element

In [9]:
# after switching positions
def get_DOM_element(observation, useful_tag, useful_classes):
    objects_in_the_image = []
    for d in observation['dom_elements']:
        if d['classes'] in useful_classes:
            if d['text'] != '':
                objects_in_the_image.append(d['classes']+' '+d['text'])
            else:
                objects_in_the_image.append(d['classes']+' '+d['id'])
        elif d['tag'] in useful_tag:
            if d['text'] != '':
                objects_in_the_image.append(d['tag']+' '+d['text'])
            else:
                objects_in_the_image.append(d['tag']+' '+d['id'])
                
    return objects_in_the_image

In [10]:
useful_tag = {'button', 'text', 'input_time', 'textarea', 'polygon','label','input_password','rect',"tt",\
              'circle' ,'input_password', 'span', 'input_text', 'input_number', 'input_date', 'input_radio', \
              'tspan', 'input_checkbox', 't', 'button', 'h3', 'ul', 'a', 'p', 'div', 'span','input_text'}

useful_classes = {'folder'}

In [11]:
classes = {'alink'}

# Regex

In [12]:
class filter_prompt:
    def __init__(self):
        self.click_format = r'.*click_action1\s*\(\s*(?<!\\)[^\\,]+\s*,\s*(?<!\\)[^\\,]+\s*,\s*observation\s*\)$'
        self.click_format1 = r'.*click_action1\s*\(\s*[^\\,]+\s*,\s*[^\\,]+\s*,\s*observation\s*\)|.*click_action1\s*\(\s*[^\\,]+\s*,\s*observation\s*,\s*[^\\,]+\s*\)$'
        self.find_input_format = r'.*find_input_action\s*\(\s*observation\s*\)$'
        self.enter_text_format = r'.*enter_text_action\s*\(\s*(?<!\\)[^\\,]+\s*,\s*observation\s*\)$'
        self.coord_click_format = r'.*coord_click_action\s*\(\s*\d+\s*,\s*\d+\s*\)$'
        self.do_nothing_format = r'.*do_nothing_action\s*\(\s*\)$'
        self.count_format = r'.*count_type\s*\(\s*[^\\]*,\s*observation\s*\)$'
        self.make_sure = r'\s*\b(action|observation)'
        self.obs = r'\s*observation\s*,\s*reward\s*,\s*terminated\s*,\s*truncated\s*,\s*info\s*=\s*env\.step\(([^)]+)\)'
        
    def check_promt(self, expression=None):
        if expression == None:
            raise ValueError("Expression cannot be None")
           
        if not(bool(re.match(self.make_sure, expression, re.IGNORECASE))):
            return 'not_useful'
        
        if bool(re.match(self.obs, expression, re.IGNORECASE)):
            return expression
        
        exp = expression.split('=')[1].lstrip()
        action_type = exp.split('(')[0].lstrip()
        if action_type == 'click_action1':
            x = re.search(self.click_format1, expression)
            if x == None:
                return False
            return x.string
        
        if action_type == 'find_input_action':
            x = re.search(self.find_input_format, expression)
            if x == None:
                return False
            return x.string
        
        if action_type == 'enter_text_action':
            x = re.search(self.enter_text_format, expression)
            if x == None:
                return False
            return x.string
            
        if action_type == 'coord_click_action':
            x = re.search(self.coord_click_format, expression)
            if x == None:
                return False
            return x.string
        
        if action_type == 'do_nothing_action':
            x = re.search(self.do_nothing_format, expression)
            if x == None:
                return False
            return x.string
        
        if action_type == 'count_type':
            x = re.search(self.count_format, expression)
            if x == None:
                return False
            return x.string
        
        return False

# Llama2

In [13]:
System_Message = """You are designed to generate programs to solve a wide range of complex web interface tasks.
You should be able to generate the program using either one or a composition of predefined action functions 
along with general python codes to solve different tasks. You should not conversate with human in any context.
"""

# Can also set up example to use it later for each task
Function_Description = """
You should only use the functions provided herewith in the function description. 
Here is the list for the pre-defined functions [click_action1, enter_text_action, coord_click_action, count_type]

To use a function, please refer to the Name, Input, Output, Description of the functions, and usage examples below. 
Action functions should be called correctly in the solution. 

Function Name: click_action1
Input: tag or element, id or text, observation
Output: create_click_element_action
Description: useful when you want to click on an element in the web interface. This functions cannot be 
            generalized on names. Normally first input is one of tag or element and second is test or id.
            The output is given as the action by calling click_action1 function or 
            'Cannot find in the DOM_element' if no such thing to be clicked on.
Example: Objects in Image: Button One;
         Task: Click button ONE;
         Solution: action = click_action1('button', 'ONE', observation)
                   observation, reward, terminated, truncated, info = env.step(action)

Function Name: enter_text_action
Input: input_text, observation
Output: create_type_action
Description: useful when you want to type the input_text into input text box or a similar object like input_number
            that can accept 
             text given the observation of the task interface. Need to call click_action1 to click on it before
             calling this function.
             The output is given as the action by calling enter_text_action function.
Example: Objects in Image: input_text textbox; 
         Task: Type 'Heyi' into textbox;
         Solution: ction = click_action1('input_text', 'textbox', observation)
                   observation, reward, terminated, truncated, info = env.step(action)
                   action = enter_text_action('Heyi', observation)
                   observation, reward, terminated, truncated, info = env.step(action)
            
Function Name: coord_click_action
Input: x, y
Output: create_coord_click_action
Description: useful when you want to click at a the (x, y) coordinate on the task interface given the left
             upper corner of the task interface as the coordinate(0,0). 
             The output is given as the action by calling coord_click_action function.
Example: Task: Click on coordinate (x, y) on the screen 
         Solution: action = coord_click_action(x, y)
                   observation, reward, terminated, truncated, info = env.step(action)

Name: count_type
Input: tag, observation
Output: count
Description: useful when you want count the number with the type tag given by the input. Observation of the 
             task interface is also given. The output is given as the find count of that element by calling 
             count_type function.
Example: Objects in Image: Shape Circle, Shape Square 
         Task: Count number of Shape objects in the screen 
         Solution: num = count(Shape, observation)
"""


Solution_Description ="""
Objects in the image are given by the concatenation of either tag or classes and either text or id. Solution
should be consistent with objects in the image when calling functions.
The final solution should be given after the text 'Solution?' based on the Object in the image and task given, 
and should only give the solution by either calling the predfined functions' names 
with or without using python format data structure. No need to specify extra text solution when giving solution.
Any other texts can only be given as comment in python format.
Assume that you can direclty use observation before calling the function without checking it.

Cannot assume there are other functions beside previous given functions or unknown imformation from the system.
Solution actions will be executed directly, no need to regive solution again as a list, but add 
observation, reward, terminated, truncated, info = env.step(action) after every action. Actions are independent
from each other.

If you cannot directly solve a given task as shown in the examples in function description or as a hierarchy of actions
to achieve the final goal, you should try to first perform a reasonable action which you can do given the objects
in the image and interact with the environment, post that observe the change in objects in the image, 
and then try to solve the task for new objects in the environment. 
Remember, you can add any other python data structure like if-else, for loop, while loop etc to 
generated program to solve the question. You should generate the program based on the objects in the image and use 
your knowledge from your training data on DOM Elements to understand the objects in the image. 
"""

# Currently not including the do_nothing_action, can add it later
# """
# Function Name: do_nothing_action
# Input: 
# Output: create_none_action
# Description: useful when you find there is no correct way of solving the task by calling any other function. 
#              You can call this function multiple times to create a long period of no action. 
#              The output is given as the action by calling do_nothing_action function.
# Example: Objects in Image: Button One, Button Two 
#          Task: Click button three 
#          Solution: action = do_nothing_action()
# """

In [14]:
device="cuda:0"
model = LlamaForCausalLM.from_pretrained("llama/model_2/",load_in_8bit=True, device_map="cuda:0", max_position_embeddings=4096)
tokenizer = AutoTokenizer.from_pretrained("llama/model_2/")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
prompt = "<s>[INST] <<SYS>> {{" + System_Message + " }}<</SYS>> {{" + Function_Description + "}} [/INST] \
{{" + Solution_Description + "}} [/INST]"

In [15]:
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# generate_ids = model.generate(inputs.input_ids, max_length = 1600)
# tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Task 1 - Click button Sequence

In [68]:
env = gym.make("miniwob/click-button-sequence-v1" , render_mode="human")
observation, info = env.reset()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [69]:
print(type(prompt))

<class 'str'>


In [70]:
objects_in_the_image = get_DOM_element(observation, useful_tag, useful_classes)
task=observation["utterance"]
task_input = prompt + "Objects in Image: {}; \
                       Task:{}'  \
                       Solution?".format(objects_in_the_image,task)

In [72]:
inputs = tokenizer(task_input, return_tensors="pt").to(device)
generate_ids = model.generate(inputs.input_ids, max_length = 1600)
solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [181]:
solution

"[INST] <<SYS>> {{You are designed to generate programs to solve a wide range of complex web interface tasks.\nYou should be able to generate the program using either one or a composition of predefined action functions \nalong with general python codes to solve different tasks. You should not conversate with human in any context.\n }}<</SYS>> {{\nYou should only use the functions provided herewith in the function description. \nHere is the list for the pre-defined functions [click_action1, enter_text_action]\n\nTo use a function, please refer to the Name, Input, Output, Description of the functions, and usage examples below. \nAction functions should be called correctly in the solution. \n\nFunction Name: click_action1\nInput: tag or element, id or text, observation\nOutput: create_click_element_action\nDescription: useful when you want to click on an element in the web interface. This functions cannot be \n            generalized on names. Normally first input is one of tag or element

In [57]:
# def llama_chat(system_prompt, conversation):
#     conversation_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>"
#     for user_msg, assistant_response in conversation:
#         conversation_prompt += f"<s>[INST] {{ user_message }} [/INST] {user_msg}"
#         if assistant_response:
#             conversation_prompt += f" {{ assistant_message }} [/INST] {assistant_response}"
            
#     conversation_prompt += "[/INST]"

#     assistant_response = response.choices[0].text.strip()
#     return assistant_response

# system_prompt = "You are a helpful assistant that provides information."
# conversation = [
#     ("What is the capital of France?", "The capital of France is Paris."),
#     ("Tell me more about Paris.", "Paris is known for its rich history and cultural attractions."),
#     ("Recommend a famous French dish.", "One famous French dish is Coq au Vin, a delicious chicken stew."),
# ]

# response = llama_chat(system_prompt, conversation)
# print("Assistant:", response)


In [29]:
class ChatAppLlama:
    def __init__(self, model, tokenizer):
        self.device = "cuda:0"
#         self.model = LlamaForCausalLM.from_pretrained("llama/model_2/", load_in_8bit=True, device_map="cuda:0", max_position_embeddings=4096)
#         self.tokenizer = AutoTokenizer.from_pretrained("llama/model_2/")
        self.system_message = ""
        self.function_description = ""
        self.solution_description = ""
        self.model_response = ""
        self.conversation = []
        
    def build_prompt(self, include_model_response=False, include_user_message=False):
        prompt = f"<s>[INST] <<SYS>> {{ {self.system_message} }}<</SYS>> {{ {self.function_description} }} [/INST] {{ {self.solution_description} }} [/INST]"

        if include_user_message and hasattr(self, 'user_message'):
            prompt += f"{{ {self.user_message} }}"

        if include_model_response and self.model_response:
            prompt += f"{{ {self.model_response} }}"

        return prompt
    
#     def build_prompt(self, include_model_response=False):
#         prompt = f"<s>[INST] <<SYS>> {{ {self.system_message} }}<</SYS>> {{ {self.function_description} }} [/INST] {{ {self.solution_description} }} [/INST]"
#         if include_model_response and self.model_response:
#             prompt += f"{{ {self.model_response} }}"
#         return prompt

    def set_system_message(self, message):
        self.system_message = message
        self.conversation.append(("System: " + message, None))

    def set_function_description(self, description):
        self.function_description = description
        self.conversation.append(("Function Description: " + description, None))

    def set_solution_description(self, description):
        self.solution_description = description
        self.conversation.append(("Solution Description: " + description, None))
    
    def set_user_message(self, user_message):
        self.user_message = user_message
        self.conversation.append(("User: " + user_message, None))
    
    def chat(self, user_message, model_response=None, include_model_response=False, include_user_message=False):
        self.set_user_message(user_message)
        self.model_response = model_response
        if model_response:
            self.conversation.append(("Model: " + model_response, None))
        prompt = self.build_prompt(include_model_response, include_user_message)
        return prompt
    
model = LlamaForCausalLM.from_pretrained("llama/model_2/", load_in_8bit=True, device_map="cuda:0", max_position_embeddings=4096)
tokenizer = AutoTokenizer.from_pretrained("llama/model_2/")

chat_app = ChatAppLlama(model, tokenizer)

# chat_app = ChatAppLlama()
chat_app.set_system_message(System_Message)
chat_app.set_function_description(Function_Description)
chat_app.set_solution_description(Solution_Description)

user_input = ""

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [30]:
env = gym.make("miniwob/click-button-sequence-v1" , render_mode="human")
observation, info = env.reset()

In [31]:
objects_in_the_image = get_DOM_element(observation, useful_tag, useful_classes)
task=observation["utterance"]

In [32]:
task

'Click button ONE, then click button TWO.'

In [33]:
# prompt = chat_app.chat(user_input)
model_response = " "
prompt_with_model_response = chat_app.chat(user_input, model_response, include_model_response=False, include_user_message=True)
# task_input = prompt + f"Objects in Image: {}; Task:{} Solution?".format(objects_in_the_image, task)
task_input_with_model_response = (
    f"{prompt_with_model_response} Objects in Image: {objects_in_the_image}; Task: {task} Solution?"
)

In [34]:
inputs = tokenizer(task_input_with_model_response, return_tensors="pt").to(chat_app.device)
generate_ids = model.generate(inputs.input_ids, max_length=1600)
solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [35]:
# print(task_input_with_model_response)

In [36]:
print(solution)

[INST] <<SYS>> { You are designed to generate programs to solve a wide range of complex web interface tasks.
You should be able to generate the program using either one or a composition of predefined action functions 
along with general python codes to solve different tasks. You should not conversate with human in any context.
 }<</SYS>> { 
You should only use the functions provided herewith in the function description. 
Here is the list for the pre-defined functions [click_action1, enter_text_action, coord_click_action, count_type]

To use a function, please refer to the Name, Input, Output, Description of the functions, and usage examples below. 
Action functions should be called correctly in the solution. 

Function Name: click_action1
Input: tag or element, id or text, observation
Output: create_click_element_action
Description: useful when you want to click on an element in the web interface. This functions cannot be 
            generalized on names. Normally first input is one o

In [37]:
p = filter_prompt()

In [38]:
final_solution=p.check_promt(solution)

In [39]:
final_solution

'not_useful'

In [40]:
import re

message = """
[INST] <<SYS>> { You are designed to generate programs to solve a wide range of complex web interface tasks.
You should be able to generate the program using either one or a composition of predefined action functions 
along with general python codes to solve different tasks. You should not conversate with human in any context.
 }<</SYS>> { 
You should only use the functions provided herewith in the function description. 
Here is the list for the pre-defined functions [click_action1, enter_text_action, coord_click_action, count_type]

To use a function, please refer to the Name, Input, Output, Description of the functions, and usage examples below. 
Action functions should be called correctly in the solution. 

Function Name: click_action1
Input: tag or element, id or text, observation
Output: create_click_element_action
Description: useful when you want to click on an element in the web interface. This functions cannot be 
            generalized on names. Normally first input is one of tag or element and second is test or id.
            The output is given as the action by calling click_action1 function or 
            'Cannot find in the DOM_element' if no such thing to be clicked on.
Example: Objects in Image: Button One;
         Task: Click button ONE;
         Solution: action = click_action1('button', 'ONE', observation)
                   observation, reward, terminated, truncated, info = env.step(action)

Function Name: enter_text_action
Input: input_text, observation
Output: create_type_action
Description: useful when you want to type the input_text into input text box or a similar object that can accept 
             text given the observation of the task interface. Need to call click_action1 to click on it before
             calling this function.
             The output is given as the action by calling enter_text_action function.
Example: Objects in Image: input_text textbox; 
         Task: Type 'Heyi' into textbox;
         Solution: ction = click_action1('input_text', 'textbox', observation)
                   observation, reward, terminated, truncated, info = env.step(action)
                   action = enter_text_action('Heyi', observation)
                   observation, reward, terminated, truncated, info = env.step(action)
            
Function Name: coord_click_action
Input: x, y
Output: create_coord_click_action
Description: useful when you want to click at a the (x, y) coordinate on the task interface given the left
             upper corner of the task interface as the coordinate(0,0). 
             The output is given as the action by calling coord_click_action function.
Example: Task: Click on coordinate (x, y) on the screen 
         Solution: action = coord_click_action(x, y)
                   observation, reward, terminated, truncated, info = env.step(action)

Name: count_type
Input: tag, observation
Output: count
Description: useful when you want count the number with the type tag given by the input. Observation of the 
             task interface is also given. The output is given as the find count of that element by calling 
             count_type function.
Example: Objects in Image: Shape Circle, Shape Square 
         Task: Count number of Shape objects in the screen 
         Solution: num = count(Shape, observation)
 } [/INST] { 
Objects in the image are given by the concatenation of either tag or classes and either text or id. Solution
should be consistent with objects in the image when calling functions.
The final solution should be given after the text 'Solution?' based on the Object in the image and task given, 
and should only give the solution by either calling the predfined functions' names 
with or without using python format data structure. No need to specify extra text solution when giving solution.
Any other texts can only be given as comment in python format.
Assume that you can direclty use observation before calling the function without checking it.

Cannot assume there are other functions beside previous given functions or unknown imformation from the system.
Solution actions will be executed directly, no need to regive solution again as a list, but add 
observation, reward, terminated, truncated, info = env.step(action) after every action. Actions are independent
from each other.

If you cannot directly solve a given task as shown in the examples in function description or as a hierarchy of actions
to achieve the final goal, you should try to first perform a reasonable action which you can do given the objects
in the image and interact with the environment, post that observe the change in objects in the image, 
and then try to solve the task for new objects in the environment. 
Remember, you can add any other python data structure like if-else, for loop, while loop etc to 
generated program to solve the question. 
 } [/INST] Objects in Image: ['div wrap', 'div area', 'button ONE', 'button TWO']; Task: Click button ONE, then click button TWO. Solution?

To solve this task, we can use the predefined functions provided in the question. Here's one way to do it:
action = click_action1('button', 'ONE')
observation, reward, terminated, truncated, info = env.step(action)
action = enter_text_action('Click button TWO', observation)
observation, reward, terminated, truncated, info = env.step(action)

In this solution, we first use the `click_action1` function to click on the `button ONE` element in the web interface. Then, we use the `enter_text_action` function to type the text "Click button TWO" into the input box. Finally, we use the `env.step` function to execute the action and observe the result.
Note that we don't need to specify the `x` and `y` coordinates for the `coord_click_action` function since we're already clicking on an element with the `click_action1` function. Also, we don't need to use the `count_type` function since we're only dealing with two buttons in this task.
"""

capture_actions = False
action_steps = []
for line in message.split('\n'):
    if "Solution:" in line:
        capture_actions = False
    elif capture_actions:
        action_steps.append(line.strip())
    elif "Example:" in line:
        capture_actions = True

for action_step in action_steps:
    print(action_step)

Task: Click button ONE;
Task: Type 'Heyi' into textbox;
Task: Count number of Shape objects in the screen


In [41]:
actions = re.findall(r'action = .+', solution)
for action in actions:
    print(action)

action = click_action1('button', 'ONE', observation)
action = enter_text_action('Heyi', observation)
action = coord_click_action(x, y)
action = click_action1('button', 'ONE', observation)
action = click_action1('button', 'TWO', observation)


In [42]:

actions_and_observations = re.findall(r'(action = .+|observation, .+ = env\.step\(action\))', solution)

for line in actions_and_observations:
    print(line)

action = click_action1('button', 'ONE', observation)
observation, reward, terminated, truncated, info = env.step(action)
observation, reward, terminated, truncated, info = env.step(action)
action = enter_text_action('Heyi', observation)
observation, reward, terminated, truncated, info = env.step(action)
action = coord_click_action(x, y)
observation, reward, terminated, truncated, info = env.step(action)
observation, reward, terminated, truncated, info = env.step(action)
action = click_action1('button', 'ONE', observation)
observation, reward, terminated, truncated, info = env.step(action)
action = click_action1('button', 'TWO', observation)
observation, reward, terminated, truncated, info = env.step(action)


In [43]:
actions_and_observations = re.findall(r'(action = .+|observation, .+ = env\.step\(action\))\n(?!Solution:)', solution)

for line in actions_and_observations:
    print(line)

action = click_action1('button', 'ONE', observation)
observation, reward, terminated, truncated, info = env.step(action)
observation, reward, terminated, truncated, info = env.step(action)
action = enter_text_action('Heyi', observation)
observation, reward, terminated, truncated, info = env.step(action)
action = coord_click_action(x, y)
observation, reward, terminated, truncated, info = env.step(action)
action = click_action1('button', 'ONE', observation)
observation, reward, terminated, truncated, info = env.step(action)
action = click_action1('button', 'TWO', observation)
observation, reward, terminated, truncated, info = env.step(action)


In [44]:
actions_and_observations = re.findall(r'(action = .+|observation, .+ = env\.step\(action\))\n(?!Solution:)', solution)
actions_and_observations = actions_and_observations[5:]
for line in actions_and_observations:
    print(line)

action = coord_click_action(x, y)
observation, reward, terminated, truncated, info = env.step(action)
action = click_action1('button', 'ONE', observation)
observation, reward, terminated, truncated, info = env.step(action)
action = click_action1('button', 'TWO', observation)
observation, reward, terminated, truncated, info = env.step(action)


# Click-Sequence

In [45]:
count = 0
iteration = 50
incorrect_program_list=[]
for i in range(iteration):
    program=[]
    chat_app = ChatAppLlama(model, tokenizer)
    env = gym.make("miniwob/click-button-sequence-v1" , render_mode="human")
    observation, info = env.reset()
    task = observation["utterance"]
    objects_in_the_image = get_DOM_element(observation, useful_tag, useful_classes)
    chat_app.set_system_message(System_Message)
    chat_app.set_function_description(Function_Description)
    chat_app.set_solution_description(Solution_Description)
    prompt_with_model_response = chat_app.chat(user_input, model_response, include_model_response=False)
    task_input_with_model_response = (
        f"{prompt_with_model_response} Objects in Image: {objects_in_the_image}; Task: {task} Solution?"
    )
#     print(task_input_with_model_response)
    inputs = tokenizer(task_input_with_model_response, return_tensors="pt").to(chat_app.device)
    generate_ids = model.generate(inputs.input_ids, max_length=1600)
    solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
#     print(solution)
    actions_and_observations = re.findall(r'(action = .+|observation, .+ = env\.step\(action\))\n(?!Solution:)', solution)
    actions_and_observations = actions_and_observations[5:]
    for line in actions_and_observations:
        print(line)
        exec(line)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


action = coord_click_action(x, y)


NameError: name 'x' is not defined

# Click Button

In [47]:
env = gym.make("miniwob/click-button-v1" , render_mode="human")
observation, info = env.reset()
task = observation["utterance"]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
count = 0
iteration = 50
incorrect_program_list=[]
for i in range(iteration):
    program=[]
    chat_app = ChatAppLlama(model, tokenizer)
    env = gym.make("miniwob/click-checkboxes-large-v1", render_mode="human")
    observation, info = env.reset()
    task = observation["utterance"]
    objects_in_the_image = get_DOM_element(observation, useful_tag, useful_classes)
    chat_app.set_system_message(System_Message)
    chat_app.set_function_description(Function_Description)
    chat_app.set_solution_description(Solution_Description)
    prompt_with_model_response = chat_app.chat(user_input, model_response, include_model_response=False)
    task_input_with_model_response = (
        f"{prompt_with_model_response} Objects in Image: {objects_in_the_image}; Task: {task} Solution?"
    )
#     print(task_input_with_model_response)
    inputs = tokenizer(task_input_with_model_response, return_tensors="pt").to(chat_app.device)
    generate_ids = model.generate(inputs.input_ids, max_length=1600)
    solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
#     print(solution)
    actions_and_observations = re.findall(r'(action = .+|observation, .+ = env\.step\(action\))\n(?!Solution:)', solution)
    actions_and_observations = actions_and_observations[5:]
#     print(actions_and_observations)
    for line in actions_and_observations:
        print(line)
        try:
            exec(line)
        except:
            print("error")

NameError: name 'ChatAppLlama' is not defined

# click checkbox

In [51]:
env = gym.make("miniwob/click-checkboxes-large-v1" , render_mode="human")
observation, info = env.reset()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [52]:
observation["dom_elements"]

[{'ref': 1,
  'parent': 0,
  'pos': array([0., 0.], dtype=float32),
  'size': array([485., 210.], dtype=float32),
  'tag': 'body',
  'text': '',
  'value': '',
  'id': '',
  'classes': '',
  'bg_color': array([0.33333334, 0.33333334, 0.33333334, 1.        ], dtype=float32),
  'fg_color': array([0., 0., 0., 1.], dtype=float32),
  'flags': array([1, 0, 0, 0], dtype=int8)},
 {'ref': 2,
  'parent': 1,
  'pos': array([0., 0.], dtype=float32),
  'size': array([160., 210.], dtype=float32),
  'tag': 'div',
  'text': '',
  'value': '',
  'id': 'wrap',
  'classes': '',
  'bg_color': array([1., 1., 1., 1.], dtype=float32),
  'fg_color': array([0., 0., 0., 1.], dtype=float32),
  'flags': array([0, 0, 0, 0], dtype=int8)},
 {'ref': 3,
  'parent': 2,
  'pos': array([ 0., 50.], dtype=float32),
  'size': array([160., 130.], dtype=float32),
  'tag': 'div',
  'text': '',
  'value': '',
  'id': 'area',
  'classes': '',
  'bg_color': array([0., 0., 0., 0.], dtype=float32),
  'fg_color': array([0., 0., 0., 

In [53]:
task=observation["utterance"]
task

'Select ft3N, GJS2cJ, 558, zC, uGjx, 2FwR08z, nG7LZRd, yd and click Submit.'

In [54]:
objects_in_the_image = get_DOM_element(observation, useful_tag, useful_classes)
objects_in_the_image

['div wrap',
 'div area',
 'div boxes-left',
 'label ',
 'input_checkbox ch0',
 't GJS2cJ',
 'label ',
 'input_checkbox ch1',
 't 2FwR08z',
 'label ',
 'input_checkbox ch2',
 't yd',
 'label ',
 'input_checkbox ch3',
 't uGjx',
 'label ',
 'input_checkbox ch4',
 't 558',
 'div boxes-right',
 'label ',
 'input_checkbox ch5',
 't nG7LZRd',
 'label ',
 'input_checkbox ch6',
 't zC',
 'label ',
 'input_checkbox ch7',
 't qlM60k4',
 'label ',
 'input_checkbox ch8',
 't ft3N',
 'button Submit']

In [57]:
user_input = '''This is an example that you can use to understand and solve the task at hand:

Objects in Image: 'input_checkbox ch0','t X6sllIy','input_checkbox ch1','t jp0TH','input_checkbox ch2','t ZLfc','input_checkbox ch3','t 1kM17h','input_checkbox ch4','t wUHC6N','input_checkbox ch5','t j2bBA','input_checkbox ch6','t Vn8V','input_checkbox ch7','t GFUkK','input_checkbox ch8','t 30V','input_checkbox ch9','t qNN0oQd','button Submit';
             Task: Select wUHC6N, j2bBA, X6sllIy, GFUkK, jp0TH, 30V, Vn8V, ZLfc, qNN0oQd, 1kM17h and click Submit.;
             Solutions? 
action1 = click_action1('input_checkbox', 'ch4', observation)
observation, reward, terminated, truncated, info = env.step(action1)

action2 = click_action1('input_checkbox', 'ch5', observation)
observation, reward, terminated, truncated, info = env.step(action2)

action3 = click_action1('input_checkbox', 'ch0', observation)
observation, reward, terminated, truncated, info = env.step(action3)

action4 = click_action1('input_checkbox', 'ch7', observation)
observation, reward, terminated, truncated, info = env.step(action4)

action5 = click_action1('input_checkbox', 'ch1', observation)
observation, reward, terminated, truncated, info = env.step(action5)

action6 = click_action1('input_checkbox', 'ch8', observation)
observation, reward, terminated, truncated, info = env.step(action6)

action7 = click_action1('input_checkbox', 'ch6', observation)
observation, reward, terminated, truncated, info = env.step(action7)

action8 = click_action1('input_checkbox', 'ch2', observation)
observation, reward, terminated, truncated, info = env.step(action8)

action9 = click_action1('input_checkbox', 'ch9', observation)
observation, reward, terminated, truncated, info = env.step(action9)

action10 = click_action1('input_checkbox', 'ch3', observation)
observation, reward, terminated, truncated, info = env.step(action10)

action11 = click_action1('button', 'Submit', observation)
observation, reward, terminated, truncated, info = env.step(action11)
          '''

In [60]:
count = 0
iteration = 1
incorrect_program_list=[]
for i in range(iteration):
    program=[]
    model_response=""
    user_input=example
    chat_app = ChatAppLlama(model, tokenizer)
    env = gym.make("miniwob/click-checkboxes-large-v1" , render_mode="human")
    observation, info = env.reset()
    task = observation["utterance"]
    objects_in_the_image = get_DOM_element(observation, useful_tag, useful_classes)
    chat_app.set_system_message(System_Message)
    chat_app.set_function_description(Function_Description)
    chat_app.set_solution_description(Solution_Description)
    prompt_with_model_response = chat_app.chat(user_input, model_response, include_model_response=False, include_user_message=True)
    task_input_with_model_response = (
        f"{prompt_with_model_response} Objects in Image: {objects_in_the_image}; Task: {task} Solution?"
    )
    print(task_input_with_model_response)
    inputs = tokenizer(task_input_with_model_response, return_tensors="pt").to(chat_app.device)
    generate_ids = model.generate(inputs.input_ids, max_length=1600)
    solution = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
#     print(solution)
    actions_and_observations = re.findall(r'(action = .+|observation, .+ = env\.step\(action\))\n(?!Solution:)', solution)
#     print(actions_and_observations)
    actions_and_observations = actions_and_observations[5:]
#     print(actions_and_observations)
#     for line in actions_and_observations:
#         print(line)
#         try:
#             exec(line)
#         except:
#             print("error")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>[INST] <<SYS>> { You are designed to generate programs to solve a wide range of complex web interface tasks.
You should be able to generate the program using either one or a composition of predefined action functions 
along with general python codes to solve different tasks. You should not conversate with human in any context.
 }<</SYS>> { 
You should only use the functions provided herewith in the function description. 
Here is the list for the pre-defined functions [click_action1, enter_text_action]

To use a function, please refer to the Name, Input, Output, Description of the functions, and usage examples below. 
Action functions should be called correctly in the solution. 

Function Name: click_action1
Input: tag or element, id or text, observation
Output: create_click_element_action
Description: useful when you want to click on an element in the web interface. This functions cannot be 
            generalized on names. Normally first input is one of tag or element and second i



In [61]:
print(solution)

[INST] <<SYS>> { You are designed to generate programs to solve a wide range of complex web interface tasks.
You should be able to generate the program using either one or a composition of predefined action functions 
along with general python codes to solve different tasks. You should not conversate with human in any context.
 }<</SYS>> { 
You should only use the functions provided herewith in the function description. 
Here is the list for the pre-defined functions [click_action1, enter_text_action]

To use a function, please refer to the Name, Input, Output, Description of the functions, and usage examples below. 
Action functions should be called correctly in the solution. 

Function Name: click_action1
Input: tag or element, id or text, observation
Output: create_click_element_action
Description: useful when you want to click on an element in the web interface. This functions cannot be 
            generalized on names. Normally first input is one of tag or element and second is t

# Click