In [1]:
# Clone the repository
!git clone https://github.com/sathishkumar67/Browser-Automation.git

# move the files to the current directory
!mv /teamspace/studios/this_studio/Browser-Automation/* .

# upgrade pip
!pip install --upgrade pip

# install latest version pytorch
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126

# install the required packages
!pip install -r requirements.txt

fatal: destination path 'Browser-Automation' already exists and is not an empty directory.


[38;5;57m[1m⚡️ Tip[0m	Connect GitHub to Studios: [4mhttps://lightning.ai/pt-sk/home?settings=integrations[0m



zsh:1: no matches found: /teamspace/studios/this_studio/Browser-Automation/*
Looking in indexes: https://download.pytorch.org/whl/cu126


In [2]:
# import the necessary libraries
from typing import Any, Literal, TypeAlias
import requests
import torch
from PIL import Image
from pydantic import BaseModel, Field
from transformers import AutoModelForImageTextToText, AutoProcessor
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

In [3]:
model_name = "Hcompany/Holo1.5-3B"  # or "Hcompany/Holo1.5-7B", "Hcompany/Holo1.5-72B"

# Load model and processor
model = AutoModelForImageTextToText.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
class ClickAbsoluteAction(BaseModel):
    """Click at absolute coordinates."""

    action: Literal["click_absolute"] = "click_absolute"
    x: int = Field(description="The x coordinate, number of pixels from the left edge.")
    y: int = Field(description="The y coordinate, number of pixels from the top edge.")


ChatMessage: TypeAlias = dict[str, Any]


def get_chat_messages(task: str, image: Image.Image) -> list[ChatMessage]:
    """Create the prompt structure for navigation task"""
    prompt = f"""Localize an element on the GUI image according to the provided target and output a click position.
     * You must output a valid JSON following the format: {ClickAbsoluteAction.model_json_schema()}
     Your target is:"""

    return [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": f"{prompt}\n{task}"},
            ],
        },
    ]

In [8]:
def predict(image, task):
    # Resize image according to model's image processor
    image_processor_config = processor.image_processor
    resized_height, resized_width = smart_resize(
        image.height,
        image.width,
        factor=image_processor_config.patch_size * image_processor_config.merge_size,
        min_pixels=image_processor_config.min_pixels,
        max_pixels=image_processor_config.max_pixels,
    )

    processed_image: Image.Image = image.resize(size=(resized_width, resized_height), resample=Image.Resampling.LANCZOS)

    # Create the prompt
    messages: list[dict[str, Any]] = get_chat_messages(task, processed_image)

    # Apply chat template
    text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Process inputs
    inputs = processor(
        text=[text_prompt],
        images=[processed_image],
        padding=True,
        return_tensors="pt",
    ).to(model.device)

    # Generate response
    generated_ids = model.generate(**inputs, max_new_tokens=256)

    # Decode output
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    result = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return result

In [10]:

!pip install gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gradio
  Downloading gradio-5.47.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.1-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.13.2 (from gradio)
  Downloading gradio_client-1.13.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.13.1-py3-none-manylinux_

In [13]:
import gradio as gr

iface = gr.Interface(
    fn=predict,
    inputs=[gr.Image(type="pil"), gr.Textbox()],
    outputs=gr.JSON()
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860


* Running on public URL: https://5cf7a28d59bceee215.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


