In [None]:
!pip install -U datasets beautifulsoup4 pillow transformers accelerate bitsandbytes transformers peft



In [None]:
import datasets
from PIL import Image

# Load the correct, flattened multimodal dataset from Hugging Face
# Using streaming=True is still recommended to save disk space
multimodal_dataset = datasets.load_dataset("osunlp/Multimodal-Mind2Web", split="train", streaming=True)

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
import datasets
from PIL import Image

# Get the first training example (which is a single action) to inspect it
first_example = next(iter(multimodal_dataset))

# --- Print out the key fields to see the new data structure ---
print("--- First Training Example (Single Action) ---")
print(f"Goal: {first_example['confirmed_task']}")

# This is the specific action for this row
print(f"\nTarget Action Representation: {first_example['target_action_reprs']}")

# The HTML is now a direct field of the example
print("\nCleaned HTML Snippet (first 500 chars):")
print(first_example['cleaned_html'][:500])

# The 'screenshot' field will be loaded as a Pillow Image object directly
print("\nScreenshot object:")
screenshot_image = first_example['screenshot']
print(screenshot_image)
print(f"Image mode: {screenshot_image.mode}, Image size: {screenshot_image.size}")

# You can display the image in Colab by having `screenshot_image` as the last line
# screenshot_image

--- First Training Example (Single Action) ---
Goal: rent a car in Brooklyn - Central, NY on from April 9 to April 15.

Target Action Representation: [heading]  CAR -> CLICK

Cleaned HTML Snippet (first 500 chars):
<html backend_node_id="208">
  <body backend_node_id="500">
    <div backend_node_id="1054">
      <div backend_node_id="1055">
        <div backend_node_id="1056">
          <div backend_node_id="1057">
            <div backend_node_id="1060">
              <div backend_node_id="1064">
                <h1 backend_node_id="1065">
                  <text backend_node_id="1066">Welcome to United.com</text>
                </h1>
                <a backend_node_id="1067">
                  <text bac

Screenshot object:
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x5429 at 0x7A133282AE40>
Image mode: RGB, Image size: (1280, 5429)


In [None]:
from bs4 import BeautifulSoup
import json
import re

In [None]:
def clean_text(text, max_len=60):
    """
    Truncate text to save tokens, but keep enough to be readable.
    """
    if not text:
        return ""

    # 1. Regex to remove things that look like tags <...>
    text = re.sub(r'<[^>]+>', '', text)

    # Collapse multiple spaces
    text = " ".join(text.split())
    return text[:max_len]

In [None]:
def format_attributes(attrs):
    """
    Format attributes, filtering out defaults to save tokens.
    """
    out = []

    # 1. Type Attribute (Strip defaults)
    if "type" in attrs:
        t = attrs["type"].lower().strip()
        if t not in ["text", "button", "submit", "reset"]:
            out.append(f"type='{t}'")
        elif t == "submit":
            out.append("type='submit'") # Submit is useful context

    # 2. Key Attributes for Accessibility/Identification
    # We prioritize 'name' and 'role' as they often hint at functionality
    for k in ["role", "name", "value", "aria-label", "placeholder", "title", "alt"]:
        if k in attrs and attrs[k]:
            val = clean_text(attrs[k], 40)
            if val:
                # Shorten key names for compactness
                key_map = {"aria-label": "aria", "placeholder": "ph"}
                nice_key = key_map.get(k, k)
                out.append(f"{nice_key}='{val}'")

    # 3. State Attributes
    for k in ["checked", "disabled", "selected", "required", "readonly"]:
        if k in attrs:
            out.append(k)

    return "(" + ", ".join(out) + ")" if out else ""

In [None]:
def get_element_uid(el):
    """
    Mind2Web provides 'backend_node_id'. We MUST use this to link
    prompt elements to the ground truth labels.
    """
    return el.attrs.get("backend_node_id", "")

In [None]:
def distill_dom(html_string):
    """
    SOTA Semantic Filtering:
    1. Parse in document order.
    2. Keep Natively Interactive elements (button, input, etc).
    3. Keep Semantically Interactive elements (role='tab', etc).
    4. Strip generic containers (div, span) to remove redundancy.
    """
    soup = BeautifulSoup(html_string, "html.parser")

    # 1. Prune structural junk
    for tag in soup.find_all(["script", "style", "meta", "link", "noscript", "svg", "path", "footer", "head"]):
        tag.decompose()

    # START with the Sentinel Token
    # We make it look like a valid interactive element so the model treats it normally
    candidates = [
        "[0] <option> Target element is not in this list"
    ]

    # Define what we keep
    # Natively interactive tags
    INTERACTIVE_TAGS = {"a", "button", "input", "select", "textarea", "option", "label", "li", "summary"}
    # Roles that make a generic element (like div/span) interactive
    INTERACTIVE_ROLES = {"button", "tab", "link", "checkbox", "menuitem", "radio", "combobox", "listbox", "option", "switch", "searchbox"}
    # Headers for context
    HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6"}

    # 2. Traverse ALL tags in document order
    for tag in soup.find_all(True):
        uid = get_element_uid(tag)

        # --- FILTERING LOGIC ---

        # Condition A: It is a Header (Keep for context, even without ID)
        if tag.name in HEADER_TAGS:
            text = clean_text(tag.get_text(separator=" ", strip=True))
            if text:
                candidates.append(f"[-] <{tag.name}> {text}")
            continue

        # Condition B: It must have a UID to be actionable
        if not uid:
            continue

        # Condition C: Check if it's "Interactive"
        is_interactive_tag = tag.name in INTERACTIVE_TAGS

        # Check role safely (role can be a list or string in BS4)
        role = tag.attrs.get("role", "")
        if isinstance(role, list): role = role[0]
        is_interactive_role = role in INTERACTIVE_ROLES

        # --- DECISION ---
        if is_interactive_tag or is_interactive_role:
            text = clean_text(tag.get_text(separator=" ", strip=True))
            attr_str = format_attributes(tag.attrs)

            # Special case: Skip empty generic containers even if they have a UID
            # (unless they are inputs/buttons which might be icon-only)
            if not text and not attr_str and tag.name not in ["input", "button", "select", "textarea"]:
                continue

            # Formatting: [1250] <li> Car (role='tab')
            line = f"[{uid}] <{tag.name}> {text} {attr_str}"

            # Cleanup double spaces
            line = " ".join(line.split())
            candidates.append(line)

    # 3. Safety Limit (This can be higher now because we stripped the junk)
    MAX_ELEMENTS = 200
    if len(candidates) > MAX_ELEMENTS:
        candidates = candidates[:MAX_ELEMENTS]

    return "\n".join(candidates)

In [None]:
def build_selector_from_candidate(candidate):
  tag = candidate. get ("tag", "")
  attributes = json. loads(candidate.get("attributes", "{}"))

  if "id" in attributes and attributes["id"]:
    return f"{tag}#{attributes ['id']}"

  if "class" in attributes and attributes ["class"]:
    class_str =".".join(attributes["class"].strip().split())
    return f"'{tag}.{class_str}"

  return tag

In [None]:
def process_example(example):
    image = example["screenshot"]
    goal = example["confirmed_task"]
    target_index = int(example["target_action_index"])
    total_actions = len(example["action_reprs"])
    is_finished = (target_index == total_actions - 1)

    # 1. Generate the Compact DOM
    distilled_dom = distill_dom(example["cleaned_html"])

    # 2. Determine the Target UID
    # In Mind2Web, "pos_candidates" contains the correct element info
    target_uid = None

    if example.get("pos_candidates"):
        # The dataset stores candidates as a list of serialized JSON strings
        cand_list = example["pos_candidates"]
        if len(cand_list) > 0:
            candidate = json.loads(cand_list[0])
            target_uid = candidate.get("backend_node_id")

    # 3. Handle Visibility (The Sentinel Logic)
    final_target_id = "None"

    if target_uid:
        target_token = f"[{target_uid}]"

        if target_token in distilled_dom:
            # Case A: Target is visible
            final_target_id = target_uid
        else:
            # Case B: Target was cut off
            # We map this to the Sentinel ID "0"
            final_target_id = "0"

            # Downsample negatives to prevent class imbalance
            # We use the hash of the goal to deterministicly keep ~20% of negatives
            # If you have small data, you might want to keep all of them.
            if hash(goal) % 10 > 2:
                return None # Skip 80% of negatives to avoid "Lazy Model" syndrome


    # 4. Get Operation details
    op = json.loads(example["operation"])
    action = op["op"].lower() # click, type, select
    text_value = op.get("value", "")

    # 5. prompt
    prompt = (
        f"You are a web agent. Analyze the screenshot and the list of elements.\n"
        f"The element list is formatted as: [ID] <Tag> Text (Attributes).\n"
        f"If the target element is not in the list, select ID 0.\n"
        f"Your task is to select the correct Element ID to perform the action on.\n\n"
        f"TASK: {goal}\n\n"
        f"ELEMENTS:\n{distilled_dom}\n\n"
        f"Generate a JSON with keys: action, element_id, value, is_finished."
    )

    # 6. Construct the Label
    # NOTICE: we use 'element_id' instead of 'selector'
    label_dict = {
        "action": action,
        "element_id": final_target_id,
        "value": text_value,
        "is_finished": is_finished
    }

    return {
        "image": image,
        "prompt": prompt,
        "label": json.dumps(label_dict)
    }

In [None]:
# Process the first example to see the final output
processed_example = process_example(first_example)

# Print the results to verify
print("--- FULLY PROCESSED EXAMPLE ---")
print("\n--- PROMPT (Input for VLM) ---")
print(processed_example['prompt'])

print("\n--- LABEL (Output for VLM) ---")
print(processed_example['label'])

print("\n--- IMAGE (Input for VLM) ---")
print(processed_example['image'])

--- FULLY PROCESSED EXAMPLE ---

--- PROMPT (Input for VLM) ---
You are a web agent. Analyze the screenshot and the list of elements.
The element list is formatted as: [ID] <Tag> Text (Attributes).
If the target element is not in the list, select ID 0.
Your task is to select the correct Element ID to perform the action on.

TASK: rent a car in Brooklyn - Central, NY on from April 9 to April 15.

ELEMENTS:
[0] <option> Target element is not in this list
[-] <h1> Welcome to United.com
[1067] <a> Skip to book
[1085] <li> English - United States$
[1088] <button> English - United States$
[1096] <button>
[1100] <li> Search
[1101] <a> Search
[1112] <button> Hi, James 0 miles
[1129] <button>
[1133] <li> Menu
[1134] <button> Menu
[1145] <a> BOOK (role='tab')
[1148] <a> MY TRIPS (role='tab')
[1151] <a> TRAVEL INFO (role='tab')
[1154] <a> MILEAGEPLUS PROGRAM (role='tab')
[1157] <a> DEALS (role='tab')
[1163] <button>
[1168] <button>
[1173] <button>
[1178] <button>
[1183] <button>
[1186] <a> Help
[

## Process ALL Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def to_py(x):
    try:
        if hasattr(x, "numpy"):
            val = x.numpy()
            if isinstance(val, bytes):
                return val.decode("utf-8")
            return val
        return x
    except:
        return x

In [None]:
import json
from tqdm.auto import tqdm

NUM_EXAMPLES_TO_PROCESS = 7775
OUTPUT_FILENAME = "/content/drive/MyDrive/mind2web_processed_train.jsonl"

with open(OUTPUT_FILENAME, "w") as f:

    skip_ct = 0
    for example in tqdm(multimodal_dataset.take(NUM_EXAMPLES_TO_PROCESS),
                        total=NUM_EXAMPLES_TO_PROCESS):

        try:
            processed = process_example(example)

            if processed is None:
                skip_ct += 1
                continue

            annotation_id = to_py(example["annotation_id"])
            action_uid = to_py(example["action_uid"])

            data_to_save = {
                "annotation_id": annotation_id,
                "action_uid": action_uid,
                "prompt": processed["prompt"],
                "label": processed["label"]
            }

            f.write(json.dumps(data_to_save) + "\n")

        except Exception as e:
            print(f"Skipping example due to error: {e}")
print(f"total skipped examples: {skip_ct}\n")
print("Done!")

  0%|          | 0/7775 [00:00<?, ?it/s]

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 60189a81-2fb5-42af-b2a9-eb86b8601517)')' thrown while requesting GET https://huggingface.co/datasets/osunlp/Multimodal-Mind2Web/resolve/1b4c6a8cf9f77b7a5e0d641959935c80c4a05889/data/train-00020-of-00027-a3f17abfa6315328.parquet
Retrying in 1s [Retry 1/5].


total skipped examples: 2032

Done!


In [None]:
import json
import os
from transformers import AutoProcessor
from tqdm.auto import tqdm

# --- CONFIGURATION ---
INPUT_FILE = "/content/drive/MyDrive/mind2web_processed_train.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/mind2web_train_filtered_8k.jsonl"

# 8192 is a safe standard limit for A100 LoRA training
# If you have A100 80GB, you could push this to 12k or 14k,
# but 8k is safer for convergence.
MAX_SEQ_LENGTH = 8192

# Estimated tokens for the image (1280x896 res takes ~1300-1500 tokens)
# We add a safety buffer to be sure.
IMAGE_TOKEN_ESTIMATE = 1600

print(f"Filtering dataset to max {MAX_SEQ_LENGTH} tokens...")

# Load the tokenizer (fast load, no model needed)
# We use the processor's tokenizer to get exact counts
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
tokenizer = processor.tokenizer

kept = 0
dropped = 0
dropped_samples = []

with open(INPUT_FILE, "r") as fin, open(OUTPUT_FILE, "w") as fout:
    for line in tqdm(fin):
        data = json.loads(line)
        prompt = data["prompt"]

        # 1. Calculate Text Tokens
        # We don't need the full chat template overhead for this check,
        # just the raw prompt length is a close enough proxy (plus ~50 for system overhead)
        text_tokens = len(tokenizer.encode(prompt))

        # 2. Total Estimated Length
        total_tokens = text_tokens + IMAGE_TOKEN_ESTIMATE

        # 3. Decision
        if total_tokens <= MAX_SEQ_LENGTH:
            fout.write(line)
            kept += 1
        else:
            dropped += 1
            dropped_samples.append((data["annotation_id"], total_tokens))

print("\n--- FILTERING COMPLETE ---")
print(f"Original Count: {kept + dropped}")
print(f"Kept:           {kept}")
print(f"Dropped:        {dropped} ({dropped/(kept+dropped)*100:.2f}%)")
print(f"Output saved to: {OUTPUT_FILE}")

if dropped > 0:
    print("\nExample Dropped IDs (Top 5):")
    for clean_id, length in dropped_samples[:5]:
        print(f"- ID: {clean_id} | Length: ~{length} tokens")

Filtering dataset to max 8192 tokens...


0it [00:00, ?it/s]


--- FILTERING COMPLETE ---
Original Count: 5743
Kept:           5743
Dropped:        0 (0.00%)
Output saved to: /content/drive/MyDrive/mind2web_train_filtered_8k.jsonl
