In [1]:
# ===== Basic =====
import os
import sys
import json
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image, ImageDraw
from tqdm import tqdm
from glob import glob
import cv2

# ===== CLIP (Spotlight) =====
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity

# ===== V2P =====
sys.path.append('/home/phamlong/Downloads/v2p/AWorld-RL/V2P/src')
from transformers import AutoProcessor
from V2P.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN
from V2P.modeling import Qwen2VLForConditionalGenerationWithPointer
from V2P.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
from V2P.inference import inference, ForceFollowTokensLogitsProcessor


In [None]:
TOP_K_REGIONS = 2
REGION_THRESH = 0.28

FEATHER_SIGMA = 16        # giống feather_sigma ver5
SOFT_MASK_THRESH = 0.25  # ngưỡng để lấy bbox
CROP_PAD = 8              # tuỳ em


In [None]:
# ================= 2️⃣ Cấu hình các tham số =================
MODEL_TYPE = "qwen25vl"  
MODEL_NAME_OR_PATH = "inclusionAI/V2P-7B"  
# MODEL_NAME_OR_PATH = "microsoft/GUI-Actor-7B-Qwen2.5-VL"  

SAVE_PATH = "results/"  # tương đương --save_path
DATA_PATH = "/home/phamlong/Downloads/v2p/ScreenSpot-Pro"
RESIZE_TO_PIXELS = 3200*1800
USE_PLACEHOLDER = True
TOPK = 3
IMAGE_PATCH_SIZE = 14
# ===== Spotlight Config =====
BASE_CLIP = "/home/phamlong/Downloads/clip finetune/clip-vit-base-patch32"
CKPT_PATH = "/home/phamlong/Downloads/clip finetune/clip_finetuned/clip_finetuned_epoch5.pth"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

NUM_GRID = 36
TOP_K_REGIONS = 2
REGION_THRESH = 0.28
CROP_PAD = 8

print("Loading CLIP...")
clip_model = CLIPModel.from_pretrained(BASE_CLIP)
clip_processor = CLIPProcessor.from_pretrained(BASE_CLIP)

if os.path.exists(CKPT_PATH):
    state = torch.load(CKPT_PATH, map_location="cpu")
    if "model" in state:
        state = state["model"]
    clip_model.load_state_dict(state, strict=False)

clip_model = clip_model.to(DEVICE).eval()
torch.set_grad_enabled(False)


IMAGE_DIR = os.path.join(DATA_PATH, "images")
DATA_FN = os.path.join(DATA_PATH, "/home/phamlong/Downloads/v2p/ScreenSpot-Pro/merged_annotations.json")
PRED_PATH = os.path.join(SAVE_PATH, "screenspot-Pro_all_preds_StandardResize.json")
METRIC_PATH = os.path.join(SAVE_PATH, "screenspot-Pro_all_preds_StandardResize.txt")
annotation_files = sorted(glob(os.path.join(DATA_FN, "*.json")))
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH, exist_ok=True)

# ================= 3️⃣ Hàm hỗ trợ =================
def normalize_bbox(bbox_x1y1x2y2, img_width, img_height):
    x1, y1, x2, y2 = bbox_x1y1x2y2
    if (0 <= x1 <= 1) and (0 <= y1 <= 1) and (0 <= x2 <= 1) and (0 <= y2 <= 1):
        return bbox_x1y1x2y2
    else:
        x1 = x1 / img_width
        y1 = y1 / img_height
        x2 = x2 / img_width
        y2 = y2 / img_height
        return x1, y1, x2, y2

Loading CLIP...


In [4]:
def hybrid_vector_field_grid(image, instruction, num_grid=36, sigma=0.4, batch_size=8):
    img = image.convert("RGB")
    W, H = img.size
    pw, ph = W // num_grid, H // num_grid

    patches = []
    for gy in range(num_grid):
        for gx in range(num_grid):
            x1, y1 = gx * pw, gy * ph
            x2, y2 = x1 + pw, y1 + ph
            patches.append(img.crop((x1, y1, x2, y2)))

    # text embedding
    text_inputs = clip_processor(text=instruction, return_tensors="pt").to(DEVICE)
    text_emb = clip_model.get_text_features(**text_inputs)
    text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
    text_emb = text_emb.cpu().numpy()

    # patch embeddings
    feats = []
    for i in range(0, len(patches), batch_size):
        inputs = clip_processor(images=patches[i:i+batch_size], return_tensors="pt").to(DEVICE)
        emb = clip_model.get_image_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
        feats.append(emb.cpu().numpy())
    patch_embs = np.concatenate(feats, axis=0)

    cos_sim = cosine_similarity(patch_embs, text_emb).flatten()
    gates = 1 / (1 + np.exp(-3.0 * (cos_sim - 0.5)))
    gated = patch_embs * gates[:, None]

    N = gated.shape[0]
    F_field = np.zeros_like(gated)
    for i in range(N):
        diff = gated - gated[i]
        dist = np.linalg.norm(diff, axis=1)
        w = np.exp(-dist**2 / sigma**2) * gates
        F_field[i] = np.sum(w[:, None] * diff, axis=0)

    F_field /= (np.linalg.norm(F_field, axis=1, keepdims=True) + 1e-8)

    div = np.zeros(N)
    for i in range(N):
        idx = np.argsort(np.linalg.norm(F_field - F_field[i], axis=1))[:8]
        grad = (F_field[idx] - F_field[i]).mean(axis=0)
        div[i] = np.dot(grad, F_field[i])

    score = np.log1p(np.maximum(0, -div)) * cos_sim * gates
    score = (score - score.min()) / (score.max() - score.min() + 1e-8)

    return score.reshape(num_grid, num_grid), (H, W)


In [None]:

def spotlight_bbox(image, instruction):
    grid_scores, (H, W) = hybrid_vector_field_grid(image, instruction)

    binary = (grid_scores >= REGION_THRESH).astype(np.uint8)
    num_labels, labels = cv2.connectedComponents(binary)

    regions = []
    for l in range(1, num_labels):
        mask = (labels == l)
        area = mask.sum()
        if area < 3:
            continue
        score = grid_scores[mask].mean() * area
        regions.append((score, l))

    regions = sorted(regions, reverse=True)[:TOP_K_REGIONS]

    if len(regions) == 0:
        return None, None

    # ---- patch mask ----
    patch_mask = np.zeros_like(grid_scores, dtype=np.float32)
    for _, l in regions:
        patch_mask += (labels == l).astype(np.float32)
    patch_mask = np.clip(patch_mask, 0, 1)

    # ---- upsample (GIỐNG ver5) ----
    t = torch.tensor(patch_mask)[None, None]
    mask_full = F.interpolate(
        t, size=(H, W), mode="bicubic", align_corners=False
    ).squeeze().numpy()

    mask_full = (mask_full - mask_full.min()) / (mask_full.max() - mask_full.min() + 1e-8)

    # ---- feather ----
    mask_blur = cv2.GaussianBlur(mask_full, (0, 0), FEATHER_SIGMA)

    # ---- bbox từ mask ----
    bin_mask = mask_blur > 0.25
    ys, xs = np.where(bin_mask)
    if len(xs) == 0:
        return None, None

    x1, x2 = xs.min(), xs.max()
    y1, y2 = ys.min(), ys.max()

    bbox = (
        max(0, x1 - CROP_PAD),
        max(0, y1 - CROP_PAD),
        min(W - 1, x2 + CROP_PAD),
        min(H - 1, y2 + CROP_PAD),
    )

    return bbox, mask_blur



In [6]:
# def evaluate(
#     model_name_or_path,
#     model_type,
#     data_fn,
#     image_dir,
#     use_placeholder,
#     topk,
#     resize_to_pixels=None
# ):
#     import os, json, torch
#     from PIL import Image, ImageDraw
#     from tqdm import tqdm
#     from transformers import AutoProcessor

#     # ================= LOAD PROCESSOR & TOKENIZER =================
#     data_processor = AutoProcessor.from_pretrained(model_name_or_path)
#     tokenizer = data_processor.tokenizer

#     if model_type == "qwen2vl":
#         model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
#             model_name_or_path,
#             torch_dtype=torch.bfloat16,
#             device_map="cuda:0",
#             attn_implementation="flash_attention_2"
#         ).eval()

#         grounding_system_message = (
#             "You are a GUI agent. You are given a task and a screenshot of the screen. "
#             "You need to locate the UI element corresponding to the instruction."
#         )

#     elif model_type == "qwen25vl":
#         model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
#             model_name_or_path,
#             torch_dtype=torch.bfloat16,
#             device_map="cuda:0",
#             attn_implementation="flash_attention_2"
#         ).eval()

#         grounding_system_message = (
#             "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
#             "your task is to locate the screen element that corresponds to the instruction."
#         )
#     else:
#         raise ValueError("Invalid model type")

#     logits_processor_pointer = ForceFollowTokensLogitsProcessor(
#         token_a_id=tokenizer.encode(DEFAULT_POINTER_PAD_TOKEN)[0],
#         forced_sequence=[tokenizer.encode(DEFAULT_POINTER_END_TOKEN)[0]]
#     )

#     # ================= LOAD DATA =================
#     with open(data_fn, "r") as f:
#         data = json.load(f)

#     results = []
#     os.makedirs("vis_debug", exist_ok=True)

#     # ================= MAIN LOOP =================
#     for example in tqdm(data):

#         # ---------- metadata ----------
#         ele = {
#             "file_name": example["img_filename"],
#             "ui_type": example["ui_type"],
#             "group": example["group"],
#             "platform": example["platform"],
#             "application": example["application"],
#             "id": example["id"],
#             "instruction": example["instruction"],
#             "img_size": example["img_size"],
#             "bbox_x1y1x2y2": normalize_bbox(
#                 example["bbox"],
#                 example["img_size"][0],
#                 example["img_size"][1]
#             ),
#             "hit_top1": 0,
#         }

#         # ---------- load image ----------
#         image = Image.open(
#             os.path.join(image_dir, example["img_filename"])
#         ).convert("RGB")
#         W, H = image.size

#         # ================= STAGE 0: SPOTLIGHT ROI =================
#         # roi_bbox = spotlight_bbox(image, example["instruction"])
#         roi_bbox, mask_blur = spotlight_bbox(image, example["instruction"])

#         if roi_bbox is None:
#             rx1, ry1, rx2, ry2 = 0, 0, W, H
#         else:
#             rx1, ry1, rx2, ry2 = roi_bbox

#         roi = image.crop((rx1, ry1, rx2, ry2))
#         roi_w, roi_h = roi.size

#         # ================= STAGE 1: TOP-3 COARSE GUESS =================
#         conv1 = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": roi},
#                 {"type": "text", "text": example["instruction"]}
#             ]}
#         ]

#         pred1 = inference(
#             conv1,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=3
#         )

#         top3_roi = pred1["topk_points"][:3]

#         # remap to global px
#         pts = []
#         for px, py in top3_roi:
#             gx = px * roi_w + rx1
#             gy = py * roi_h + ry1
#             pts.append((gx, gy))

#         # ================= STAGE 2: TIGHT CROP + ZOOM =================
#         xs, ys = zip(*pts)
#         xmin, xmax = min(xs), max(xs)
#         ymin, ymax = min(ys), max(ys)

#         margin = 0.25
#         bw, bh = xmax - xmin, ymax - ymin
#         xmin = max(0, int(xmin - bw * margin))
#         ymin = max(0, int(ymin - bh * margin))
#         xmax = min(W, int(xmax + bw * margin))
#         ymax = min(H, int(ymax + bh * margin))

#         crop1 = image.crop((xmin, ymin, xmax, ymax))
#         zoom1 = crop1.resize((W, H), Image.BILINEAR)

#         conv2 = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": zoom1},
#                 {"type": "text", "text": example["instruction"]}
#             ]}
#         ]

#         pred2 = inference(
#             conv2,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=1
#         )

#         zx1, zy1 = pred2["topk_points"][0]
#         px1 = (xmin + zx1 * (xmax - xmin)) / W
#         py1 = (ymin + zy1 * (ymax - ymin)) / H

#         # ================= STAGE 3: FINAL CONFIRM ZOOM =================
#         cx, cy = int(px1 * W), int(py1 * H)
#         box = int(0.25 * min(W, H))

#         x1 = max(0, cx - box)
#         y1 = max(0, cy - box)
#         x2 = min(W, cx + box)
#         y2 = min(H, cy + box)

#         crop2 = image.crop((x1, y1, x2, y2))
#         zoom2 = crop2.resize((W, H), Image.BILINEAR)

#         conv3 = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": zoom2},
#                 {"type": "text", "text": example["instruction"]}
#             ]}
#         ]

#         pred3 = inference(
#             conv3,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=1
#         )

#         zx2, zy2 = pred3["topk_points"][0]
#         final_px = (x1 + zx2 * (x2 - x1)) / W
#         final_py = (y1 + zy2 * (y2 - y1)) / H

#         # ================= EVALUATION =================
#         gx1, gy1, gx2, gy2 = ele["bbox_x1y1x2y2"]
#         if gx1 <= final_px <= gx2 and gy1 <= final_py <= gy2:
#             ele["hit_top1"] = 1

#         #================= VIS DEBUG =================
#         # vis = image.copy()
#         # draw = ImageDraw.Draw(vis)

#         # # 1️⃣ GT bbox (green)
#         # draw.rectangle(
#         #     [
#         #         int(gx1 * W),
#         #         int(gy1 * H),
#         #         int(gx2 * W),
#         #         int(gy2 * H)
#         #     ],
#         #     outline="lime",
#         #     width=3
#         # )

#         # # 2️⃣ Spotlight crop bbox (yellow)
#         # draw.rectangle(
#         #     [rx1, ry1, rx2, ry2],
#         #     outline="yellow",
#         #     width=2
#         # )

#         # # 3️⃣ Final confirmed Top-1 point (red)
#         # fx, fy = int(final_px * W), int(final_py * H)
#         # r = 6
#         # draw.ellipse(
#         #     [fx - r, fy - r, fx + r, fy + r],
#         #     fill="red"
#         # )

#         # vis.save(f"vis_debug/{example['id']}_final.png")
#         # ================= VIS WITH SOFT SPOTLIGHT MASK =================
#         img_np = np.array(image).astype(np.float32) / 255.0

#         # ===============================
#         # Apply soft spotlight (VIS ONLY)
#         # ===============================
#         if mask_blur is not None:
#             darken_factor = 0.07

#             # ensure mask shape [H, W]
#             if mask_blur.ndim == 3:
#                 mask_blur = mask_blur.squeeze(-1)

#             mask3 = np.stack([mask_blur] * 3, axis=-1)

#             dark_np = img_np * darken_factor
#             vis_np = dark_np * (1 - mask3) + img_np * mask3
#             vis_np = np.clip(vis_np, 0, 1)
#         else:
#             # fallback: no spotlight, use original image
#             vis_np = img_np

#         vis = Image.fromarray((vis_np * 255).astype(np.uint8))
#         draw = ImageDraw.Draw(vis)


#         # 1️⃣ GT bbox (green)
#         draw.rectangle(
#             [
#                 int(gx1 * W),
#                 int(gy1 * H),
#                 int(gx2 * W),
#                 int(gy2 * H)
#             ],
#             outline="lime",
#             width=3
#         )

#         # 2️⃣ Spotlight crop bbox (yellow)
#         draw.rectangle(
#             [rx1, ry1, rx2, ry2],
#             outline="yellow",
#             width=2
#         )

#         # 3️⃣ Final confirmed Top-1 point (red)
#         fx, fy = int(final_px * W), int(final_py * H)
#         r = 6
#         draw.ellipse(
#             [fx - r, fy - r, fx + r, fy + r],
#             fill="red"
#         )

#         os.makedirs("vis_debug", exist_ok=True)
#         vis.save(f"vis_debug/{example['id']}_final.png")



#         results.append(ele)

#     return results


In [None]:
#no add stage#
def evaluate(
    model_name_or_path,
    model_type,
    data_fn,
    image_dir,
    use_placeholder,
    topk,
    resize_to_pixels=None
):
    import os, json, torch
    import numpy as np
    from PIL import Image, ImageDraw
    from tqdm import tqdm
    from transformers import AutoProcessor

    # ================= LOAD PROCESSOR & TOKENIZER =================
    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
    tokenizer = data_processor.tokenizer

    if model_type == "qwen2vl":
        model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            device_map="cuda:0",
            attn_implementation="flash_attention_2"
        ).eval()

        grounding_system_message = (
            "You are a GUI agent. You are given a task and a screenshot of the screen. "
            "You need to locate the UI element corresponding to the instruction."
        )

    elif model_type == "qwen25vl":
        model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            device_map="cuda:0",
            attn_implementation="flash_attention_2"
        ).eval()

        grounding_system_message = (
            "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
            "your task is to locate the screen element that corresponds to the instruction."
        )
    else:
        raise ValueError("Invalid model type")

    logits_processor_pointer = ForceFollowTokensLogitsProcessor(
        token_a_id=tokenizer.encode(DEFAULT_POINTER_PAD_TOKEN)[0],
        forced_sequence=[tokenizer.encode(DEFAULT_POINTER_END_TOKEN)[0]]
    )

    # ================= LOAD DATA =================
    with open(data_fn, "r") as f:
        data = json.load(f)

    results = []
    os.makedirs("vis_debug", exist_ok=True)

    for example in tqdm(data):

        # ---------- instruction selection ----------
        instruction_text = example["instruction"]
        fname = example.get("img_filename", "").lower()

        if fname.startswith("autocad_windows") or fname.startswith("solidworks_windows"):
            if "instruction_cn" in example and example["instruction_cn"]:
                instruction_text = example["instruction_cn"]

        # ---------- metadata ----------
        ele = {
            "file_name": example["img_filename"],
            "ui_type": example["ui_type"],
            "group": example["group"],
            "platform": example["platform"],
            "application": example["application"],
            "id": example["id"],
            "instruction": instruction_text,
            "img_size": example["img_size"],
            "bbox_x1y1x2y2": normalize_bbox(
                example["bbox"],
                example["img_size"][0],
                example["img_size"][1]
            ),
            "hit_top1": 0,
        }

        # ---------- load image ----------
        image = Image.open(
            os.path.join(image_dir, example["img_filename"])
        ).convert("RGB")
        W, H = image.size

        roi_bbox, mask_blur = spotlight_bbox(image, instruction_text)

        if roi_bbox is None:
            rx1, ry1, rx2, ry2 = 0, 0, W, H
        else:
            rx1, ry1, rx2, ry2 = roi_bbox

        roi = image.crop((rx1, ry1, rx2, ry2))
        roi_w, roi_h = roi.size

        conv1 = [
            {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
            {"role": "user", "content": [
                {"type": "image", "image": roi},
                {"type": "text", "text": instruction_text}
            ]}
        ]

        pred1 = inference(
            conv1,
            model,
            tokenizer,
            data_processor,
            logits_processor=logits_processor_pointer,
            use_placeholder=use_placeholder,
            topk=3
        )

        pts = []
        for px, py in pred1["topk_points"][:3]:
            gx = px * roi_w + rx1
            gy = py * roi_h + ry1
            pts.append((gx, gy))

        xs, ys = zip(*pts)
        xmin, xmax = min(xs), max(xs)
        ymin, ymax = min(ys), max(ys)

        margin = 0.25
        bw, bh = xmax - xmin, ymax - ymin
        xmin = max(0, int(xmin - bw * margin))
        ymin = max(0, int(ymin - bh * margin))
        xmax = min(W, int(xmax + bw * margin))
        ymax = min(H, int(ymax + bh * margin))

        crop1 = image.crop((xmin, ymin, xmax, ymax))
        zoom1 = crop1.resize((W, H), Image.BILINEAR)

        conv2 = [
            {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
            {"role": "user", "content": [
                {"type": "image", "image": zoom1},
                {"type": "text", "text": instruction_text}
            ]}
        ]

        pred2 = inference(
            conv2,
            model,
            tokenizer,
            data_processor,
            logits_processor=logits_processor_pointer,
            use_placeholder=use_placeholder,
            topk=3
        )

        zx, zy = pred2["topk_points"][0]
        px_mid = (xmin + zx * (xmax - xmin)) / W
        py_mid = (ymin + zy * (ymax - ymin)) / H

        cx, cy = int(px_mid * W), int(py_mid * H)
        box = int(0.25 * min(W, H))

        x1 = max(0, cx - box)
        y1 = max(0, cy - box)
        x2 = min(W, cx + box)
        y2 = min(H, cy + box)

        crop2 = image.crop((x1, y1, x2, y2))
        zoom2 = crop2.resize((W, H), Image.BILINEAR)

        conv3 = [
            {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
            {"role": "user", "content": [
                {"type": "image", "image": zoom2},
                {"type": "text", "text": instruction_text}
            ]}
        ]

        pred3 = inference(
            conv3,
            model,
            tokenizer,
            data_processor,
            logits_processor=logits_processor_pointer,
            use_placeholder=use_placeholder,
            topk=1
        )

        zx2, zy2 = pred3["topk_points"][0]
        final_px = (x1 + zx2 * (x2 - x1)) / W
        final_py = (y1 + zy2 * (y2 - y1)) / H

        gx1, gy1, gx2, gy2 = ele["bbox_x1y1x2y2"]
        if gx1 <= final_px <= gx2 or gy1 <= final_py <= gy2:
            ele["hit_top1"] = 1

        # ================= VIS DEBUG =================
        img_np = np.array(image).astype(np.float32) / 255.0

        if mask_blur is not None:
            darken_factor = 0.07
            if mask_blur.ndim == 3:
                mask_blur = mask_blur.squeeze(-1)
            mask3 = np.stack([mask_blur] * 3, axis=-1)
            dark_np = img_np * darken_factor
            vis_np = dark_np * (1 - mask3) + img_np * mask3
            vis_np = np.clip(vis_np, 0, 1)
        else:
            vis_np = img_np

        vis = Image.fromarray((vis_np * 255).astype(np.uint8))
        draw = ImageDraw.Draw(vis)

        draw.rectangle(
            [int(gx1 * W), int(gy1 * H), int(gx2 * W), int(gy2 * H)],
            outline="lime", width=3
        )
        draw.rectangle([rx1, ry1, rx2, ry2], outline="yellow", width=2)

        fx, fy = int(final_px * W), int(final_py * H)
        r = 6
        draw.ellipse([fx - r, fy - r, fx + r, fy + r], fill="red")

        vis.save(f"vis_debug/{example['id']}_final.png")

        results.append(ele)

    return results


In [8]:
# # add stage#
# def evaluate(
#     model_name_or_path,
#     model_type,
#     data_fn,
#     image_dir,
#     use_placeholder,
#     topk,
#     resize_to_pixels=None
# ):
#     import os, json, torch
#     import numpy as np
#     from PIL import Image, ImageDraw
#     from tqdm import tqdm
#     from transformers import AutoProcessor

#     # ================= LOAD PROCESSOR & TOKENIZER =================
#     data_processor = AutoProcessor.from_pretrained(model_name_or_path)
#     tokenizer = data_processor.tokenizer

#     if model_type == "qwen2vl":
#         model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
#             model_name_or_path,
#             torch_dtype=torch.bfloat16,
#             device_map="cuda:0",
#             attn_implementation="flash_attention_2"
#         ).eval()

#         grounding_system_message = (
#             "You are a GUI agent. You are given a task and a screenshot of the screen. "
#             "You need to locate the UI element corresponding to the instruction."
#         )

#     elif model_type == "qwen25vl":
#         model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
#             model_name_or_path,
#             torch_dtype=torch.bfloat16,
#             device_map="cuda:0",
#             attn_implementation="flash_attention_2"
#         ).eval()

#         grounding_system_message = (
#             "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
#             "your task is to locate the screen element that corresponds to the instruction."
#         )
#     else:
#         raise ValueError("Invalid model type")

#     logits_processor_pointer = ForceFollowTokensLogitsProcessor(
#         token_a_id=tokenizer.encode(DEFAULT_POINTER_PAD_TOKEN)[0],
#         forced_sequence=[tokenizer.encode(DEFAULT_POINTER_END_TOKEN)[0]]
#     )

#     # ================= LOAD DATA =================
#     with open(data_fn, "r") as f:
#         data = json.load(f)

#     results = []
#     os.makedirs("vis_debug", exist_ok=True)

#     # ================= MAIN LOOP =================
#     for example in tqdm(data):

#         # ---------- instruction selection (KEY CHANGE) ----------
#         instruction_text = example["instruction"]
#         fname = example.get("img_filename", "").lower()

#         if fname.startswith("autocad_windows") or fname.startswith("solidworks_windows"):
#             if "instruction_cn" in example and example["instruction_cn"]:
#                 instruction_text = example["instruction_cn"]

#         # ---------- metadata ----------
#         ele = {
#             "file_name": example["img_filename"],
#             "ui_type": example["ui_type"],
#             "group": example["group"],
#             "platform": example["platform"],
#             "application": example["application"],
#             "id": example["id"],
#             "instruction": instruction_text,
#             "img_size": example["img_size"],
#             "bbox_x1y1x2y2": normalize_bbox(
#                 example["bbox"],
#                 example["img_size"][0],
#                 example["img_size"][1]
#             ),
#             "hit_top1": 0,
#         }

#         # ---------- load image ----------
#         image = Image.open(
#             os.path.join(image_dir, example["img_filename"])
#         ).convert("RGB")
#         W, H = image.size

#         # ================= STAGE 0: SPOTLIGHT ROI =================
#         roi_bbox, mask_blur = spotlight_bbox(image, instruction_text)

#         if roi_bbox is None:
#             rx1, ry1, rx2, ry2 = 0, 0, W, H
#         else:
#             rx1, ry1, rx2, ry2 = roi_bbox

#         roi = image.crop((rx1, ry1, rx2, ry2))
#         roi_w, roi_h = roi.size

#         # ================= STAGE 1: COARSE TOP-3 =================
#         conv1 = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": roi},
#                 {"type": "text", "text": instruction_text}
#             ]}
#         ]

#         pred1 = inference(
#             conv1,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=3
#         )

#         pts = []
#         for px, py in pred1["topk_points"][:3]:
#             gx = px * roi_w + rx1
#             gy = py * roi_h + ry1
#             pts.append((gx, gy))

#         # ================= STAGE 2: ZOOM-REFINE-1 =================
#         xs, ys = zip(*pts)
#         xmin, xmax = min(xs), max(xs)
#         ymin, ymax = min(ys), max(ys)

#         margin = 0.25
#         bw, bh = xmax - xmin, ymax - ymin
#         xmin = max(0, int(xmin - bw * margin))
#         ymin = max(0, int(ymin - bh * margin))
#         xmax = min(W, int(xmax + bw * margin))
#         ymax = min(H, int(ymax + bh * margin))

#         crop1 = image.crop((xmin, ymin, xmax, ymax))
#         zoom1 = crop1.resize((W, H), Image.BILINEAR)

#         conv2 = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": zoom1},
#                 {"type": "text", "text": instruction_text}
#             ]}
#         ]

#         pred2 = inference(
#             conv2,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=3
#         )

#         # ================= STAGE 3: ZOOM-REFINE-2 =================
#         pts2 = []
#         for zx, zy in pred2["topk_points"]:
#             gx = xmin + zx * (xmax - xmin)
#             gy = ymin + zy * (ymax - ymin)
#             pts2.append((gx, gy))

#         xs, ys = zip(*pts2)
#         xmin2, xmax2 = min(xs), max(xs)
#         ymin2, ymax2 = min(ys), max(ys)

#         margin = 0.15
#         bw, bh = xmax2 - xmin2, ymax2 - ymin2
#         xmin2 = max(0, int(xmin2 - bw * margin))
#         ymin2 = max(0, int(ymin2 - bh * margin))
#         xmax2 = min(W, int(xmax2 + bw * margin))
#         ymax2 = min(H, int(ymax2 + bh * margin))

#         crop_mid = image.crop((xmin2, ymin2, xmax2, ymax2))
#         zoom_mid = crop_mid.resize((W, H), Image.BILINEAR)

#         conv_mid = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": zoom_mid},
#                 {"type": "text", "text": instruction_text}
#             ]}
#         ]

#         pred_mid = inference(
#             conv_mid,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=1
#         )

#         zx_mid, zy_mid = pred_mid["topk_points"][0]
#         px_mid = (xmin2 + zx_mid * (xmax2 - xmin2)) / W
#         py_mid = (ymin2 + zy_mid * (ymax2 - ymin2)) / H

#         # ================= STAGE 4: FINAL CONFIRM =================
#         cx, cy = int(px_mid * W), int(py_mid * H)
#         box = int(0.25 * min(W, H))

#         x1 = max(0, cx - box)
#         y1 = max(0, cy - box)
#         x2 = min(W, cx + box)
#         y2 = min(H, cy + box)

#         crop2 = image.crop((x1, y1, x2, y2))
#         zoom2 = crop2.resize((W, H), Image.BILINEAR)

#         conv3 = [
#             {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
#             {"role": "user", "content": [
#                 {"type": "image", "image": zoom2},
#                 {"type": "text", "text": instruction_text}
#             ]}
#         ]

#         pred3 = inference(
#             conv3,
#             model,
#             tokenizer,
#             data_processor,
#             logits_processor=logits_processor_pointer,
#             use_placeholder=use_placeholder,
#             topk=1
#         )

#         zx2, zy2 = pred3["topk_points"][0]
#         final_px = (x1 + zx2 * (x2 - x1)) / W
#         final_py = (y1 + zy2 * (y2 - y1)) / H

#         # ================= EVALUATION =================
#         gx1, gy1, gx2, gy2 = ele["bbox_x1y1x2y2"]
#         if gx1 <= final_px <= gx2 or gy1 <= final_py <= gy2:
#             ele["hit_top1"] = 1

#         # ================= VIS DEBUG =================
#         img_np = np.array(image).astype(np.float32) / 255.0

#         if mask_blur is not None:
#             darken_factor = 0.07
#             if mask_blur.ndim == 3:
#                 mask_blur = mask_blur.squeeze(-1)
#             mask3 = np.stack([mask_blur] * 3, axis=-1)
#             dark_np = img_np * darken_factor
#             vis_np = dark_np * (1 - mask3) + img_np * mask3
#             vis_np = np.clip(vis_np, 0, 1)
#         else:
#             vis_np = img_np

#         vis = Image.fromarray((vis_np * 255).astype(np.uint8))
#         draw = ImageDraw.Draw(vis)

#         draw.rectangle(
#             [int(gx1 * W), int(gy1 * H), int(gx2 * W), int(gy2 * H)],
#             outline="lime", width=3
#         )
#         draw.rectangle([rx1, ry1, rx2, ry2], outline="yellow", width=2)

#         fx, fy = int(final_px * W), int(final_py * H)
#         r = 6
#         draw.ellipse([fx - r, fy - r, fx + r, fy + r], fill="red")

#         vis.save(f"vis_debug/{example['id']}_final.png")

#         results.append(ele)

#     return results


In [9]:
# ================= 5️⃣ Hàm get_metric =================
def get_metric(list_of_examples):
    metrics = ["hit_top1", "overlap_top1", "hit_topk", "overlap_topk"]
    groups=["Dev", "Creative", "CAD", "Scientific", "Office", "OS"]
    ui_types=["text", "icon"]

    def compute_mean(examples, key):
        if not examples: return None
        return sum(example.get(key, 0) for example in examples)/len(examples)

    results = {metric:{} for metric in metrics}

    for group in groups:
        group_examples = [ex for ex in list_of_examples if ex.get("group")==group]
        for ui in ui_types:
            group_ui_examples = [ex for ex in group_examples if ex.get("ui_type")==ui]
            col_name=f"{group}-{ui}"
            for metric in metrics:
                results[metric][col_name]=compute_mean(group_ui_examples, metric)
        col_name_avg=f"{group}-avg"
        for metric in metrics:
            results[metric][col_name_avg]=compute_mean(group_examples, metric)

    for ui in ui_types:
        ui_examples = [ex for ex in list_of_examples if ex.get("ui_type")==ui]
        col_name=f"All-{ui}"
        for metric in metrics:
            results[metric][col_name]=compute_mean(ui_examples, metric)
    for metric in metrics:
        results[metric]["All-avg"]=compute_mean(list_of_examples, metric)

    return results


In [None]:
# ================= 6️⃣ Chạy evaluate và lưu kết quả =================
resize_to_pixels = RESIZE_TO_PIXELS if RESIZE_TO_PIXELS > 0 else None

if os.path.exists(METRIC_PATH):
    print(f"Metrics already exist at {METRIC_PATH}")
else:
    if os.path.exists(PRED_PATH):
        print(f"Loading predictions from {PRED_PATH}")
        with open(PRED_PATH, "r") as f:
            results = json.load(f)
    else:
        print(f"Evaluating {MODEL_NAME_OR_PATH} ...")
        results = evaluate(MODEL_NAME_OR_PATH, MODEL_TYPE, DATA_FN, IMAGE_DIR, USE_PLACEHOLDER, TOPK, resize_to_pixels)
        with open(PRED_PATH, "w") as f:
            json.dump(results, f)
        print(f"Saved {len(results)} predictions to {PRED_PATH}")

    metric_info = get_metric(results)
    with open(METRIC_PATH, "w") as f:
        json.dump(metric_info, f, indent=2)
    print(f"Saved metric to {METRIC_PATH}")


Evaluating microsoft/GUI-Actor-7B-Qwen2.5-VL ...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

  1%|▏         | 23/1581 [08:27<8:33:56, 19.79s/it] 