In [None]:
import os
import sys
import json
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image, ImageDraw
from tqdm import tqdm
from glob import glob
import cv2
from glob import glob
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append('/home/phamlong/Downloads/v2p/AWorld-RL/V2P/src')
from transformers import AutoProcessor
from V2P.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN
from V2P.modeling import Qwen2VLForConditionalGenerationWithPointer
from V2P.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
from V2P.inference import inference, ForceFollowTokensLogitsProcessor


In [None]:


FEATHER_SIGMA = 16       
SOFT_MASK_THRESH = 0.25  


In [None]:
MODEL_TYPE = "qwen25vl"
MODEL_NAME_OR_PATH = "inclusionAI/V2P-7B"

SAVE_PATH = "results/"
DATA_PATH = "/home/phamlong/Downloads/v2p/OSWorld-G/benchmark"

RESIZE_TO_PIXELS = 3200 * 1800

USE_PLACEHOLDER = True
TOPK = 3
IMAGE_PATCH_SIZE = 14

# ===== Spotlight Config =====
BASE_CLIP = "/home/phamlong/Downloads/clip finetune/clip-vit-base-patch32"
CKPT_PATH = "/home/phamlong/Downloads/clip finetune/clip_finetuned/clip_finetuned_epoch5.pth"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

NUM_GRID = 36
TOP_K_REGIONS = 4
REGION_THRESH = 0.25
CROP_PAD = 8

print("Loading CLIP...")
clip_model = CLIPModel.from_pretrained(BASE_CLIP)
clip_processor = CLIPProcessor.from_pretrained(BASE_CLIP)

if os.path.exists(CKPT_PATH):
    state = torch.load(CKPT_PATH, map_location="cpu")
    if "model" in state:
        state = state["model"]
    clip_model.load_state_dict(state, strict=False)

clip_model = clip_model.to(DEVICE).eval()
torch.set_grad_enabled(False)

IMAGE_DIR = os.path.join(DATA_PATH, "images")
DATA_FN = os.path.join(DATA_PATH, "classification_result.json")

PRED_PATH = os.path.join(SAVE_PATH, "osworld_results.json")
METRIC_PATH = os.path.join(SAVE_PATH, "osworld_metrics.json")

assert os.path.isfile(DATA_FN), \
    f"Missing classification_result.json at {DATA_FN}"

assert os.path.isdir(IMAGE_DIR), \
    f"Missing images directory at {IMAGE_DIR}"

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH, exist_ok=True)


def normalize_bbox(bbox_x1y1x2y2, img_width, img_height):
    x1, y1, x2, y2 = bbox_x1y1x2y2
    if (0 <= x1 <= 1) and (0 <= y1 <= 1) and (0 <= x2 <= 1) and (0 <= y2 <= 1):
        return bbox_x1y1x2y2
    else:
        return (
            x1 / img_width,
            y1 / img_height,
            x2 / img_width,
            y2 / img_height,
        )


Loading CLIP...


In [4]:
def hybrid_vector_field_grid(image, instruction, num_grid=36, sigma=0.4, batch_size=8):
    img = image.convert("RGB")
    W, H = img.size
    pw, ph = W // num_grid, H // num_grid

    patches = []
    for gy in range(num_grid):
        for gx in range(num_grid):
            x1, y1 = gx * pw, gy * ph
            x2, y2 = x1 + pw, y1 + ph
            patches.append(img.crop((x1, y1, x2, y2)))

    # text embedding
    text_inputs = clip_processor(text=instruction, return_tensors="pt").to(DEVICE)
    text_emb = clip_model.get_text_features(**text_inputs)
    text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
    text_emb = text_emb.cpu().numpy()

    # patch embeddings
    feats = []
    for i in range(0, len(patches), batch_size):
        inputs = clip_processor(images=patches[i:i+batch_size], return_tensors="pt").to(DEVICE)
        emb = clip_model.get_image_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
        feats.append(emb.cpu().numpy())
    patch_embs = np.concatenate(feats, axis=0)

    cos_sim = cosine_similarity(patch_embs, text_emb).flatten()
    gates = 1 / (1 + np.exp(-3.0 * (cos_sim - 0.5)))
    gated = patch_embs * gates[:, None]

    N = gated.shape[0]
    F_field = np.zeros_like(gated)
    for i in range(N):
        diff = gated - gated[i]
        dist = np.linalg.norm(diff, axis=1)
        w = np.exp(-dist**2 / sigma**2) * gates
        F_field[i] = np.sum(w[:, None] * diff, axis=0)

    F_field /= (np.linalg.norm(F_field, axis=1, keepdims=True) + 1e-8)

    div = np.zeros(N)
    for i in range(N):
        idx = np.argsort(np.linalg.norm(F_field - F_field[i], axis=1))[:8]
        grad = (F_field[idx] - F_field[i]).mean(axis=0)
        div[i] = np.dot(grad, F_field[i])

    score = np.log1p(np.maximum(0, -div)) * cos_sim * gates
    score = (score - score.min()) / (score.max() - score.min() + 1e-8)

    return score.reshape(num_grid, num_grid), (H, W)


In [None]:

def spotlight_bbox(image, instruction):
    grid_scores, (H, W) = hybrid_vector_field_grid(image, instruction)

    binary = (grid_scores >= REGION_THRESH).astype(np.uint8)
    num_labels, labels = cv2.connectedComponents(binary)

    regions = []
    for l in range(1, num_labels):
        mask = (labels == l)
        area = mask.sum()
        if area < 3:
            continue
        score = grid_scores[mask].mean() * area
        regions.append((score, l))

    regions = sorted(regions, reverse=True)[:TOP_K_REGIONS]

    if len(regions) == 0:
        return None, None

    patch_mask = np.zeros_like(grid_scores, dtype=np.float32)
    for _, l in regions:
        patch_mask += (labels == l).astype(np.float32)
    patch_mask = np.clip(patch_mask, 0, 1)

    t = torch.tensor(patch_mask)[None, None]
    mask_full = F.interpolate(
        t, size=(H, W), mode="bicubic", align_corners=False
    ).squeeze().numpy()

    mask_full = (mask_full - mask_full.min()) / (mask_full.max() - mask_full.min() + 1e-8)

    # ---- feather ----
    mask_blur = cv2.GaussianBlur(mask_full, (0, 0), FEATHER_SIGMA)

    # ---- bbox từ mask ----
    bin_mask = mask_blur > 0.25
    ys, xs = np.where(bin_mask)
    if len(xs) == 0:
        return None, None

    x1, x2 = xs.min(), xs.max()
    y1, y2 = ys.min(), ys.max()

    bbox = (
        max(0, x1 - CROP_PAD),
        max(0, y1 - CROP_PAD),
        min(W - 1, x2 + CROP_PAD),
        min(H - 1, y2 + CROP_PAD),
    )

    return bbox, mask_blur



In [6]:
def point_in_polygon(point, polygon):
    x, y = point
    inside = False
    n = len(polygon) // 2
    j = n - 1
    for i in range(n):
        xi, yi = polygon[2*i], polygon[2*i+1]
        xj, yj = polygon[2*j], polygon[2*j+1]
        if (yi > y) != (yj > y):
            x_intersect = (xj - xi) * (y - yi) / (yj - yi + 1e-6) + xi
            if x < x_intersect:
                inside = not inside
        j = i
    return inside



In [None]:
import glob

def evaluate(
    model_name_or_path,
    model_type,
    data_fn,
    image_dir,
    use_placeholder,
    topk,
    resize_to_pixels=None
):
    import os, json, torch
    import numpy as np
    from PIL import Image, ImageDraw
    from tqdm import tqdm
    from transformers import AutoProcessor

    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
    tokenizer = data_processor.tokenizer

    if model_type == "qwen2vl":
        model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            device_map="cuda:0",
            attn_implementation="flash_attention_2"
        ).eval()

        grounding_system_message = (
            "You are a GUI agent. You are given a task and a screenshot of the screen. "
            "You need to locate the UI element corresponding to the instruction."
        )

    elif model_type == "qwen25vl":
        model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            device_map="cuda:0",
            attn_implementation="flash_attention_2"
        ).eval()

        grounding_system_message = (
            "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, "
            "your task is to locate the screen element that corresponds to the instruction."
        )
    else:
        raise ValueError("Invalid model type")

    logits_processor_pointer = ForceFollowTokensLogitsProcessor(
        token_a_id=tokenizer.encode(DEFAULT_POINTER_PAD_TOKEN)[0],
        forced_sequence=[tokenizer.encode(DEFAULT_POINTER_END_TOKEN)[0]]
    )

    assert os.path.isfile(data_fn)
    with open(data_fn, "r") as f:
        raw_json = json.load(f)

    assert "classified" in raw_json

    data = []
    for task_type, samples in raw_json["classified"].items():
        for ex in samples:
            data.append({
                "id": ex["id"],
                "instruction": ex["instruction"],
                "img_filename": ex["image_path"],
                "img_size": ex["image_size"],
                "box_type": ex["box_type"],
                "box_coordinates": ex["box_coordinates"],
                "platform": "unknown",
                "ui_type": (
                    "+".join(ex["GUI_types"])
                    if isinstance(ex.get("GUI_types"), list)
                    else "unknown"
                ),
                "task_type": task_type
            })

    results = []
    os.makedirs("vis_debug", exist_ok=True)

    for example in tqdm(data):
        instruction_text = example["instruction"]

        ele = {
            "file_name": example["img_filename"],
            "platform": example["platform"],
            "ui_type": example["ui_type"],
            "instruction": instruction_text,
            "img_size": example["img_size"],
            "task_type": example["task_type"],
            "hit_top1": 0
        }

        image = Image.open(os.path.join(image_dir, example["img_filename"])).convert("RGB")
        W, H = image.size

        roi_bbox, mask_blur = spotlight_bbox(image, instruction_text)

        if roi_bbox is None:
            rx1, ry1, rx2, ry2 = 0, 0, W, H
        else:
            rx1, ry1, rx2, ry2 = roi_bbox

        roi = image.crop((rx1, ry1, rx2, ry2))
        roi_w, roi_h = roi.size

        conv = [
            {"role": "system", "content": [{"type": "text", "text": grounding_system_message}]},
            {"role": "user", "content": [
                {"type": "image", "image": roi},
                {"type": "text", "text": instruction_text}
            ]}
        ]

        pred = inference(
            conv,
            model,
            tokenizer,
            data_processor,
            logits_processor=logits_processor_pointer,
            use_placeholder=use_placeholder,
            topk=topk
        )

        # ================= FINAL POINT (NO STAGE 2) =================
        # lấy top1 point từ ROI → map về ảnh gốc
        zx, zy = pred["topk_points"][0]
        final_px = (rx1 + zx * roi_w) / W
        final_py = (ry1 + zy * roi_h) / H

        # ================= HIT@1 =================
        px, py = final_px, final_py
        hit = 0

        if example["box_type"] == "bbox":
            x, y, bw, bh = example["box_coordinates"]
            gt = normalize_bbox([x, y, x + bw, y + bh], W, H)
            if gt[0] <= px <= gt[2] and gt[1] <= py <= gt[3]:
                hit = 1

        elif example["box_type"] == "polygon":
            cx, cy = px * W, py * H
            if point_in_polygon((cx, cy), example["box_coordinates"]):
                hit = 1

        elif example["box_type"] == "refusal":
            hit = 1

        ele["hit_top1"] = hit

        img_np = np.array(image).astype(np.float32) / 255.0
        if mask_blur is not None and mask_blur.shape[:2] == img_np.shape[:2]:
            darken_factor = 0.07
            if mask_blur.ndim == 3:
                mask_blur = mask_blur.squeeze(-1)
            mask3 = np.stack([mask_blur] * 3, axis=-1)
            dark_np = img_np * darken_factor
            vis_np = dark_np * (1 - mask3) + img_np * mask3
            vis_np = np.clip(vis_np, 0, 1)
        else:
            vis_np = img_np

        vis = Image.fromarray((vis_np * 255).astype(np.uint8))
        draw = ImageDraw.Draw(vis)

        # GT bbox
        if example["box_type"] == "bbox":
            draw.rectangle(
                [int(gt[0] * W), int(gt[1] * H), int(gt[2] * W), int(gt[3] * H)],
                outline="lime",
                width=3
            )
        elif example["box_type"] == "polygon":
            poly = [(x, y) for x, y in zip(example["box_coordinates"][0::2], example["box_coordinates"][1::2])]
            draw.polygon(poly, outline="lime", width=3)

        # ROI
        draw.rectangle([rx1, ry1, rx2, ry2], outline="yellow", width=2)

        # prediction
        fx, fy = int(final_px * W), int(final_py * H)
        r = 6
        draw.ellipse([fx - r, fy - r, fx + r, fy + r], fill="red")

        vis.save(f"vis_debug/{example['id']}_final.png")

        results.append(ele)

    return results


In [None]:
def get_metric(list_of_examples):

    TASK_NAME_MAP = {
        "text_matching": "Text_Matching",
        "element_recognition": "Element_Recognition",
        "layout_understanding": "Layout_Understanding",
        "fine_grained_manipulation": "Fine_grained_Manipulation",
        "refusal": "Refusal",
    }

    results = {v: None for v in TASK_NAME_MAP.values()}

    for raw_name, pretty_name in TASK_NAME_MAP.items():
        task_examples = [
            ex for ex in list_of_examples
            if ex.get("task_type") == raw_name
        ]

        if len(task_examples) > 0:
            results[pretty_name] = (
                sum(ex.get("hit_top1", 0) for ex in task_examples)
                / len(task_examples)
            )

    valid_scores = [
        v for k, v in results.items()
        if v is not None and k != "Refusal"
    ]

    results["Overall"] = (
        sum(valid_scores) / len(valid_scores)
        if valid_scores else None
    )

    return results


In [None]:
resize_to_pixels = RESIZE_TO_PIXELS if RESIZE_TO_PIXELS > 0 else None

assert os.path.isfile(DATA_FN), f"DATA_FN must be a json file, got {DATA_FN}"
assert DATA_FN.endswith(".json"), f"DATA_FN must be a json file, got {DATA_FN}"

if os.path.exists(METRIC_PATH):
    print(f"Metrics already exist at {METRIC_PATH}")

else:
    if os.path.exists(PRED_PATH):
        print(f"Loading predictions from {PRED_PATH}")
        with open(PRED_PATH, "r") as f:
            results = json.load(f)

    else:
        print(f"Evaluating {MODEL_NAME_OR_PATH} ...")
        results = evaluate(
            model_name_or_path=MODEL_NAME_OR_PATH,
            model_type=MODEL_TYPE,
            data_fn=DATA_FN,          
            image_dir=IMAGE_DIR,
            use_placeholder=USE_PLACEHOLDER,
            topk=TOPK,
            resize_to_pixels=resize_to_pixels
        )

        assert isinstance(results, list)
        assert "task_type" in results[0]
        assert "hit_top1" in results[0]

        print(
            "Detected tasks:",
            sorted(set(r["task_type"] for r in results))
        )

        with open(PRED_PATH, "w") as f:
            json.dump(results, f, indent=2)

        print(f"Saved {len(results)} predictions to {PRED_PATH}")

    metric_info = get_metric(results)

    with open(METRIC_PATH, "w") as f:
        json.dump(metric_info, f, indent=2)

    print("===== FINAL METRICS (OSWorld-G) =====")
    for k, v in metric_info.items():
        if v is None:
            print(f"{k}: N/A")
        else:
            print(f"{k}: {v:.4f}")

    print(f"Saved metric to {METRIC_PATH}")
