In [1]:
import json
import os
import sys
import re

In [None]:
def extract_information_vrpart2(output_text, x_factor, y_factor):
    """
        extract bounding boxes, points, and various text fields from model output.
        parses VRPart2 output format.
        <think> ... </think> <target> ... </target> <object_hint> ... </object_hint> <first_answer> ... </first_answer> <criticism> ... </criticism> <answer> ... </answer>
        this returns empty strings and lists if it does not find the expected tags in the output.
    """
    # Extract think tag
    think_pattern = r'<think>([^<]+)</think>'
    think_match = re.search(think_pattern, output_text)
    think_text = think_match.group(1).strip() if think_match else ""
    
    # Extract decide tag
    decide_pattern = r'<target>([^<]+)</target>'
    decide_match = re.search(decide_pattern, output_text)
    decide_text = decide_match.group(1).strip() if decide_match else ""

    # Extract object_hint tag
    object_hint_pattern = r'<object_hint>\s*(.*?)\s*</object_hint>'
    object_hint_match = re.search(object_hint_pattern, output_text, re.DOTALL)
    object_hint_text = object_hint_match.group(1).strip() if object_hint_match else ""
    
    # Extract first_answer tag
    first_answer_pattern = r'<first_answer>\s*(.*?)\s*</first_answer>'
    first_answer_match = re.search(first_answer_pattern, output_text, re.DOTALL)
    first_answer_text = first_answer_match.group(1).strip() if first_answer_match else ""
    
    # Extract criticism tag
    criticism_pattern = r'<criticism>([^<]+)</criticism>'
    criticism_match = re.search(criticism_pattern, output_text)
    criticism_text = criticism_match.group(1).strip() if criticism_match else ""
    
    # Extract final_answer and parse bbox/points
    final_answer_pattern = r'<answer>\s*(.*?)\s*</answer>'
    final_answer_match = re.search(final_answer_pattern, output_text, re.DOTALL) 
    final_answer_text = final_answer_match.group(1).strip() if final_answer_match else ""

    output_text_parsed = {
        "think": think_text,
        "decide": decide_text,
        "first_answer": first_answer_text,
        "criticism": criticism_text,
        "final_answer": final_answer_text
    }
    
    pred_bboxes = []
    pred_points = []
    
    if final_answer_match:
        data = json.loads(final_answer_match.group(1))
        pred_bboxes = [[
            int(item['bbox_2d'][0] * x_factor + 0.5),
            int(item['bbox_2d'][1] * y_factor + 0.5),
            int(item['bbox_2d'][2] * x_factor + 0.5),
            int(item['bbox_2d'][3] * y_factor + 0.5)
        ] for item in data]
        pred_points = [[
            int(item['point_2d'][0] * x_factor + 0.5),
            int(item['point_2d'][1] * y_factor + 0.5)
        ] for item in data]

    return pred_bboxes, pred_points, output_text_parsed


In [3]:
test_string = """<think>Finding "fork's tines" means identifying the points where a fork's tines meet the handle. In this context, "fork's tines" refers to the points where the tines begin on the fork.</think>
<target>part</target>
<object_hint>
[{"bbox_2d": [0,96,838,840], "point_2d": [360,670]}]
</object_hint>
<first_answer>
[{"bbox_2d": [160,612,417,788], "point_2d": [373,708]}]
</first_answer>
<criticism>The box is already tight and correctly placed. ADJUSTMENT: YES</criticism>
<answer>
[{"bbox_2d": [160, 579, 443, 804], "point_2d": [373,708]}]
</answer>
"""

In [4]:
extract_information_vrpart2(test_string, 1.0, 1.0)

([[160, 579, 443, 804]],
 [[373, 708]],
 {'think': 'Finding "fork\'s tines" means identifying the points where a fork\'s tines meet the handle. In this context, "fork\'s tines" refers to the points where the tines begin on the fork.',
  'decide': 'part',
  'first_answer': '[{"bbox_2d": [160,612,417,788], "point_2d": [373,708]}]',
  'criticism': 'The box is already tight and correctly placed. ADJUSTMENT: YES',
  'final_answer': '[{"bbox_2d": [160, 579, 443, 804], "point_2d": [373,708]}]'})

In [1]:
# run inference

In [None]:
import os
from PIL import Image
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load the model
model_path = "/data/VLMGroundingProject/ModelData/SegZero/visionreasoner_workdir/ip_vrpretrained_partreward2/global_step_224/actor/huggingface"
print("Loading model...")
reasoning_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
reasoning_model.eval()

In [4]:
# Set your paths and parameters
image_path = "/data/VLMGroundingProject/Datasets/InstructPart/test/images/4773046926_15b78b68d5_o-television-screen.jpg"
query_text = "television's screen"  
prompt_template_path = "/home/ksmehrab/AttentionGrounding/ModelPlaygrounds/SegZero/EvaluationScripts/Prompts/vrpart2_prompt.txt"
resize_size = 840
max_response_length = 2000

# Load prompt template
with open(prompt_template_path, 'r') as f:
    QUESTION_TEMPLATE = f.read()

processor = AutoProcessor.from_pretrained(model_path)

# Load and prepare image
image = Image.open(image_path).convert("RGB")
resized_image = image.resize((resize_size, resize_size), Image.BILINEAR)

# Prepare message
message = [{
    "role": "user",
    "content": [
        {
            "type": "image", 
            "image": resized_image
        },
        {   
            "type": "text",
            "text": QUESTION_TEMPLATE.format(Question=query_text.lower().strip("."))
        }
    ]
}]

# Prepare inputs
text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info([message])

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to("cuda")

# Generate output
print("Generating response...")
with torch.inference_mode():
    generated_ids = reasoning_model.generate(
        **inputs, 
        use_cache=True, 
        max_new_tokens=max_response_length, 
        do_sample=False
    )
    generated_ids_trimmed = [
        out_ids[len(in_ids):] 
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, 
        skip_special_tokens=False, 
        clean_up_tokenization_spaces=False
    )[0]

# Print the output
print("\n" + "="*50)
print("Model Output:")
print("="*50)
print(output_text)
print("="*50)

Generating response...

Model Output:
<think>In this image, the television's screen is the part of the television that displays the image. It is a rectangular area within the overall television set.</think>
<target>part</target>
<object_hint>[{"bbox_2d": [158,146,760,840], "point_2d": [350,230]}]</object_hint>
<first_answer>[{"bbox_2d": [248,209,539,602], "point_2d": [390,425]}]</first_answer>
<criticism>The box is already tight and correctly placed. ADJUSTMENT: NO</criticism>
<answer>[{"bbox_2d": [248,209,539,602], "point_2d": [390,425]}]</answer><|im_end|>
