In [1]:
# load instruct part data - images and corresponding masks

In [2]:
# changes from segzero dataset creation:
# 1. qwen3 outputs all boxes in 1000x1000 resolution, so the ground truth "resize_size" needs to be 1000x1000
# 2. qwen3 vision encoder actually expects images to be multple of 32, so we will resize images to 102x1024
# 3. we will add the sam3 boxes as baselines boxes in an dataset column "baseline_boxes"
# 4. we will add the grounding dino object boxes as object_hint_boxes in an dataset column "object_hint_boxes"
## for the sam3 and grounding dino boxes, we will use the same resize logic as the images -> the boxes need to be in a 1000x1000 grid
## if all 4 components of a box are 0, we will set that box to None instead of [0,0,0,0]. This needs to be handled during training

In [3]:
# format in segzero format 
from datasets import Dataset, DatasetDict, Image, Features, Value
from huggingface_hub import create_repo
import os
import json
from tqdm import tqdm
import glob
from tqdm import tqdm
from PIL import Image as PILImage
import cv2

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def scale_box_coordinates(bbox_2d, x_factor, y_factor):
    """
    bbox_2d: [x1, y1, x2, y2]
    """

    scaled_bbox = [
        int(bbox_2d[0] * x_factor + 0.5),  # x1
        int(bbox_2d[1] * y_factor + 0.5),  # y1
        int(bbox_2d[2] * x_factor + 0.5),  # x2
        int(bbox_2d[3] * y_factor + 0.5)   # y2
    ]
    
    return scaled_bbox

def scale_point_coordinates(point_2d, x_factor, y_factor):
    """
    point_2d: [x, y]
    """
    scaled_point = [
        int(point_2d[0] * x_factor + 0.5),  # x
        int(point_2d[1] * y_factor + 0.5)   # y
    ]
    
    return scaled_point

def create_local_dataset(train_data, output_dir, image_resize):
    def process_split(split_data, image_resize):
        processed_data = split_data.copy()
        images = []
        for img_path in split_data['image']:
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, (image_resize, image_resize), interpolation=cv2.INTER_AREA)
            images.append(img)
        
        processed_data['image'] = images
        return processed_data
    
    dataset = DatasetDict({
        'train': Dataset.from_dict(
            process_split(train_data, image_resize),
            features=Features({
                'id': Value('string'),
                'problem': Value('string'),
                'solution': Value('string'),
                'image': Image(),
                'img_height': Value('int64'),
                'img_width': Value('int64'),
                'object_part': Value('bool'),
                'object_hint_boxes': Value('string'),
                'baseline_iou': Value('float32'),
            })
        )
    })
    
    dataset.save_to_disk(output_dir)
    print(f"saved to: {output_dir}")
    
    return dataset

In [5]:
# the following json file has been created in create_instructpart_for_segzero.ipynb 
data_path = "/data/VLMGroundingProject/Datasets/InstructPart/train1800/instructpart_train_metadata_for_rl.json"
data = json.load(open(data_path, 'r'))

image_resize = 1024
prediction_grid_size = 1000
    
id_list = []
problem_list = []
solution_list = []
image_list = []
img_height_list = []
img_width_list = []
# create an object_part_list. add True for all entries since this is instructpart dataset
object_part_list = [True] * len(data)
object_boxes_list = []
baseline_iou_list = []

In [6]:
# read object_hint data 
object_boxes_data_path = "/data/VLMGroundingProject/Datasets/InstructPart/train1800/groundingdino_object_boxes_on_instrucpart_train.json"
object_boxes_data = json.load(open(object_boxes_data_path, 'r'))
# create mapping from image_name to object boxes
image_name_to_object_boxes = {}
for item in object_boxes_data:
    if item['boxes'] != [[0, 0, 0, 0]]:
        image_name_to_object_boxes[item['image_filename']] = item['boxes']
    else:
        print(f"Warning: No valid boxes found for {item['image_filename']}")
        image_name_to_object_boxes[item['image_filename']] = None



In [7]:
# read sam3 baseline boxes data 
sam3_boxes_data_path0 = "/data/VLMGroundingProject/Datasets/InstructPart/train1800/sam3_bboxes_0.json"
sam3_boxes_data0 = json.load(open(sam3_boxes_data_path0, 'r'))
sam3_boxes_data_path1 = "/data/VLMGroundingProject/Datasets/InstructPart/train1800/sam3_bboxes_1.json"
sam3_boxes_data1 = json.load(open(sam3_boxes_data_path1, 'r'))
sam3_boxes_data = sam3_boxes_data0 + sam3_boxes_data1
image_name_to_sam3_boxes = {}
for item in sam3_boxes_data:
    if item['pred_bboxes'] != [0, 0, 0, 0]:
        image_name_to_sam3_boxes[item['image_name']] = item['pred_bboxes']
    else:
        print(f"Warning: No valid SAM3 boxes found for {item['image_name']}")
        image_name_to_sam3_boxes[item['image_name']] = None



In [None]:
# compute baseline iou (sam3 boxes vs gt boxes) and add it to the dataset right here so we don't have to do this during training 

import numpy as np
from scipy.optimize import linear_sum_assignment

def batch_iou(boxes1, boxes2):
    """Compute IoU between each box in boxes1 and each box in boxes2."""
    x11, y11, x12, y12 = np.split(boxes1, 4, axis=1)  # (M,1) each
    x21, y21, x22, y22 = np.split(boxes2, 4, axis=1)  # (N,1)
    # Intersection coords
    xA = np.maximum(x11, np.transpose(x21))
    yA = np.maximum(y11, np.transpose(y21))
    xB = np.minimum(x12, np.transpose(x22))
    yB = np.minimum(y12, np.transpose(y22))
    interArea = np.maximum(0, xB - xA + 1) * np.maximum(0, yB - yA + 1)
    # Areas of boxes
    box1Area = (x12 - x11 + 1) * (y12 - y11 + 1)
    box2Area = (x22 - x21 + 1) * (y22 - y21 + 1)
    # Union
    unionArea = box1Area + np.transpose(box2Area) - interArea
    iou = interArea / np.clip(unionArea, a_min=1e-9, a_max=None)  # avoid division by zero
    return iou

def compute_hungarian_match_and_average_iou(pred_bboxes: np.ndarray, gt_bboxes: np.ndarray) -> float:
    try:
        M, N = len(pred_bboxes), len(gt_bboxes)
        if M == 0 or N == 0:
            # if both empty, full reward
            if M == 0 and N == 0:
                return 1.0  # no objects quried and none predicted, so its correct.
            return 0.0  # no initial prediction or no target, no iou
        
        iou_matrix = batch_iou(pred_bboxes, gt_bboxes)  # (M,N)

        cost_matrix = 1.0 - iou_matrix  # hungarian match on iou
        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        # Compute average IoU over matched pairs 
        avg_iou = 0.0
        if len(row_ind) > 0:
            ious = [iou_matrix[i, j] for i, j in zip(row_ind, col_ind)]
            avg_iou = np.mean(ious)
    except Exception as e:
        print("Caught error in compute_hungarian_match_and_average_iou:", e)
        avg_iou = 0.0

    return avg_iou

image_name_to_baseline_avg_iou = {}
for idx, item in tqdm(enumerate(data)):
    image_name = item['image_name']
    gt_boxes = item['bboxes']
    sam3_boxes = image_name_to_sam3_boxes[image_name]
    if sam3_boxes is None:
        image_name_to_baseline_avg_iou[image_name] = 0.0
        continue

    avg_iou = compute_hungarian_match_and_average_iou(np.array(sam3_boxes), np.array(gt_boxes))
    image_name_to_baseline_avg_iou[image_name] = avg_iou.item()

1800it [00:00, 25702.58it/s]


In [9]:
"""
data_entry = {
    "image_path": image_path,
    "image_name": image_name,
    "image_id": image_id,
    "object_name": object_name,
    "part_name": part_name,
    "bboxes": bboxes,
    "midpoints": midpoints,
    "image_width": image_width,
    "image_height": image_height,
    "num_segments": len(bboxes)
}
"""

for idx, item in tqdm(enumerate(data)):
    id_list.append(item['image_id'])

    # construct problem string
    # e.g., "cabinet's handle" or "bus' wheel"
    if item['object_name'].strip().endswith('s'):
        problem = f"{item['object_name']}' {item['part_name']}"
    else:
        problem = f"{item['object_name']}'s {item['part_name']}"
    problem_list.append(problem)
    
    image_list.append(item['image_path'])
    
    # print(item['image_path'])
    image = cv2.imread(item['image_path'])
    height, width = image.shape[:2]
    
    img_height_list.append(height)
    img_width_list.append(width)
    
    x_factor = prediction_grid_size / width
    y_factor = prediction_grid_size / height
    solution = []

    # TODO: Does it have to be bboxes_2d and points_2d? How is Qwen_2_5 trained? What about Qwen3
    # the scale methods will scale the coordinates to prediction grid size. for qwen3, this will be 1000x1000
    for box_idx in range(len(item['bboxes'])):
        solution.append({
            "bbox_2d": scale_box_coordinates(item['bboxes'][box_idx], x_factor, y_factor), # [x1, y1, x2, y2]
            "point_2d": scale_point_coordinates(item['midpoints'][box_idx], x_factor, y_factor) # [x, y]
            
        })
    solution_list.append(json.dumps(solution))
    # if idx > 20:
    #     break

    unscaled_object_boxes = image_name_to_object_boxes[item['image_name']]
    if unscaled_object_boxes is not None:
        scaled_object_boxes = []
        for box in unscaled_object_boxes:
            # scale the box to the prediction grid size. for qwen3, this will be 1000x1000
            scaled_box = scale_box_coordinates(box, x_factor, y_factor)
            scaled_object_boxes.append(scaled_box)
        object_boxes_list.append(json.dumps(scaled_object_boxes))
    else:
        object_boxes_list.append(None)  # empty list if no boxes
    
    # unscaled_baseline_boxes = image_name_to_sam3_boxes[item['image_name']]
    # if unscaled_baseline_boxes is not None:
    #     scaled_baseline_boxes = []
    #     for box in unscaled_baseline_boxes:
    #         # scale the box to the prediction grid size. for qwen3, this will be 1000x1000
    #         scaled_box = scale_box_coordinates(box, x_factor, y_factor)
    #         scaled_baseline_boxes.append(scaled_box)
    #     baseline_boxes_list.append(json.dumps(scaled_baseline_boxes))
    # else:
    #     baseline_boxes_list.append(None)  # This will work

    baseline_iou = image_name_to_baseline_avg_iou[item['image_name']]
    baseline_iou_list.append(baseline_iou)
    

train_data = {
    'id': id_list,
    'problem': problem_list,
    'solution': solution_list,
    'image': image_list,
    'img_height': img_height_list,
    'img_width': img_width_list,
    'object_part': object_part_list,
    'object_hint_boxes': object_boxes_list,
    'baseline_iou': baseline_iou_list
}

1800it [00:07, 228.00it/s]


In [10]:
# the method resizes images to image_resize x image_resize and saves the dataset in HF format 
# 
dataset = create_local_dataset(
    train_data=train_data,
    output_dir=f"/data/VLMGroundingProject/Datasets/InstructPart/train1800/instructpart_train_dataset_for_qwen3", 
    image_resize=image_resize
)

Saving the dataset (4/4 shards): 100%|██████████| 1800/1800 [00:04<00:00, 361.86 examples/s]

saved to: /data/VLMGroundingProject/Datasets/InstructPart/train1800/instructpart_train_dataset_for_qwen3



