In [2]:
import numpy as np
import pandas as pd

In [36]:

import math
from typing import List, Dict, Tuple, Union

BBox = Union[Dict[str, float], Dict[str, int], Tuple[float, float, float, float], List[float]]

def _coerce_bbox(b: BBox) -> Tuple[float, float, float, float]:
    if isinstance(b, (list, tuple)) and len(b) == 4:
        return float(b[0]), float(b[1]), float(b[2]), float(b[3])
    if isinstance(b, dict):
        keys = {k.lower(): k for k in b.keys()}
        def g(k):
            for cand in (k, k.replace('main', 'min')):  # tolerate 'ymain' -> 'ymin'
                if cand in keys:
                    return float(b[keys[cand]])
            raise KeyError(f"Missing key {k} in bbox dict: {b}")
        return g('xmin'), g('ymin'), g('xmax'), g('ymax')
    raise TypeError(f"Unsupported bbox format: {type(b)}")

def bbox_patch_binary_masks(
    image_w: int,
    image_h: int,
    bboxes: Union[BBox, List[BBox]],
    patch_w: int = 14,
    patch_h: int = None,
    *,
    inclusive_xymax: bool = False
) -> List[List[int]]:
    """
    For a W×H image split into patch_w×patch_h patches (row-major),
    return, for each bbox, a flat 0/1 list of length (num_rows*num_cols)
    marking patches that intersect the bbox.

    Args:
        image_w, image_h: image size in pixels (e.g., 336, 336)
        bboxes: single bbox or list of bboxes. Each bbox can be:
                - dict with keys xmin,ymin,xmax,ymax (case-insensitive; 'ymain' tolerated)
                - list/tuple [xmin, ymin, xmax, ymax]
        patch_w, patch_h: patch size in pixels (default 14×14). If patch_h is None, uses patch_w.
        inclusive_xymax: treat (xmax,ymax) as inclusive if True; exclusive if False.

    Returns:
        List of binary masks (one per bbox). Each mask is a list[int] of length num_rows*num_cols.
        Indexing is row-major: idx = row * num_cols + col.
    """
    if patch_h is None:
        patch_h = patch_w

    num_cols = math.ceil(image_w / patch_w)
    num_rows = math.ceil(image_h / patch_h)
    total_patches = num_rows * num_cols

    # Normalize to list of bboxes
    if isinstance(bboxes, list) and not (len(bboxes) == 4 and all(isinstance(v, (int, float)) for v in bboxes)):
        bbox_list = bboxes
    else:
        bbox_list = [bboxes]

    masks: List[List[int]] = []

    for b in bbox_list:
        x0, y0, x1, y1 = _coerce_bbox(b)
        if inclusive_xymax:
            x1 += 1.0
            y1 += 1.0

        # Clamp to image bounds
        x0 = max(0.0, min(x0, float(image_w)))
        y0 = max(0.0, min(y0, float(image_h)))
        x1 = max(0.0, min(x1, float(image_w)))
        y1 = max(0.0, min(y1, float(image_h)))

        # Degenerate -> all zeros
        if x1 <= x0 or y1 <= y0:
            masks.append([0] * total_patches)
            continue

        # Candidate patch span
        col_start = int(math.floor(x0 / patch_w))
        col_end   = int(math.ceil (x1 / patch_w) - 1)
        row_start = int(math.floor(y0 / patch_h))
        row_end   = int(math.ceil (y1 / patch_h) - 1)

        col_start = max(0, min(col_start, num_cols - 1))
        col_end   = max(0, min(col_end,   num_cols - 1))
        row_start = max(0, min(row_start, num_rows - 1))
        row_end   = max(0, min(row_end,   num_rows - 1))

        mask = [0] * total_patches

        # Mark intersecting patches
        for r in range(row_start, row_end + 1):
            for c in range(col_start, col_end + 1):
                patch_x0 = c * patch_w
                patch_y0 = r * patch_h
                patch_x1 = min(patch_x0 + patch_w, image_w)
                patch_y1 = min(patch_y0 + patch_h, image_h)

                inter_w = max(0.0, min(x1, patch_x1) - max(x0, patch_x0))
                inter_h = max(0.0, min(y1, patch_y1) - max(y0, patch_y0))
                if inter_w > 0.0 and inter_h > 0.0:
                    idx = r * num_cols + c
                    mask[idx] = 1

        masks.append(mask)

    return masks



def combine_mask_tensor(masks: List[List[int]], *, out_dtype=torch.uint64) -> torch.Tensor:
    if len(masks) == 0:
        return torch.empty((0,), dtype=out_dtype)
    mask_tensor = torch.as_tensor(masks, dtype=out_dtype)
    
    if mask_tensor.numel() == 0:
        # Empty input -> empty output
        return torch.empty((mask_tensor.shape[-1] if mask_tensor.ndim == 2 else 0,), 
                           dtype=out_dtype, device=mask_tensor.device)

    # Normalize to bool
    mask_bool = (mask_tensor != 0) if mask_tensor.dtype != torch.bool else mask_tensor
    # Row-wise OR -> single row
    combined_bool = torch.any(mask_bool, dim=0)
    return combined_bool.to(out_dtype)


The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [15]:
data = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/coco_long_captions/coco_img_des_10k_bb_annot.csv")

In [17]:
data.head(2)

Unnamed: 0,image_id,question,answer,target_words,image_path
0,COCO_train2014_000000557315.jpg,Please describe this image in detail.,This outdoor scene captures a black bear in wh...,"[{'word': 'black', 'class': 'attribute', 'bbox...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...
1,COCO_train2014_000000106639.jpg,Please describe this image in detail.,"The image presents an inviting indoor scene, l...","[{'word': 'dining', 'class': 'attribute', 'bbo...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...


In [50]:
eval(data["target_words"].iloc[100])[3]["bbox"]

[{'xmin': 174, 'ymin': 308, 'xmax': 234, 'ymax': 458},
 {'xmin': 107, 'ymin': 347, 'xmax': 164, 'ymax': 487},
 {'xmin': 276, 'ymin': 268, 'xmax': 303, 'ymax': 305},
 {'xmin': 291, 'ymin': 278, 'xmax': 324, 'ymax': 308}]

In [51]:

boxes = eval(data["target_words"].iloc[100])[3]["bbox"]
masks = bbox_patch_binary_masks(bboxes = boxes, image_w=336, image_h=336, patch_w=14)
combined_from_lists = combine_mask_tensor(masks)
