diff --git a/supervision/detection/core.py b/supervision/detection/core.py index bda2e7de3..e5e298bbb 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -47,6 +47,7 @@ from_moondream, from_paligemma, from_qwen_2_5_vl, + from_qwen_3_vl, validate_vlm_parameters, ) from supervision.geometry.core import Position @@ -951,6 +952,36 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio # array([0, 1]) ``` + !!! example "Qwen3-VL" + + ```python + import supervision as sv + + qwen_3_vl_result = \"\"\"```json + [ + {"bbox_2d": [139, 768, 315, 954], "label": "cat"}, + {"bbox_2d": [366, 679, 536, 849], "label": "dog"} + ] + ```\"\"\" + detections = sv.Detections.from_lmm( + sv.LMM.QWEN_3_VL, + qwen_3_vl_result, + resolution_wh=(1000, 1000), + classes=['cat', 'dog'], + ) + detections.xyxy + # array([[139., 768., 315., 954.], [366., 679., 536., 849.]]) + + detections.class_id + # array([0, 1]) + + detections.data + # {'class_name': array(['cat', 'dog'], dtype=' Detectio | PaliGemma | `PALIGEMMA` | detection | `resolution_wh` | `classes` | | PaliGemma 2 | `PALIGEMMA` | detection | `resolution_wh` | `classes` | | Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` | + | Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` | | Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` | | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | @@ -1328,6 +1360,36 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio # array([0, 1]) ``` + !!! example "Qwen3-VL" + + ```python + import supervision as sv + + qwen_3_vl_result = \"\"\"```json + [ + {"bbox_2d": [139, 768, 315, 954], "label": "cat"}, + {"bbox_2d": [366, 679, 536, 849], "label": "dog"} + ] + ```\"\"\" + detections = sv.Detections.from_vlm( + sv.VLM.QWEN_3_VL, + qwen_3_vl_result, + resolution_wh=(1000, 1000), + classes=['cat', 'dog'], + ) + detections.xyxy + # array([[139., 768., 315., 954.], [366., 679., 536., 849.]]) + + detections.class_id + # array([0, 1]) + + detections.data + # {'class_name': array(['cat', 'dog'], dtype=' Detectio if vlm == VLM.QWEN_2_5_VL: xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} - return cls(xyxy=xyxy, class_id=class_id, data=data) + confidence = np.ones(len(xyxy), dtype=float) + return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data) + + if vlm == VLM.QWEN_3_VL: + xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs) + data = {CLASS_NAME_DATA_FIELD: class_name} + confidence = np.ones(len(xyxy), dtype=float) + return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data) if vlm == VLM.DEEPSEEK_VL_2: xyxy, class_id, class_name = from_deepseek_vl_2(result, **kwargs) diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py index 2f9b60ddb..97988c9f0 100644 --- a/supervision/detection/vlm.py +++ b/supervision/detection/vlm.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import base64 import io import json @@ -27,7 +28,8 @@ class LMM(Enum): Attributes: PALIGEMMA: Google's PaliGemma vision-language model. FLORENCE_2: Microsoft's Florence-2 vision-language model. - QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba. + QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.\ + QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. MOONDREAM: The Moondream vision-language model. @@ -36,6 +38,7 @@ class LMM(Enum): PALIGEMMA = "paligemma" FLORENCE_2 = "florence_2" QWEN_2_5_VL = "qwen_2_5_vl" + QWEN_3_VL = "qwen_3_vl" DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" @@ -69,6 +72,7 @@ class VLM(Enum): PALIGEMMA: Google's PaliGemma vision-language model. FLORENCE_2: Microsoft's Florence-2 vision-language model. QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba. + QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. MOONDREAM: The Moondream vision-language model. @@ -77,6 +81,7 @@ class VLM(Enum): PALIGEMMA = "paligemma" FLORENCE_2 = "florence_2" QWEN_2_5_VL = "qwen_2_5_vl" + QWEN_3_VL = "qwen_3_vl" DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" @@ -106,6 +111,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.PALIGEMMA: str, VLM.FLORENCE_2: dict, VLM.QWEN_2_5_VL: str, + VLM.QWEN_3_VL: str, VLM.DEEPSEEK_VL_2: str, VLM.GOOGLE_GEMINI_2_0: str, VLM.GOOGLE_GEMINI_2_5: str, @@ -116,6 +122,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.PALIGEMMA: ["resolution_wh"], VLM.FLORENCE_2: ["resolution_wh"], VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh"], + VLM.QWEN_3_VL: ["resolution_wh"], VLM.DEEPSEEK_VL_2: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"], @@ -126,6 +133,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.PALIGEMMA: ["resolution_wh", "classes"], VLM.FLORENCE_2: ["resolution_wh"], VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh", "classes"], + VLM.QWEN_3_VL: ["resolution_wh", "classes"], VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"], @@ -235,6 +243,51 @@ def from_paligemma( return xyxy, class_id, class_name +def recover_truncated_qwen_2_5_vl_response(text: str) -> Any | None: + """ + Attempt to recover and parse a truncated or malformed JSON snippet from Qwen-2.5-VL + output. + + This utility extracts a JSON-like portion from a string that may be truncated or + malformed, cleans trailing commas, and attempts to parse it into a Python object. + + Args: + text (str): Raw text containing the JSON snippet possibly truncated or + incomplete. + + Returns: + Parsed Python object (usually list) if recovery and parsing succeed; + otherwise `None`. + """ + try: + first_bracket = text.find("[") + if first_bracket == -1: + return None + snippet = text[first_bracket:] + + last_brace = snippet.rfind("}") + if last_brace == -1: + return None + + snippet = snippet[: last_brace + 1] + + prefix_end = snippet.find("[") + if prefix_end == -1: + return None + + prefix = snippet[: prefix_end + 1] + body = snippet[prefix_end + 1 :].rstrip() + + if body.endswith(","): + body = body[:-1].rstrip() + + repaired = prefix + body + "]" + + return json.loads(repaired) + except Exception: + return None + + def from_qwen_2_5_vl( result: str, input_wh: tuple[int, int], @@ -242,7 +295,7 @@ def from_qwen_2_5_vl( classes: list[str] | None = None, ) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]: """ - Parse and scale bounding boxes from Qwen-2.5-VL style JSON output. + Parse and rescale bounding boxes and class labels from Qwen-2.5-VL JSON output. The JSON is expected to be enclosed in triple backticks with the format: ```json @@ -253,38 +306,52 @@ def from_qwen_2_5_vl( ``` Args: - result: String containing the JSON snippet enclosed by triple backticks. - input_wh: (input_width, input_height) describing the original bounding box - scale. - resolution_wh: (output_width, output_height) to which we rescale the boxes. - classes: Optional list of valid class names. If provided, returned boxes/labels - are filtered to only those classes found here. + result (str): String containing Qwen-2.5-VL JSON bounding box and label data. + input_wh (tuple[int, int]): Width and height of the coordinate space where boxes + are normalized. + resolution_wh (tuple[int, int]): Target width and height to scale bounding + boxes. + classes (list[str] or None): Optional list of valid class names to filter + results. If provided, only boxes with labels in this list are returned. Returns: - xyxy (np.ndarray): An array of shape `(n, 4)` containing - the bounding boxes coordinates in format `[x1, y1, x2, y2]` - class_id (Optional[np.ndarray]): An array of shape `(n,)` containing - the class indices for each bounding box (or None if `classes` is not - provided) - class_name (np.ndarray): An array of shape `(n,)` containing - the class labels for each bounding box + xyxy (np.ndarray): Array of shape `(N, 4)` with rescaled bounding boxes in + `(x_min, y_min, x_max, y_max)` format. + class_id (np.ndarray or None): Array of shape `(N,)` with indices of classes, + or `None` if no filtering applied. + class_name (np.ndarray): Array of shape `(N,)` with class names as strings. """ in_w, in_h = validate_resolution(input_wh) out_w, out_h = validate_resolution(resolution_wh) - pattern = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL) - - match = pattern.search(result) - if not match: - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + text = result.strip() + text = re.sub(r"^```(json)?", "", text, flags=re.IGNORECASE).strip() + text = re.sub(r"```$", "", text).strip() - json_snippet = match.group(1) + start = text.find("[") + end = text.rfind("]") + if start != -1 and end != -1 and end > start: + text = text[start : end + 1].strip() try: - data = json.loads(json_snippet) + data = json.loads(text) except json.JSONDecodeError: - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + repaired = recover_truncated_qwen_2_5_vl_response(text) + if repaired is not None: + data = repaired + else: + try: + data = ast.literal_eval(text) + except (ValueError, SyntaxError, TypeError): + return ( + np.empty((0, 4)), + np.empty((0,), dtype=int), + np.empty((0,), dtype=str), + ) + + if not isinstance(data, list): + return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str)) boxes_list = [] labels_list = [] @@ -296,7 +363,7 @@ def from_qwen_2_5_vl( labels_list.append(item["label"]) if not boxes_list: - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str)) xyxy = np.array(boxes_list, dtype=float) class_name = np.array(labels_list, dtype=str) @@ -315,6 +382,36 @@ def from_qwen_2_5_vl( return xyxy, class_id, class_name +def from_qwen_3_vl( + result: str, + resolution_wh: tuple[int, int], + classes: list[str] | None = None, +) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]: + """ + Parse and scale bounding boxes from Qwen-3-VL style JSON output. + + Args: + result (str): String containing the Qwen-3-VL JSON output. + resolution_wh (tuple[int, int]): Target resolution `(width, height)` to + scale bounding boxes. + classes (list[str] or None): Optional list of valid classes to filter + results. + + Returns: + xyxy (np.ndarray): Array of bounding boxes with shape `(N, 4)` in + `(x_min, y_min, x_max, y_max)` format scaled to `resolution_wh`. + class_id (np.ndarray or None): Array of class indices for each box, or + None if no filtering by classes. + class_name (np.ndarray): Array of class names as strings. + """ + return from_qwen_2_5_vl( + result=result, + input_wh=(1000, 1000), + resolution_wh=resolution_wh, + classes=classes, + ) + + def from_deepseek_vl_2( result: str, resolution_wh: tuple[int, int], classes: list[str] | None = None ) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]: diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py index 8a8240e98..9a0195f78 100644 --- a/test/detection/test_vlm.py +++ b/test/detection/test_vlm.py @@ -320,6 +320,43 @@ def test_from_paligemma( np.array(["dog"], dtype=str), ), ), # out-of-bounds box + ( + does_not_raise(), + """[ + {'bbox_2d': [10, 20, 110, 120], 'label': 'cat'} + ]""", + (640, 640), + (1280, 720), + None, + ( + np.array([[20.0, 22.5, 220.0, 135.0]]), + None, + np.array(["cat"], dtype=str), + ), + ), # python-style list, single quotes, no fences + ( + does_not_raise(), + """```json + [ + {"bbox_2d": [0, 0, 64, 64], "label": "dog"}, + {"bbox_2d": [10, 20, 110, 120], "label": "cat"}, + {"bbox_2d": [30, 40, 130, 140], "label": + """, + (640, 640), + (640, 640), + None, + ( + np.array( + [ + [0.0, 0.0, 64.0, 64.0], + [10.0, 20.0, 110.0, 120.0], + ], + dtype=float, + ), + None, + np.array(["dog", "cat"], dtype=str), + ), + ), # truncated response, last object unfinished, previous ones recovered ( pytest.raises(ValueError), """```json @@ -330,8 +367,8 @@ def test_from_paligemma( (0, 640), (1280, 720), None, - None, # won't be compared because we expect an exception - ), # zero input width -> ValueError + None, # invalid input_wh + ), ( pytest.raises(ValueError), """```json @@ -342,8 +379,8 @@ def test_from_paligemma( (640, 640), (1280, -100), None, - None, - ), # negative resolution height -> ValueError + None, # invalid resolution_wh + ), ], ) def test_from_qwen_2_5_vl(