From 2edcbdc4872f83abcb7992f350afe338c9325958 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 21:27:55 +0100 Subject: [PATCH 01/12] initial commit with `from_qwen_3_vl` added, `from_qwen_2_5_vl` improved --- supervision/detection/core.py | 1 + supervision/detection/vlm.py | 137 ++++++++++++++++++++++++++++------ test/detection/test_vlm.py | 45 ++++++++--- 3 files changed, 151 insertions(+), 32 deletions(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index bda2e7de3..66610b998 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -1211,6 +1211,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio | PaliGemma | `PALIGEMMA` | detection | `resolution_wh` | `classes` | | PaliGemma 2 | `PALIGEMMA` | detection | `resolution_wh` | `classes` | | Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` | + | Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` | | Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` | | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py index 2f9b60ddb..9892e40de 100644 --- a/supervision/detection/vlm.py +++ b/supervision/detection/vlm.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import base64 import io import json @@ -27,7 +28,8 @@ class LMM(Enum): Attributes: PALIGEMMA: Google's PaliGemma vision-language model. FLORENCE_2: Microsoft's Florence-2 vision-language model. - QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba. + QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.\ + QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. MOONDREAM: The Moondream vision-language model. @@ -36,6 +38,7 @@ class LMM(Enum): PALIGEMMA = "paligemma" FLORENCE_2 = "florence_2" QWEN_2_5_VL = "qwen_2_5_vl" + QWEN_3_VL = "qwen_3_vl" DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" @@ -69,6 +72,7 @@ class VLM(Enum): PALIGEMMA: Google's PaliGemma vision-language model. FLORENCE_2: Microsoft's Florence-2 vision-language model. QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba. + QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. MOONDREAM: The Moondream vision-language model. @@ -77,6 +81,7 @@ class VLM(Enum): PALIGEMMA = "paligemma" FLORENCE_2 = "florence_2" QWEN_2_5_VL = "qwen_2_5_vl" + QWEN_3_VL = "qwen_3_vl" DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" @@ -106,6 +111,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.PALIGEMMA: str, VLM.FLORENCE_2: dict, VLM.QWEN_2_5_VL: str, + VLM.QWEN_3_VL: str, VLM.DEEPSEEK_VL_2: str, VLM.GOOGLE_GEMINI_2_0: str, VLM.GOOGLE_GEMINI_2_5: str, @@ -116,6 +122,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.PALIGEMMA: ["resolution_wh"], VLM.FLORENCE_2: ["resolution_wh"], VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh"], + VLM.QWEN_3_VL: ["resolution_wh"], VLM.DEEPSEEK_VL_2: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"], @@ -126,6 +133,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.PALIGEMMA: ["resolution_wh", "classes"], VLM.FLORENCE_2: ["resolution_wh"], VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh", "classes"], + VLM.QWEN_3_VL: ["resolution_wh", "classes"], VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"], @@ -235,6 +243,51 @@ def from_paligemma( return xyxy, class_id, class_name +def recover_truncated_qwen_2_5_vl_response(text: str) -> Any | None: + """ + Attempt to recover and parse a truncated or malformed JSON snippet from Qwen-2.5-VL + output. + + This utility extracts a JSON-like portion from a string that may be truncated or + malformed, cleans trailing commas, and attempts to parse it into a Python object. + + Args: + text (str): Raw text containing the JSON snippet possibly truncated or + incomplete. + + Returns: + Parsed Python object (usually list) if recovery and parsing succeed; + otherwise `None`. + """ + try: + first_bracket = text.find("[") + if first_bracket == -1: + return None + snippet = text[first_bracket:] + + last_brace = snippet.rfind("}") + if last_brace == -1: + return None + + snippet = snippet[: last_brace + 1] + + prefix_end = snippet.find("[") + if prefix_end == -1: + return None + + prefix = snippet[: prefix_end + 1] + body = snippet[prefix_end + 1 :].rstrip() + + if body.endswith(","): + body = body[:-1].rstrip() + + repaired = prefix + body + "]" + + return json.loads(repaired) + except Exception: + return None + + def from_qwen_2_5_vl( result: str, input_wh: tuple[int, int], @@ -242,7 +295,7 @@ def from_qwen_2_5_vl( classes: list[str] | None = None, ) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]: """ - Parse and scale bounding boxes from Qwen-2.5-VL style JSON output. + Parse and rescale bounding boxes and class labels from Qwen-2.5-VL JSON output. The JSON is expected to be enclosed in triple backticks with the format: ```json @@ -253,37 +306,47 @@ def from_qwen_2_5_vl( ``` Args: - result: String containing the JSON snippet enclosed by triple backticks. - input_wh: (input_width, input_height) describing the original bounding box - scale. - resolution_wh: (output_width, output_height) to which we rescale the boxes. - classes: Optional list of valid class names. If provided, returned boxes/labels - are filtered to only those classes found here. + result (str): String containing Qwen-2.5-VL JSON bounding box and label data. + input_wh (tuple[int, int]): Width and height of the coordinate space where boxes + are normalized. + resolution_wh (tuple[int, int]): Target width and height to scale bounding + boxes. + classes (list[str] or None): Optional list of valid class names to filter + results. If provided, only boxes with labels in this list are returned. Returns: - xyxy (np.ndarray): An array of shape `(n, 4)` containing - the bounding boxes coordinates in format `[x1, y1, x2, y2]` - class_id (Optional[np.ndarray]): An array of shape `(n,)` containing - the class indices for each bounding box (or None if `classes` is not - provided) - class_name (np.ndarray): An array of shape `(n,)` containing - the class labels for each bounding box + xyxy (np.ndarray): Array of shape `(N, 4)` with rescaled bounding boxes in + `(x_min, y_min, x_max, y_max)` format. + class_id (np.ndarray or None): Array of shape `(N,)` with indices of classes, + or `None` if no filtering applied. + class_name (np.ndarray): Array of shape `(N,)` with class names as strings. """ in_w, in_h = validate_resolution(input_wh) out_w, out_h = validate_resolution(resolution_wh) - pattern = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL) - - match = pattern.search(result) - if not match: - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + text = result.strip() + text = re.sub(r"^```(json)?", "", text, flags=re.IGNORECASE).strip() + text = re.sub(r"```$", "", text).strip() - json_snippet = match.group(1) + start = text.find("[") + end = text.rfind("]") + if start != -1 and end != -1 and end > start: + text = text[start: end + 1].strip() try: - data = json.loads(json_snippet) + data = json.loads(text) except json.JSONDecodeError: + repaired = recover_truncated_qwen_2_5_vl_response(text) + if repaired is not None: + data = repaired + else: + try: + data = ast.literal_eval(text) + except (ValueError, SyntaxError, TypeError): + return np.empty((0, 4)), None, np.empty((0,), dtype=str) + + if not isinstance(data, list): return np.empty((0, 4)), None, np.empty((0,), dtype=str) boxes_list = [] @@ -315,6 +378,36 @@ def from_qwen_2_5_vl( return xyxy, class_id, class_name +def from_qwen_3_vl( + result: str, + resolution_wh: tuple[int, int], + classes: list[str] | None = None, +) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]: + """ + Parse and scale bounding boxes from Qwen-3-VL style JSON output. + + Args: + result (str): String containing the Qwen-3-VL JSON output. + resolution_wh (tuple[int, int]): Target resolution `(width, height)` to + scale bounding boxes. + classes (list[str] or None): Optional list of valid classes to filter + results. + + Returns: + xyxy (np.ndarray): Array of bounding boxes with shape `(N, 4)` in + `(x_min, y_min, x_max, y_max)` format scaled to `resolution_wh`. + class_id (np.ndarray or None): Array of class indices for each box, or + None if no filtering by classes. + class_name (np.ndarray): Array of class names as strings. + """ + return from_qwen_2_5_vl( + result=result, + input_wh=(1000, 1000), + resolution_wh=resolution_wh, + classes=classes + ) + + def from_deepseek_vl_2( result: str, resolution_wh: tuple[int, int], classes: list[str] | None = None ) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]: diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py index 8a8240e98..e93b66f3d 100644 --- a/test/detection/test_vlm.py +++ b/test/detection/test_vlm.py @@ -320,18 +320,43 @@ def test_from_paligemma( np.array(["dog"], dtype=str), ), ), # out-of-bounds box +( + does_not_raise(), + """[ + {'bbox_2d': [10, 20, 110, 120], 'label': 'cat'} + ]""", + (640, 640), + (1280, 720), + None, + ( + np.array([[20.0, 22.5, 220.0, 135.0]]), + None, + np.array(["cat"], dtype=str), + ), + ), # python-style list, single quotes, no fences ( - pytest.raises(ValueError), + does_not_raise(), """```json [ - {"bbox_2d": [10, 20, 110, 120], "label": "cat"} - ] - ```""", - (0, 640), - (1280, 720), + {"bbox_2d": [0, 0, 64, 64], "label": "dog"}, + {"bbox_2d": [10, 20, 110, 120], "label": "cat"}, + {"bbox_2d": [30, 40, 130, 140], "label": + """, + (640, 640), + (640, 640), None, - None, # won't be compared because we expect an exception - ), # zero input width -> ValueError + ( + np.array( + [ + [0.0, 0.0, 64.0, 64.0], + [10.0, 20.0, 110.0, 120.0], + ], + dtype=float, + ), + None, + np.array(["dog", "cat"], dtype=str), + ), + ), # truncated response, last object unfinished, previous ones recovered ( pytest.raises(ValueError), """```json @@ -342,8 +367,8 @@ def test_from_paligemma( (640, 640), (1280, -100), None, - None, - ), # negative resolution height -> ValueError + None, # invalid resolution_wh + ), ], ) def test_from_qwen_2_5_vl( From e68c908077bb8bf901478bc087458fd218abb69f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:29:32 +0000 Subject: [PATCH 02/12] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- supervision/detection/vlm.py | 4 ++-- test/detection/test_vlm.py | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py index 9892e40de..371827668 100644 --- a/supervision/detection/vlm.py +++ b/supervision/detection/vlm.py @@ -332,7 +332,7 @@ def from_qwen_2_5_vl( start = text.find("[") end = text.rfind("]") if start != -1 and end != -1 and end > start: - text = text[start: end + 1].strip() + text = text[start : end + 1].strip() try: data = json.loads(text) @@ -404,7 +404,7 @@ def from_qwen_3_vl( result=result, input_wh=(1000, 1000), resolution_wh=resolution_wh, - classes=classes + classes=classes, ) diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py index e93b66f3d..7bfc23131 100644 --- a/test/detection/test_vlm.py +++ b/test/detection/test_vlm.py @@ -320,7 +320,7 @@ def test_from_paligemma( np.array(["dog"], dtype=str), ), ), # out-of-bounds box -( + ( does_not_raise(), """[ {'bbox_2d': [10, 20, 110, 120], 'label': 'cat'} @@ -346,15 +346,15 @@ def test_from_paligemma( (640, 640), None, ( - np.array( - [ - [0.0, 0.0, 64.0, 64.0], - [10.0, 20.0, 110.0, 120.0], - ], - dtype=float, - ), - None, - np.array(["dog", "cat"], dtype=str), + np.array( + [ + [0.0, 0.0, 64.0, 64.0], + [10.0, 20.0, 110.0, 120.0], + ], + dtype=float, + ), + None, + np.array(["dog", "cat"], dtype=str), ), ), # truncated response, last object unfinished, previous ones recovered ( From 7301156e4613039415e99a5415dbffe842a20441 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 21:36:54 +0100 Subject: [PATCH 03/12] plug Qwen3-VL into `sv.Detections.from_vlm` --- supervision/detection/core.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index 66610b998..903134f82 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -1559,6 +1559,11 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio data = {CLASS_NAME_DATA_FIELD: class_name} return cls(xyxy=xyxy, class_id=class_id, data=data) + if vlm == VLM.QWEN_3_VL: + xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs) + data = {CLASS_NAME_DATA_FIELD: class_name} + return cls(xyxy=xyxy, class_id=class_id, data=data) + if vlm == VLM.DEEPSEEK_VL_2: xyxy, class_id, class_name = from_deepseek_vl_2(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} From 9ab9650763205bac575bee3ecdfe3d4ff5231a11 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 21:39:23 +0100 Subject: [PATCH 04/12] plug Qwen3-VL into `sv.Detections.from_vlm` --- supervision/detection/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index 903134f82..464855a3d 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -47,7 +47,7 @@ from_moondream, from_paligemma, from_qwen_2_5_vl, - validate_vlm_parameters, + validate_vlm_parameters, from_qwen_3_vl, ) from supervision.geometry.core import Position from supervision.utils.internal import deprecated, get_instance_variables @@ -1560,7 +1560,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio return cls(xyxy=xyxy, class_id=class_id, data=data) if vlm == VLM.QWEN_3_VL: - xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs) + xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} return cls(xyxy=xyxy, class_id=class_id, data=data) From 31db2ded112634d4661099f8da707a06f064c4c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:40:01 +0000 Subject: [PATCH 05/12] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- supervision/detection/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index 464855a3d..1fe602ea1 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -47,7 +47,8 @@ from_moondream, from_paligemma, from_qwen_2_5_vl, - validate_vlm_parameters, from_qwen_3_vl, + from_qwen_3_vl, + validate_vlm_parameters, ) from supervision.geometry.core import Position from supervision.utils.internal import deprecated, get_instance_variables From 0422a66eaa0c81c73eb766058b0230041ae42fc1 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 22:05:46 +0100 Subject: [PATCH 06/12] plug Qwen3-VL into `sv.Detections.from_vlm` --- supervision/detection/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index 464855a3d..2f350be1f 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -1557,12 +1557,14 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio if vlm == VLM.QWEN_2_5_VL: xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} - return cls(xyxy=xyxy, class_id=class_id, data=data) + confidence = np.ones(len(class_id), dtype=float) + return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data) if vlm == VLM.QWEN_3_VL: xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} - return cls(xyxy=xyxy, class_id=class_id, data=data) + confidence = np.ones(len(class_id), dtype=float) + return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data) if vlm == VLM.DEEPSEEK_VL_2: xyxy, class_id, class_name = from_deepseek_vl_2(result, **kwargs) From e29e3832fb3b18ce38190bc3a34466a4a66afb59 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 22:19:28 +0100 Subject: [PATCH 07/12] plug Qwen3-VL into `sv.Detections.from_vlm` --- supervision/detection/vlm.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py index 371827668..388820f5b 100644 --- a/supervision/detection/vlm.py +++ b/supervision/detection/vlm.py @@ -344,10 +344,19 @@ def from_qwen_2_5_vl( try: data = ast.literal_eval(text) except (ValueError, SyntaxError, TypeError): - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + return ( + np.empty((0, 4)), + np.empty((0,), dtype=int), + np.empty((0,), dtype=str) + ) if not isinstance(data, list): - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + return ( + np.empty((0, 4)), + np.empty((0,), dtype=int), + np.empty((0,), dtype=str) + ) + boxes_list = [] labels_list = [] @@ -359,7 +368,12 @@ def from_qwen_2_5_vl( labels_list.append(item["label"]) if not boxes_list: - return np.empty((0, 4)), None, np.empty((0,), dtype=str) + return ( + np.empty((0, 4)), + np.empty((0,), dtype=int), + np.empty((0,), dtype=str) + ) + xyxy = np.array(boxes_list, dtype=float) class_name = np.array(labels_list, dtype=str) From 9bd8f71f26dfba13f80f32b10b4ae40da728aef5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:19:51 +0000 Subject: [PATCH 08/12] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- supervision/detection/vlm.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/supervision/detection/vlm.py b/supervision/detection/vlm.py index 388820f5b..97988c9f0 100644 --- a/supervision/detection/vlm.py +++ b/supervision/detection/vlm.py @@ -347,16 +347,11 @@ def from_qwen_2_5_vl( return ( np.empty((0, 4)), np.empty((0,), dtype=int), - np.empty((0,), dtype=str) + np.empty((0,), dtype=str), ) if not isinstance(data, list): - return ( - np.empty((0, 4)), - np.empty((0,), dtype=int), - np.empty((0,), dtype=str) - ) - + return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str)) boxes_list = [] labels_list = [] @@ -368,12 +363,7 @@ def from_qwen_2_5_vl( labels_list.append(item["label"]) if not boxes_list: - return ( - np.empty((0, 4)), - np.empty((0,), dtype=int), - np.empty((0,), dtype=str) - ) - + return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str)) xyxy = np.array(boxes_list, dtype=float) class_name = np.array(labels_list, dtype=str) From c35a35cefdf8b586bbe325d16d63403ebbb0d6f2 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 22:27:11 +0100 Subject: [PATCH 09/12] plug Qwen3-VL into `sv.Detections.from_vlm` --- supervision/detection/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index e67f906f5..ef88fbfec 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -1558,13 +1558,13 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio if vlm == VLM.QWEN_2_5_VL: xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} - confidence = np.ones(len(class_id), dtype=float) + confidence = np.ones(len(xyxy), dtype=float) return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data) if vlm == VLM.QWEN_3_VL: xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: class_name} - confidence = np.ones(len(class_id), dtype=float) + confidence = np.ones(len(xyxy), dtype=float) return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data) if vlm == VLM.DEEPSEEK_VL_2: From 5af13c0f8c71442acb90a053b65249810ca1de4c Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 22:48:20 +0100 Subject: [PATCH 10/12] add Qwen3-VL prompting example --- supervision/detection/core.py | 60 +++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index ef88fbfec..153dbe47a 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -951,6 +951,36 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio detections.class_id # array([0, 1]) ``` + + !!! example "Qwen3-VL" + + ```python + import supervision as sv + + qwen_3_vl_result = \"\"\"```json + [ + {"bbox_2d": [139, 768, 315, 954], "label": "cat"}, + {"bbox_2d": [366, 679, 536, 849], "label": "dog"} + ] + ```\"\"\" + detections = sv.Detections.from_lmm( + sv.LMM.QWEN_3_VL, + qwen_3_vl_result, + resolution_wh=(1000, 1000), + classes=['cat', 'dog'], + ) + detections.xyxy + # array([[139., 768., 315., 954.], [366., 679., 536., 849.]]) + + detections.class_id + # array([0, 1]) + + detections.data + # {'class_name': array(['cat', 'dog'], dtype=' Detectio detections.class_id # array([0, 1]) ``` + + !!! example "Qwen3-VL" + + ```python + import supervision as sv + + qwen_3_vl_result = \"\"\"```json + [ + {"bbox_2d": [139, 768, 315, 954], "label": "cat"}, + {"bbox_2d": [366, 679, 536, 849], "label": "dog"} + ] + ```\"\"\" + detections = sv.Detections.from_vlm( + sv.VLM.QWEN_3_VL, + qwen_3_vl_result, + resolution_wh=(1000, 1000), + classes=['cat', 'dog'], + ) + detections.xyxy + # array([[139., 768., 315., 954.], [366., 679., 536., 849.]]) + + detections.class_id + # array([0, 1]) + + detections.data + # {'class_name': array(['cat', 'dog'], dtype=' Date: Sat, 15 Nov 2025 21:48:42 +0000 Subject: [PATCH 11/12] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- supervision/detection/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index 153dbe47a..e5e298bbb 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -951,7 +951,7 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio detections.class_id # array([0, 1]) ``` - + !!! example "Qwen3-VL" ```python @@ -1359,7 +1359,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio detections.class_id # array([0, 1]) ``` - + !!! example "Qwen3-VL" ```python From a99015ac2da717477346fd825ba11cbed934b11f Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Sat, 15 Nov 2025 22:51:22 +0100 Subject: [PATCH 12/12] more Qwen2.5-VL tests --- test/detection/test_vlm.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/detection/test_vlm.py b/test/detection/test_vlm.py index 7bfc23131..9a0195f78 100644 --- a/test/detection/test_vlm.py +++ b/test/detection/test_vlm.py @@ -357,6 +357,18 @@ def test_from_paligemma( np.array(["dog", "cat"], dtype=str), ), ), # truncated response, last object unfinished, previous ones recovered + ( + pytest.raises(ValueError), + """```json + [ + {"bbox_2d": [10, 20, 110, 120], "label": "cat"} + ] + ```""", + (0, 640), + (1280, 720), + None, + None, # invalid input_wh + ), ( pytest.raises(ValueError), """```json