Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 70 additions & 1 deletion supervision/detection/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from_moondream,
from_paligemma,
from_qwen_2_5_vl,
from_qwen_3_vl,
validate_vlm_parameters,
)
from supervision.geometry.core import Position
Expand Down Expand Up @@ -951,6 +952,36 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
# array([0, 1])
```

!!! example "Qwen3-VL"

```python
import supervision as sv

qwen_3_vl_result = \"\"\"```json
[
{"bbox_2d": [139, 768, 315, 954], "label": "cat"},
{"bbox_2d": [366, 679, 536, 849], "label": "dog"}
]
```\"\"\"
detections = sv.Detections.from_lmm(
sv.LMM.QWEN_3_VL,
qwen_3_vl_result,
resolution_wh=(1000, 1000),
classes=['cat', 'dog'],
)
detections.xyxy
# array([[139., 768., 315., 954.], [366., 679., 536., 849.]])

detections.class_id
# array([0, 1])

detections.data
# {'class_name': array(['cat', 'dog'], dtype='<U10')}

detections.class_id
# array([0, 1])
```

!!! example "Gemini 2.0"
```python
import supervision as sv
Expand Down Expand Up @@ -1211,6 +1242,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
| PaliGemma | `PALIGEMMA` | detection | `resolution_wh` | `classes` |
| PaliGemma 2 | `PALIGEMMA` | detection | `resolution_wh` | `classes` |
| Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` |
| Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` |
| Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` |
| Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` |
| Moondream | `MOONDREAM` | detection | `resolution_wh` | |
Expand Down Expand Up @@ -1328,6 +1360,36 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
# array([0, 1])
```

!!! example "Qwen3-VL"

```python
import supervision as sv

qwen_3_vl_result = \"\"\"```json
[
{"bbox_2d": [139, 768, 315, 954], "label": "cat"},
{"bbox_2d": [366, 679, 536, 849], "label": "dog"}
]
```\"\"\"
detections = sv.Detections.from_vlm(
sv.VLM.QWEN_3_VL,
qwen_3_vl_result,
resolution_wh=(1000, 1000),
classes=['cat', 'dog'],
)
detections.xyxy
# array([[139., 768., 315., 954.], [366., 679., 536., 849.]])

detections.class_id
# array([0, 1])

detections.data
# {'class_name': array(['cat', 'dog'], dtype='<U10')}

detections.class_id
# array([0, 1])
```

!!! example "Gemini 2.0"
```python
import supervision as sv
Expand Down Expand Up @@ -1556,7 +1618,14 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
if vlm == VLM.QWEN_2_5_VL:
xyxy, class_id, class_name = from_qwen_2_5_vl(result, **kwargs)
data = {CLASS_NAME_DATA_FIELD: class_name}
return cls(xyxy=xyxy, class_id=class_id, data=data)
confidence = np.ones(len(xyxy), dtype=float)
return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data)

if vlm == VLM.QWEN_3_VL:
xyxy, class_id, class_name = from_qwen_3_vl(result, **kwargs)
data = {CLASS_NAME_DATA_FIELD: class_name}
confidence = np.ones(len(xyxy), dtype=float)
return cls(xyxy=xyxy, class_id=class_id, confidence=confidence, data=data)

if vlm == VLM.DEEPSEEK_VL_2:
xyxy, class_id, class_name = from_deepseek_vl_2(result, **kwargs)
Expand Down
145 changes: 121 additions & 24 deletions supervision/detection/vlm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import ast
import base64
import io
import json
Expand Down Expand Up @@ -27,7 +28,8 @@ class LMM(Enum):
Attributes:
PALIGEMMA: Google's PaliGemma vision-language model.
FLORENCE_2: Microsoft's Florence-2 vision-language model.
QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.\
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
MOONDREAM: The Moondream vision-language model.
Expand All @@ -36,6 +38,7 @@ class LMM(Enum):
PALIGEMMA = "paligemma"
FLORENCE_2 = "florence_2"
QWEN_2_5_VL = "qwen_2_5_vl"
QWEN_3_VL = "qwen_3_vl"
DEEPSEEK_VL_2 = "deepseek_vl_2"
GOOGLE_GEMINI_2_0 = "gemini_2_0"
GOOGLE_GEMINI_2_5 = "gemini_2_5"
Expand Down Expand Up @@ -69,6 +72,7 @@ class VLM(Enum):
PALIGEMMA: Google's PaliGemma vision-language model.
FLORENCE_2: Microsoft's Florence-2 vision-language model.
QWEN_2_5_VL: Qwen2.5-VL open vision-language model from Alibaba.
QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba.
GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model.
GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model.
MOONDREAM: The Moondream vision-language model.
Expand All @@ -77,6 +81,7 @@ class VLM(Enum):
PALIGEMMA = "paligemma"
FLORENCE_2 = "florence_2"
QWEN_2_5_VL = "qwen_2_5_vl"
QWEN_3_VL = "qwen_3_vl"
DEEPSEEK_VL_2 = "deepseek_vl_2"
GOOGLE_GEMINI_2_0 = "gemini_2_0"
GOOGLE_GEMINI_2_5 = "gemini_2_5"
Expand Down Expand Up @@ -106,6 +111,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.PALIGEMMA: str,
VLM.FLORENCE_2: dict,
VLM.QWEN_2_5_VL: str,
VLM.QWEN_3_VL: str,
VLM.DEEPSEEK_VL_2: str,
VLM.GOOGLE_GEMINI_2_0: str,
VLM.GOOGLE_GEMINI_2_5: str,
Expand All @@ -116,6 +122,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.PALIGEMMA: ["resolution_wh"],
VLM.FLORENCE_2: ["resolution_wh"],
VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh"],
VLM.QWEN_3_VL: ["resolution_wh"],
VLM.DEEPSEEK_VL_2: ["resolution_wh"],
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"],
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"],
Expand All @@ -126,6 +133,7 @@ def from_value(cls, value: VLM | str) -> VLM:
VLM.PALIGEMMA: ["resolution_wh", "classes"],
VLM.FLORENCE_2: ["resolution_wh"],
VLM.QWEN_2_5_VL: ["input_wh", "resolution_wh", "classes"],
VLM.QWEN_3_VL: ["resolution_wh", "classes"],
VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"],
VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"],
Expand Down Expand Up @@ -235,14 +243,59 @@ def from_paligemma(
return xyxy, class_id, class_name


def recover_truncated_qwen_2_5_vl_response(text: str) -> Any | None:
"""
Attempt to recover and parse a truncated or malformed JSON snippet from Qwen-2.5-VL
output.

This utility extracts a JSON-like portion from a string that may be truncated or
malformed, cleans trailing commas, and attempts to parse it into a Python object.

Args:
text (str): Raw text containing the JSON snippet possibly truncated or
incomplete.

Returns:
Parsed Python object (usually list) if recovery and parsing succeed;
otherwise `None`.
"""
try:
first_bracket = text.find("[")
if first_bracket == -1:
return None
snippet = text[first_bracket:]

last_brace = snippet.rfind("}")
if last_brace == -1:
return None

snippet = snippet[: last_brace + 1]

prefix_end = snippet.find("[")
if prefix_end == -1:
return None

prefix = snippet[: prefix_end + 1]
body = snippet[prefix_end + 1 :].rstrip()

if body.endswith(","):
body = body[:-1].rstrip()

repaired = prefix + body + "]"

return json.loads(repaired)
except Exception:
return None


def from_qwen_2_5_vl(
result: str,
input_wh: tuple[int, int],
resolution_wh: tuple[int, int],
classes: list[str] | None = None,
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
"""
Parse and scale bounding boxes from Qwen-2.5-VL style JSON output.
Parse and rescale bounding boxes and class labels from Qwen-2.5-VL JSON output.

The JSON is expected to be enclosed in triple backticks with the format:
```json
Expand All @@ -253,38 +306,52 @@ def from_qwen_2_5_vl(
```

Args:
result: String containing the JSON snippet enclosed by triple backticks.
input_wh: (input_width, input_height) describing the original bounding box
scale.
resolution_wh: (output_width, output_height) to which we rescale the boxes.
classes: Optional list of valid class names. If provided, returned boxes/labels
are filtered to only those classes found here.
result (str): String containing Qwen-2.5-VL JSON bounding box and label data.
input_wh (tuple[int, int]): Width and height of the coordinate space where boxes
are normalized.
resolution_wh (tuple[int, int]): Target width and height to scale bounding
boxes.
classes (list[str] or None): Optional list of valid class names to filter
results. If provided, only boxes with labels in this list are returned.

Returns:
xyxy (np.ndarray): An array of shape `(n, 4)` containing
the bounding boxes coordinates in format `[x1, y1, x2, y2]`
class_id (Optional[np.ndarray]): An array of shape `(n,)` containing
the class indices for each bounding box (or None if `classes` is not
provided)
class_name (np.ndarray): An array of shape `(n,)` containing
the class labels for each bounding box
xyxy (np.ndarray): Array of shape `(N, 4)` with rescaled bounding boxes in
`(x_min, y_min, x_max, y_max)` format.
class_id (np.ndarray or None): Array of shape `(N,)` with indices of classes,
or `None` if no filtering applied.
class_name (np.ndarray): Array of shape `(N,)` with class names as strings.
"""

in_w, in_h = validate_resolution(input_wh)
out_w, out_h = validate_resolution(resolution_wh)

pattern = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL)

match = pattern.search(result)
if not match:
return np.empty((0, 4)), None, np.empty((0,), dtype=str)
text = result.strip()
text = re.sub(r"^```(json)?", "", text, flags=re.IGNORECASE).strip()
text = re.sub(r"```$", "", text).strip()

json_snippet = match.group(1)
start = text.find("[")
end = text.rfind("]")
if start != -1 and end != -1 and end > start:
text = text[start : end + 1].strip()

try:
data = json.loads(json_snippet)
data = json.loads(text)
except json.JSONDecodeError:
return np.empty((0, 4)), None, np.empty((0,), dtype=str)
repaired = recover_truncated_qwen_2_5_vl_response(text)
if repaired is not None:
data = repaired
else:
try:
data = ast.literal_eval(text)
except (ValueError, SyntaxError, TypeError):
return (
np.empty((0, 4)),
np.empty((0,), dtype=int),
np.empty((0,), dtype=str),
)

if not isinstance(data, list):
return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str))

boxes_list = []
labels_list = []
Expand All @@ -296,7 +363,7 @@ def from_qwen_2_5_vl(
labels_list.append(item["label"])

if not boxes_list:
return np.empty((0, 4)), None, np.empty((0,), dtype=str)
return (np.empty((0, 4)), np.empty((0,), dtype=int), np.empty((0,), dtype=str))

xyxy = np.array(boxes_list, dtype=float)
class_name = np.array(labels_list, dtype=str)
Expand All @@ -315,6 +382,36 @@ def from_qwen_2_5_vl(
return xyxy, class_id, class_name


def from_qwen_3_vl(
result: str,
resolution_wh: tuple[int, int],
classes: list[str] | None = None,
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
"""
Parse and scale bounding boxes from Qwen-3-VL style JSON output.

Args:
result (str): String containing the Qwen-3-VL JSON output.
resolution_wh (tuple[int, int]): Target resolution `(width, height)` to
scale bounding boxes.
classes (list[str] or None): Optional list of valid classes to filter
results.

Returns:
xyxy (np.ndarray): Array of bounding boxes with shape `(N, 4)` in
`(x_min, y_min, x_max, y_max)` format scaled to `resolution_wh`.
class_id (np.ndarray or None): Array of class indices for each box, or
None if no filtering by classes.
class_name (np.ndarray): Array of class names as strings.
"""
return from_qwen_2_5_vl(
result=result,
input_wh=(1000, 1000),
resolution_wh=resolution_wh,
classes=classes,
)


def from_deepseek_vl_2(
result: str, resolution_wh: tuple[int, int], classes: list[str] | None = None
) -> tuple[np.ndarray, np.ndarray | None, np.ndarray]:
Expand Down
Loading
Loading