# Single-view 3D reconstruction with SAM-3D Objects

This notebook requires a self-hosted inference server with a 32GB+ VRAM GPU. See the README for the recommended setup.

In [None]:
%load_ext autoreload
%autoreload 2
%pip install -r requirements.txt

Set up the notebook to point at your inference server instance and use your API key to download model weights.

In [None]:
API_URL = "http://localhost:9001"
API_KEY = "YOUR_API_KEY"

SEGMENTATION_MODEL_ID = "rfdetr-seg-preview"
SAM3_3D_MODEL_ID = "sam3-3d-objects"

Set input data and output directory for logging the annotated image and 3D view.

In [None]:
from supervision.assets import download_assets, VideoAssets

# INPUT_VIDEO_PATH = download_assets(VideoAssets.MILK_BOTTLING_PLANT)
INPUT_VIDEO_PATH = download_assets(VideoAssets.VEHICLES)

OUTPUT_DIR = "sam-3d-detect"

In [None]:
import os
import shutil

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)

Step 1: Load an input image and make sure it looks how we expect.

In [None]:
import supervision as sv

image = next(sv.get_video_frames_generator(INPUT_VIDEO_PATH))
sv.plot_image(image)

Step 2: Generate 2D object masks by running an instance segmentation model like RF-DETR Seg.

In [None]:
from inference_sdk import InferenceHTTPClient
import time

client = InferenceHTTPClient(api_url=API_URL, api_key=API_KEY)

start = time.perf_counter()

seg_result = client.infer(image, model_id=SEGMENTATION_MODEL_ID)

print(f"{SEGMENTATION_MODEL_ID} inference took {(time.perf_counter() - start):.2f} sec")

Let's take a look at the detections to check if they make sense.

In [None]:
import numpy as np

detections = sv.Detections.from_inference(seg_result)

# remove low-confidence detections
detections = detections[detections.confidence > 0.5]

labels = [
    f"#{i} ({class_name})" for i, class_name in enumerate(detections.data["class_name"])
]
mask_annotator = sv.MaskAnnotator()
label_annotator = sv.LabelAnnotator()
annotated = mask_annotator.annotate(scene=image.copy(), detections=detections)
annotated = label_annotator.annotate(scene=annotated, detections=detections, labels=labels)

sv.plot_image(annotated)

with sv.ImageSink(target_dir_path=OUTPUT_DIR) as sink:
    sink.save_image(annotated, "annotated.png")

Step 3: Pass the input image and object masks to SAM-3D to generate 3D reconstructions of each object. 

This will take a few minutes the first time as the model weights need to be downloaded to the server. Subsequent inference calls can take anywhere from seconds to minutes depending on the number of objects and the inference configuration.

In [None]:
# flatten polygons to the expected [x1 y1 x2 y2 ... xN yN] format
mask_input = [
    np.array(sv.mask_to_polygons(mask)[0]).flatten().tolist()
    for mask in detections.mask
]

start = time.perf_counter()
sam3_3d_result = client.sam3_3d_infer(
    inference_input=image,
    mask_input=mask_input,
    model_id=SAM3_3D_MODEL_ID,
    # 'Fast' SAM-3D config
    output_meshes=False,
    output_scene=False,
    with_mesh_postprocess=False,
    with_texture_baking=False,
    use_distillations=True,
)
print(f"SAM-3D inference took {(time.perf_counter() - start):.2f} sec")

detections.data["sam3_3d"] = sam3_3d_result["objects"]

Step 4: Transform the 3D objects into a common global frame using their layout metadata, and draw them in [Rerun.io](https://rerun.io).

When `output_scene=True` SAM-3D will output a combined 3D asset containing all 3D objects in a common frame. The code below uses the same Y-up frame convention to draw the objects, so it's consistent with what SAM-3D provides natively.

Rerun will log to disk at `OUTPUT_DIR/rerun_log.rrd`. You can then visualize this file in the notebook or using the standalone Rerun viewer `rerun [OUTPUT_DIR]/rerun_log.rrd`.

In [None]:
from base64 import b64decode
from io import BytesIO

import torch
from pytorch3d.io import IO
from pytorch3d.transforms.rotation_conversions import quaternion_to_matrix

import rerun as rr

rr.init("sam-3d-detect")
rr.save(os.path.join(OUTPUT_DIR, "rerun_log.rrd"))
rr.log("/", rr.ViewCoordinates.RIGHT_HAND_Y_UP, rr.TransformAxes3D(0.5), static=True)

rr.set_time("tick", sequence=0)

rr.log("/camera/image", rr.Image(annotated, color_model="bgr"))

# Coordinate transforms used in make_scene_glb
z_to_y_up = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], dtype=torch.float)
y_to_z_up = torch.tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]], dtype=torch.float)
R_view = torch.tensor([[-1, 0, 0], [0, 0, -1], [0, -1, 0]], dtype=torch.float)

for i in range(len(detections)):
    det = detections[i]
    obj_id = f"#{i}"
    if "sam3_3d" not in det.data:
        print(f"No 3D data available for {obj_id}")
        continue
    obj_sam3_3d = det.data["sam3_3d"][0]

    obj_ply = IO().load_pointcloud(BytesIO(b64decode(obj_sam3_3d["gaussian_ply"])))
    obj_pts = obj_ply.points_list()[0]
    obj_pts = obj_pts[::100, :]  # Keep 1% of points to speed up rendering
    obj_box_size = (obj_pts.amax(dim=0) - obj_pts.amin(dim=0))
    obj_rgb = sv.annotators.utils.resolve_color(sv.ColorPalette.DEFAULT, detections, i).as_rgb()

    metadata = obj_sam3_3d["metadata"]
    t = torch.tensor(metadata["translation"], dtype=torch.float)
    R = quaternion_to_matrix(torch.tensor(metadata["rotation"], dtype=torch.float))
    s = torch.tensor(metadata["scale"], dtype=torch.float)
    # 1. Z-up â†’ Y-up coordinate conversion (row-vector convention throughout SAM3D)
    # 2. PyTorch3D quaternion_to_matrix is column-vector (R @ v), but SAM3D uses it
    #    row-vector (v @ R), so pass R.T to Rerun's column-vector mat3x3
    # 3. R_view: global scene correction from make_scene_glb, applied in world space
    t = t @ z_to_y_up @ R_view
    R = R_view @ y_to_z_up @ R.T @ z_to_y_up

    rr.log(
        f"objects/{obj_id}",
        rr.Boxes3D(sizes=obj_box_size, colors=obj_rgb, labels=obj_id),
        rr.Transform3D(translation=t, mat3x3=R, scale=s),
    )
    rr.log(
        f"objects/{obj_id}/pts",
        rr.Points3D(positions=obj_pts, colors=obj_rgb),
    )

In [None]:
# You can also use the standalone viewer app
# rerun [OUTPUT_DIR]/rerun_log.rrd
rr.notebook_show()
rr.log_file_from_path(os.path.join(OUTPUT_DIR, "rerun_log.rrd"))