In [2]:
import torch
import cv2
import numpy as np
from PIL import Image
import sys
sys.path.append('Depth-Anything-V2')
from depth_anything_v2.dpt import DepthAnythingV2

xFormers not available
xFormers not available


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
video_input_path = "vid.mp4"
video_output_path = "out.mp4"
checkpoint_path = "depth_anything_v2_vits.pth"
resize_shortest = 518  # recommended by Depth Anything V2

# --- Load model ---
model = DepthAnythingV2(**{'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]})
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.to(device)
model.eval()

# --- Open video ---
cap = cv2.VideoCapture(video_input_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(video_output_path, fourcc, fps, (frame_width, frame_height))

Using device: cpu


In [4]:
# export to ONNX
dummy_input = torch.randn(1, 3, resize_shortest, resize_shortest).to(device)
torch.onnx.export(model, dummy_input, "depth_anything_v2.onnx", opset_version=11)

W1101 12:59:59.710000 65130 torch/onnx/_internal/exporter/_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 11 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


[torch.onnx] Obtain model graph for `DepthAnythingV2([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `DepthAnythingV2([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...


The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 11).
Failed to convert the model to the target version 11 using the ONNX C API. The model was not modified
Traceback (most recent call last):
  File "/home/max/Documents/VisualComputing/exercises/project/ar-placement/depth_tracking/.venv/lib/python3.12/site-packages/onnxscript/version_converter/__init__.py", line 127, in call
    converted_proto = _c_api_utils.call_onnx_api(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/max/Documents/VisualComputing/exercises/project/ar-placement/depth_tracking/.venv/lib/python3.12/site-packages/onnxscript/version_converter/_c_api_utils.py", line 65, in call_onnx_api
    result = func(proto)
             ^^^^^^^^^^^
  File "/home/max/Documents/VisualComputing/exercises/project/ar-placement/depth_tracking/.venv/lib/python3.12/site-packages/onnxscript/version_convert

[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...
[torch.onnx] Translate the graph into ONNX... ✅
Applied 5 of general pattern rewrite rules.


ONNXProgram(
    model=
        <
            ir_version=10,
            opset_imports={'': 18},
            producer_name='pytorch',
            producer_version='2.9.0+cu128',
            domain=None,
            model_version=None,
        >
        graph(
            name=main_graph,
            inputs=(
                %"x"<FLOAT,[1,3,518,518]>
            ),
            outputs=(
                %"squeeze"<FLOAT,[1,518,518]>
            ),
            initializers=(
                %"pretrained.patch_embed.proj.bias"<FLOAT,[384]>{TorchTensor(...)},
                %"pretrained.blocks.0.norm1.weight"<FLOAT,[384]>{TorchTensor(...)},
                %"pretrained.blocks.0.norm1.bias"<FLOAT,[384]>{TorchTensor(...)},
                %"pretrained.blocks.0.attn.proj.bias"<FLOAT,[384]>{TorchTensor(...)},
                %"pretrained.blocks.0.ls1.gamma"<FLOAT,[384]>{TorchTensor(...)},
                %"pretrained.blocks.0.norm2.weight"<FLOAT,[384]>{TorchTensor(...)},
                %"pret

In [11]:
while True:
    ret, frame_bgr = cap.read()
    if not ret:
        break
    
    # Convert BGR -> RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    frame_pil = Image.fromarray(frame_rgb)
    
    # Resize keeping aspect ratio
    h, w = frame_pil.height, frame_pil.width
    scale = resize_shortest / min(h, w)
    new_w, new_h = int(w * scale), int(h * scale)
    
    # Round to nearest multiple of 14
    new_w = (new_w // 14) * 14
    new_h = (new_h // 14) * 14
    
    frame_resized = frame_pil.resize((new_w, new_h))
    
    # To tensor
    frame_np = np.array(frame_resized).astype(np.float32) / 255.0
    input_tensor = torch.from_numpy(frame_np).permute(2,0,1).unsqueeze(0).to(device)
    
    # Inference
    with torch.no_grad():
        depth_output = model(input_tensor)
    
    # Postprocess depth map
    depth_map = depth_output.squeeze().cpu().numpy()
    depth_vis = cv2.resize(depth_map, (frame_width, frame_height))  # resize back to original
    depth_vis = cv2.normalize(depth_vis, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    depth_colored = cv2.applyColorMap(depth_vis, cv2.COLORMAP_PLASMA)
    
    # Optional: overlay original video
    overlay = cv2.addWeighted(frame_bgr, 0.6, depth_colored, 0.4, 0)
    
    # Write frame to output video
    out.write(overlay)

# --- Release resources ---
cap.release()
out.release()