In [1]:

import torch
import cv2
import numpy as np
import time
from fvcore.nn import FlopCountAnalysis

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VIDEO_PATH = "test.mov"
RESOLUTION = 384
WARMUP_FRAMES = 20
MEASURE_FRAMES = 200

In [16]:
model = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
model.to(DEVICE)
model.eval()

total_params = sum(p.numel() for p in model.parameters())
model_size_mb = total_params * 4 / (1024**2)  # FP32
print(f"Parameters: {total_params/1e6:.2f} M")
print(f"Model Size (FP32): {model_size_mb:.2f} MB")

dummy_input = torch.randn(1, 3, RESOLUTION, RESOLUTION).to(DEVICE)


Using cache found in /home/RUS_CIP/st189432/.cache/torch/hub/intel-isl_MiDaS_master
  model = create_fn(


Parameters: 123.15 M
Model Size (FP32): 469.77 MB


In [17]:
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA],
    with_flops=True
) as prof:
    model(dummy_input)

print(prof.key_averages().table(sort_by="flops", row_limit=20))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.54%     193.167us        11.09%       3.955ms      48.829us       0.000us         0.00%      11.985ms     147.964us            81    142223.081  
                                            aten::addmm         3.05%       1.086ms         4.48%       1.596ms      31.929us      12.151ms        39.1

Self CPU - Time spent inside that operator only, on the CPU.  
CPU total - Time spent in that operator including child calls.

In [None]:
cap = cv2.VideoCapture(VIDEO_PATH)

latencies = []

torch.cuda.reset_peak_memory_stats()

frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.resize(frame, (RESOLUTION, RESOLUTION))
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = frame / 255.0
    frame = torch.from_numpy(frame).permute(2,0,1).float().unsqueeze(0).to(DEVICE)

    if DEVICE == "cuda":
        torch.cuda.synchronize()

    start = time.time()
    with torch.no_grad():
        output = model(frame)
    if DEVICE == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    if frame_count >= WARMUP_FRAMES:
        latencies.append((end - start) * 1000)

    frame_count += 1
    if frame_count >= (WARMUP_FRAMES + MEASURE_FRAMES):
        break

cap.release()

latencies = np.array(latencies)
print(f"Mean Latency: {latencies.mean():.2f} ms")
print(f"P50 Latency: {np.percentile(latencies,50):.2f} ms")
print(f"P95 Latency: {np.percentile(latencies,95):.2f} ms")
print(f"FPS: {1000.0/latencies.mean():.2f}")

if DEVICE == "cuda":
    peak_mem = torch.cuda.max_memory_allocated() / (1024**2)
    print(f"Peak GPU Memory: {peak_mem:.2f} MB")

Mean Latency: 37.26 ms
P50 Latency: 37.02 ms
P95 Latency: 39.25 ms
FPS: 26.84
Peak GPU Memory: 1860.74 MB


In [None]:
OUTPUT_PATH = "depth_output.mp4" 
cap = cv2.VideoCapture(VIDEO_PATH)
orig_fps = cap.get(cv2.CAP_PROP_FPS)

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(
    OUTPUT_PATH,
    fourcc,
    orig_fps,
    (RESOLUTION, RESOLUTION)
)

print("Starting inference...")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_resized = cv2.resize(frame, (RESOLUTION, RESOLUTION))
    rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    rgb = rgb / 255.0

    input_tensor = torch.from_numpy(rgb).permute(2,0,1).float().unsqueeze(0).to(DEVICE)

    if DEVICE == "cuda":
        torch.cuda.synchronize()

    start = time.time()
    with torch.no_grad():
        depth = model(input_tensor)
    if DEVICE == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency_ms = (end - start) * 1000
    fps = 1000.0 / latency_ms

    depth = depth.squeeze().cpu().numpy()
    depth_min = depth.min()
    depth_max = depth.max()
    depth_vis = (depth - depth_min) / (depth_max - depth_min + 1e-8)
    depth_vis = (depth_vis * 255).astype(np.uint8)

    depth_color = cv2.applyColorMap(depth_vis, cv2.COLORMAP_INFERNO)

    # Overlay FPS
    cv2.putText(
        depth_color,
        f"FPS: {fps:.2f}",
        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        1,
        (255, 255, 255),
        2
    )

    writer.write(depth_color)

cap.release()
writer.release()

print("Saved:", OUTPUT_PATH)

Starting inference...
Saved: depth_output.mp4
