In [None]:
import torch
import time
import numpy as np
import cv2
# uv pip install torch==2.10.0 torchvision==0.25.0 triton==3.6.0 --index-url https://download.pytorch.org/whl/cu128
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# from .autonotebook import tqdm as notebook_tqdm
torch.backends.cudnn.benchmark = True
import torch._dynamo
# torch._dynamo.config.suppress_errors = True

def compile_model_once(device="cuda", precision="fp16",
                       compile_mode="reduce-overhead", fullgraph=False):

    model = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
    model = model.to(device).eval()

    if precision == "fp16":
        model = model.half()
        
    if hasattr(model, 'backbone') and hasattr(model.backbone, 'patch_embed'):
        stem_layer = model.backbone.patch_embed.backbone.stem
    # This tells the compiler: "When you hit this function, stop compiling, run it normally, and resume compilation afterward."
# torch._dynamo.disable(model.backbone.patch_embed.backbone.stem.forward)
    #decorate the forward pass of the stem to be skipped by Dynamo
        stem_layer.forward = torch._dynamo.disable(stem_layer.forward)
        print("Optimization disabled for ResNet stem to prevent LoweringException.")
        

    compiled_model = torch.compile(
        model,
        backend="aot_eager",
        mode=compile_mode,
        dynamic=True
    )

    return compiled_model

In [None]:
def align_input_dtype(model, input_tensor):
    model_dtype = next(model.parameters()).dtype
    return input_tensor.to(dtype=model_dtype)

def preprocess_frame(frame, resolution, device):
    frame_resized = cv2.resize(frame, (resolution, resolution))
    rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    rgb = rgb / 255.0

    tensor = torch.from_numpy(rgb).permute(2,0,1).float().unsqueeze(0)
    return tensor.to(device)
def infer_image(compiled_model,
                image_path,
                transform_fn,
                device="cuda",
                warmup_runs=5,
                measure_runs=30,
                output_path="depth_output.png"):

    frame = cv2.imread(image_path)
    if frame is None:
        raise ValueError("Invalid image path")

    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # input_tensor = transform_fn(img_rgb).to(device)
    input_tensor = preprocess_frame(frame, 384, device)
    input_tensor = align_input_dtype(compiled_model, input_tensor)

    # Warmup
    for _ in range(warmup_runs):
        with torch.no_grad():
            _ = compiled_model(input_tensor)

    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

    latencies = []

    for _ in range(measure_runs):

        torch.cuda.synchronize()
        start = time.time()

        with torch.no_grad():
            depth = compiled_model(input_tensor)

        torch.cuda.synchronize()
        end = time.time()

        latencies.append((end - start) * 1000)

    latencies = np.array(latencies)

    mean_latency = latencies.mean()
    fps = 1000.0 / mean_latency
    peak_mem = torch.cuda.max_memory_allocated() / (1024**2)

    # Save output
    depth_np = depth.squeeze().detach().cpu().numpy()
    depth_norm = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-6)
    depth_uint8 = (depth_norm * 255).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)

    cv2.putText(depth_color,
                f"FPS: {fps:.2f}",
                (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (255, 255, 255),
                2)

    cv2.imwrite(output_path, depth_color)

    print("------ IMAGE INFERENCE ------")
    print(f"Mean Latency: {mean_latency:.2f} ms")
    print(f"P95 Latency: {np.percentile(latencies,95):.2f} ms")
    print(f"FPS: {fps:.2f}")
    print(f"Peak Memory: {peak_mem:.2f} MB")

    return {
        "MeanLatency_ms": float(mean_latency),
        "FPS": float(fps),
        "PeakMemory_MB": float(peak_mem)
    }

In [12]:
def depth_to_colormap(depth_tensor):
    depth = depth_tensor.squeeze().detach().cpu().numpy()
    dmin, dmax = float(depth.min()), float(depth.max())
    depth_norm = (depth - dmin) / (dmax - dmin + 1e-6)
    depth_uint8 = (depth_norm * 255).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)
    return depth_color


def run_inference(
    compiled_model,
    source,
    transform_fn,
    device="cuda",
    warmup_runs=5,
    measure_runs=30,
    save=False,
    output_path=None
):
    """
    Runs inference on compiled model.

    Parameters:
        compiled_model : torch.compile wrapped model
        source         : image path or video path
        transform_fn   : MiDaS transform function
        warmup_runs    : warmup iterations
        measure_runs   : measured iterations
    """

    assert compiled_model is not None, "Model must be pre-compiled"
    compiled_model.eval()

    is_video = source.lower().endswith((".mp4", ".avi", ".mov", ".mkv"))

    latencies = []
    outputs = []

    if is_video:
        cap = cv2.VideoCapture(source)
        orig_fps = cap.get(cv2.CAP_PROP_FPS)

        if save:
            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
            writer = cv2.VideoWriter(
                output_path,
                fourcc,
                orig_fps if orig_fps > 0 else 30,
                (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
                 int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
            )
    else:
        frame = cv2.imread(source)
        if frame is None:
            raise ValueError("Invalid image path")

        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        input_tensor = transform_fn(img_rgb).to(device)

    # -----------------------------
    # Warmup phase
    # -----------------------------
    if not is_video:
        for _ in range(warmup_runs):
            with torch.no_grad():
                _ = compiled_model(input_tensor)
    else:
        count = 0
        while count < warmup_runs:
            ret, frame = cap.read()
            if not ret:
                break
            img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            input_tensor = transform_fn(img_rgb).to(device)
            if next(compiled_model.parameters()).dtype == torch.float16:
                input_tensor = input_tensor.half()
            with torch.no_grad():
                _ = compiled_model(input_tensor)
            count += 1

    torch.cuda.synchronize()

    # -----------------------------
    # Measurement phase
    # -----------------------------
    torch.cuda.reset_peak_memory_stats()

    if not is_video:
        for _ in range(measure_runs):

            torch.cuda.synchronize()
            start = time.time()

            with torch.no_grad():
                output = compiled_model(input_tensor)

            torch.cuda.synchronize()
            end = time.time()

            latencies.append((end - start) * 1000)
            outputs.append(output.detach().cpu())

            if save:
                depth_color = depth_to_colormap(output)
                fps = 1000.0 / latencies[-1]
                depth_color = overlay_fps(depth_color, fps)
                cv2.imwrite(output_path, depth_color)

    else:
        count = 0
        while count < measure_runs:
            ret, frame = cap.read()
            if not ret:
                break

            img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            input_tensor = transform_fn(img_rgb).to(device)

            torch.cuda.synchronize()
            start = time.time()

            with torch.no_grad():
                output = compiled_model(input_tensor)

            torch.cuda.synchronize()
            end = time.time()

            latency_ms = (end - start) * 1000
            latencies.append(latency_ms)

            if save:
                depth_color = depth_to_colormap(output)
                fps = 1000.0 / latency_ms
                depth_color = overlay_fps(depth_color, fps)
                writer.write(depth_color)

            count += 1

        cap.release()
        if save:
            writer.release()

    latencies = np.array(latencies)

    results = {
        "MeanLatency_ms": float(latencies.mean()),
        "P50_ms": float(np.percentile(latencies, 50)),
        "P95_ms": float(np.percentile(latencies, 95)),
        "FPS": float(1000.0 / latencies.mean()),
        "PeakMemory_MB": float(torch.cuda.max_memory_allocated() / (1024**2))
    }

    return results, outputs

In [13]:
def infer_video(compiled_model,
                video_path,
                transform_fn,
                device="cuda",
                warmup_frames=5,
                measure_frames=100,
                output_path="depth_video_output.mp4"):

    cap = cv2.VideoCapture(video_path)
    orig_fps = cap.get(cv2.CAP_PROP_FPS)

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(output_path, fourcc,
                             orig_fps if orig_fps > 0 else 30,
                             (width, height))

    frame_count = 0
    latencies = []

    torch.cuda.reset_peak_memory_stats()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        input_tensor = transform_fn(img_rgb).to(device)
        input_tensor = align_input_dtype(compiled_model, input_tensor)

        if frame_count < warmup_frames:
            with torch.no_grad():
                _ = compiled_model(input_tensor)
            frame_count += 1
            continue

        torch.cuda.synchronize()
        start = time.time()

        with torch.no_grad():
            depth = compiled_model(input_tensor)

        torch.cuda.synchronize()
        end = time.time()

        latency_ms = (end - start) * 1000
        latencies.append(latency_ms)

        # Visualization
        depth_np = depth.squeeze().detach().cpu().numpy()
        depth_norm = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-6)
        depth_uint8 = (depth_norm * 255).astype(np.uint8)
        depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)

        fps = 1000.0 / latency_ms

        cv2.putText(depth_color,
                    f"FPS: {fps:.2f}",
                    (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1,
                    (255, 255, 255),
                    2)

        writer.write(depth_color)

        frame_count += 1
        if frame_count >= warmup_frames + measure_frames:
            break

    cap.release()
    writer.release()

    latencies = np.array(latencies)
    mean_latency = latencies.mean()
    fps = 1000.0 / mean_latency
    peak_mem = torch.cuda.max_memory_allocated() / (1024**2)

    print("------ VIDEO INFERENCE ------")
    print(f"Mean Latency: {mean_latency:.2f} ms")
    print(f"P95 Latency: {np.percentile(latencies,95):.2f} ms")
    print(f"FPS: {fps:.2f}")
    print(f"Peak Memory: {peak_mem:.2f} MB")

    return {
        "MeanLatency_ms": float(mean_latency),
        "FPS": float(fps),
        "PeakMemory_MB": float(peak_mem)
    }

In [None]:
compiled_model = compile_model_once(
    device="cuda"
)
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
results_img = infer_image(
    compiled_model,
    "Images/people.jpg",
    midas_transforms.dpt_transform
)

Using cache found in /home/RUS_CIP/st189432/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /home/RUS_CIP/st189432/.cache/torch/hub/intel-isl_MiDaS_master
W0220 14:58:04.777000 2457723 torch/_dynamo/backends/common.py:53] [3/2_1] aot_autograd-based backend ignoring extra kwargs {'mode': 'reduce-overhead'}
W0220 14:58:25.433000 2457723 torch/_dynamo/backends/common.py:53] [4/3] aot_autograd-based backend ignoring extra kwargs {'mode': 'reduce-overhead'}
W0220 14:59:00.341000 2457723 torch/_dynamo/backends/common.py:53] [4/3_1] aot_autograd-based backend ignoring extra kwargs {'mode': 'reduce-overhead'}
W0220 14:59:35.867000 2457723 torch/_dynamo/backends/common.py:53] [5/0] aot_autograd-based backend ignoring extra kwargs {'mode': 'reduce-overhead'}
W0220 14:59:37.023000 2457723 torch/_dynamo/backends/common.py:53] [6/0] aot_autograd-based backend ignoring extra kwargs {'mode': 'reduce-overhead'}
W0220 14:59:38.381000 2457723 torch/_dynamo/backends/common.py:53] [7/0] aot_a

------ IMAGE INFERENCE ------
Mean Latency: 25.99 ms
P95 Latency: 26.77 ms
FPS: 38.48
Peak Memory: 1127.28 MB


In [17]:
results_img = infer_image(
    compiled_model,
    "Images/people.jpg",
    midas_transforms.dpt_transform
)

------ IMAGE INFERENCE ------
Mean Latency: 25.86 ms
P95 Latency: 28.81 ms
FPS: 38.68
Peak Memory: 1127.28 MB


In [None]:
compiled_model = compile_model_once(
    device="cuda",
    precision="fp16"
)
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
results_img = infer_image(
    compiled_model,
    "Images/people.jpg",
    midas_transforms.dpt_transform
)

## Compilation error due to custom padding logic in MiDas architecture

```
InductorError: LoweringException: AssertionError: 
  target: aten.convolution.default
  args[0]: TensorBox(StorageBox(
    ComputedBuffer(name='buf3', layout=FixedLayout('cuda:0', torch.float32, size=[1, 3, s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5), s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5)], stride=[3*(s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5))*(s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5)), (s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5))*(s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5)), s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5), 1]), data=Pointwise(
      'cuda',
      torch.float32,
      def inner_fn(index):
          _, i1, i2, i3 = index
          tmp0 = ops.index_expr(i2 - ps0, torch.int64)
          tmp1 = ops.index_expr(0, torch.int64)
          tmp2 = tmp0 >= tmp1
          tmp3 = ops.index_expr(i2 - ps0, torch.int64)
          tmp4 = ops.index_expr(s53, torch.int64)
          tmp5 = tmp3 < tmp4
          tmp6 = ops.index_expr(i3 - ps1, torch.int64)
          tmp7 = ops.index_expr(0, torch.int64)
          tmp8 = tmp6 >= tmp7
          tmp9 = ops.index_expr(i3 - ps1, torch.int64)
          tmp10 = ops.index_expr(s0, torch.int64)
          tmp11 = tmp9 < tmp10
          tmp12 = tmp2 & tmp5
          tmp13 = tmp12 & tmp8
          tmp14 = tmp13 & tmp11
          tmp15 = ops.load(arg2_1, i1 + -3 * ps1 + 3 * i3 + 3 * s0 * (i2 - ps0))
          tmp16 = ops.masked(tmp14, tmp15, 0.0)
          return tmp16
      ,
      ranges=[1, 3, s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5), s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5)],
      origin_node=None,
      origins=OrderedSet([convolution, constant_pad_nd, view_5, mul...,
```

In [13]:

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
RESOLUTION = 384
WARMUP_FRAMES = 20
MEASURE_FRAMES = 200
dummy_input = torch.randn(1, 3, RESOLUTION, RESOLUTION).to(DEVICE)
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA, # Only include if CUDA is available
    ],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
) as prof:
    for i in range(10):
        with torch.no_grad():
            dummy_input = align_input_dtype(compiled_model, dummy_input)
            compiled_model(dummy_input)
        prof.step()
print("Profiler run complete. Printing summary...")
print("-" * 50)
print(prof.key_averages().table(sort_by="flops", row_limit=20))

  super().capture_end()
  super().capture_end()
  super().capture_end()
STAGE:2026-02-20 11:16:33 2354873:2354873 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2026-02-20 11:16:33 2354873:2354873 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2026-02-20 11:16:33 2354873:2354873 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
STAGE:2026-02-20 11:16:33 2354873:2354873 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2026-02-20 11:16:34 2354873:2354873 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2026-02-20 11:16:34 2354873:2354873 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Profiler run complete. Printing summary...
--------------------------------------------------
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
fmha_cutlassF_f32_aligned_64x64_rf_sm75(PyTorchMemEf...         0.00%       0.000us         0.00%       0.000us       0.000us      10.265ms         9.11%      10.265ms     244.405us            42  
                                          ProfilerStep*         0.45%     525.000us        60.39%      70.089ms  