In [2]:
import torch
import time
import numpy as np
import cv2
# uv pip install torch==2.10.0 torchvision==0.25.0 triton==3.6.0 --index-url https://download.pytorch.org/whl/cu128
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# from .autonotebook import tqdm as notebook_tqdm
torch.backends.cudnn.benchmark = True
import torch._dynamo
# torch._dynamo.config.suppress_errors = True

def compile_model_once(model=None, precision="fp16",backend="inductor",
                       compile_mode="reduce-overhead", fullgraph=False):

    if precision == "fp16":
        model = model.half()
        
    if hasattr(model, 'backbone') and hasattr(model.backbone, 'patch_embed'):
        stem_layer = model.backbone.patch_embed.backbone.stem
    # This tells the compiler: "When you hit this function, stop compiling, run it normally, and resume compilation afterward."
# torch._dynamo.disable(model.backbone.patch_embed.backbone.stem.forward)
    #decorate the forward pass of the stem to be skipped by Dynamo
        stem_layer.forward = torch._dynamo.disable(stem_layer.forward)
        print("Optimization disabled for ResNet stem to prevent LoweringException.")
        

    compiled_model = torch.compile(
        model,
        backend=backend, #aot_eager
        mode=compile_mode,
        dynamic=False
    )

    return compiled_model

In [3]:
def align_input_dtype(model, input_tensor):
    model_dtype = next(model.parameters()).dtype
    return input_tensor.to(dtype=model_dtype)

def preprocess_frame(frame, resolution, device):
    frame_resized = cv2.resize(frame, (resolution, resolution))
    rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    rgb = rgb / 255.0

    tensor = torch.from_numpy(rgb).permute(2,0,1).float().unsqueeze(0)
    return tensor.to(device)
def infer_image(compiled_model,
                image_path,
                transform_fn,
                device="cuda",
                warmup_runs=5,
                measure_runs=30,
                output_path="depth_output.png"):

    frame = cv2.imread(image_path)
    if frame is None:
        raise ValueError("Invalid image path")

    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # input_tensor = transform_fn(img_rgb).to(device)
    input_tensor = preprocess_frame(frame, 384, device)
    input_tensor = align_input_dtype(compiled_model, input_tensor)

    # Warmup
    for _ in range(warmup_runs):
        with torch.no_grad():
            _ = compiled_model(input_tensor)

    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

    latencies = []

    for _ in range(measure_runs):

        torch.cuda.synchronize()
        start = time.time()

        with torch.no_grad():
            depth = compiled_model(input_tensor)

        torch.cuda.synchronize()
        end = time.time()

        latencies.append((end - start) * 1000)

    latencies = np.array(latencies)

    mean_latency = latencies.mean()
    fps = 1000.0 / mean_latency
    peak_mem = torch.cuda.max_memory_allocated() / (1024**2)

    # Save output
    depth_np = depth.squeeze().detach().cpu().numpy()
    depth_norm = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-6)
    depth_uint8 = (depth_norm * 255).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)

    cv2.putText(depth_color,
                f"FPS: {fps:.2f}",
                (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (255, 255, 255),
                2)

    cv2.imwrite(output_path, depth_color)

    print("------ IMAGE INFERENCE ------")
    print(f"Mean Latency: {mean_latency:.2f} ms")
    print(f"P95 Latency: {np.percentile(latencies,95):.2f} ms")
    print(f"FPS: {fps:.2f}")
    print(f"Peak Memory: {peak_mem:.2f} MB")

    return {
        "MeanLatency_ms": float(mean_latency),
        "FPS": float(fps),
        "PeakMemory_MB": float(peak_mem)
    }

In [4]:
def depth_to_colormap(depth_tensor):
    depth = depth_tensor.squeeze().detach().cpu().numpy()
    dmin, dmax = float(depth.min()), float(depth.max())
    depth_norm = (depth - dmin) / (dmax - dmin + 1e-6)
    depth_uint8 = (depth_norm * 255).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)
    return depth_color

In [5]:
def infer_video(compiled_model,
                video_path,
                transform_fn,
                device="cuda",
                warmup_frames=5,
                measure_frames=100,
                output_path="depth_video_output.mp4"):

    cap = cv2.VideoCapture(video_path)
    orig_fps = cap.get(cv2.CAP_PROP_FPS)

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(output_path, fourcc,
                             orig_fps if orig_fps > 0 else 30,
                             (384, 384))

    frame_count = 0
    latencies = []

    torch.cuda.reset_peak_memory_stats()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (384, 384))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = frame / 255.0
        frame = torch.from_numpy(frame).permute(2,0,1).float().unsqueeze(0).to(device)
        if next(compiled_model.parameters()).dtype == torch.float16:
                frame = frame.half()
        if frame_count < warmup_frames:
            with torch.no_grad():
                _ = compiled_model(frame)
            frame_count += 1
            continue

        torch.cuda.synchronize()
        start = time.time()

        with torch.no_grad():
            depth = compiled_model(frame)

        torch.cuda.synchronize()
        end = time.time()

        latency_ms = (end - start) * 1000
        latencies.append(latency_ms)

        # Visualization
        depth_np = depth.squeeze().detach().cpu().numpy()
        depth_norm = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-6)
        depth_uint8 = (depth_norm * 255).astype(np.uint8)
        depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)

        fps = 1000.0 / latency_ms

        cv2.putText(depth_color,
                    f"FPS: {fps:.2f}",
                    (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    1,
                    (255, 255, 255),
                    2)

        writer.write(depth_color)

        frame_count += 1
        if frame_count >= warmup_frames + measure_frames:
            break

    cap.release()
    writer.release()

    latencies = np.array(latencies)
    mean_latency = latencies.mean()
    fps = 1000.0 / mean_latency
    peak_mem = torch.cuda.max_memory_allocated() / (1024**2)

    print("------ VIDEO INFERENCE ------")
    print(f"Mean Latency: {mean_latency:.2f} ms")
    print(f"P95 Latency: {np.percentile(latencies,95):.2f} ms")
    print(f"FPS: {fps:.2f}")
    print(f"Peak Memory: {peak_mem:.2f} MB")

    return {
        "MeanLatency_ms": float(mean_latency),
        "FPS": float(fps),
        "PeakMemory_MB": float(peak_mem)
    }

In [9]:
model = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
model = model.to("cuda").eval()
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
compiled_model = compile_model_once(
    model = model,
    precision="fp16",
    backend="inductor", #eager, aot-eager
    compile_mode = "reduce-overhead" #max-autotune
)

results_img = infer_image(
    compiled_model,
    "Images/people.jpg",
    midas_transforms.dpt_transform,
    output_path="depth_compile_f16.png"
)

Using cache found in /home/RUS_CIP/st189432/.cache/torch/hub/intel-isl_MiDaS_master
Using cache found in /home/RUS_CIP/st189432/.cache/torch/hub/intel-isl_MiDaS_master


------ IMAGE INFERENCE ------
Mean Latency: 9.31 ms
P95 Latency: 9.65 ms
FPS: 107.41
Peak Memory: 269.17 MB


In [None]:
# results_img = infer_video(
#     compiled_model,
#     "/home/RUS_CIP/st189432/MasterThesis/ddacs/Monocular-Depth-ViT-Optimization/test.mov",
#     midas_transforms.dpt_transform,
#     output_path="depth_compile_f16.avi"
# )

------ VIDEO INFERENCE ------
Mean Latency: 10.05 ms
P95 Latency: 10.41 ms
FPS: 99.48
Peak Memory: 919.28 MB


## Compilation error due to custom padding logic in MiDas architecture

```
InductorError: LoweringException: AssertionError: 
  target: aten.convolution.default
  args[0]: TensorBox(StorageBox(
    ComputedBuffer(name='buf3', layout=FixedLayout('cuda:0', torch.float32, size=[1, 3, s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5), s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5)], stride=[3*(s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5))*(s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5)), (s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5))*(s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5)), s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5), 1]), data=Pointwise(
      'cuda',
      torch.float32,
      def inner_fn(index):
          _, i1, i2, i3 = index
          tmp0 = ops.index_expr(i2 - ps0, torch.int64)
          tmp1 = ops.index_expr(0, torch.int64)
          tmp2 = tmp0 >= tmp1
          tmp3 = ops.index_expr(i2 - ps0, torch.int64)
          tmp4 = ops.index_expr(s53, torch.int64)
          tmp5 = tmp3 < tmp4
          tmp6 = ops.index_expr(i3 - ps1, torch.int64)
          tmp7 = ops.index_expr(0, torch.int64)
          tmp8 = tmp6 >= tmp7
          tmp9 = ops.index_expr(i3 - ps1, torch.int64)
          tmp10 = ops.index_expr(s0, torch.int64)
          tmp11 = tmp9 < tmp10
          tmp12 = tmp2 & tmp5
          tmp13 = tmp12 & tmp8
          tmp14 = tmp13 & tmp11
          tmp15 = ops.load(arg2_1, i1 + -3 * ps1 + 3 * i3 + 3 * s0 * (i2 - ps0))
          tmp16 = ops.masked(tmp14, tmp15, 0.0)
          return tmp16
      ,
      ranges=[1, 3, s53 + Max(0, -s53 + 2*CeilToInt(IntTrueDiv(s53, 2)) + 5), s0 + Max(0, -s0 + 2*CeilToInt(IntTrueDiv(s0, 2)) + 5)],
      origin_node=None,
      origins=OrderedSet([convolution, constant_pad_nd, view_5, mul...,
```

In [10]:

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
RESOLUTION = 384
WARMUP_FRAMES = 20
MEASURE_FRAMES = 200
dummy_input = torch.randn(1, 3, RESOLUTION, RESOLUTION).to(DEVICE)
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA, # Only include if CUDA is available
    ],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
) as prof:
    for i in range(10):
        with torch.no_grad():
            dummy_input = align_input_dtype(compiled_model, dummy_input)
            compiled_model(dummy_input)
        prof.step()
print("Profiler run complete. Printing summary...")
print("-" * 50)
print(prof.key_averages().table(sort_by="flops", row_limit=20))

Profiler run complete. Printing summary...
--------------------------------------------------
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.43%     184.005us         9.56%       4.054ms       1.351ms       0.000us         0.00%      27.022ms       9.007ms             3  
                                               aten::to         0.01%       2.946us         0.01%       2.946us  