In [None]:
import torch
import cv2
import numpy as np
import time

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VIDEO_PATH = "test.mov"
RESOLUTION = 384
WARMUP_FRAMES = 20
MEASURE_FRAMES = 200

model_32 = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
model_base = torch.hub.load("intel-isl/MiDaS", "DPT_Hybrid")
dummy_input = torch.randn(1, 3, RESOLUTION, RESOLUTION).to(DEVICE)

In [7]:
def prepare_model(model, device="cuda", precision="fp32"):
    model = model.to(device)

    if precision == "fp16":
        model = model.half()

    return model.eval()

def build_infer_fn(model, device="cuda", precision="fp32"):

    if precision == "fp32":
        def infer(input_tensor):
            with torch.no_grad():
                return model(input_tensor)

    elif precision == "amp":
        def infer(input_tensor):
            with torch.no_grad():
                with torch.amp.autocast('cuda',dtype=torch.float16):
                    return model(input_tensor)

    elif precision == "fp16":
        def infer(input_tensor):
            with torch.no_grad():
                return model(input_tensor.half())

    else:
        raise ValueError("Unsupported precision")

    return infer

def timed_inference(infer_fn, input_tensor, device="cuda"):

    if device == "cuda":
        torch.cuda.synchronize()

    start = time.time()
    output = infer_fn(input_tensor)
    if device == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency_ms = (end - start) * 1000
    return output.float(), latency_ms

def compute_error_metrics(ref, pred):
    error = torch.abs(ref - pred)

    mae = error.mean().item()
    max_err = error.max().item()
    rel_err = (error / (ref.abs() + 1e-6)).mean().item()

    return {
        "MAE": mae,
        "MaxError": max_err,
        "RelativeError": rel_err
    }
    
def depth_to_colormap(depth_tensor):
    depth = depth_tensor.squeeze().cpu().numpy()
    depth_norm = (depth - depth.min()) / (depth.max() - depth.min() + 1e-6)
    depth_uint8 = (depth_norm * 255).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_uint8, cv2.COLORMAP_INFERNO)
    return depth_color

def overlay_fps(image, fps):
    cv2.putText(
        image,
        f"FPS: {fps:.2f}",
        (10, 30),
        cv2.FONT_HERSHEY_SIMPLEX,
        1,
        (255, 255, 255),
        2
    )
    return image

def preprocess_frame(frame, resolution, device):
    frame_resized = cv2.resize(frame, (resolution, resolution))
    rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    rgb = rgb / 255.0

    tensor = torch.from_numpy(rgb).permute(2,0,1).float().unsqueeze(0)
    return tensor.to(device)

def infer_image(model,
                image_path,
                resolution=384,
                device="cuda",
                precision="fp32",
                warmup_runs=5,
                measure_runs=10,
                output_path=None,
                save=False):
    latencies = []
    model = prepare_model(model, device, precision)
    infer_fn = build_infer_fn(model, device, precision)

    frame = cv2.imread(image_path)
    if frame is None:
        raise ValueError("Invalid image path")

    input_tensor = preprocess_frame(frame, resolution, device)

    if device == "cuda":
        torch.cuda.reset_peak_memory_stats()

    for _ in range(warmup_runs):
        _ = timed_inference(infer_fn, input_tensor, device)

    for _ in range(measure_runs):
        depth, latency_ms = timed_inference(infer_fn, input_tensor, device)
        latencies.append(latency_ms)
    latencies = np.array(latencies)

    results = {
        "MeanLatency_ms": latencies.mean(),
        "P50_ms": np.percentile(latencies, 50),
        "P95_ms": np.percentile(latencies, 95),
        "FPS": 1000.0 / latencies.mean()
    }

    if save:
        depth_color = depth_to_colormap(depth)
        fps = results["FPS"]
        depth_color = overlay_fps(depth_color, fps)
        cv2.imwrite(output_path, depth_color)

    return results, depth.detach().cpu()

def infer_video(model,
                video_path,
                resolution=384,
                device="cuda",
                precision="fp32",
                warmup_frames=5,
                measure_frames=50,
                output_path=None,
                save=False):

    model = prepare_model(model, device, precision)
    infer_fn = build_infer_fn(model, device, precision)

    cap = cv2.VideoCapture(video_path)
    orig_fps = cap.get(cv2.CAP_PROP_FPS)

    if save:
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        writer = cv2.VideoWriter(
            output_path,
            fourcc,
            orig_fps if orig_fps > 0 else 30,
            (resolution, resolution)
        )

    latencies = []
    frame_count = 0

    if device == "cuda":
        torch.cuda.reset_peak_memory_stats()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        input_tensor = preprocess_frame(frame, resolution, device)

        depth, latency_ms = timed_inference(infer_fn, input_tensor, device)

        if frame_count >= warmup_frames:
            latencies.append(latency_ms)

            if save:
                depth_color = depth_to_colormap(depth)
                fps = 1000.0 / latency_ms
                depth_color = overlay_fps(depth_color, fps)
                writer.write(depth_color)

        frame_count += 1

        if frame_count >= (warmup_frames + measure_frames):
            break

    cap.release()
    if save:
        writer.release()

    latencies = np.array(latencies)

    results = {
        "MeanLatency_ms": latencies.mean(),
        "P50_ms": np.percentile(latencies, 50),
        "P95_ms": np.percentile(latencies, 95),
        "FPS": 1000.0 / latencies.mean()
    }

    return results

def run_inference(model,
                  source,
                  resolution=384,
                  device="cuda",
                  precision="fp32",
                  output_path=None,
                  save=False):

    if source.lower().endswith((".mp4", ".avi", ".mov", ".mkv")):
        return infer_video(
            model,
            source,
            resolution=resolution,
            device=device,
            precision=precision,
            output_path=output_path,
            save=save
        )
    else:
        return infer_image(
            model,
            source,
            resolution=resolution,
            device=device,
            precision=precision,
            output_path=output_path,
            save=save
        )


In [8]:
fp32_results, fp32_outputs = run_inference(
    model_32,
    "people.jpg",
    output_path="depth_image.png",
    precision="fp32",
    save=True
)
print(fp32_results)

{'MeanLatency_ms': np.float64(33.47959518432617), 'P50_ms': np.float64(33.13398361206055), 'P95_ms': np.float64(34.45504903793335), 'FPS': np.float64(29.868939409045204)}


In [9]:
fp16_results, fp16_outputs = run_inference(
    model_base,
    "people.jpg",
    output_path="depth_image_16.png",
    precision="fp16",
    save=True
)
print(fp16_results)

{'MeanLatency_ms': np.float64(15.43724536895752), 'P50_ms': np.float64(15.418410301208496), 'P95_ms': np.float64(15.588390827178955), 'FPS': np.float64(64.77839641072767)}


In [11]:
amp_results, amp_outputs = run_inference(
    model_base,
    "people.jpg",
    output_path="depth_image_amp.png",
    precision="amp",
    save=True
)
print(amp_results)

{'MeanLatency_ms': np.float64(22.405457496643066), 'P50_ms': np.float64(22.082209587097168), 'P95_ms': np.float64(25.071918964385986), 'FPS': np.float64(44.631983084916996)}


In [13]:
error = compute_error_metrics(fp32_outputs, fp16_outputs)
print(error)

{'MAE': 0.596645712852478, 'MaxError': 5.41015625, 'RelativeError': 0.0006504552438855171}


In [14]:
error = compute_error_metrics(fp32_outputs, amp_outputs)
print(error)

{'MAE': 0.596645712852478, 'MaxError': 5.41015625, 'RelativeError': 0.0006504552438855171}
