In [None]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
from IPython.display import display
import torch
from PIL import Image
import coremltools as ct
from huggingface_hub import hf_hub_download
import tempfile
import matplotlib
import numpy as np
from torch import nn

## Patch CoreMLTools

We need to patch the lack of `upsample_bicubic2d`.
Achieved by looking here:
- https://github.com/pytorch/pytorch/blob/ac5f565fa7010bd77b9e779415e8709d347234b6/aten/src/ATen/native/UpSample.cpp#L10
- https://github.com/pytorch/pytorch/blob/ac5f565fa7010bd77b9e779415e8709d347234b6/aten/src/ATen/native/UpSampleBicubic2d.cpp#L284
- https://apple.github.io/coremltools/docs-guides/source/composite-operators.html#using-composite-ops-with-pytorch-conversion
- https://github.com/huggingface/exporters/blob/7a545974275c7af167a2fa4e16c4574359f2acec/src/exporters/coreml/models.py#L530


In [None]:
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs
from coremltools.converters.mil import Builder as mb

@register_torch_op
def upsample_bicubic2d(context, node):
    inputs = _get_inputs(context, node)

    for input in inputs:
        print("DBS upsample_bicubic2d, input: ", input)
        if hasattr(input, 'name'):
            print("DBS upsample_bicubic2d, input name: ", input.name)
            if (input.name == '170' or input.name == '173') and hasattr(input, 'val'):
                print("DBS upsample_bicubic2d, input value: ", input.val)
        if hasattr(input, 'shape'):
            print("DBS upsample_bicubic2d, input shape: ", input.shape)
        if hasattr(input, 'dtype'):
            print("DBS upsample_bicubic2d, input dtype: ", input.dtype)
        if hasattr(input, 'type_str'):
            print("DBS upsample_bicubic2d, input type_str: ", input.type_str)
        print("\n")

    a = inputs[0]
    b = inputs[3]
    print("DBS upsample_bicubic2d, a: ", a)
    print("DBS upsample_bicubic2d, b: ", b)
    y = mb.resize_bilinear(
        x=a, 
        target_size_height=int(b.val[0] * a.shape[2]), 
        target_size_width=int(b.val[1] * a.shape[3]), 
        name=node.name
    )
    context.add(y)

## Import the DepthAnythingV2 model

The `depth_anything_v2` folder I copied directly from their huggingface repo: https://huggingface.co/spaces/depth-anything/Depth-Anything-V2/tree/main/depth_anything_v2

To make the code work, copy it / symlink it to the root of this folder

In [None]:
from depth_anything_v2.dpt import DepthAnythingV2

In [None]:
image = Image.open("cat_dog.jpg")
image 

## Setup
This part I ported from the huggingface DepthAnythingV2 repo: https://huggingface.co/spaces/depth-anything/Depth-Anything-V2/blob/main/app.py

Load the model and put it in `eval` mode.

Change the `encoder` to the one of your choice.

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
encoder2name = {
    'vits': 'Small',
    'vitb': 'Base',
    'vitl': 'Large',
    'vitg': 'Giant', # UNAVAILABLE AS OF TODAY
}
encoder = 'vits'
model_name = encoder2name[encoder]
model = DepthAnythingV2(**model_configs[encoder])
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-{model_name}", filename=f"depth_anything_v2_{encoder}.pth", repo_type="model")
state_dict = torch.load(filepath, map_location="cpu")
model.load_state_dict(state_dict)
model = model.to(device).eval()

Util function to predict depth without grad:

In [None]:
def predict_depth(image):
    with torch.no_grad():
        return model.infer_image(image)

Test that what we loaded works:

In [None]:
cmap = matplotlib.colormaps.get_cmap('Spectral_r')
input_image = np.array(image)
h, w = input_image.shape[:2]

depth = predict_depth(input_image[:, :, ::-1])

raw_depth = Image.fromarray(depth.astype('uint16'))
tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
raw_depth.save(tmp_raw_depth.name)

depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
depth = depth.astype(np.uint8)
colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
colored_depth_image = Image.fromarray(colored_depth)

gray_depth = Image.fromarray(depth)
tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
gray_depth.save(tmp_gray_depth.name)

In [None]:
display(gray_depth)
display(colored_depth_image)

## Patch the model output

CoreMLTools wants tensors of rank 4 as output. However, DepthAnythingV2 returns a depth map of shap (1, H, W).
Hence, we add a postprocessing step to fix this

In [None]:
class DepthModelWrapper(nn.Module):
    def __init__(self, model):
        super(DepthModelWrapper, self).__init__()
        self.model = model

    def forward(self, x):
        output = self.model(x)
        return output.unsqueeze(1)  # Add an extra dimension to make the output tensor of rank 4

In [None]:
model_wrapper = DepthModelWrapper(model)

## Finally convert

Notice the shape of `input_tensor`: its width is 518 because that's the default input size to DepthAnythingV2.
The height can be any other value as long as it's a multiple of 14.
For simplicity, we make the input have an aspect ratio of 1.
Consequently, the output will have the same aspect ratio.

In [None]:
input_tensor = torch.randn(1, 3, 518, 518).to(device)
traceable_model = torch.jit.trace(model_wrapper, input_tensor)


In [None]:
# Set the image scale and bias for input image preprocessing
scale = 1 / (0.226 * 255.0)
bias = [-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225]

# Define the input and output types for the CoreML model
input_name = "input"
output_name = "output"
input_type = ct.ImageType(name=input_name, shape=input_tensor.shape, scale=scale, bias=bias,
                          color_layout=ct.colorlayout.RGB)
output_type = ct.ImageType(name=output_name, color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)

# Convert the PyTorch model to CoreML
# If it gets stuck at this step: https://github.com/ggerganov/whisper.cpp/issues/773#issuecomment-1563324684
mlmodel = ct.convert(
    traceable_model,
    inputs=[input_type],
    outputs=[output_type],
    minimum_deployment_target=ct.target.iOS16,
)

mlmodel.save(f"DepthAnythingV2{model_name}.mlpackage")