<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/multimodality/imagebind.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation

In [None]:
!git clone https://github.com/facebookresearch/ImageBind.git
!pip install git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d timm==0.6.7 ftfy regex einops fvcore decord==0.6.0

Data Checker

In [None]:
%cd /content/ImageBind

In [None]:
import IPython
from PIL import Image

text = 'bird'
display(Image.open(f".assets/{text}_image.jpg"))
display(Image.open(f".assets/{text}_depth.jpg"))
IPython.display.Audio(f".assets/{text}_audio.wav")

Inference

In [None]:
%cd /content/ImageBind

In [None]:
import data
import torch
from models import imagebind_model
from models.imagebind_model import ModalityType

text_list=["A dog.", "A car", "A bird"]
image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"]
audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"]

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
}

with torch.no_grad():
    embeddings = model(inputs)

print(
    "Vision x Text: ",
    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T, dim=-1),
)
print(
    "Audio x Text: ",
    torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1),
)
print(
    "Vision x Audio: ",
    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.AUDIO].T, dim=-1),
)


Advanced Application plan

In [None]:
!pip install transformers

In [1]:
# depth estimator
# https://huggingface.co/spaces/nielsr/dpt-depth-estimation/blob/main/app.py
from transformers import DPTFeatureExtractor, DPTForDepthEstimation
import torch
import numpy as np
from PIL import Image

In [None]:
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

In [3]:
text = "bird"
image = Image.open(f"/content/ImageBind/.assets/{text}_image.jpg")

In [4]:
encoding = feature_extractor(image, return_tensors="pt")
    
# forward pass
with torch.no_grad():
  outputs = model(**encoding)
  predicted_depth = outputs.predicted_depth
    
# interpolate to original size
prediction = torch.nn.functional.interpolate(
                        predicted_depth.unsqueeze(1),
                        size=image.size[::-1],
                        mode="bicubic",
                        align_corners=False,
    ).squeeze()
output = prediction.cpu().numpy()
formatted = (output * 255 / np.max(output)).astype('uint8')
img = Image.fromarray(formatted)

In [None]:
display(img)
img.save(f"/content/ImageBind/.assets/{text}_depth.jpg")

Cross modality

In [None]:
%cd /content/ImageBind

In [2]:
from torchvision import transforms
from PIL import Image
def load_and_transform_depth_data(depth_paths, device):
    if depth_paths is None:
        return None

    depth_ouputs = []
    for depth_path in depth_paths:
        data_transform = transforms.Compose(
            [
                transforms.Resize(
                    224, interpolation=transforms.InterpolationMode.BICUBIC
                ),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                # transforms.Normalize((0.5, ), (0.5, ))
            ]
        )
        with open(depth_path, "rb") as fopen:
            image = Image.open(fopen).convert("L")

        image = data_transform(image).to(device)
        depth_ouputs.append(image)
    return torch.stack(depth_ouputs, dim=0)

In [3]:
import data
import torch
from models import imagebind_model
from models.imagebind_model import ModalityType

text_list=["A dog.", "A car", "A bird"]
image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"]
audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"]
depth_paths = [".assets/dog_depth.jpg", ".assets/car_depth.jpg", ".assets/bird_depth.jpg"]

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
    ModalityType.DEPTH: load_and_transform_depth_data(depth_paths, device),
}

with torch.no_grad():
    embeddings = model(inputs)

print(
    "Vision x Depth: ",
    torch.softmax(embeddings[ModalityType.VISION] @ embeddings[ModalityType.DEPTH].T, dim=-1),
)
print(
    "Text x Depth: ",
    torch.softmax(embeddings[ModalityType.TEXT] @ embeddings[ModalityType.DEPTH].T, dim=-1),
)
print(
    "Depth x Audio: ",
    torch.softmax(embeddings[ModalityType.DEPTH] @ embeddings[ModalityType.AUDIO].T, dim=-1),
)



Vision x Depth:  tensor([[0.3444, 0.3040, 0.3516],
        [0.3451, 0.2363, 0.4186],
        [0.3517, 0.3634, 0.2849]], device='cuda:0')
Text x Depth:  tensor([[9.5571e-01, 4.4270e-02, 1.5210e-05],
        [5.6266e-01, 4.3734e-01, 9.7014e-10],
        [4.6230e-06, 1.0000e+00, 7.2704e-15]], device='cuda:0')
Depth x Audio:  tensor([[1.9618e-01, 1.4769e-02, 7.8905e-01],
        [1.5248e-02, 4.6171e-03, 9.8014e-01],
        [1.5896e-04, 1.8075e-02, 9.8177e-01]], device='cuda:0')
