<a href="https://colab.research.google.com/github/pouyan6/ireal/blob/main/iReal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall clip -y
!pip install git+https://github.com/openai/CLIP.git

In [None]:
!pip install ftfy regex tqdm

# Import libs

In [9]:
import torch
import clip
from PIL import Image

# Load Model

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load two images

In [24]:
img1 = preprocess(Image.open("house-1.jpg")).unsqueeze(0).to(device)
img2 = preprocess(Image.open("house-2.jpeg")).unsqueeze(0).to(device)
# img2 = preprocess(Image.open("house-3.png")).unsqueeze(0).to(device)


# Get embeddings

In [25]:
with torch.no_grad():
    emb1 = model.encode_image(img1)
    emb2 = model.encode_image(img2)

# Normalize & compute cosine similarity


In [26]:
similarity = torch.cosine_similarity(emb1, emb2)
print("Similarity score:", similarity.item())

Similarity score: 0.8495361804962158


Example with DINOv2 (PyTorch)

In [28]:
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from io import BytesIO

In [29]:
# Load DINOv2
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
model.eval()


Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip




Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth


100%|██████████| 84.2M/84.2M [00:00<00:00, 251MB/s]


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (norm): LayerNorm((384,), eps=1e-06, elementwise_affi

In [30]:
# Preprocessing
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

In [31]:
def get_embedding(img_path):
    img = Image.open(img_path).convert("RGB")
    img = transform(img).unsqueeze(0)
    with torch.no_grad():
        emb = model(img)
    return emb

In [41]:
emb1 = get_embedding("house-1.jpg")
emb2 = get_embedding("house-2.jpeg")

In [42]:
similarity = torch.cosine_similarity(emb1, emb2).item()
print("Similarity score:", similarity)

Similarity score: 0.7294092178344727
