In [111]:
!pip install open-clip-torch torch torchvision pillow



In [112]:
"""
CLIP mainly consists of two encoders:
An image encoder (based on a Vision Transformer or CNN)
A text encoder (based on a Transformer)
Each of these encoders is responsible for converting images and text into vector embeddings, respectively.
CLIP is not an encoder-decoder model;
instead, it can be seen as a Siamese encoder model that embeds both images and text into the same vector space.
"""

'\nCLIP mainly consists of two encoders:\nAn image encoder (based on a Vision Transformer or CNN)\nA text encoder (based on a Transformer)\nEach of these encoders is responsible for converting images and text into vector embeddings, respectively.\nCLIP is not an encoder-decoder model;\ninstead, it can be seen as a Siamese encoder model that embeds both images and text into the same vector space.\n'

In [113]:
import open_clip
import torch
from PIL import Image
import requests
from torchvision import transforms

In [114]:
# 1. model loading
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [115]:
#Device setting (MPS > CUDA > CPU)

In [116]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [117]:
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [118]:
print(f"Using device: {device}")

Using device: mps


In [119]:
#preprocessing and embedding

In [120]:
from PIL import Image
import requests

In [121]:
image_path = "11_data/fubao.jpg"

In [122]:
# Convert image to RGB to make it compatible with the CLIP model
image = Image.open(image_path).convert("RGB")

In [123]:
# Preprocess the image, add batch dimension, and move to device
image_input = preprocess(image).unsqueeze(0).to(device)

In [124]:
#text tokenizing using CLIP tokenizer => text_tokens

In [125]:
texts = [
    "A cute baby panda",
    "A sleeping cat",
    "A playful puppy",
    "A panda eating bamboo"
]

In [126]:
text_tokens = tokenizer(texts).to(device)

In [127]:
#embedding

In [128]:
# Siamese encoder 모델 임베딩 추출 (gradient 계산 OFF)
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_tokens)

In [129]:
#calculate the length(norm) of each vector along the last dimention
#with keepdim=true , the dimention is retained = shape the bactch size =1
#the original tenspr is then divided by these length vaues for each vector
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)

In [130]:
print("Image features shape:", image_features.shape)

Image features shape: torch.Size([1, 512])


In [131]:
print("Text features shape:", text_features.shape)  

Text features shape: torch.Size([4, 512])


In [132]:
print("Image embedding sample:", image_features[0, :5])

Image embedding sample: tensor([ 0.0027, -0.0375, -0.1141, -0.0503, -0.0033], device='mps:0')


In [133]:
print("Text embedding sample (1st):", text_features[0, :5])

Text embedding sample (1st): tensor([-0.0042,  0.0061,  0.0118, -0.0102,  0.0170], device='mps:0')


In [134]:
#dot product (softmax)

In [135]:
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

In [136]:
for i, (text, score) in enumerate(zip(texts, similarity[0])):
    print(f"{i+1}. \"{text}\" → similarity score: {score.item():.4f}")

1. "A cute baby panda" → similarity score: 0.9997
2. "A sleeping cat" → similarity score: 0.0000
3. "A playful puppy" → similarity score: 0.0000
4. "A panda eating bamboo" → similarity score: 0.0003
