In [12]:
from transformers import CLIPTokenizer, CLIPModel
import torch

# Load CLIP tokenizer and model
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [2]:
def get_clip_text_embedding(text: str) -> torch.Tensor:
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        text_features = model.get_text_features(**inputs)

    return text_features[0]  # Shape: (512,)

def get_clip_distance(text1: str, text2: str) -> float:
    emb1 = get_clip_text_embedding(text1)
    emb2 = get_clip_text_embedding(text2)

    diff = emb1 - emb2
    return torch.sum(diff ** 2).item()


In [3]:
text1 = "Lorem ipsum"
text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit..."

embedding = get_clip_text_embedding(text1)
print("CLIP text embedding shape:", embedding.shape)

for i in range(embedding.shape[0]):
    print(f"Embedding {i}: {embedding[i].item()}")


CLIP text embedding shape: torch.Size([512])
Embedding 0: -0.05216789245605469
Embedding 1: 0.36954179406166077
Embedding 2: 0.025117069482803345
Embedding 3: 0.25797364115715027
Embedding 4: 0.1369258612394333
Embedding 5: 0.06664882600307465
Embedding 6: -0.10717491805553436
Embedding 7: -1.0406571626663208
Embedding 8: -0.013779710978269577
Embedding 9: -0.2968878149986267
Embedding 10: 0.2702043950557709
Embedding 11: 0.04655654728412628
Embedding 12: 0.018182925879955292
Embedding 13: -0.5079228281974792
Embedding 14: 0.009670615196228027
Embedding 15: -0.1947697252035141
Embedding 16: 0.41427332162857056
Embedding 17: 0.0358353853225708
Embedding 18: 0.2160024642944336
Embedding 19: 0.21833528578281403
Embedding 20: 0.2473505586385727
Embedding 21: -0.4153980016708374
Embedding 22: -0.07173170149326324
Embedding 23: 0.10334262996912003
Embedding 24: -0.4194077253341675
Embedding 25: 0.4170600175857544
Embedding 26: 0.09457990527153015
Embedding 27: -0.06524960696697235
Embedding 

In [11]:
print("CLIP distance:", get_clip_distance("dog", "dog"))
print("CLIP distance:", get_clip_distance("dog", "dogs"))
print("CLIP distance:", get_clip_distance("cat", "dog"))
print("CLIP distance:", get_clip_distance("human", "dog"))
print("CLIP distance:", get_clip_distance("black dog", "dog"))
print("CLIP distance:", get_clip_distance("black dog", "black human"))
print("CLIP distance:", get_clip_distance("dog on", "on dog"))


CLIP distance: 0.0
CLIP distance: 18.57891845703125
CLIP distance: 20.26671600341797
CLIP distance: 32.514976501464844
CLIP distance: 32.71125793457031
CLIP distance: 26.89969253540039
CLIP distance: 8.504020690917969
