In [2]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def get_bert_text_embedding(text: str, norm: bool = False) -> torch.Tensor:
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device) #, max_length=64

    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)

    if norm:
        embedding = F.normalize(embedding, p=2, dim=-1)

    return embedding.squeeze(0)  # shape: (768,)


In [3]:
embedding = get_bert_text_embedding("A man riding a horse.")  # shape: (768,)
print(embedding.min().item(), embedding.max().item())
normalized_embedding = F.normalize(embedding, p=2, dim=-1)
print(normalized_embedding.min().item(), normalized_embedding.max().item())

-7.142841339111328 3.9890592098236084
-0.46136611700057983 0.2576589286327362


In [12]:
from transformers import CLIPTokenizer, CLIPModel
import torch

# Load CLIP tokenizer and model
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [7]:
def get_clip_text_embedding(text: str, norm: bool = False) -> torch.Tensor:
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        text_features = model.get_text_features(**inputs)

    if norm:
        text_features = F.normalize(text_features, p=2, dim=-1)
    return text_features[0]  # Shape: (512,)

def get_text_embedding(text: str, emb_type: str = "bert", norm: bool = False) -> torch.Tensor:
    if emb_type == "bert":
        return get_bert_text_embedding(text, norm=norm)
    elif emb_type == "clip":
        return get_clip_text_embedding(text, norm=norm)
    else:
        raise ValueError("Invalid type. Use 'bert' or 'clip'.")

def get_distance(text1: str, text2: str, emb_type: str = "bert", norm: bool = False) -> float:
    emb1 = get_text_embedding(text1, emb_type=emb_type, norm=norm)
    emb2 = get_text_embedding(text2, emb_type=emb_type, norm=norm)

    diff = emb1 - emb2
    return torch.sum(diff ** 2).item()


In [5]:
text1 = "Lorem ipsum"
text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit..."

# embedding = get_clip_text_embedding(text1)
embedding = get_bert_text_embedding(text1)
print("CLIP text embedding shape:", embedding.shape)

for i in range(embedding.shape[0]):
    print(f"Embedding {i}: {embedding[i].item()}")


CLIP text embedding shape: torch.Size([768])
Embedding 0: -0.4526381194591522
Embedding 1: 0.05799601972103119
Embedding 2: -0.0746992826461792
Embedding 3: -0.1379043012857437
Embedding 4: -0.07813472300767899
Embedding 5: -0.1462736874818802
Embedding 6: 0.08453729748725891
Embedding 7: 0.2688385844230652
Embedding 8: -0.08219554275274277
Embedding 9: -0.06635184586048126
Embedding 10: -0.01631523296236992
Embedding 11: 0.09536563605070114
Embedding 12: 0.1274995654821396
Embedding 13: 0.4383492171764374
Embedding 14: -0.11381445825099945
Embedding 15: -0.038340672850608826
Embedding 16: -0.22322000563144684
Embedding 17: 0.46132972836494446
Embedding 18: 0.22511440515518188
Embedding 19: -0.041060805320739746
Embedding 20: 0.1893252730369568
Embedding 21: -0.2919433116912842
Embedding 22: -0.24283313751220703
Embedding 23: -0.36146506667137146
Embedding 24: -0.016134928911924362
Embedding 25: 0.0747588723897934
Embedding 26: -0.10291728377342224
Embedding 27: -0.2002193033695221
Emb

In [15]:

print("CLIP distance:", get_distance("dog", "dog", norm=True))
print("CLIP distance:", get_distance("dog", "dog"))
print("CLIP distance:", get_distance("dog", "dog "))
print("CLIP distance:", get_distance("dog", "dog-"))
print("CLIP distance:", get_distance("dog", "dogs"))
print("CLIP distance:", get_distance("dog", "dogs", norm=True))
print("CLIP distance:", get_distance("cat", "dog"))
print("CLIP distance:", get_distance("human", "dog"))
print("CLIP distance:", get_distance("black dog", "dog"))
print("CLIP distance:", get_distance("black dog", "black human"))
print("CLIP distance:", get_distance("dog on", "on dog"))
print("CLIP distance:", get_distance("dog", ""))
print("CLIP distance:", get_distance("dog", "In the image (476.4KB jpg by CREATOR_3D), a white Dacia Logan car is depicted in a frontal view, showcasing its crossover attributes with an elevated ground clearance, fog lights, and a prominent front grille displaying a logo. The headlights are large and round, and the bumper is equipped with fog lights and a central air intake, contributing to the car's sleek and modern design."))
print()
print("CLIP distance:", get_distance("Dacia Logan MCV Fiskal 2016 ( 476.4KB jpg by CREATOR_3D )", "In the image (476.4KB jpg by CREATOR_3D), a white Dacia Logan car is depicted in a frontal view, showcasing its crossover attributes with an elevated ground clearance, fog lights, and a prominent front grille displaying a logo. The headlights are large and round, and the bumper is equipped with fog lights and a central air intake, contributing to the car's sleek and modern design."))
print("CLIP distance:", get_distance("This image showcases a white Dacia Logan car, positioned in a frontal view. The car is a crossover type, evident from its elevated ground clearance and the presence of fog lights. The front grille is prominent, featuring a logo in the center. The headlights are large and round, and the bumper is equipped with fog lights and a central air intake. The car's overall design is sleek and modern.", "In the image (476.4KB jpg by CREATOR_3D), a white Dacia Logan car is depicted in a frontal view, showcasing its crossover attributes with an elevated ground clearance, fog lights, and a prominent front grille displaying a logo. The headlights are large and round, and the bumper is equipped with fog lights and a central air intake, contributing to the car's sleek and modern design."))
print("CLIP distance:", get_distance("Dacia Logan MCV Fiskal 2016 ( 476.4KB jpg by CREATOR_3D )", "This image showcases a white Dacia Logan car, positioned in a frontal view. The car is a crossover type, evident from its elevated ground clearance and the presence of fog lights. The front grille is prominent, featuring a logo in the center. The headlights are large and round, and the bumper is equipped with fog lights and a central air intake. The car's overall design is sleek and modern."))


CLIP distance: 0.0
CLIP distance: 0.0
CLIP distance: 0.0
CLIP distance: 62.013084411621094
CLIP distance: 18.201637268066406
CLIP distance: 0.08603137731552124
CLIP distance: 13.775386810302734
CLIP distance: 34.64408874511719
CLIP distance: 13.760682106018066
CLIP distance: 37.55180358886719
CLIP distance: 16.45562744140625
CLIP distance: 282.0098571777344
CLIP distance: 117.21015930175781

CLIP distance: 102.71747589111328
CLIP distance: 58.14107894897461
CLIP distance: 147.71607971191406


In [None]:
import os
from tqdm.notebook import tqdm
import numpy as np
# Convert all captions to embeddings and save them

caption_dir = r"C:\!project-dataset\captions"
caption_emb_dir = r"C:\!project-dataset\captions_emb"
if not os.path.exists(caption_emb_dir):
    os.makedirs(caption_emb_dir)

for filename in tqdm(os.listdir(caption_dir)):
    if filename.endswith(".txt"):
        filepath = os.path.join(caption_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            caption = f.read().strip()
        caption_emb = get_text_embedding(caption, emb_type="bert", norm=False)
        caption_emb = caption_emb.cpu().numpy()
        caption_emb = caption_emb.astype("float16")
        caption_emb_path = os.path.join(caption_emb_dir, filename.replace(".txt", ".npy"))
        with open(caption_emb_path, "wb") as f:
            np.save(f, caption_emb)

  0%|          | 0/176582 [00:00<?, ?it/s]