<a href="https://colab.research.google.com/github/ramyahramzy/Colab/blob/main/Image_and_description_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# If you're in Colab, run this cell first

In [None]:

!pip -q install git+https://github.com/openai/CLIP.git ftfy regex tqdm

import torch
import clip
from PIL import Image
import requests
from io import BytesIO
import torch.nn.functional as F

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:0



# -----------------------------
# 1) Load CLIP and pick a device
# -----------------------------

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 95.8MiB/s]


# -----------------------------
# 2) Choose an image
#    (Use any URL you like)
# -----------------------------

In [None]:
image_url = "https://images.unsplash.com/photo-1518791841217-8f162f1e1131"  # a cat photo
img = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")


# -----------------------------
# 3) Create captions
# -----------------------------

In [None]:

captions = [
    "a close-up photo of a cat",
    "a red sports car on a race track",
    "a small kitten sitting on a couch",
    "a bowl of fresh salad with tomatoes",
]

# -----------------------------
# 4) Preprocess & Encode
# -----------------------------

In [None]:

with torch.no_grad():
    # Image -> embedding
    image_input = preprocess(img).unsqueeze(0).to(device)
    image_features = model.encode_image(image_input)
    image_features = F.normalize(image_features, dim=-1)

    # Text -> embeddings
    text_tokens = clip.tokenize(captions).to(device)
    text_features = model.encode_text(text_tokens)
    text_features = F.normalize(text_features, dim=-1)

In [None]:
text_features

tensor([[ 0.0191,  0.0050, -0.0293,  ..., -0.0389, -0.0538, -0.0029],
        [ 0.0028, -0.0289, -0.0119,  ...,  0.0458, -0.0463,  0.0140],
        [ 0.0042, -0.0402, -0.0526,  ..., -0.0016, -0.0627,  0.0095],
        [-0.0659, -0.0025,  0.0366,  ...,  0.0042, -0.0410, -0.0184]])

In [None]:
# -----------------------------
# 5) Cosine similarity
# -----------------------------
# cosine_sim[i] = similarity between image and captions[i]
cosine_sim = (image_features @ text_features.T).squeeze(0).tolist()

In [None]:
# -----------------------------
# 6) Show results
# -----------------------------
print("Image URL:", image_url)
print("\nSimilarities (higher is more similar; max = 1.0):\n")
for cap, sim in sorted(zip(captions, cosine_sim), key=lambda x: x[1], reverse=True):
    print(f"{sim:.4f}  |  {cap}")


Image URL: https://images.unsplash.com/photo-1518791841217-8f162f1e1131

Similarities (higher is more similar; max = 1.0):

0.2694  |  a small kitten sitting on a couch
0.2672  |  a close-up photo of a cat
0.1727  |  a bowl of fresh salad with tomatoes
0.1107  |  a red sports car on a race track
