## Question 1 : CLIP

### Part 1

In [1]:
!pip install torch torchvision pillow
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

### Part 2

In [2]:
import clip
import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 101MiB/s]


### Part 3

In [4]:
image = preprocess(Image.open("/kaggle/input/cv-hw3-q1/sample_image.jpg")).unsqueeze(0).to(device)

texts = [
    "A man holding a large black dog", 
    "A dog being held by its owner",
    "A bookshelf filled with collection of encyclopedias, including World Book (A-Z)",
    "A man posing with his dog at home",
    "A living room with wooden floors, bookshelves and a man holding a dog",
    "A dog with floppy ears being carried",
    "A man in formal attire holding a large dog",
    "A room with wooden floors, bookshelves, and a closed door",
    "A man in a white shirt and gray pants holding a large dog",
    "A man with neatly groomed beard and short dark hair standing in a room"
]
text_tokens = clip.tokenize(texts).to(device)

with torch.no_grad():
    image_features = model.encode_image(image).float()
    text_features  = model.encode_text(text_tokens).float()  

image_features /= image_features.norm(dim=-1, keepdim=True)
text_features  /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T

print("Similarity Scores (cosine similarity):")
for txt, score in zip(texts, similarity):
    print(f"{txt:80s} --> {score.item():.4f}")

Similarity Scores (cosine similarity):
A man holding a large black dog                                                  --> 0.3005
A dog being held by its owner                                                    --> 0.2868
A bookshelf filled with collection of encyclopedias, including World Book (A-Z)  --> 0.1656
A man posing with his dog at home                                                --> 0.3351
A living room with wooden floors, bookshelves and a man holding a dog            --> 0.2677
A dog with floppy ears being carried                                             --> 0.2878
A man in formal attire holding a large dog                                       --> 0.3014
A room with wooden floors, bookshelves, and a closed door                        --> 0.1871
A man in a white shirt and gray pants holding a large dog                        --> 0.3141
A man with neatly groomed beard and short dark hair standing in a room           --> 0.2329


### Part 4

In [5]:
!pip install torch torchvision pillow
!pip install open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Downloading open_clip_torch-2.32.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: open_clip_torch
Successfully installed open_clip_torch-2.32.0


### Part 5

In [6]:
from PIL import Image
import torch
import torch.nn.functional as F
from open_clip import create_model_from_pretrained, get_tokenizer

model, preprocess = create_model_from_pretrained('hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-Recap-DataComp-1B')
tokenizer = get_tokenizer('hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-Recap-DataComp-1B')

open_clip_pytorch_model.bin:   0%|          | 0.00/1.66G [00:00<?, ?B/s]

open_clip_config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Part 6

In [7]:
image = Image.open("/kaggle/input/cv-hw3-q1/sample_image.jpg")
image = preprocess(image).unsqueeze(0)

texts = [
    "A man holding a large black dog", 
    "A dog being held by its owner",
    "A bookshelf filled with collection of encyclopedias, including World Book (A-Z)",
    "A man posing with his dog at home",
    "A living room with wooden floors, bookshelves and a man holding a dog",
    "A dog with floppy ears being carried",
    "A man in formal attire holding a large dog",
    "A room with wooden floors, bookshelves, and a closed door",
    "A man in a white shirt and gray pants holding a large dog",
    "A man with neatly groomed beard and short dark hair standing in a room"
]
text = tokenizer(texts, context_length=model.context_length)

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)  
    text_features = model.encode_text(text)    
    
    image_features = F.normalize(image_features, dim=-1)
    text_features = F.normalize(text_features, dim=-1)
    
    similarity = image_features @ text_features.T

print("Similarity Scores (cosine similarity):")
for txt, score in zip(texts, similarity[0]):
    print(f"{txt:80s} --> {score.item():.4f}")

  with torch.no_grad(), torch.cuda.amp.autocast():


Similarity Scores (cosine similarity):
A man holding a large black dog                                                  --> 0.1773
A dog being held by its owner                                                    --> 0.1561
A bookshelf filled with collection of encyclopedias, including World Book (A-Z)  --> 0.0137
A man posing with his dog at home                                                --> 0.1913
A living room with wooden floors, bookshelves and a man holding a dog            --> 0.1613
A dog with floppy ears being carried                                             --> 0.1300
A man in formal attire holding a large dog                                       --> 0.1924
A room with wooden floors, bookshelves, and a closed door                        --> 0.0366
A man in a white shirt and gray pants holding a large dog                        --> 0.2103
A man with neatly groomed beard and short dark hair standing in a room           --> 0.1018
