## **Installation of Libraries & Loading CLIP Model**

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 8.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.8 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 44.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [None]:
#Import all the necessary libraries
import torch
import requests
import numpy as np
import pandas as pd
from io import BytesIO
from PIL import Image as PILIMAGE
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

#Selecting device based on availability of GPUs
device = "cuda" if torch.cuda.is_available() else "cpu"

#Defining model, processor and tokenizer
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


## **Data Processing of the MSCOCO Dataset**

In [None]:
###### Extracting all "TEST" captions and image name from the COCO dataset ###################

# Import the module
import json
from PIL import Image

images_test = []
# Opening JSON file
with open('/content/dataset_coco.json') as json_file:
  # Convert JSON string to dictionary
  data_dict = json.load(json_file)
  #print(type(data_dict))
  images = data_dict['images']
  #print(images[0])
  for i in range(0,len(images)):
    if images[i]['filepath'] =='val2014' and images[i]['split']=='test':
      images_test.append(images[i])

print(len(images_test))

## Smaller Dictionary with just filename and sentences
test_images = {}

for image in images_test:
  test_images[image['filename']]= image['sentences']

## Smaller Dictionary with just image_name as key and sentence_Ids as value
test_images_sent_ids = {}

for image in images_test:
  test_images_sent_ids[image['filename']]= image['sentids']

## Smaller Dictionary with just sentence Id as key and Image name as value
test_sent_image_ids = {}
for item in test_images_sent_ids.items():
  caption_sentences_ids = item[1]
  for cid in caption_sentences_ids:
    test_sent_image_ids[cid] = item[0]

5000


In [None]:
test_image_names = list(test_images.keys())

In [None]:
### Making a list of all captions ####

captions = []
sent_ids = []

for item in test_images.items():
  sent_list = item[1]
  for sent_dict_element in sent_list:
    captions.append(sent_dict_element['raw'])
    sent_ids.append(sent_dict_element['sentid'])


### **Encoding All the Captions using CLIP Model**

In [None]:
### Making a list of all captions ####

captions = []
sent_ids = []

for item in test_images.items():
  sent_list = item[1]
  for sent_dict_element in sent_list:
    captions.append(sent_dict_element['raw'])
    sent_ids.append(sent_dict_element['sentid'])

In [None]:
with torch.no_grad():
        # Encode and normalize the description using CLIP
  chunk_size = 300
  array_list = []
  for i in range(0,len(captions),chunk_size):

    inputs = processor(captions[i:i+chunk_size], images=None, return_tensors="pt", padding=True)
    text_encoded =  model.get_text_features(**inputs).detach().numpy()
    array_list.append(text_encoded)



In [None]:
text_embeddings = np.vstack(array_list)

In [None]:
np.savetxt('text_features.csv', text_embeddings, delimiter=",")

### **Encoding all the Images using the CLIP Model**

In [None]:
## Encoding Images
test_image_names = list(test_images.keys())
url_base = "http://images.cocodataset.org/val2014/"
array_image_list = []
chunk = 200
for i in range(0,len(test_image_names),chunk):
  image_batch = []
  batch_image_name = test_image_names[i:i+chunk]
  for image_name in batch_image_name:
    url = url_base + image_name
    image = Image.open(requests.get(url, stream=True).raw)
    image_batch.append(image)

  inputs_image = processor(text=None, images=image_batch, return_tensors="pt", padding=True)
  image_encode = model.get_image_features(**inputs_image).detach().numpy()
  array_image_list.append(image_encode)
  #print(i)


KeyboardInterrupt: ignored

In [None]:
image_embeddings = np.vstack(array_image_list)

In [None]:
np.savetxt('image_features.csv', image_embeddings, delimiter=",")

## **Text to Image Retrieval**

In [None]:
## Loading text features from CSV
from numpy import genfromtxt
text_embeddings = genfromtxt('text_features.csv', delimiter=',')
image_embeddings = genfromtxt('image_features.csv', delimiter=',')

In [None]:
### Image to Text Retrieval

def T2I(image_embeddings,text_embeddings,test_sent_image_ids,test_image_names):
  # Normalizing
  norm = np.linalg.norm(image_embeddings)
  norm_image_emb = image_embeddings / norm

  norm_text = np.linalg.norm(text_embeddings)
  norm_text_emb = text_embeddings / norm_text

  ## To check Recall@1, Recall@5 and Recall@10
  Recall_1 = []
  Recall_5 = []
  Recall_10 = []

  test_sentence_ids = list(test_sent_image_ids.keys())
  print(len(test_sentence_ids))
  ## Retrieve Caption for every image
  for i in range(0,len(test_sentence_ids)):

    text_vector = norm_text_emb[i,:].reshape(1,512)
    # Finding Cosine similarity
    similarities = list((text_vector @ norm_image_emb.T).squeeze(0))
    if i % 1000 == 0:
      print(i)
   ## image name
    sentence_test_id = test_sentence_ids[i]
    #print(sentence_test_id)
    #print(image_test_name)
    ## Getting retrived top 10 sentences Ids
    ret_idxs = []
    for i in range(10):
      idx = sorted(zip(similarities, range(image_embeddings.shape[0])), key=lambda x: x[0], reverse=True)[i][1]
      ret_idxs.append(test_image_names[idx])
    #print(ret_idxs)
    found_match = False

    if ret_idxs[0] == test_sent_image_ids[sentence_test_id]:
      Recall_1.append(1)
      Recall_5.append(1)
      Recall_10.append(1)
      found_match = True
      #print('yo')
    else:
      for i in range(0,5):
        if found_match != True:
          if ret_idxs[i] == test_sent_image_ids[sentence_test_id]:
            Recall_1.append(0)
            Recall_5.append(1)
            Recall_10.append(1)
            found_match = True

      if found_match == False:
        for i in range(5,10):
          if found_match != True:
            if ret_idxs[i] == test_sent_image_ids[sentence_test_id]:
              Recall_1.append(0)
              Recall_5.append(0)
              Recall_10.append(1)
              found_match = True
      if found_match == False:
        Recall_1.append(0)
        Recall_5.append(0)
        Recall_10.append(0)

  #print(Recall_1)
  #print(Recall_10)
  #print(Recall_5)
  return (Recall_1,Recall_5,Recall_10)

In [None]:
T2I_R1, T2I_R5, T2I_R10 = T2I(image_embeddings,text_embeddings,test_sent_image_ids,test_image_names)

23000
24000
25000


In [None]:

# Saving the values for each retrieval on different recall in a csv
# This helped me in error analysis to classify which caption got image in Recall 1 and which in Recall 10
dict = {'name': list(test_sent_image_ids.keys()), 'Recall@1': T2I_R1 , 'Recall@5': T2I_R5, 'Recall@10': T2I_R10}

df = pd.DataFrame(dict)

df.to_csv('T2I.csv')

In [None]:
#Output of the Recall Values
T2I_Recall_1 = (sum(T2I_R1)/len(T2I_R1))*100
T2I_Recall_5 = (sum(T2I_R5)/len(T2I_R5))*100
T2I_Recall_10 = (sum(T2I_R10)/len(T2I_R10))*100

In [None]:
(T2I_Recall_1, T2I_Recall_5,T2I_Recall_10)

(25.621751299480206, 49.316273490603756, 60.723710515793684)

## **Image to Text Retrieval**

In [None]:
### Image to Text Retrieval

def I2T(image_embeddings,text_embeddings,test_images_sent_ids):
  # Normalizing
  norm = np.linalg.norm(image_embeddings)
  norm_image_emb = image_embeddings / norm

  norm_text = np.linalg.norm(text_embeddings)
  norm_text_emb = text_embeddings / norm_text

  ## To check Recall@1, Recall@5 and Recall@10
  Recall_1 = []
  Recall_5 = []
  Recall_10 = []

  image_names = list(test_images_sent_ids.keys())

  ## Retrieve Caption for every image
  for i in range(0,5000):

    image_vector = norm_image_emb[i,:].reshape(1,512)
    # Finding Cosine similarity
    similarities = list((image_vector @ norm_text_emb.T).squeeze(0))
    if i % 100 == 0:
      print(i)
   ## image name
    image_test_name = image_names[i]
    #print(image_test_name)
    ## Getting retrived top 10 sentences Ids
    ret_idxs = []
    for i in range(10):
      idx = sorted(zip(similarities, range(text_embeddings.shape[0])), key=lambda x: x[0], reverse=True)[i][1]
      ret_idxs.append(sent_ids[idx])

    found_match = False

    if ret_idxs[0] in test_images_sent_ids[image_test_name]:
      Recall_1.append(1)
      Recall_5.append(1)
      Recall_10.append(1)
      found_match = True
      #print('yo')
    else:
      for i in range(0,5):
        if found_match != True:
          if ret_idxs[i] in test_images_sent_ids[image_test_name]:
            Recall_1.append(0)
            Recall_5.append(1)
            Recall_10.append(1)
            found_match = True

      if found_match == False:
        for i in range(5,10):
          if found_match != True:
            if ret_idxs[i] in test_images_sent_ids[image_test_name]:
              Recall_1.append(0)
              Recall_5.append(0)
              Recall_10.append(1)
              found_match = True
      if found_match == False:
        Recall_1.append(0)
        Recall_5.append(0)
        Recall_10.append(0)

  #print(Recall_1)
  #print(Recall_10)
  #print(Recall_5)
  return (Recall_1,Recall_5,Recall_10)

In [None]:
R1,R5,R10 =I2T(image_embeddings,text_embeddings,test_images_sent_ids)

4400
4500
4600
4700
4800
4900


In [None]:
# dictionary of lists
dict = {'name': list(test_images_sent_ids.keys()), 'Recall@1': R1 , 'Recall@5': R5, 'Recall@10': R10}

df = pd.DataFrame(dict)

df.to_csv('I2T.csv')

In [None]:
I2T_Recall_1 = (sum(R1)/len(R1))*100
I2T_Recall_5 = (sum(R5)/len(R5))*100
I2T_Recall_10 = (sum(R10)/len(R10))*100

In [None]:
print((I2T_Recall_1,I2T_Recall_5,I2T_Recall_10))

(8.34, 19.88, 32.22)
