In [None]:
! conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git
! pip install transformers

/bin/bash: line 1: conda: command not found
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-x9t58sr3
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-x9t58sr3
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=d4c207313136f7af05fd3bca2b542da9ab0e99a196bba17a9f9560f525189d58
  Stored in directory: /tmp/pip-ephem-wheel-cache-d3a905c

# First Task

In [None]:
import os
import clip
import torch
import pandas as pd

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

csv_path = '/content/document.csv'
df = pd.read_csv(csv_path)

def best_docu(text_query):

  q_tokens = clip.tokenize(text_query)
  q_feature = model.encode_text(q_tokens)

  values = []

  for text in df.text.values:

    # all CLIP models use 77 as the context length that's why I had to use truncate here.
    # I looked in the huggingface. I probably have to retrain the whole model to increase the context length.
    text_input = clip.tokenize(text,context_length=77, truncate=True).to(device)

  # Calculate features
    with torch.no_grad():
        text_features = model.encode_text(text_input)

    # calculating similarity of each document
    q_feature /= q_feature.norm(dim=-1,keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = q_feature @ text_features.T

    values.append(similarity)

  index = torch.argmax(torch.tensor(values)).item()

  return index

100%|███████████████████████████████████████| 338M/338M [00:05<00:00, 62.6MiB/s]


# Second Task

In [None]:
from PIL import Image
import os
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy

# Specify the folder path containing the images
folder_path = '/content/Images'

imgs = []
# Check if the folder exists
if os.path.exists(folder_path):
    # Get a list of all files in the folder
    file_list = os.listdir(folder_path)

    # Loop through the files and read each image
    for file_name in file_list:
        # Construct the full path to the image file
        image_path = os.path.join(folder_path, file_name)

        if file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
            # Open the image using PIL
            imgs.append((preprocess(Image.open(image_path)).unsqueeze(0).squeeze().to(device),file_name))
else:
    print(f"The folder '{folder_path}' does not exist.")


def get_features(dataset):
    all_features = []
    all_labels = []

    with torch.no_grad():
        for images, _ in tqdm(DataLoader(dataset, batch_size=100,shuffle=False)):
            features = model.encode_image(images.to(device))

            all_features.append(features)

    return torch.cat(all_features)


def best_images(text_query):

  features = get_features(imgs)
  q_tokens = clip.tokenize(text_query)
  q_feature = model.encode_text(q_tokens)


  # Pick the top 5 most similar labels for the image
  q_feature /= q_feature.norm(dim=-1, keepdim=True)
  features /= features.norm(dim=-1, keepdim=True)
  similarity = (100.0 * q_feature @ features.T).softmax(dim=-1)
  values, indices = similarity[0].topk(20)
  return indices

## output File function

In [None]:
import csv
import os

def write_prediction_csv(output_file, predictions):

    # Define the column names
    fieldnames = ['qid', 'doc_id', 'img_id']

    # Open the CSV file in write mode
    with open(output_file, mode='a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write the header row
        writer.writeheader()

        # Write the prediction data
        for prediction in predictions:
            writer.writerow(prediction)



## Inference

In [None]:
text_query = 'confront your friend about the abuse.'

index = best_docu(text_query) #returns the index of the best documents
indices = best_images(text_query) #retruns top 20 indices of the images

# Writing the file
predictions = [
    {'qid':text_query,'doc_id': df.doc_id[index], 'img_id': [imgs[image_ID][1] for image_ID in indices[:5]]},
]
write_prediction_csv('predictions.csv', predictions)


100%|██████████| 3/3 [00:37<00:00, 12.35s/it]
