In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import cv2
import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from glob import glob

from collections import OrderedDict
import torch
import gc

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Install CLIP library

In [None]:
!pip install ../input/openaiclipweights/python-ftfy-master/python-ftfy-master
!pip install ../input/openaiclipweights/clip/CLIP
!cp ../input/openaiclipweights/CLIP-main/CLIP-main/clip/bpe_simple_vocab_16e6.txt /opt/conda/lib/python3.7/site-packages/clip/.
!gzip -k /opt/conda/lib/python3.7/site-packages/clip/bpe_simple_vocab_16e6.txt
!ls /opt/conda/lib/python3.7/site-packages/clip/.

In [None]:
import torch
import clip
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset

print("Torch version:", torch.__version__)

# Download some images from open images collection.

In [None]:
!wget https://farm8.staticflickr.com/6036/6426668771_b5b915e46c_o.jpg
!wget https://c6.staticflickr.com/8/7457/10806045045_02d3dbdcee_o.jpg
!wget https://c1.staticflickr.com/4/3267/2888764405_0a0a608604_o.jpg
!wget https://farm8.staticflickr.com/4028/4294212194_a49663b2b9_o.jpg
!wget https://c5.staticflickr.com/9/8173/8019508216_6540c8686a_o.jpg
!wget https://farm3.staticflickr.com/1146/1357102390_943c5cb999_o.jpg

In [None]:
files = glob('*.jpg')
print(files)

# List pretrained CLIP models available

In [None]:
clip.available_models()

# List pretrained weights available

In [None]:
!ls ../input/openaiclipweights/clip/CLIP/models/

# Load CLIP Vision Transformer based model

In [None]:
model, preprocess = clip.load("../input/openaiclipweights/clip/CLIP/models/ViT-B-32.pt")
model.cuda().eval()

# For each image we will query for the following senteces and see what CLIP predicts. 
# You can add custom sentences here.

In [None]:
QUERIES = [
    "a dog",
    "a cat",
    "a elephant",
    "a zebra",
    "a sleeping dog",
    "a sleeping cat",
    "a giraffe",
    "a poodle",
    "animal inside a car",
    "animal outside a car",
    "a sofa",
    "some animals",
    "santa claus",
    "ipod",
    "two mugs",
    "three mugs",
    "blue sky",
] 

# Score images vs queries using clip model

In [None]:
with torch.no_grad():
    for file in files:
        print(file)
        # Load image from file
        img = Image.open(file).convert("RGB")

        # Just show image in the notebook
        plt.imshow(cv2.resize(np.array(img), (256, 256)))
        plt.show()
        
        # Preprocess image using clip
        img = preprocess(img).unsqueeze(0).cuda()
        
        # Get Image embeddings
        image_embeddings = model.encode_image(img)
        image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
        
        
        score = []
        for query in QUERIES:
            texts = clip.tokenize(query).cuda()
            
            # Get Text Embeddings
            text_embeddings = model.encode_text(texts)
            text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
            
            # Calc dot product between image and text embeddings
            sc = float((image_embeddings @ text_embeddings.T).cpu().numpy())
            score.append(sc)
        
        print( pd.DataFrame({'query': QUERIES, 'score': score}).sort_values('score', ascending=False) )
        print('')
        print('-------------------------')
        print('')
