In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.9 MB/s 
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-7y_bcq_m
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-7y_bcq_m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369409 sha256=d051709c82cff0c671d9918263dc30752cb56888a0390445483e91959dd794c8
  Stored in directory: /tmp/pip-ephem-wheel-cache-5uako29o/wheels/fd/b9/c3/5b4470e35ed76e174bff77c92f91da82098d5e35fd5bc8cdac
Successfully

In [None]:
import torch
import clip
import numpy as np
from PIL import Image

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


### Set up class feature vectors

In [None]:
classes_templates = {}
classes_templates['cifar10_classes'] = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
classes_templates['cifar10_templates'] = [
    'a photo of a {}.',
    'a blurry photo of a {}.',
    'a black and white photo of a {}.',
    'a low contrast photo of a {}.',
    'a high contrast photo of a {}.',
    'a bad photo of a {}.',
    'a good photo of a {}.',
    'a photo of a small {}.',
    'a photo of a big {}.',
    'a photo of the {}.',
    'a blurry photo of the {}.',
    'a black and white photo of the {}.',
    'a low contrast photo of the {}.',
    'a high contrast photo of the {}.',
    'a bad photo of the {}.',
    'a good photo of the {}.',
    'a photo of the small {}.',
    'a photo of the big {}.',
]
classes_templates['kinetics_templates'] = [
    'a photo of {}.',
    'a photo of a person {}.',
    'a photo of a person using {}.',
    'a photo of a person doing {}.',
    'a photo of a person during {}.',
    'a photo of a person performing {}.',
    'a photo of a person practicing {}.',
    'a video of {}.',
    'a video of a person {}.',
    'a video of a person using {}.',
    'a video of a person doing {}.',
    'a video of a person during {}.',
    'a video of a person performing {}.',
    'a video of a person practicing {}.',
    'a example of {}.',
    'a example of a person {}.',
    'a example of a person using {}.',
    'a example of a person doing {}.',
    'a example of a person during {}.',
    'a example of a person performing {}.',
    'a example of a person practicing {}.',
    'a demonstration of {}.',
    'a demonstration of a person {}.',
    'a demonstration of a person using {}.',
    'a demonstration of a person doing {}.',
    'a demonstration of a person during {}.',
    'a demonstration of a person performing {}.',
    'a demonstration of a person practicing {}.',
]
classes_templates['kinetics_classes'] = ['air drumming', 'chasing', 'head stand', 'tackling', 'yoga']

In [None]:
all_classes = []
for dataset in ['cifar10', 'kinetics']:
  for class_i in classes_templates[f'{dataset}_classes']:
    for template_i in classes_templates[f'{dataset}_templates']:
      all_classes.append(template_i.format(class_i))

In [None]:
len(all_classes)  # num class vectors

320

In [None]:
model, preprocess = clip.load("ViT-B/32", device=device)

100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 263MiB/s]


In [None]:
text = clip.tokenize(all_classes).to(device)
with torch.no_grad(): text_features = model.encode_text(text)

In [None]:
text_features.shape

torch.Size([320, 512])

### Load data and preprocess

In [None]:
!curl https://images.pexels.com/photos/104827/cat-pet-animal-domestic-104827.jpeg > cat.png

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1906k  100 1906k    0     0  13.6M      0 --:--:-- --:--:-- --:--:-- 13.6M


In [None]:
IMG_PATH = 'cat.png'
image = preprocess(Image.open(IMG_PATH)).unsqueeze(0).to(device)

In [None]:
image.shape

torch.Size([1, 3, 224, 224])

In [None]:
BATCH_SIZE = 8 * 60  # frames in 8 min video @ 1 fps
TOP_N = 5
inp = torch.tile(image, (BATCH_SIZE, 1, 1, 1))

In [None]:
inp.shape

torch.Size([480, 3, 224, 224])

### Extract features and calculate similarity

In [None]:
with torch.no_grad():
    logits_per_image, _ = model(inp, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
    top_n_class_ids = np.argsort(probs, axis=1)[:, -TOP_N:][:, ::-1]

In [None]:
top_n_class_ids  # top N (highest to lowest prob from left to right)

array([[54, 61, 63, 60, 70],
       [54, 61, 63, 60, 70],
       [54, 61, 63, 60, 70],
       ...,
       [54, 61, 63, 60, 70],
       [54, 61, 63, 60, 70],
       [54, 61, 63, 60, 70]])

In [None]:
[all_classes[i] for i in top_n_class_ids[0]]

['a photo of a cat.',
 'a photo of a small cat.',
 'a photo of the cat.',
 'a good photo of a cat.',
 'a photo of the small cat.']