<a href="https://colab.research.google.com/github/shpotes/tensorflowers/blob/clip/notebooks/CLIPTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys

if 'google.colab' in sys.modules:
  !pip install transformers datasets -qq

In [2]:
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
from tqdm import tqdm

In [3]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Downloading:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/577M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [4]:
ds = load_dataset("shpotes/tfcol", split="validation")

Downloading:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset tf_col/default to /root/.cache/huggingface/datasets/shpotes___tf_col)/default/1.0.0/0c616218d5e0a194334e0ed0adacd86ab9b315ec6b03a8b388dece024753def2...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/687M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/469k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/225M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/155k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset tf_col downloaded and prepared to /root/.cache/huggingface/datasets/shpotes___tf_col)/default/1.0.0/0c616218d5e0a194334e0ed0adacd86ab9b315ec6b03a8b388dece024753def2. Subsequent calls will reuse this data.


In [5]:
def int2str(x):
  _int2str = ds.features["labels"].feature.int2str
  if isinstance(x, int):
    return _int2str(x)
  elif hasattr(x, '__iter__'):
    return [_int2str(i) for i in x]
  raise TypeError

def batch(iterable, n=1):
  l = len(iterable)
  num = range(l)
  for ndx in range(0, l, n):
    yield num[ndx:min(ndx + n, l)], iterable[ndx:min(ndx + n, l)]

In [6]:
images = [Image.open(img).convert("RGB") for img in ds["image"]]

prompt_seeds = [
    "clothing store",
    "liquor store",
    "barber shop",
    "electronic store",
    "coffee store",
    "furniture store",
    "hot dog cart", # puesto movil 
    "ERROR OSJDFADOIAJSOIDJAMS", # electrodomesticos
    "butcher shop",
    "bar",
    "pet shop",
    "store",
    "pharmacy",
    "sport store",
    "car shop",
    "shoe shop",
    "supermarket",
    "hotel"
]

prompts = [f"an image of a {seed}" for seed in prompt_seeds]

model = model.cuda()

In [7]:
score = [0 for _ in range(len(prompts))]

In [8]:
for image_idx, image_batch in tqdm(batch(images, 8)):
  input = processor(
      text=prompts, 
      images=image_batch,
      return_tensors="pt", 
      padding=True,
  )
  
  input = {k: v.cuda() for k, v in input.items()}

  outputs = model(**input)
  logits_per_image = outputs.logits_per_image
  probs = logits_per_image.softmax(dim=1)

  for img_idx, prob_idx in zip(image_idx, range(8)):
    topk = set(torch.topk(probs[prob_idx], 5).indices.cpu().tolist())
    labels = set(ds["labels"][img_idx])


    for i in (topk & labels):
      score[i] += 1

83it [00:26,  3.16it/s]


In [9]:
final_score = [local / len(ds) for local in score]

[0.057750759878419454,
 0.00911854103343465,
 0.02127659574468085,
 0.0243161094224924,
 0.0060790273556231,
 0.04559270516717325,
 0.00303951367781155,
 0.0,
 0.0182370820668693,
 0.0,
 0.019756838905775075,
 0.02127659574468085,
 0.00303951367781155,
 0.0060790273556231,
 0.004559270516717325,
 0.0,
 0.0121580547112462,
 0.0]

In [21]:
torch.rand(18)

tensor([0.0270, 0.5197, 0.2802, 0.8014, 0.1947, 0.9885, 0.5138, 0.8334, 0.5969,
        0.6727, 0.7295, 0.0855, 0.3730, 0.4220, 0.9219, 0.0513, 0.7940, 0.3188])

In [28]:
boots = [0 for _ in range(100)]

for k in range(100):
  random_baseline = [0 for _ in range(len(prompts))]

  for labels in ds["labels"]:
    topk = set(torch.topk(torch.rand(18), 5).indices.tolist())
    labels = set(labels)

    for i in (topk & labels):
        random_baseline[i] += 1

  random_baseline = [local / len(ds) for local in random_baseline]
  boots[k] = sum([x < y for x, y in zip(final_score, random_baseline)]) / len(prompts)

In [31]:
sum(boots) / 100

0.5816666666666673

In [None]:
# TODO: compute el per prompt boots score!