# Imports

In [9]:
import os
import sys
import inspect
import glob

import torch
from PIL import Image
import torchvision.transforms as transforms

sys.path.insert(0,'..')
import process_section
import clipscore

from torchmetrics.multimodal import CLIPScore

import time

# Existing clipscore implementation

In [35]:
# Change num_workers to 0 to account for local issue

def process_images(image_dir, model, device):
    image_paths = glob.glob(f'{image_dir}/*/*.jpg')

    # a dictionary
    image_feats_lookup = clipscore.extract_all_images(
        image_paths, model, device, batch_size=64, num_workers=0)

    return image_feats_lookup

In [36]:
model, device = clipscore.get_clip_mdl()
image_feats_lookup = process_images('imgdir', model, device)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.59s/it]


In [58]:
def gen_clip_score(image_set, prompts, image_feats_lookup, model, device):
    segments = [('s1', prompt) for prompt in prompts]
    result = process_section.calc_clip(segments, image_set, image_feats_lookup, model, device, num_pics=len(image_set))
    return {prompts[i]: {image_set[j]: result[i][j] for j in range(len(image_set))} for i in range(len(prompts))}

In [59]:
start = time.time()
result = gen_clip_score(['./imgdir/inner/cat.jpg', './imgdir/inner/ski.jpg'], ['This is a cat', 'A cute animal', 'A dog', 'An airplane'], image_feats_lookup, model, device)
end = time.time()

print('Result =', result)
print('Time used =', end - start)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.30s/it]

Result = {'This is a cat': {'./imgdir/inner/cat.jpg': 0.65607315, './imgdir/inner/ski.jpg': 0.41508007}, 'A cute animal': {'./imgdir/inner/cat.jpg': 0.634635, './imgdir/inner/ski.jpg': 0.46036527}, 'A dog': {'./imgdir/inner/cat.jpg': 0.5394762, './imgdir/inner/ski.jpg': 0.4733926}, 'An airplane': {'./imgdir/inner/cat.jpg': 0.43243903, './imgdir/inner/ski.jpg': 0.4799783}}
Time used = 8.317385196685791





# TorchMetrics implementation

In [43]:
metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch32")

In [45]:
def img2tensor(imgname):
    image = Image.open(imgname)
    transform = transforms.Compose([transforms.PILToTensor()])
    img_tensor = transform(image)
    return img_tensor

In [77]:
score = metric(img2tensor('./imgdir/inner/cat.jpg'), "This is a cat")

In [78]:
score.detach()

tensor(26.6762)