In [1]:
%pip install torchmetrics
%pip install torch
%pip install transformers>=4.10.0
%pip install torchmetrics[multimodal]

Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.9.0 torchmetrics-1.2.0
Collecting piq<=0.8.0 (from torchmetrics[multimodal])
  Downloading piq-0.8.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.9/106.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: piq
Successfully installed piq-0.8.0


#Image-Text CLIP Score

In [2]:
from torchmetrics.multimodal.clip_score import CLIPScore
from functools import partial
import torch

def calculate_clip_score_text(images, prompts):
    images_int = (images * 255).astype("uint8")
    metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")
    clip_score_res = metric(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score_res), 4)


In [None]:
import imageio as iio
# read an image
img = iio.imread("/content/doge_riding_bicycle_3.png")
img = img[None]

In [None]:
sd_clip_score = calculate_clip_score_text(img, "a dog riding a bicycle")
print(f"CLIP score: {sd_clip_score}")

In [6]:
from transformers import (
    CLIPTokenizer,
    CLIPTextModelWithProjection,
    CLIPVisionModelWithProjection,
    CLIPImageProcessor,
)
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
clip_id = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

#Image-Image CLIP Score (with prompt)

In [9]:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class DirectionalSimilarity(nn.Module):
    def __init__(self, tokenizer, text_encoder, image_processor, image_encoder):
        super().__init__()
        self.tokenizer = tokenizer
        self.text_encoder = text_encoder
        self.image_processor = image_processor
        self.image_encoder = image_encoder

    def preprocess_image(self, image):
        image = self.image_processor(image, return_tensors="pt")["pixel_values"]
        return {"pixel_values": image.to(device)}

    def tokenize_text(self, text):
        inputs = self.tokenizer(
            text,
            max_length=self.tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {"input_ids": inputs.input_ids.to(device)}

    def encode_image(self, image):
        preprocessed_image = self.preprocess_image(image)
        image_features = self.image_encoder(**preprocessed_image).image_embeds
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        return image_features

    def encode_text(self, text):
        tokenized_text = self.tokenize_text(text)
        text_features = self.text_encoder(**tokenized_text).text_embeds
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        return text_features

    def compute_directional_similarity(self, img_feat_one, img_feat_two, text_feat_one, text_feat_two):
        sim_direction = F.cosine_similarity(img_feat_two - img_feat_one, text_feat_two - text_feat_one)
        return sim_direction

    def forward(self, image_one, image_two, caption_one, caption_two):
        img_feat_one = self.encode_image(image_one)
        img_feat_two = self.encode_image(image_two)
        text_feat_one = self.encode_text(caption_one)
        text_feat_two = self.encode_text(caption_two)
        directional_similarity = self.compute_directional_similarity(
            img_feat_one, img_feat_two, text_feat_one, text_feat_two
        )
        return directional_similarity

In [11]:
dir_similarity = DirectionalSimilarity(tokenizer, text_encoder, image_processor, image_encoder)
scores = []

original_image = "/content/doge_riding_bicycle_1.png" #@param {type:"string"}
original_caption = "a photo of doge riding bicycle" #@param {type:"string"}
edited_image = "/content/doge_riding_bicycle_2.png" #@param {type:"string"}
modified_caption = "a photo of dog riding bicycle" #@param {type:"string"}


# read an image
original_image = iio.imread(original_image)
edited_image = iio.imread(edited_image)

similarity_score = dir_similarity(original_image, edited_image, original_caption, modified_caption)
scores.append(float(similarity_score.detach().cpu()))

print(f"CLIP directional similarity: {np.mean(scores)}")
# CLIP directional similarity: 0.0797976553440094

  original_image = iio.imread(original_image)
  edited_image = iio.imread(edited_image)


CLIP directional similarity: -0.07524948567152023
