In [None]:
!pip install -r requirements.txt

In [2]:
from matplotlib import pyplot as plt
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering
import pandas as pd
import torchvision.transforms as transforms
from tqdm.auto import tqdm
import os
from torchmetrics.functional.multimodal import clip_score
import clip
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from torchmetrics.image.fid import FrechetInceptionDistance
from transformers import ViltProcessor, ViltForQuestionAnswering

# Data

In [3]:
dataset_path = '../../prompts/our prompts/test_two_objects_prompts.csv'
base_path = 'CoM'

In [4]:
df = pd.read_csv(dataset_path)
for i in tqdm(range(len(df))):
    prompt = df['prompt'][i]
    for idx, I in enumerate(os.listdir('./' + base_path + '/' + prompt)):
        try:
            img = Image.open(f'./{base_path}/{prompt}/{I}')
            img.save(f'./{base_path}/{prompt}/image.png')
        except:
            continue

  0%|          | 0/2485 [00:00<?, ?it/s]

# BLIP VQA

In [5]:
blip_vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")
def get_score_prompt_and_image_blip_vqa(prompt, image_addres):
    image = Image.open(image_addres)
    question_sd_1 = f"Is there any {prompt.split()[1]} in the picture?"
    question_sd_2 = f"Is there any {prompt.split()[4]} in the picture?"
    print(f'{question_sd_1}  {question_sd_2}')

    inputs_1 = blip_vqa_processor(image, question_sd_1, return_tensors="pt").to("cuda")
    inputs_2 = blip_vqa_processor(image, question_sd_2, return_tensors="pt").to("cuda")
    out_sd_1 = 1 if blip_vqa_processor.decode(blip_vqa_model.generate(**inputs_1)[0], skip_special_tokens=True)=="yes" else 0
    out_sd_2 = 1 if blip_vqa_processor.decode(blip_vqa_model.generate(**inputs_2)[0], skip_special_tokens=True)=="yes" else 0
    return (out_sd_1 + out_sd_2) / 2

df = pd.read_csv(dataset_path)
blip_vqa_scores_soft = []
blip_vqa_scores_harsh = []
for i in tqdm(range(len(df))):
  prompt = df['prompt'][i]
  score = get_score_prompt_and_image_blip_vqa(prompt=df['prompt'][i], image_addres=f'./{base_path}/{prompt}/image.png')
  blip_vqa_scores_harsh.append(1 if score == 1 else 0)
  blip_vqa_scores_soft.append(score)

  0%|          | 0/2485 [00:00<?, ?it/s]

Is there any airplane in the picture?  Is there any apple in the picture?




Is there any airplane in the picture?  Is there any backpack in the picture?
Is there any airplane in the picture?  Is there any banana in the picture?
Is there any airplane in the picture?  Is there any bear in the picture?
Is there any airplane in the picture?  Is there any bed in the picture?
Is there any airplane in the picture?  Is there any bench in the picture?
Is there any airplane in the picture?  Is there any bicycle in the picture?
Is there any airplane in the picture?  Is there any bird in the picture?
Is there any airplane in the picture?  Is there any blender in the picture?
Is there any airplane in the picture?  Is there any boat in the picture?
Is there any airplane in the picture?  Is there any book in the picture?
Is there any airplane in the picture?  Is there any bottle in the picture?
Is there any airplane in the picture?  Is there any bowl in the picture?
Is there any airplane in the picture?  Is there any broccoli in the picture?
Is there any airplane in the pict

# CLIP score

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
def image_encoder(image):
  image_input = clip_preprocess(image).unsqueeze(0).to(device)
  with torch.no_grad():
    image_embedding = clip_model.encode_image(image_input).float()
  return image_embedding
def text_encoder(text: str):
  tx = clip.tokenize([text]).to(device)
  with torch.no_grad():
    text_embedding = clip_model.encode_text(tx).float()
  return text_embedding
def cos_sim(tens1, tens2):
  similarity = torch.cosine_similarity(tens1, tens2)
  return similarity.item()
def sim_text_image(text, image):
  return cos_sim(text_encoder(text), image_encoder(image))

In [7]:
df = pd.read_csv(dataset_path)
def get_score_prompt_and_image_clip(prompt, image_addres):
  image = Image.open(image_addres)
  score = sim_text_image(prompt, image)
  return score

clip_scores = []
for i in tqdm(range(len(df))):
  prompt = df['prompt'][i]
  clip_scores.append(get_score_prompt_and_image_clip(prompt=df['prompt'][i], image_addres=f'./{base_path}/{prompt}/image.png'))

  0%|          | 0/2485 [00:00<?, ?it/s]

# BLIP captioning

In [8]:
blip_captioning_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_captioning_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
def text_encoder(text: str):
  tx = clip.tokenize([text]).to(device)
  with torch.no_grad():
    text_embedding = clip_model.encode_text(tx).float()
  return text_embedding
def cos_sim_text_text(text_1, text_2):
  similarity = torch.cosine_similarity(text_encoder(text_1), text_encoder(text_2))
  return similarity.item()


In [9]:
def get_score_prompt_and_image_captioning(prompt, image_addres):
  image = Image.open(image_addres)

  # unconditional image captioning
  inputs = blip_captioning_processor(image, return_tensors="pt").to("cuda")

  out = blip_captioning_model.generate(**inputs)
  caption = blip_captioning_processor.decode(out[0], skip_special_tokens=True)
  return cos_sim_text_text(caption, prompt)


df = pd.read_csv(dataset_path)
blip_captioning_scores = []
for i in tqdm(range(len(df))):
  prompt = df['prompt'][i]
  blip_captioning_scores.append(get_score_prompt_and_image_captioning(prompt=df['prompt'][i], image_addres=f'./{base_path}/{prompt}/image.png'))

  0%|          | 0/2485 [00:00<?, ?it/s]



# TIFA score

In [10]:
!git clone https://github.com/Yushi-Hu/tifa

fatal: destination path 'tifa' already exists and is not an empty directory.


In [11]:
from tifa.tifascore import VQAModel
tifa_model = VQAModel("mplug-large")
def get_score_prompt_and_image_TIFA(prompt, image_addres):
  question_sd_1 = f"Is there any {prompt.split()[1]} in the picture?"
  question_sd_2 = f"Is there any {prompt.split()[4]} in the picture?"
  print(f'{question_sd_1}  {question_sd_2}')
  out_sd_1 = 1 if (tifa_model.multiple_choice_vqa(image_addres, question_sd_1, choices=['yes', 'no']))['multiple_choice_answer']=="yes" else 0
  out_sd_2 = 1 if (tifa_model.multiple_choice_vqa(image_addres, question_sd_2, choices=['yes', 'no']))['multiple_choice_answer']=="yes" else 0
  return (out_sd_1 + out_sd_2) / 2

df = pd.read_csv(dataset_path)
tifa_scores_soft = []
tifa_scores_harsh = []
for i in tqdm(range(len(df))):
  prompt = df['prompt'][i]
  score = get_score_prompt_and_image_TIFA(prompt=df['prompt'][i], image_addres=f'./{base_path}/{prompt}/image.png')
  tifa_scores_harsh.append(1 if score == 1 else 0)
  tifa_scores_soft.append(score)

2024-02-23 16:04:29,539 - modelscope - INFO - PyTorch version 2.1.0 Found.
2024-02-23 16:04:29,546 - modelscope - INFO - Loading ast index from C:\Users\user01\.cache\modelscope\ast_indexer
2024-02-23 16:04:30,217 - modelscope - INFO - Loading done! Current index file version is 1.4.2, with md5 975378c766c1709e9bd5d225e817233e and a total number of 842 components indexed


No module named 'tensorflow'
Loading mplug-large...


2024-02-23 16:04:44,517 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.0
INFO:modelscope:Model revision not specified, use the latest revision: v1.0.0
2024-02-23 16:04:45,308 - modelscope - INFO - initiate model from C:\Users\user01\.cache\modelscope\hub\damo\mplug_visual-question-answering_coco_large_en
INFO:modelscope:initiate model from C:\Users\user01\.cache\modelscope\hub\damo\mplug_visual-question-answering_coco_large_en
2024-02-23 16:04:45,310 - modelscope - INFO - initiate model from location C:\Users\user01\.cache\modelscope\hub\damo\mplug_visual-question-answering_coco_large_en.
INFO:modelscope:initiate model from location C:\Users\user01\.cache\modelscope\hub\damo\mplug_visual-question-answering_coco_large_en.
2024-02-23 16:04:45,316 - modelscope - INFO - initialize model from C:\Users\user01\.cache\modelscope\hub\damo\mplug_visual-question-answering_coco_large_en
INFO:modelscope:initialize model from C:\Users\user01\.cache\modelscope\hub\d

load checkpoint from C:\Users\user01\.cache\modelscope\hub\damo\mplug_visual-question-answering_coco_large_en\pytorch_model.bin
<All keys matched successfully>
Finish loading mplug-large
Using SBERT on GPU


  0%|          | 0/2485 [00:00<?, ?it/s]

Is there any airplane in the picture?  Is there any apple in the picture?




Is there any airplane in the picture?  Is there any backpack in the picture?
Is there any airplane in the picture?  Is there any banana in the picture?
Is there any airplane in the picture?  Is there any bear in the picture?
Is there any airplane in the picture?  Is there any bed in the picture?
Is there any airplane in the picture?  Is there any bench in the picture?
Is there any airplane in the picture?  Is there any bicycle in the picture?
Is there any airplane in the picture?  Is there any bird in the picture?
Is there any airplane in the picture?  Is there any blender in the picture?
Is there any airplane in the picture?  Is there any boat in the picture?
Is there any airplane in the picture?  Is there any book in the picture?
Is there any airplane in the picture?  Is there any bottle in the picture?
Is there any airplane in the picture?  Is there any bowl in the picture?
Is there any airplane in the picture?  Is there any broccoli in the picture?
Is there any airplane in the pict

# VILT

In [12]:
vilt_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
vilt_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to("cuda")

def get_score_prompt_and_image_vilt(prompt, image_addres):
  entity_1, entity_2 = prompt.split(" ")[1], prompt.split(" ")[4]
  image = Image.open(image_addres)
  question_1 = f"Is there any {entity_1} in the picture?"
  inputs_1 = vilt_processor(image, question_1, return_tensors="pt").to("cuda")
  outputs_1 = vilt_model(**inputs_1)
  logits_1 = outputs_1.logits
  yes_score_1 = (torch.exp(logits_1[0][3]) / (torch.exp(logits_1[0][3]) + torch.exp(logits_1[0][9]))).item()
  idx_1 = logits_1.argmax(-1).item()
  question_2 = f"Is there any {entity_2} in the picture?"
  inputs_2 = vilt_processor(image, question_2, return_tensors="pt").to("cuda")
  outputs_2 = vilt_model(**inputs_2)
  logits_2 = outputs_2.logits
  yes_score_2 = (torch.exp(logits_2[0][3]) / (torch.exp(logits_2[0][3]) + torch.exp(logits_2[0][9]))).item()
  idx_2 = logits_2.argmax(-1).item()
  score = (yes_score_1 + yes_score_2) / 2
  return score


vilt_scores = []
df = pd.read_csv(dataset_path)
for i in tqdm(range(len(df))):
  prompt = df['prompt'][i]
  vilt_scores.append(get_score_prompt_and_image_vilt(prompt=prompt, image_addres=f'./{base_path}/{prompt}/image.png'))

  0%|          | 0/2485 [00:00<?, ?it/s]

# Save results

In [13]:
if not os.path.exists('./evaluations/'):
        os.mkdir('./evaluations/')
if not os.path.exists('./evaluations/' + base_path + '/'):
        os.mkdir('./evaluations/' + base_path + '/')

df = pd.read_csv(dataset_path)
df['blip vqa score soft'] = blip_vqa_scores_soft
df['blip vqa score harsh'] = blip_vqa_scores_harsh
df['clip score'] = clip_scores
df['blip captioning score'] = blip_captioning_scores
df['tifa score soft'] = tifa_scores_soft
df['tifa score harsh'] = tifa_scores_harsh
df['vilt score'] = vilt_scores

if not os.path.exists('./evaluations/' + base_path + '/'):
        os.mkdir('./evaluations/' + base_path + '/')
df.to_csv('./evaluations/' + base_path + '/evaluations_' + base_path + '.csv')