In [None]:
# we use CLIP score to further select questions that possess stronger probing ability

In [None]:
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import torchvision.transforms as T
from torchvision.transforms.functional import crop
import numpy as np
import os
from tqdm import tqdm
from pathlib import Path

from infer_utili.data_utili import get_data, read_json, save_json

In [None]:
gpu_id = 3
DEVICE = torch.device(f'cuda:{gpu_id}')

In [None]:
# Load CLIP model 
clip_model = CLIPModel.from_pretrained("openai_clip-vit-large-patch14-336")
clip_processor = CLIPProcessor.from_pretrained("openai_clip-vit-large-patch14-336")
clip_model.eval().to(DEVICE)

In [None]:
def clip_similarity(model, processor, img_patch, text):

    inputs = processor(text=text, images=img_patch, return_tensors="pt", padding=True).to(model.device)

    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeds = outputs.image_embeds  # (1, 768)
        text_embeds = outputs.text_embeds    # (1, 768)

    # Compute cosine similarity
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
    similarity = (image_embeds @ text_embeds.T).item()

    return similarity

In [None]:
target_model = ['llava_v1_5_7b', 'llama_adapter_v2', 'MiniGPT4'][0]
filter_apply = ['noFilter', 'withFilter'][0]
used_dataset = ['img_Flickr', 'img_dalle'][0]
print(filter_apply, used_dataset, target_model)

In [None]:
dataset, dataset_length = get_data(used_dataset)
len(dataset)

In [None]:
# check confuser 
confuser_add = f'ObjColor_exp/confuser_res/{filter_apply}/{used_dataset}_by_gpt-4o-mini_mode/res.json'
confuser_res = read_json(confuser_add)
print(len(confuser_res))
print(confuser_res[0].keys())
print(len(confuser_res[0]['sam_result']))

In [None]:
INFERENCE_FUNCS_sample = {
    'llava_v1_5_7b': llava_oneSample_inference,
    'MiniGPT4': minigpt4_oneSample_inference,
    'llama_adapter_v2': llama_adapter_oneSample_inference,
}

prompt = 'Describe this image in detail.'

In [None]:
# 对每张图片 每个检测出来的物体 (对应一个图片patch) 计算 clip 分数, 加入 sam_result 保存信息
for sample_id in tqdm(range(dataset_length)):   

    image = dataset[sample_id]['image']
    # get image description from target model
    desc_text = inference_func(model_dict, prompt, image, args)[0]

    obj_info = confuser_res[sample_id]['sam_result']
    if obj_info is not None:
        for obj_sam_res in obj_info:
            box_info = obj_sam_res['bbox']
            # obtain image patch based on box 
            img_patch = image.crop(box_info)
            clip_score = clip_similarity(clip_model, clip_processor, img_patch, desc_text)
            obj_sam_res['clip_score'] = clip_score