In [1]:
import pandas as pd
import random
import os
import json
import torch
import clip
import requests
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import argparse

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import re
string = 'Generate a caption for this image in 50 words[/INST] "A romantic dinner for two, with a beautiful blonde woman engrossed in her tablet, while her companion looks on, surrounded by a table set for a delightful meal."'
# string.split("[/INST]")[-1].strip().split('"')[1]
re.findall(r'"(.*?)"', string)[0]

['A romantic dinner for two, with a beautiful blonde woman engrossed in her tablet, while her companion looks on, surrounded by a table set for a delightful meal.']

In [2]:
def main(input_dir, survey_dir, num_sample_images, user_name):

    if not os.path.exists(survey_dir):
        os.makedirs(survey_dir)

    # if os.path.exists(os.path.join(survey_dir, f'{user_name}_survey.csv')):
    #     print(f"Survey file for {user_name} already exists. Please delete the file and try again.")
    #     return None

    # Get the images from the input file
    input_df = pd.read_csv(input_dir)

    # Use images that do not have any captions
    input_df = input_df[input_df["caption_A"].isnull() & input_df["caption_B"].isnull()]

    # Sample the images
    input_df = input_df.sample(num_sample_images).reset_index(drop=True)
    results_df = input_df.copy()

    # Get the models and processors
    llava_model, llava_processor = get_model_processor("llava")
    # clip_model, clip_processor = get_model_processor("clip")

    for image_url in input_df.image_url.values:

        # Load the image
        image = Image.open(requests.get(image_url, stream=True).raw)

        # Generate Captions:
        # WCOT: without chain-of-thought/without any conditioned prompt
        # COT: with chain-of-thought/with a pre-determined conditioned prompt
        wcot_caption, cot_caption = generate_caption_from_image(
            llava_model, llava_processor, image
        )
        print(f"WCOT Caption: {wcot_caption}")
        print(f"COT Caption: {cot_caption}")

        # Process the generated captions
        process_wcot_caption = process_caption(wcot_caption)
        process_cot_caption = process_caption(cot_caption)
        print(f"Processed WCOT Caption: {process_wcot_caption}")
        print(f"Processed COT Caption: {process_cot_caption}")

        valid_outcomes = [process_wcot_caption, cot_caption]

        # Evaluate which caption is better according to LLM
        # clip_pred = get_predicted_labels_clip(
        #     clip_model, clip_processor, image, valid_outcomes
        # )

        # Evaluate which caption is better according to LLM
        llm_pred = get_predicted_labels_llm(
            llava_model, llava_processor, image, valid_outcomes, max_new_tokens=50
        )

        # Evaluate which caption is better according to a human
        # human_pred = get_human_pref_caption(image, valid_outcomes)

        results_df.loc[-1] = [
            image_url,
            process_wcot_caption,
            process_cot_caption,
            " ",
            llm_pred,
            " ",
        ]
        results_df.index = results_df.index + 1
        results_df = results_df.sort_index()

    # Save the results
    results_df.to_csv(
        os.path.join(user_survey_dir, f"{user_name}_survey.csv")
    )


def get_images(input_file, output_file, num_images=10):
    objects = pd.read_csv(input_file)["image_url"]

    random_numbers = random.sample(range(0, len(objects)), num_images)
    images = [objects[i] for i in random_numbers]

    output_file = output_file.append(pd.DataFram([images], columns=["image_url"]))

    return output_file


def get_model_processor(model_name):
    if model_name == "llava":
        model_pref = "llava-hf/llava-v1.6-mistral-7b-hf"
        processor = LlavaNextProcessor.from_pretrained(model_pref)

        model = LlavaNextForConditionalGeneration.from_pretrained(
            model_pref, torch_dtype=torch.float16, low_cpu_mem_usage=True
        )
        model.to(device)
    elif model_name == "clip":
        model, processor = clip.load("ViT-B/32", device=device)

    print(f"Model: {model_name} loaded successfully!")

    return model, processor


def generate_caption_from_image(model, processor, image, max_new_tokens=50):

    # Generate prompt
    wcot_prompt = "[INST] <image>\nGenerate a caption for this image in 50 words[/INST]"
    cot_prompt = "[INST] <image>\nGenerate a caption for this image, and the description should include the number of objects in the image without explicitly mentioning it in 50 words[/INST]"

    # Process prompt
    wcot_inputs = processor(wcot_prompt, image, return_tensors="pt").to(device)
    cot_inputs = processor(cot_prompt, image, return_tensors="pt").to(device)

    # autoregressively complete prompt
    wcot_output = model.generate(**wcot_inputs, max_new_tokens=max_new_tokens)
    cot_output = model.generate(**cot_inputs, max_new_tokens=max_new_tokens)

    # Decode output
    wcot_final_output = processor.decode(wcot_output[0], skip_special_tokens=True)
    cot_final_output = processor.decode(cot_output[0], skip_special_tokens=True)
    
    print(wcot_final_output)

    # Process the output
    wcot_final_output = (
        wcot_final_output.split("[/INST]")[-1].strip().split('"')[1]
    )  # remove prompt
    cot_final_output = (
        cot_final_output.split("[/INST]")[-1].strip().split('"')[1]
    )  # remove prompt

    return wcot_final_output, cot_final_output


def process_caption(caption):
    # first_split = caption.split("[/INST]")[-1].strip() # remove prompt
    second_split = caption.split(". ")  # split into sentences

    # remove last sentence if it doesn't end with a period
    if not second_split[-1].endswith("."):
        second_split.pop()

    final_string = ". ".join(second_split)  # join sentences
    return final_string


def get_predicted_labels_clip(model, preprocess, image, captions):

    img = preprocess(image).unsqueeze(0).to(device)
    text = clip.tokenize(captions).to(device)

    with torch.no_grad():
        logits_per_image, _ = model(img, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    final_pred = probs[0].argmax()
    print("Label probs:", final_pred)

    # if final_pred == 0:
    #     print("WCOT", wcot_caption)
    # else:
    #     print("COT", cot_caption)

    if final_pred == 0:
        return "A"
    else:
        return "B"


def get_predicted_labels_llm(model, processor, image, captions, max_new_tokens=50):

    # Generate prompt
    prompt = f"[INST] <image>\nWhich caption is a better choice for the given image: (A) {captions[0]} or (B) {captions[1]}? Give only the option letter to me.[/INST]"

    # Process prompt
    inputs = processor(prompt, image, return_tensors="pt").to(device)

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=max_new_tokens)

    # Decode output
    final_output = processor.decode(output[0], skip_special_tokens=True)

    final_output = final_output.split("[/INST]")[-1].strip()  # remove prompt

    if final_output == "A":
        # print(f"Selected caption: WCOT: {captions[0]}")
        return 0
    else:
        # print(f"Selected caption: COT: {captions[1]}")
        return 1


def get_human_pref_caption(image, captions):
    image.show()
    print("There are 2 captions. Which one do you prefer from the ones?")
    
    # Shuffle the captions to avoid bias
    random.shuffle(captions)
    
    print(f"Caption (A): {captions[0]}")
    print(f"Caption (B): {captions[1]}")

    possible_inputs = ["A", "B"]
    for attempts in range(3):
        human_input = input(f"Attempt: {attempts+1}. Enter either option 'A' or 'B'")
        if human_input in possible_inputs:
            break
        else:
            print("Invalid input. Please try again!")

    return human_input


In [3]:
input_dir = 'images/visual_genome_available_images.csv'
survey_dir = 'surveys'
num_sample_images = 2
user_name = 'test_user'

main(input_dir, survey_dir, num_sample_images, user_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 4/4 [00:06<00:00,  1.55s/it]


Model: llava loaded successfully!


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


IndexError: list index out of range

: 

In [None]:
! python image_caption_prompt_analysis.py --input_dir images/visual_genome_available_images.csv --survey_dir surveys --num_sample_images 2 --user_name test

In [3]:
# if __name__ == "__main__":
parser = argparse.ArgumentParser(
    description="Generate captions for the given image and evaluate the captions using LLM and CLIP models."
)
# parser.add_argument("--image_url", type=str, default='https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg', help="URL of the image")
parser.add_argument(
    "--input_dir",
    type=str,
    required=False,
    help="File containing the URLs of the images to be evaluated",
)
parser.add_argument(
    "--survey_dir",
    type=str,
    required=True,
    help="CSV file containing all the survey results conducted",
)
parser.add_argument(
    "--num_sample_images",
    type=int,
    default=10,
    help="Number of images to be sampled for the survey",
)
parser.add_argument(
    "--user_name", type=str, required=True, help="Name of the survey taker."
)
args = parser.parse_args()

main(args)

usage: ipykernel_launcher.py [-h] [--input_dir INPUT_DIR] --survey_dir
                             SURVEY_DIR
                             [--num_sample_images NUM_SAMPLE_IMAGES]
                             --user_name USER_NAME
ipykernel_launcher.py: error: the following arguments are required: --survey_dir, --user_name


SystemExit: 2

In [1]:
import pandas as pd

In [2]:
file = pd.read_csv('results/generated_captions.csv')

In [3]:
print(file.shape)
file.head()

(500, 8)


Unnamed: 0.1,Unnamed: 0,image_url,caption_A,caption_B,human_output,llm_output,clip_small_output,clip_large_output
0,0,https://cs.stanford.edu/people/rak248/VG_100K_...,"In the tranquil expanse of a grassy field, a g...","In the tranquil expanse of a grassy field, a g...",,0.0,0.0,1.0
1,1,https://cs.stanford.edu/people/rak248/VG_100K_...,A cozy kitchen scene with a white refrigerator...,The image captures a cozy kitchen scene. Domin...,,0.0,1.0,1.0
2,2,https://cs.stanford.edu/people/rak248/VG_100K_...,"The image captures a well-organized workspace,...",The image captures a well-organized workspace....,,0.0,1.0,1.0
3,3,https://cs.stanford.edu/people/rak248/VG_100K_...,"The image captures a serene urban scene, where...",The image captures a serene urban scene. A bla...,,0.0,1.0,1.0
4,4,https://cs.stanford.edu/people/rak248/VG_100K_...,Elegant Living Room: A Symphony of Comfort and...,The image captures a warm and inviting living ...,,1.0,1.0,0.0


In [4]:
# convert the last 3 columns to integers
file['clip_small_output'] = file['clip_small_output'].apply(lambda x: int(x))
file['clip_large_output'] = file['clip_large_output'].apply(lambda x: int(x))
file['llm_output'] = file['llm_output'].apply(lambda x: int(x))

In [5]:
file.drop(columns=['Unnamed: 0'], inplace=True)
file.head()

Unnamed: 0,image_url,caption_A,caption_B,human_output,llm_output,clip_small_output,clip_large_output
0,https://cs.stanford.edu/people/rak248/VG_100K_...,"In the tranquil expanse of a grassy field, a g...","In the tranquil expanse of a grassy field, a g...",,0,0,1
1,https://cs.stanford.edu/people/rak248/VG_100K_...,A cozy kitchen scene with a white refrigerator...,The image captures a cozy kitchen scene. Domin...,,0,1,1
2,https://cs.stanford.edu/people/rak248/VG_100K_...,"The image captures a well-organized workspace,...",The image captures a well-organized workspace....,,0,1,1
3,https://cs.stanford.edu/people/rak248/VG_100K_...,"The image captures a serene urban scene, where...",The image captures a serene urban scene. A bla...,,0,1,1
4,https://cs.stanford.edu/people/rak248/VG_100K_...,Elegant Living Room: A Symphony of Comfort and...,The image captures a warm and inviting living ...,,1,1,0


In [6]:
# treat the llm_output as the ground truth and calculate the accuracy of the clip_small_output and clip_large_output
file['clip_small_correct'] = file['clip_small_output'] == file['llm_output']
file['clip_large_correct'] = file['clip_large_output'] == file['llm_output']

clip_small_accuracy = file['clip_small_correct'].sum() / file.shape[0]
clip_large_accuracy = file['clip_large_correct'].sum() / file.shape[0]

print(f"CLIP Small Accuracy: {clip_small_accuracy}")
print(f"CLIP Large Accuracy: {clip_large_accuracy}")

CLIP Small Accuracy: 0.574
CLIP Large Accuracy: 0.518
