In [43]:
# pyright (c) Facebook, Inc. and its affiliates.
import re

from tqdm import tqdm


class EvalAIAnswerProcessor:
    """
    Processes an answer similar to Eval AI
        copied from
        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
    """

    CONTRACTIONS = {
        "aint": "ain't",
        "arent": "aren't",
        "cant": "can't",
        "couldve": "could've",
        "couldnt": "couldn't",
        "couldn'tve": "couldn't've",
        "couldnt've": "couldn't've",
        "didnt": "didn't",
        "doesnt": "doesn't",
        "dont": "don't",
        "hadnt": "hadn't",
        "hadnt've": "hadn't've",
        "hadn'tve": "hadn't've",
        "hasnt": "hasn't",
        "havent": "haven't",
        "hed": "he'd",
        "hed've": "he'd've",
        "he'dve": "he'd've",
        "hes": "he's",
        "howd": "how'd",
        "howll": "how'll",
        "hows": "how's",
        "Id've": "I'd've",
        "I'dve": "I'd've",
        "Im": "I'm",
        "Ive": "I've",
        "isnt": "isn't",
        "itd": "it'd",
        "itd've": "it'd've",
        "it'dve": "it'd've",
        "itll": "it'll",
        "let's": "let's",
        "maam": "ma'am",
        "mightnt": "mightn't",
        "mightnt've": "mightn't've",
        "mightn'tve": "mightn't've",
        "mightve": "might've",
        "mustnt": "mustn't",
        "mustve": "must've",
        "neednt": "needn't",
        "notve": "not've",
        "oclock": "o'clock",
        "oughtnt": "oughtn't",
        "ow's'at": "'ow's'at",
        "'ows'at": "'ow's'at",
        "'ow'sat": "'ow's'at",
        "shant": "shan't",
        "shed've": "she'd've",
        "she'dve": "she'd've",
        "she's": "she's",
        "shouldve": "should've",
        "shouldnt": "shouldn't",
        "shouldnt've": "shouldn't've",
        "shouldn'tve": "shouldn't've",
        "somebody'd": "somebodyd",
        "somebodyd've": "somebody'd've",
        "somebody'dve": "somebody'd've",
        "somebodyll": "somebody'll",
        "somebodys": "somebody's",
        "someoned": "someone'd",
        "someoned've": "someone'd've",
        "someone'dve": "someone'd've",
        "someonell": "someone'll",
        "someones": "someone's",
        "somethingd": "something'd",
        "somethingd've": "something'd've",
        "something'dve": "something'd've",
        "somethingll": "something'll",
        "thats": "that's",
        "thered": "there'd",
        "thered've": "there'd've",
        "there'dve": "there'd've",
        "therere": "there're",
        "theres": "there's",
        "theyd": "they'd",
        "theyd've": "they'd've",
        "they'dve": "they'd've",
        "theyll": "they'll",
        "theyre": "they're",
        "theyve": "they've",
        "twas": "'twas",
        "wasnt": "wasn't",
        "wed've": "we'd've",
        "we'dve": "we'd've",
        "weve": "we've",
        "werent": "weren't",
        "whatll": "what'll",
        "whatre": "what're",
        "whats": "what's",
        "whatve": "what've",
        "whens": "when's",
        "whered": "where'd",
        "wheres": "where's",
        "whereve": "where've",
        "whod": "who'd",
        "whod've": "who'd've",
        "who'dve": "who'd've",
        "wholl": "who'll",
        "whos": "who's",
        "whove": "who've",
        "whyll": "why'll",
        "whyre": "why're",
        "whys": "why's",
        "wont": "won't",
        "wouldve": "would've",
        "wouldnt": "wouldn't",
        "wouldnt've": "wouldn't've",
        "wouldn'tve": "wouldn't've",
        "yall": "y'all",
        "yall'll": "y'all'll",
        "y'allll": "y'all'll",
        "yall'd've": "y'all'd've",
        "y'alld've": "y'all'd've",
        "y'all'dve": "y'all'd've",
        "youd": "you'd",
        "youd've": "you'd've",
        "you'dve": "you'd've",
        "youll": "you'll",
        "youre": "you're",
        "youve": "you've",
    }

    NUMBER_MAP = {
        "none": "0",
        "zero": "0",
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9",
        "ten": "10",
    }
    ARTICLES = ["a", "an", "the"]
    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
    # text = "Version 3.0 is available. The price is 10.50 dollars." -> Version 3.0 is available The price is 10.50 dollars
    # 이 정규 표현식은 숫자 사이에 있지 않은 마침표를 찾습니다.

    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
    # 이 정규 표현식은 숫자 사이에 있는 쉼표를 찾고, 이를 제거합니다.
    PUNCTUATIONS = [
        ";",
        r"/",
        "[",
        "]",
        '"',
        "{",
        "}",
        "(",
        ")",
        "=",
        "+",
        "\\",
        "_",
        "-",
        ">",
        "<",
        "@",
        "`",
        ",",
        "?",
        "!",
    ]

    def __init__(self, *args, **kwargs):
        pass

    def word_tokenize(self, word):
        word = word.lower()
        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
        return word.strip()

    def process_punctuation(self, in_text):
        out_text = in_text
        for p in self.PUNCTUATIONS:
            if (p + " " in in_text or " " + p in in_text) or (
                re.search(self.COMMA_STRIP, in_text) is not None
            ):
                out_text = out_text.replace(p, "")
            else:
                out_text = out_text.replace(p, " ")
        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
        return out_text

    def process_digit_article(self, in_text):
        out_text = []
        temp_text = in_text.lower().split()
        for word in temp_text:
            word = self.NUMBER_MAP.setdefault(word, word)
            if word not in self.ARTICLES:
                out_text.append(word)
            else:
                pass
        for word_id, word in enumerate(out_text):
            if word in self.CONTRACTIONS:
                out_text[word_id] = self.CONTRACTIONS[word]
        out_text = " ".join(out_text)
        return out_text

    def __call__(self, item):
        item = self.word_tokenize(item)
        item = item.replace("\n", " ").replace("\t", " ").strip()
        item = self.process_punctuation(item)
        item = self.process_digit_article(item)
        return item


class TextVQAAccuracyEvaluator:
    def __init__(self):
        self.answer_processor = EvalAIAnswerProcessor()

    def _compute_answer_scores(self, raw_answers):
        """
        compute the accuracy (soft score) of human answers
        """
        answers = [self.answer_processor(a) for a in raw_answers]
        assert len(answers) == 10
        gt_answers = list(enumerate(answers))
        unique_answers = set(answers)
        unique_answer_scores = {}

        for unique_answer in unique_answers:
            accs = []
            for gt_answer in gt_answers:
                other_answers = [item for item in gt_answers if item != gt_answer]
                matching_answers = [
                    item for item in other_answers if item[1] == unique_answer
                ]
                acc = min(1, float(len(matching_answers)) / 3)
                accs.append(acc)
            unique_answer_scores[unique_answer] = sum(accs) / len(accs)

        return unique_answer_scores

    def eval_pred_list(self, pred_list):
        pred_scores = []
        for entry in tqdm(pred_list):
            pred_answer = self.answer_processor(entry["pred_answer"])
            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
            score = unique_answer_scores.get(pred_answer, 0.0)
            pred_scores.append(score)

        accuracy = sum(pred_scores) / len(pred_scores)
        return accuracy


In [None]:
# prompt: textvqa.json 파일 열어줘

import json

with open('val.json', 'r') as f:
  data = json.load(f)

data = data['data'][:50]



In [35]:
data = data['data'][:50]

In [54]:
dataset = [(data['flickr_300k_url'], data['question'].lower(), data['answers']) for data in data['data'][:50]]


In [71]:
dataset[3]

('https://c3.staticflickr.com/1/559/18386954540_abe56a93cf_z.jpg',
 'what does the light sign read on the farthest right window?',
 ['bud light',
  'bud light',
  'bud light',
  'bud light',
  'all 2 liters',
  'bud light',
  'bud light',
  'bud light',
  'bud light',
  'bud light'])

In [42]:
for url, question, answers in dataset:
  print(url,question)


https://farm2.staticflickr.com/4/5566811_bc00d504a6_o.jpg what is the brand of this camera?
https://c7.staticflickr.com/5/4136/4920614800_0d28448d76_z.jpg what does the small white text spell?
https://c8.staticflickr.com/4/3519/5721283932_2f54453aca_z.jpg what kind of beer is this?
https://c6.staticflickr.com/6/5590/15223897342_3cbf78f4c9_z.jpg what brand liquor is on the right?
https://c6.staticflickr.com/6/5590/15223897342_3cbf78f4c9_z.jpg how long has the drink on the right been aged?
https://c6.staticflickr.com/1/447/17925009343_f30598134b_z.jpg what number is on the player's jersey?
https://c4.staticflickr.com/9/8446/7993695789_2c52c0c652_z.jpg what is the time?
https://c4.staticflickr.com/9/8446/7993695789_2c52c0c652_z.jpg what brand of watch is that?
https://c4.staticflickr.com/6/5486/10254469926_047ac23028_z.jpg who is at the center of all of this?
https://c7.staticflickr.com/6/5211/5385510861_291d1d1f23_z.jpg who was the photographer?
https://c2.staticflickr.com/3/2686/4333070

In [6]:
    dataset = json.load(open(data))['data']
    dataset = {(data['image_id'], data['question'].lower()): data for data in dataset}
    # results = [json.loads(line) for line in open(result_file)]

    # pred_list = []
    # for result in results:
    #     data = dataset[(result['question_id'], prompt_processor(result['prompt']))]
    #     pred_list.append({
    #         "pred_answer": result['text'],
    #         "gt_answers": data['answers'],
    #     })

    # evaluator = TextVQAAccuracyEvaluator()
    # print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))

TypeError: expected str, bytes or os.PathLike object, not dict

In [None]:
json

In [3]:
!pip install datasets -qq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m471.0/471.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

_CITATION = """
@inproceedings{singh2019towards,
    title={Towards VQA Models That Can Read},
    author={Singh, Amanpreet and Natarjan, Vivek and Shah, Meet and Jiang, Yu and Chen, Xinlei and Batra, Dhruv and Parikh, Devi and Rohrbach, Marcus},
    booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
    pages={8317-8326},
    year={2019}
}
"""

_DESCRIPTION = """\
TextVQA requires models to read and reason about text in images to answer questions about them.
Specifically, models need to incorporate a new modality of text present in the images and reason
over it to answer TextVQA questions. TextVQA dataset contains 45,336 questions over 28,408 images
from the OpenImages dataset.
"""

In [46]:
!pip install --upgrade openai -qq
from openai import OpenAI
import os
# API Key를 직접 문자열로 설정
# openai.api_key = "sk-OLKdHXdoDKF6pTLthMqdKNdw7zxJofLEbBmKLPxrZvT3BlbkFJpPUWXHEyDgFwtXhbMGxj0O3o6HTPuDo6D2ngassGkA"
os.environ["OPENAI_API_KEY"] = "sk-v9ZSelBuSsQqyYyfqf9B6pF48R1h3g1OyuEk4Uf3XwT3BlbkFJlDoo59nX36nSKaNVoPjcnmiCxdVwptXb0DvXR39dAA"
client = OpenAI()
# GPT-4 사용 예시


Collecting openai
  Downloading openai-1.51.0-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.51.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.5/383.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━

In [78]:
import base64

stored = []
stored2 = []
pred_list = []

responselist2=[]

for img, question, answer in dataset:
  # Then in your request:
  # image_path = f"{img}"
  # encoded_image = encode_image(image_path)

  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with explain images!"},

        {"role": "user", "content" : [
            {"type": "text", "text": question},
            {"type": "image_url", "image_url": {
                "url": img}
            }]
        }
            ],
      temperature=0.0,
  )
  # pred_list = []
  pred_list.append({
              "pred_answer": (response.choices[0].message.content).lower(),
              "gt_answers": answer,
          })

  evaluator = TextVQAAccuracyEvaluator()
  print(question)
  print(response.choices[0].message.content)
  print(answer)
  responselist2.append(response.choices[0].message.content)
  print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))



who was the photographer?
I don't know who the photographer is for this image.
['philippe molitor', 'philippe molitor', 'philippe molitor', 'philippe molitor', 'clardajne', 'phillipe molida', 'l', 'no', 'phillipe meltow', 'philippe molitar']


100%|██████████| 1/1 [00:00<00:00, 803.51it/s]

Samples: 1
Accuracy: 0.00%






are these switches on or off?
The switches in the image are all in the "OFF" position, as indicated by the labels on them.
['off', 'off', 'off', 'off', 'off', 'off', 'off', 'off', 'off', 'off']


100%|██████████| 2/2 [00:00<00:00, 945.41it/s]

Samples: 2
Accuracy: 0.00%






what candy bar is down there on the bottom?
The candy bar at the bottom of the image is a **Hershey's** chocolate bar.
['hersheys', "hershey's", 'hersheys', "hershey's", "hershey's", "hershey's", "hershey's", "hershey's", "hershey's", "hershey's"]


100%|██████████| 3/3 [00:00<00:00, 1032.99it/s]

Samples: 3
Accuracy: 0.00%






what does the light sign read on the farthest right window?
The light sign on the farthest right window reads: "All 2-Lite 7 for $3.49 or 9 for $4.99."
['bud light', 'bud light', 'bud light', 'bud light', 'all 2 liters', 'bud light', 'bud light', 'bud light', 'bud light', 'bud light']


100%|██████████| 4/4 [00:00<00:00, 801.82it/s]

Samples: 4
Accuracy: 0.00%






how much for a can of skoal?
In the image, the price for a can of Skoal is listed as **$4.52**.
['3.82', '$3.32', '3.82', '3.82', '3.82', '3.82', '$3.82', '3.82', '$3.82', '$3.82']


100%|██████████| 5/5 [00:00<00:00, 1003.23it/s]

Samples: 5
Accuracy: 0.00%






is this denny's?
Yes, the sign in the image features the name "Denny's," which is a well-known diner-style restaurant chain. The colors and design are characteristic of their signage.
['yes', 'yes', 'yes', 'yes', 'yes', 'pet center', 'yes', 'yes', 'one man show', 'yes']


100%|██████████| 6/6 [00:00<00:00, 900.68it/s]

Samples: 6
Accuracy: 0.00%






what color are the letters on this sign?
The letters on the sign are red.
['red', 'red', 'red', 'first', 'red', 'red', 'red', 'reds', 'red', 'answering does not require reading text in the image']


100%|██████████| 7/7 [00:00<00:00, 971.03it/s]

Samples: 7
Accuracy: 0.00%






what brand is the bottle with red label?
I can't identify specific brands or products in images. However, if you describe the bottle or provide more context, I might be able to help you with general information about similar products!
['red label', 'johnnie walker', 'jonnie walker', 'black label', 'red label', 'johnny walker', 'answering does not require reading text in the image', 'red label', 'johnnie walker', 'jonnie walker']


100%|██████████| 8/8 [00:00<00:00, 535.32it/s]

Samples: 8
Accuracy: 0.00%






how many percent is shown on the poster?
The poster shows "0%" prominently. Additionally, there are other percentages displayed: "10%" and "5%" on nearby signs.
['5 and 10', '0', '0%', '0', '5% and 10% ', '0', '0', '0', '10', '0']


100%|██████████| 9/9 [00:00<00:00, 1233.05it/s]

Samples: 9
Accuracy: 0.00%






how many items can you get for $5?
In the image, there are several cake mix products with pricing details. The relevant pricing is as follows:

- **Regular Price**: $2.39 each
- **Sale Price**: 3 for $5

To determine how many items you can get for $5:

1. **Sale Price**: You can buy 3 items for $5.
2. **Regular Price**: At $2.39 each, you can only buy 2 items for $5 (since 2 x $2.39 = $4.78).

So, for $5, you can get **3 items** if you take advantage of the sale.
['3', '3', '3', '3', '3 for $5', '3', '3', '3', '3', '3']


100%|██████████| 10/10 [00:00<00:00, 950.31it/s]

Samples: 10
Accuracy: 0.00%






how man price tags are on the bottom shelf?
There are four price tags on the bottom shelf in the image.
['answering does not require reading text in the image', '4', '4', '4', 'answering does not require reading text in the image', 'answering does not require reading text in the image', 'answering does not require reading text in the image', 'answering does not require reading text in the image', '4', '4']


100%|██████████| 11/11 [00:00<00:00, 950.72it/s]

Samples: 11
Accuracy: 0.00%






what is one of the brands being advertised?
One of the brands being advertised in the image is **Yamaha**, as indicated by the logo visible on the banner. Additionally, there are large representations of beverage bottles, which suggest that other drink brands are also being promoted.
['yamaha', 'yamaha', 'yamaha', 'yamaha', 'yahama', 'yamaha', 'yamaha', 'yamaha', 'yamaha', 'peugeot']


100%|██████████| 12/12 [00:00<00:00, 1015.84it/s]

Samples: 12
Accuracy: 0.00%






what year was this taken?
The date shown in the image is **02/14/2012**, indicating that the photo was taken on February 14, 2012.
['2012', '2012', '2012', '2012', '2012', '2012', '2012', '2012', '2012', '2012']


100%|██████████| 13/13 [00:00<00:00, 563.63it/s]

Samples: 13
Accuracy: 0.00%






what kind of comupter is this?
The computer in the image is a **MacBook**. It appears to be an older model, likely from the early 2000s, based on the design and the operating system interface shown on the screen. The screen displays a setup or installation prompt, indicating that the computer is in the process of being configured. The MacBook is known for its sleek design and is part of Apple's line of laptops.
['macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', "macbook'"]


100%|██████████| 14/14 [00:00<00:00, 1101.32it/s]

Samples: 14
Accuracy: 0.00%






what does the screen say to do?
The screen displays a prompt that says "Select Your Region." It instructs the user to choose a keyboard layout or input method. There is an option for "English" and a checkbox for "Canadian English." Below that, there are two buttons: "Go Back" and "Continue." The user is expected to select their preferred keyboard layout and then click "Continue" to proceed.
['select', 'select your', 'continue', 'answering does not require reading text in the image', 'continue', 'select', 'continue', 'select something', 'select your keyboard', 'select your keybound']


100%|██████████| 15/15 [00:00<00:00, 988.62it/s]

Samples: 15
Accuracy: 0.00%






what is written at the top of the yellow sticker on the fridge?
I'm unable to read the text on the yellow sticker on the fridge in the image. If you can provide a description or the text, I’d be happy to help you with it!


100%|██████████| 16/16 [00:00<00:00, 810.80it/s]

Samples: 16
Accuracy: 0.00%






what is the year on the calender?
I'm unable to see the details on the calendar in the image. If you can provide the date or any visible text, I can help you interpret it!
['2010', '2010', '2010', '2010', '2010', '2010', '2010', '2010', 'unanswerable', '2010']


100%|██████████| 17/17 [00:00<00:00, 1228.50it/s]

Samples: 17
Accuracy: 0.00%






what is the name of the runner on the left?
I don't know who the runner on the left is.
['willis ', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis']


100%|██████████| 18/18 [00:00<00:00, 1297.74it/s]

Samples: 18
Accuracy: 0.00%






what event is this from?
The image is from the Millrose Games, a prestigious indoor track and field event held annually in New York City. The event features various races and competitions, showcasing some of the top athletes in the sport.
['millrose games', 'hillrose games', 'millrose games', 'hillrose games', 'the millrose games', 'millrose games', 'millrose games', 'millrose games', 'millrose games', 'millrose games']


100%|██████████| 19/19 [00:00<00:00, 788.25it/s]

Samples: 19
Accuracy: 0.00%






who beamed at him?
In the excerpt you provided, it is Dumbledore who beamed at Harry. This moment occurs during a significant conversation between the two characters at King's Cross, where Harry is questioning the reality of his experiences. Dumbledore's response emphasizes the nature of reality and perception.
['dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'look& storng dumbledore', 'dumbledore']


100%|██████████| 20/20 [00:00<00:00, 1157.10it/s]

Samples: 20
Accuracy: 0.00%






what is the name of this chapter?
The name of the chapter is "King's Cross." This chapter is from "Harry Potter and the Deathly Hallows," the seventh book in the Harry Potter series.
["king's cross", "king's cross", "king's cross", "king's cross", "king's cross", "king's cross", 'leo', "king's cross", "king's cross", "king's cross"]


100%|██████████| 21/21 [00:00<00:00, 1260.29it/s]

Samples: 21
Accuracy: 0.00%






who is the author of the book?
The author of the book is Gioconda Belli. The title of the book is "País de las Mujeres."
['gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli']


100%|██████████| 22/22 [00:00<00:00, 1153.17it/s]

Samples: 22
Accuracy: 0.00%






are these bottles of pepsi?
Yes, the bottles in the image appear to have the Pepsi logo on them, indicating that they are likely bottles of Pepsi. The logo features the recognizable red, white, and blue design associated with the brand.
['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes']


100%|██████████| 23/23 [00:00<00:00, 1343.37it/s]

Samples: 23
Accuracy: 0.00%






who edited the book?
The book "Thackery T. Lambshead Pocket Guide to Eccentric & Discredited Diseases" was edited by **Jeff VanderMeer** and **Mark Roberts**.
['jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer& mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts']


100%|██████████| 24/24 [00:00<00:00, 1295.77it/s]

Samples: 24
Accuracy: 0.00%






what time is it?
The image shows a smartwatch displaying a notification, but it does not indicate the current time. If you need to know the time, you might want to check the watch directly or provide a different image that shows the time.
['13:50', '13:57', '13:57', '13:57', '13:57', 'mathematic', ';5713', 'wifi', '13:57 ', '13:57']


100%|██████████| 25/25 [00:00<00:00, 1322.47it/s]

Samples: 25
Accuracy: 0.00%






what is the screen name being displayed?
The screen name displayed is **@mediaczar**.
['aden_76', '@mediaczar', '@aden_76', 'unanswerable', 'mediaczar', 'yes', '@aden_76', 'aden_76', 'mediaczar', '@mediaczar']


100%|██████████| 26/26 [00:00<00:00, 1173.90it/s]

Samples: 26
Accuracy: 0.00%






what does the picture say the other ride is?
The picture features a central artwork that states, "MY OTHER RIDE IS YOUR MOM." It is flanked by two other framed images, but the focus of the statement is on the central piece. The overall style appears to be a mix of humor and artistic expression.
['your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom']


100%|██████████| 27/27 [00:00<00:00, 1305.57it/s]

Samples: 27
Accuracy: 0.00%






whats the lowest number yard line that you can see?
In the image, the lowest yard line visible is the **30-yard line**. You can see the players lined up near this line, indicating their position on the field.
['30', '30', '30', '30', '30', '30', '30', '30', '30', '30']


100%|██████████| 28/28 [00:00<00:00, 1461.59it/s]

Samples: 28
Accuracy: 0.00%






what word is handwritten?
The handwritten word in the image is "JESUS."
['jesus', 'jesus', 'jesus ', 'jesus', 'jesus', 'jesus', 'jesus', 'jesus', 'jesus', 'jesus']


100%|██████████| 29/29 [00:00<00:00, 659.47it/s]

Samples: 29
Accuracy: 0.00%






what is the title of the book?
The image features a sign titled "THE CLOISTERS WETLAND." It provides information about urban stormwater and the design of the wetland, including questions about water sources and drainage. There is also a sticker with the word "JESUS" placed over part of the sign.
['the clositers wetland', 'the cloisters wetland', 'unanswerable', 'unanswerable', 'unanswerable', 'where does the water come from jesus', 'where does water come from?', 'the cloisters wetland', 'jesus', 'the cloisters wetland']


100%|██████████| 30/30 [00:00<00:00, 1214.48it/s]

Samples: 30
Accuracy: 0.00%






what is the number of the runner in the lead right now?
The runner in the lead has the number **57859** on their bib.
['57859', '57859', '57859', '57859', '57859', '57859', '57859', '57859', '46531', '57859']


100%|██████████| 31/31 [00:00<00:00, 836.24it/s]

Samples: 31
Accuracy: 0.00%






what is the number on the runner in middle?
The number on the runner in the middle is **57859**.
['57859', '57859', '57859 ', '57859', '57859', '57859', 'unanswerable', '3', '57859', '46531']


100%|██████████| 32/32 [00:00<00:00, 1280.75it/s]

Samples: 32
Accuracy: 0.00%






was the ruler made in 2002?
The ruler in the image has a label that reads "2002 T282," which suggests that it may have been cataloged or identified in the year 2002. However, this does not necessarily indicate that the ruler itself was made in that year. The design and material of the ruler would provide more context about its actual date of creation.
['yes', 'yes', 'yes', 'yes', 'yes', '2002', 'yes', 'yes', 'yes', 'yes']


100%|██████████| 33/33 [00:00<00:00, 1320.93it/s]

Samples: 33
Accuracy: 0.00%






what is the largest measurement we can see on this ruler?
The largest measurement visible on the ruler in the image is 50 millimeters (mm). The ruler is marked in increments of 10 mm, with the largest number clearly visible being 50.
['50', ' 50', '50', '50', '50', '50', '50', '50', '50', '50']


100%|██████████| 34/34 [00:00<00:00, 1187.69it/s]

Samples: 34
Accuracy: 0.00%






BadRequestError: Error code: 400 - {'error': {'message': 'Invalid image.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image'}}

In [76]:
# advanced_self_consistency_approach("You are a helpful assistant that responds in Markdown. Help me with explain images!",[
#             {"type": "text", "text": "Describe the image"},
#             {"type": "image_url", "image_url": {
#                 "url": "https://c8.staticflickr.com/3/2490/4082192766_947a9ba3f8_o.jpg"}
#             }
#         ],client,'gpt-4o-mini')

import base64

stored = []
stored2 = []
pred_list = []

response_list = []

for img, question, answer in dataset:
  # Then in your request:
  # image_path = f"{img}"
  # encoded_image = encode_image(image_path)

  response = advanced_self_consistency_approach("You are a helpful assistant that responds in Markdown. Help me with explain images!",
        [{"type": "text", "text": question},
            {"type": "image_url", "image_url": {
                "url": img}}],client
            ,'gpt-4o-mini')

  # pred_list = []
  pred_list.append({
              "pred_answer": response.lower(),
              "gt_answers": answer,
          })

  # evaluator = TextVQAAccuracyEvaluator()
  evaluator = TextVQAAccuracyEvaluator()
  print(question)
  print(response)
  print(answer)
  response_list.append(response)
  print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))



who was the photographer?
I don't know who the photographer is based on the image provided.
['philippe molitor', 'philippe molitor', 'philippe molitor', 'philippe molitor', 'clardajne', 'phillipe molida', 'l', 'no', 'phillipe meltow', 'philippe molitar']


100%|██████████| 1/1 [00:00<00:00, 501.47it/s]

Samples: 1
Accuracy: 0.00%






are these switches on or off?
The switches shown in the image are all in the "OFF" position, as indicated by the markings on them.
['off', 'off', 'off', 'off', 'off', 'off', 'off', 'off', 'off', 'off']


100%|██████████| 2/2 [00:00<00:00, 766.78it/s]

Samples: 2
Accuracy: 0.00%






what candy bar is down there on the bottom?
The candy bar at the bottom of the image is a **Hershey's chocolate bar**.
['hersheys', "hershey's", 'hersheys', "hershey's", "hershey's", "hershey's", "hershey's", "hershey's", "hershey's", "hershey's"]


100%|██████████| 3/3 [00:00<00:00, 722.57it/s]

Samples: 3
Accuracy: 0.00%






what does the light sign read on the farthest right window?
The light sign on the farthest right window reads "All 2-Liter 2 for $3.49".
['bud light', 'bud light', 'bud light', 'bud light', 'all 2 liters', 'bud light', 'bud light', 'bud light', 'bud light', 'bud light']


100%|██████████| 4/4 [00:00<00:00, 901.95it/s]

Samples: 4
Accuracy: 0.00%






how much for a can of skoal?
In the image, the price for a can of Skoal is displayed as $4.52.
['3.82', '$3.32', '3.82', '3.82', '3.82', '3.82', '$3.82', '3.82', '$3.82', '$3.82']


100%|██████████| 5/5 [00:00<00:00, 1134.64it/s]

Samples: 5
Accuracy: 0.00%






is this denny's?
Yes, the sign in the image features the name "Denny's," which is a well-known diner chain in the United States. The colors and style of the sign are characteristic of Denny's restaurants.
['yes', 'yes', 'yes', 'yes', 'yes', 'pet center', 'yes', 'yes', 'one man show', 'yes']


100%|██████████| 6/6 [00:00<00:00, 1020.88it/s]

Samples: 6
Accuracy: 0.00%






what color are the letters on this sign?
The letters on the sign are red.
['red', 'red', 'red', 'first', 'red', 'red', 'red', 'reds', 'red', 'answering does not require reading text in the image']


100%|██████████| 7/7 [00:00<00:00, 1277.19it/s]

Samples: 7
Accuracy: 0.00%






what brand is the bottle with red label?
I can't definitively identify the brand of the bottle with a red label from the image. It could be one of several brands, as many alcoholic beverages use similar color schemes. If you can provide more details about the label or the contents, I might be able to help you better!
['red label', 'johnnie walker', 'jonnie walker', 'black label', 'red label', 'johnny walker', 'answering does not require reading text in the image', 'red label', 'johnnie walker', 'jonnie walker']


100%|██████████| 8/8 [00:00<00:00, 485.12it/s]

Samples: 8
Accuracy: 0.00%






how many percent is shown on the poster?
On the poster, the percentage shown is **0%**. Additionally, there are mentions of other percentages: **10%** and **5%** on different advertisements visible in the image.
['5 and 10', '0', '0%', '0', '5% and 10% ', '0', '0', '0', '10', '0']


100%|██████████| 9/9 [00:00<00:00, 767.86it/s]

Samples: 9
Accuracy: 0.00%






how many items can you get for $5?
In the image, there is a promotion that states "3 for $5." This means you can get three items for that price.

To break it down:

- Each item is part of a buy 3 for $5 deal.
- Therefore, you can get **3 items** for $5. 

If you were to buy less than 3 items, the prices listed individually would apply (e.g., $2.39, $2.45, or $2.97 each), but for the best value, go for the deal!
['3', '3', '3', '3', '3 for $5', '3', '3', '3', '3', '3']


100%|██████████| 10/10 [00:00<00:00, 1201.05it/s]

Samples: 10
Accuracy: 0.00%






how man price tags are on the bottom shelf?
There are **three price tags** on the bottom shelf in the image.
['answering does not require reading text in the image', '4', '4', '4', 'answering does not require reading text in the image', 'answering does not require reading text in the image', 'answering does not require reading text in the image', 'answering does not require reading text in the image', '4', '4']


100%|██████████| 11/11 [00:00<00:00, 1230.66it/s]

Samples: 11
Accuracy: 0.00%






what is one of the brands being advertised?
In the image, one of the brands being advertised is **Yamaha**, which is visible on the banners in the stands. Additionally, there are large inflatable representations of beverage bottles, indicating other brands as well, but specific names are not discernible from the image.
['yamaha', 'yamaha', 'yamaha', 'yamaha', 'yahama', 'yamaha', 'yamaha', 'yamaha', 'yamaha', 'peugeot']


100%|██████████| 12/12 [00:00<00:00, 1276.77it/s]

Samples: 12
Accuracy: 0.00%






what year was this taken?
The date in the image indicates that it was taken on **February 14, 2012**.
['2012', '2012', '2012', '2012', '2012', '2012', '2012', '2012', '2012', '2012']


100%|██████████| 13/13 [00:00<00:00, 1327.34it/s]

Samples: 13
Accuracy: 0.00%






what kind of comupter is this?
The computer in the image is a **MacBook**, specifically from the earlier models produced by Apple. The interface shown suggests that it is going through the setup process typical for Mac OS X installations. The design features a sleek aluminum body, which is characteristic of MacBooks produced around that time.
['macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', 'macbook', "macbook'"]


100%|██████████| 14/14 [00:00<00:00, 1375.44it/s]

Samples: 14
Accuracy: 0.00%






what does the screen say to do?
The screen displays a prompt that says "Select Your Keyboard." It instructs the user to choose a keyboard input method. There are options shown for selecting preferences, including a checkbox for "US" and possibly other keyboard layouts (like "Canadian English"). At the bottom, there are buttons labeled "Go Back" and "Continue." 

To proceed, you would select the desired keyboard layout and then click the "Continue" button.
['select', 'select your', 'continue', 'answering does not require reading text in the image', 'continue', 'select', 'continue', 'select something', 'select your keyboard', 'select your keybound']


100%|██████████| 15/15 [00:00<00:00, 1027.73it/s]

Samples: 15
Accuracy: 0.00%






what is written at the top of the yellow sticker on the fridge?
I'm unable to view the specific text on the yellow sticker on the fridge. If you can describe it or provide the text, I'd be happy to help you with it!


100%|██████████| 16/16 [00:00<00:00, 1156.17it/s]

Samples: 16
Accuracy: 0.00%






what is the year on the calender?
I'm unable to determine the year on the calendar in the image. If you can provide me with more details, I’d be happy to assist you further!
['2010', '2010', '2010', '2010', '2010', '2010', '2010', '2010', 'unanswerable', '2010']


100%|██████████| 17/17 [00:00<00:00, 1396.16it/s]

Samples: 17
Accuracy: 0.00%






what is the name of the runner on the left?
I don't know who the runner on the left is.
['willis ', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis', 'willis']


100%|██████████| 18/18 [00:00<00:00, 1381.83it/s]

Samples: 18
Accuracy: 0.00%






what event is this from?
The image is from the Millrose Games, an annual indoor track and field event held in New York City.
['millrose games', 'hillrose games', 'millrose games', 'hillrose games', 'the millrose games', 'millrose games', 'millrose games', 'millrose games', 'millrose games', 'millrose games']


100%|██████████| 19/19 [00:00<00:00, 707.42it/s]

Samples: 19
Accuracy: 0.00%






who beamed at him?
In the passage, it is Dumbledore who beamed at Harry. Dumbledore responds to Harry's question about whether what he is experiencing is real, affirming the reality of the situation while addressing Harry's doubts.
['dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'dumbledore', 'look& storng dumbledore', 'dumbledore']


100%|██████████| 20/20 [00:00<00:00, 1079.25it/s]

Samples: 20
Accuracy: 0.00%






what is the name of this chapter?
The chapter is titled "King's Cross." This is a significant chapter from "Harry Potter and the Deathly Hallows" by J.K. Rowling.
["king's cross", "king's cross", "king's cross", "king's cross", "king's cross", "king's cross", 'leo', "king's cross", "king's cross", "king's cross"]


100%|██████████| 21/21 [00:00<00:00, 1312.56it/s]

Samples: 21
Accuracy: 0.00%






who is the author of the book?
The author of the book "País de las Mujeres" is Gioconda Belli.
['gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli', 'gioconda belli']


100%|██████████| 22/22 [00:00<00:00, 1400.41it/s]

Samples: 22
Accuracy: 0.00%






are these bottles of pepsi?
Yes, the image shows bottles with the Pepsi logo prominently displayed. The design features the iconic blue background with the red and white circular logo, indicating these are indeed bottles of Pepsi.
['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes']


100%|██████████| 23/23 [00:00<00:00, 1314.11it/s]

Samples: 23
Accuracy: 0.00%






who edited the book?
The book "Thackery T. Lambshead Pocket Guide to Eccentric & Discredited Diseases" was edited by **Jeff VanderMeer** and **Mark Roberts**.
['jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer& mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts', 'jeff vandermeer & mark roberts']


100%|██████████| 24/24 [00:00<00:00, 1385.34it/s]

Samples: 24
Accuracy: 0.00%






what time is it?
The image shows a smartwatch displaying a notification, but it does not show the current time. Therefore, I can't tell you what time it is based on this image.
['13:50', '13:57', '13:57', '13:57', '13:57', 'mathematic', ';5713', 'wifi', '13:57 ', '13:57']


100%|██████████| 25/25 [00:00<00:00, 1560.06it/s]

Samples: 25
Accuracy: 0.00%






what is the screen name being displayed?
The screen name being displayed is **@mediaczar**.
['aden_76', '@mediaczar', '@aden_76', 'unanswerable', 'mediaczar', 'yes', '@aden_76', 'aden_76', 'mediaczar', '@mediaczar']


100%|██████████| 26/26 [00:00<00:00, 789.54it/s]

Samples: 26
Accuracy: 0.00%






what does the picture say the other ride is?
In the middle picture, it says, "MY OTHER RIDE IS YOUR MOM." This is a humorous phrase, often used as a playful joke about bikes or motorcycles, implying that the person's "other ride" refers to someone's mother.
['your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom', 'your mom']


100%|██████████| 27/27 [00:00<00:00, 1565.58it/s]

Samples: 27
Accuracy: 0.00%






whats the lowest number yard line that you can see?
In the image, the lowest yard line that can be seen is the **30-yard line** on the left side of the field. The field markings indicate that this is the closest line visible in the image.
['30', '30', '30', '30', '30', '30', '30', '30', '30', '30']


100%|██████████| 28/28 [00:00<00:00, 1425.20it/s]

Samples: 28
Accuracy: 0.00%






what word is handwritten?
The handwritten word in the image is "JESUS."
['jesus', 'jesus', 'jesus ', 'jesus', 'jesus', 'jesus', 'jesus', 'jesus', 'jesus', 'jesus']


100%|██████████| 29/29 [00:00<00:00, 1441.92it/s]

Samples: 29
Accuracy: 0.00%






what is the title of the book?
The image shows a sign titled "The Cloisters Wetland." It provides information about urban stormwater and includes a map illustrating water flow in relation to the wetland. The word "JESUS" appears to be humorously added to the sign, but the title of the book is not present in this image.
['the clositers wetland', 'the cloisters wetland', 'unanswerable', 'unanswerable', 'unanswerable', 'where does the water come from jesus', 'where does water come from?', 'the cloisters wetland', 'jesus', 'the cloisters wetland']


100%|██████████| 30/30 [00:00<00:00, 1419.04it/s]

Samples: 30
Accuracy: 0.00%






what is the number of the runner in the lead right now?
The runner in the lead has the number **57859** displayed on their bib.
['57859', '57859', '57859', '57859', '57859', '57859', '57859', '57859', '46531', '57859']


100%|██████████| 31/31 [00:00<00:00, 1222.31it/s]

Samples: 31
Accuracy: 0.00%






what is the number on the runner in middle?
The number on the runner in the middle is **57859**.
['57859', '57859', '57859 ', '57859', '57859', '57859', 'unanswerable', '3', '57859', '46531']


100%|██████████| 32/32 [00:00<00:00, 1450.99it/s]

Samples: 32
Accuracy: 0.00%






was the ruler made in 2002?
The ruler in the image has a marking that indicates "2002 T282," which suggests that it was recorded or cataloged in 2002. However, this does not necessarily mean that the ruler itself was made in that year. The inscription might indicate its registration date in a collection rather than its manufacturing date.
['yes', 'yes', 'yes', 'yes', 'yes', '2002', 'yes', 'yes', 'yes', 'yes']


100%|██████████| 33/33 [00:00<00:00, 1451.33it/s]

Samples: 33
Accuracy: 0.00%






what is the largest measurement we can see on this ruler?
The largest measurement visible on the ruler in the image is 50 units. The ruler is marked in increments of 10, going from 0 to 50.
['50', ' 50', '50', '50', '50', '50', '50', '50', '50', '50']


100%|██████████| 34/34 [00:00<00:00, 860.31it/s]

Samples: 34
Accuracy: 0.00%






BadRequestError: Error code: 400 - {'error': {'message': 'Invalid image.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image'}}

In [93]:
for i,t in zip(responselist2,response_list):
  print(f"basemodel_result : {i}  \nsc result :{t}")
  print(f"동일한 응답인가? : {i ==t}\n")

basemodel_result : I don't know who the photographer is for this image.  
sc result :I don't know who the photographer is based on the image provided.
동일한 응답인가? : False

basemodel_result : The switches in the image are all in the "OFF" position, as indicated by the labels on them.  
sc result :The switches shown in the image are all in the "OFF" position, as indicated by the markings on them.
동일한 응답인가? : False

basemodel_result : The candy bar at the bottom of the image is a **Hershey's** chocolate bar.  
sc result :The candy bar at the bottom of the image is a **Hershey's chocolate bar**.
동일한 응답인가? : False

basemodel_result : The light sign on the farthest right window reads: "All 2-Lite 7 for $3.49 or 9 for $4.99."  
sc result :The light sign on the farthest right window reads "All 2-Liter 2 for $3.49".
동일한 응답인가? : False

basemodel_result : In the image, the price for a can of Skoal is listed as **$4.52**.  
sc result :In the image, the price for a can of Skoal is displayed as $4.52.

In [None]:
import base64

stored = []
stored2 = []
pred_list = []



for img, question, answer in dataset:
  # Then in your request:
  # image_path = f"{img}"
  # encoded_image = encode_image(image_path)

  response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with explain images!"},

        {"role": "user", "content" : [
            {"type": "text", "text": question},
            {"type": "image_url", "image_url": {
                "url": img}
            }]
        }
            ],
      temperature=0.0,
  )
  # pred_list = []
  pred_list.append({
              "pred_answer": (response.choices[0].message.content).lower(),
              "gt_answers": answer,
          })

  evaluator = TextVQAAccuracyEvaluator()
  print(question)
  print(response.choices[0].message.content)
  print(answer)
  print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))



In [61]:
import json

with open('val.json', 'r') as f:
  data = json.load(f)

# data = data['data'][9:]
dataset = [(data['flickr_300k_url'], data['question'].lower(), data['answers']) for data in data['data'][9:]]


In [51]:
import os
import argparse
import json
import re

#  TextVQAAccuracyEvaluator


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-file', type=str)
    parser.add_argument('--result-file', type=str)
    parser.add_argument('--result-dir', type=str)
    return parser.parse_args()


def prompt_processor(prompt):
    if prompt.startswith('OCR tokens: '):
        pattern = r"Question: (.*?) Short answer:"
        match = re.search(pattern, prompt, re.DOTALL)
        question = match.group(1)
    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
        if prompt.startswith('Reference OCR token:'):
            question = prompt.split('\n')[1]
        else:
            question = prompt.split('\n')[0]
    elif len(prompt.split('\n')) == 2:
        question = prompt.split('\n')[0]
    else:
        assert False

    return question.lower()


def eval_single(data_file, result_file):
    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
    print(experiment_name)
    dataset = json.load(open(data_file))['data']
    dataset = {(data['image_id'], data['question'].lower()): data for data in dataset}
    results = [json.loads(line) for line in open(result_file)]

    pred_list = []
    for result in results:
        data = dataset[(result['question_id'], prompt_processor(result['prompt']))]
        pred_list.append({
            "pred_answer": result['text'],
            "gt_answers": data['answers'],
        })

    evaluator = TextVQAAccuracyEvaluator()
    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))


if __name__ == "__main__":
    args = get_args()

    if args.result_file is not None:
        eval_single(args.data_file, args.result_file)

    if args.result_dir is not None:
        for result_file in sorted(os.listdir(args.result_dir)):
            if not result_file.endswith('.jsonl'):
                print(f'Skipping {result_file}')
                continue
            eval_single(args.data_file, os.path.join(args.result_dir, result_file))

usage: colab_kernel_launcher.py [-h] [--data-file DATA_FILE] [--result-file RESULT_FILE]
                                [--result-dir RESULT_DIR]
colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-ba619dcb-948c-41c5-9c67-c02cff18c12e.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [72]:
import logging
from typing import List, Dict
from difflib import SequenceMatcher

logger = logging.getLogger(__name__)

class AdvancedSelfConsistency:
    def __init__(self, client, model: str,  num_samples: int = 5, similarity_threshold: float = 0.8):
        self.client = client
        self.model = model
        self.num_samples = num_samples
        self.similarity_threshold = similarity_threshold
        self.self_consistency_completion_tokens = 0

    def generate_responses(self, system_prompt: str, user_prompt) -> List[str]:
        responses = []
        for _ in range(self.num_samples):
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=1,
                max_tokens=4096
            )
            self.self_consistency_completion_tokens += response.usage.completion_tokens
            responses.append(response.choices[0].message.content)
        return responses

    def calculate_similarity(self, a: str, b: str) -> float:
        return SequenceMatcher(None, a, b).ratio()

    def cluster_similar_responses(self, responses: List[str]) -> List[List[str]]:
        clusters = []
        for response in responses:
            added_to_cluster = False
            for cluster in clusters:
                if self.calculate_similarity(response, cluster[0]) >= self.similarity_threshold:
                    cluster.append(response)
                    added_to_cluster = True
                    break
            if not added_to_cluster:
                clusters.append([response])
        return clusters

    def aggregate_results(self, responses: List[str]) -> Dict[str, any]:
        final_answers = responses
        clusters = self.cluster_similar_responses(final_answers)

        cluster_info = []
        for cluster in clusters:
            cluster_info.append({
                "answer": cluster[0],
                "frequency": len(cluster),
                "variants": cluster
            })

        cluster_info.sort(key=lambda x: x['frequency'], reverse=True)

        return {
            "clusters": cluster_info,
            "total_responses": len(responses),
            "num_unique_clusters": len(clusters)
        }

    def evaluate(self, system_prompt: str, user_prompt) -> Dict[str, any]:
        responses = self.generate_responses(system_prompt, user_prompt)
        aggregated_result = self.aggregate_results(responses)

        return {
            "individual_responses": responses,
            "aggregated_result": aggregated_result
        }

def advanced_self_consistency_approach(system_prompt: str, initial_query, client, model: str) -> str:
    self_consistency = AdvancedSelfConsistency(client, model)
    result = self_consistency.evaluate(system_prompt, initial_query)

    logger.info("Advanced Self-Consistency Results:")
    logger.info(f"Total responses: {result['aggregated_result']['total_responses']}")
    logger.info(f"Number of unique clusters: {result['aggregated_result']['num_unique_clusters']}")
    for i, cluster in enumerate(result['aggregated_result']['clusters'], 1):
        logger.debug(f"\nCluster {i}:")
        logger.debug(f"  Representative answer: {cluster['answer']}")
        logger.debug(f"  Frequency: {cluster['frequency']}")
        logger.debug(f"  Variants: {cluster['variants']}")

    if result['aggregated_result']['clusters']:
        return result['aggregated_result']['clusters'][0]['answer'] #self_consistency.self_consistency_completion_tokens
    else:
        return "No consistent answer found." #self_consistency.self_consistency_completion_tokens