In [8]:
import json
import os
import re

#[VideoRefer-7B, VideoLLaMA3, Qwen2.5-VL-7B-Instruct, Qwen2.5-VL-3B-Instruct, Osprey-7B, GPT-4o-low-8frames]
result_root = "/PATH/TO/results/RefVideoQA/GPT-4o-low-8frames"

API_KEY = ""

eval_result = json.load(open(os.path.join(result_root, "eval_result.json")))
requests = []

for vid in eval_result:
    for exp_id in eval_result[vid]:
        for qa_id in eval_result[vid][exp_id]:
            task_id = f"{vid}____{exp_id}____{qa_id}"

            sample = eval_result[vid][exp_id][qa_id]
            question, gt_answer, pred_answer = (
                sample["question"],
                sample["gt_answer"],
                sample["pred_answer"],
            )

            request = {
                "custom_id": task_id,
                "method": "POST",
                "url": "/chat/completions",
                "body": {
                    "model": "gpt-4o-batch",
                    "response_format": {"type": "json_object"},
                    "messages": [
                        {
                            "role": "system",
                            "content": 
                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
                            "Your response should be in JSON format."
                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
                            "------"
                            "##INSTRUCTIONS: "
                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
                            "- Consider synonyms or paraphrases as valid matches.\n"
                            "- Evaluate the correctness of the prediction compared to the answer.",
                        },
                        {
                            "role": "user",
                            "content": 
                            "Please evaluate the following video-based question-answer pair:\n\n"
                            f"Question: {question}\n"
                            f"Correct Answer: {gt_answer}\n"
                            f"Predicted Answer: {pred_answer}\n\n"
                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
                        },
                    ],
                },
            }

            requests.append(request)

requests_path = os.path.join(result_root, "requests.jsonl")
with open(requests_path, "w") as f:
    for request in requests:
        json.dump(request, f)
        f.write("\n")

In [None]:
push_command = f'curl --location "https://runway.devops.xiaohongshu.com/openai/azure/upload_batch_file?api-version=2024-10-01-preview" ' \
          f'--header "api-key: {API_KEY}" ' \
          f'--form "purpose=batch" ' \
          f'--form "file=@{requests_path}"'


push_result = !{push_command}


json_str = ''.join(push_result[-9:])
file_id = eval(json_str)['id']
file_id

In [None]:
submit_command = f'''curl --location "https://runway.devops.xiaohongshu.com/openai/azure/batches/create?api-version=2024-10-01-preview" \
          --header "api-key: {API_KEY}" \
          --header "Content-Type: application/json" \
          --data '{{"input_file_id": "{file_id}", "endpoint": "/chat/completions", "completion_window": "24h"}}' '''

submit_result = !{submit_command}


response_text = ''.join(submit_result)
match = re.search(r'"id":\s*"([^"]+)"', response_text)
batch_id = match.group(1)
print(batch_id)

In [None]:
query_command = f'''curl --location "https://runway.devops.xiaohongshu.com/openai/azure/batches/{batch_id}?api-version=2024-10-01-preview" \
                --header "api-key: {API_KEY}"'''

query_result = !{query_command}

match = re.search(r'"output_file_id":\s*"([^"]*)"', ''.join(query_result))
output_file_id = match.group(1)

query_result, output_file_id

In [None]:
save_path = os.path.join(result_root, 'outputs.jsonl')
pull_command = f'''curl "https://runway.devops.xiaohongshu.com/openai/azure/out_batch_file/{output_file_id}?api-version=2024-10-01-preview" \
             -H "api-key: {API_KEY}"  > {save_path}'''
!{pull_command}

In [None]:
from tqdm import tqdm

output = []
with open(save_path, "r", encoding="utf-8") as file:
    for line in file:
        try:
            output.append(json.loads(line))
        except:
            continue


eval_result_gpt = eval_result.copy()


# Calculate average score and accuracy
score_sum = 0
count = 0
yes_count = 0
no_count = 0
error_count = 0

for sample in tqdm(output):

    vid, exp_id, qa_id = sample['custom_id'].split('____')
    if 'content' not in sample["response"]["body"]['choices'][0]['message'].keys():
        continue
    result = sample["response"]["body"]['choices'][0]['message']['content']
    
    try:
        result = eval(result)

        # Computing score
        if 'score' in result:
            count += 1
            score = max(min(int(result['score']), 5), 0)
            eval_result_gpt[vid][exp_id][qa_id]['score'] = score
            score_sum += score

        # Computing accuracy
        if 'pred' in result:
            pred = result['pred']
            eval_result_gpt[vid][exp_id][qa_id]['pred'] = pred
            if "yes" in pred.lower():
                yes_count += 1
            elif "no" in pred.lower():
                no_count += 1
    except:
        error_count += 1
        continue


average_score = score_sum / count
accuracy = yes_count / (yes_count + no_count)
print("Yes count:", yes_count)
print("No count:", no_count)
print("Error count:", error_count)
print("Accuracy:", accuracy)
print("Average score:", average_score)

# eval_result_gpt= {
#     "Yes count": yes_count,
#     "No count": no_count,
#     "Accuracy": accuracy,
#     "Average score": average_score,
#     **eval_result_gpt
# }
with open(os.path.join(result_root, "eval_result_gpt.json"), "w", encoding="utf-8") as file:
    json.dump(eval_result_gpt, file, indent=2, ensure_ascii=False)
