In [2]:
import re
import os
import glob
import numpy as np
import pandas as pd


infer_dir = "../results/inference"

# task_name = "okvqa"
# ds_split = "train"

# task_name = "mmmu"
# ds_split = "validation"

task_name = "mmmu_pro"
ds_split = "test"


model_names = ["llava-v1.6-vicuna-7b-hf", "llava-v1.6-vicuna-13b-hf",
               "Qwen2.5-VL-7B-Instruct", "InternVL2-8B",
               "deepseek-vl2-tiny", "deepseek-vl2-small"]


def extract_letter(text):
    match = re.search(r"\((\w)\)", text)
    return match.group(1) if match else ""

error_list = []
for mn in model_names:
    data_path = os.path.join(infer_dir, task_name, ds_split, f"{mn}_output.csv")
    data_df = pd.read_csv(data_path)

    arr_path = os.path.join(infer_dir, task_name, ds_split, f"{mn}_prob.npy")
    prob_arr = np.load(arr_path)

    start_chr = 'A'
    choices = []
    for i in range(prob_arr.shape[1]):
        choices.append(start_chr)
        start_chr = chr(ord(start_chr) + 1)

    prob_pred = []
    for i in np.argmax(prob_arr, axis=1):
        prob_pred.append(choices[i])
    prob_pred = np.array(prob_pred, dtype=str)

    generated_outputs = data_df["generated_outputs"].values

    extracted_outputs = []
    for output in generated_outputs:
        pred_txt = str(output)[:10].strip()
        if "\n" in pred_txt:
            pred_txt = pred_txt.split("\n")[1]
        if "(" in pred_txt or ")" in pred_txt:
            pred_txt = extract_letter(pred_txt)
        extracted_outputs.append(pred_txt[:1].upper())
    extracted_outputs = np.array(extracted_outputs)

    labels = data_df["answer"].values.astype(str) 
    if task_name == "mmmu_pro" and "llava" not in mn:
        extracted_outputs = np.delete(extracted_outputs, (1017), axis=0)
        prob_pred = np.delete(prob_pred, (1017), axis=0)
        labels = np.delete(labels, (1017), axis=0)

    errors = labels == extracted_outputs.astype(str)
    error_list.append(errors.astype(int))
    acc = np.mean(errors)

    print(prob_arr.shape)
    # print(mn, acc, np.mean(data_df["answer"].values.astype(str) == prob_pred))
    


(1591, 9)
(1591, 9)
(1592, 9)
(1592, 9)
(1592, 9)
(1592, 9)


In [18]:
import torch

exp_dict = torch.load("../results/ensemble/exp_result.pth")
error_arr = np.array(error_list)

logits = exp_dict["logits"]
ens_preds = logits[:, -9:].argmax(axis=1)
ens_err = (ens_preds == exp_dict["labels"]).astype(int)

In [16]:
import pandas as pd
import numpy as np

split = "test"
data_path = f"../results/inference/ocr/{split}"

model_names = ["llava-v1.6-vicuna-7b-hf", "llava-v1.6-vicuna-13b-hf", 
               "Qwen2.5-VL-7B-Instruct", "InternVL2-8B", 
               "deepseek-vl2-tiny", "deepseek-vl2-small"]

model_outputs = []
answers = []
questions = []
for mn in model_names:
    data_df = pd.read_csv(f"{data_path}/{mn}_output.csv", index_col=0)
    model_outputs.append(data_df["generated_outputs"].values)
    if len(answers) == 0:
        answers = data_df["answer"].values
        questions = data_df["question"].values
model_outputs = np.array(model_outputs).T


In [17]:
model_outputs.shape, answers.shape, questions.shape

((3001, 6), (3001,), (3001,))

In [18]:
import re
import nltk
from nltk.metrics.scores import f_measure


def calc_metric(labels, pred_arr):
    blue_scores, em_scores, F1 = [], [], []
    for i in range(len(labels)):
        pred = str(pred_arr[i]).strip().split(" ")
        blue_scores.append(nltk.translate.bleu_score.
                           sentence_bleu([labels[i].split(" ")], pred,
                                         weights=(1, 0, 0, 0)))

        reference_set = set(labels[i].split(" "))
        test_set = set(pred)
        F1.append(f_measure(reference_set, test_set))

        pred = " ".join(pred)
        ans = re.sub(r"[()]", "", labels[i]).split(" ")

        txt_wo_prn = re.sub(r'\([^)]*\)', '', labels[i])
        txt_with_prn = ans
        if txt_wo_prn == pred or txt_with_prn == pred:
            em_scores.append(1)
        else:
            em_scores.append(0)

    return np.mean(blue_scores), np.mean(em_scores), np.mean(F1)


In [19]:
scores = {}
for i, mn in enumerate(model_names):
    blue_sc, em_sc, f1_sc = calc_metric(answers, model_outputs[:, i])
    scores[mn] = {"bleu_score": blue_sc, "em_score": em_sc, "f1_sc": f1_sc}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [20]:
scores

{'llava-v1.6-vicuna-7b-hf': {'bleu_score': 0.7807903437741843,
  'em_score': 0.6771076307897368,
  'f1_sc': 0.7960851736144892},
 'llava-v1.6-vicuna-13b-hf': {'bleu_score': 0.7893527761555877,
  'em_score': 0.6917694101966011,
  'f1_sc': 0.8044668447086526},
 'Qwen2.5-VL-7B-Instruct': {'bleu_score': 0.8539193058784806,
  'em_score': 0.7517494168610463,
  'f1_sc': 0.8679192066008536},
 'InternVL2-8B': {'bleu_score': 0.5236647506317215,
  'em_score': 0.4555148283905365,
  'f1_sc': 0.5709381156177536},
 'deepseek-vl2-tiny': {'bleu_score': 0.546012094714602,
  'em_score': 0.4171942685771409,
  'f1_sc': 0.5725242791702768},
 'deepseek-vl2-small': {'bleu_score': 0.383092169394673,
  'em_score': 0.29323558813728756,
  'f1_sc': 0.4171195235151174}}

In [21]:
for key, values in scores.items():
    print(f"{key}\t{values['bleu_score']*100:.3f}\t{values['em_score']*100:.3f}\t{values['f1_sc']*100:.3f}")

llava-v1.6-vicuna-7b-hf	78.079	67.711	79.609
llava-v1.6-vicuna-13b-hf	78.935	69.177	80.447
Qwen2.5-VL-7B-Instruct	85.392	75.175	86.792
InternVL2-8B	52.366	45.551	57.094
deepseek-vl2-tiny	54.601	41.719	57.252
deepseek-vl2-small	38.309	29.324	41.712


In [None]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch


class MyDataset(Dataset):
    def __init__(self, tokenized_inputs, labels, global_attention_tokens=None, negative_inputs=None):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels
        self.global_attention_tokens = global_attention_tokens

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # input_ids = self.tokenized_inputs['input_ids'][idx]
        # attention_mask = self.tokenized_inputs['attention_mask'][idx]
        input_ids = torch.tensor(self.tokenized_inputs[idx].ids)
        attention_mask = torch.tensor(self.tokenized_inputs[idx].attention_mask)
        global_attentions = []
        start = False
        for i in input_ids:
            if start:
                if i == 50266:
                    start = False
                global_attentions.append(1)
            else:
                if i == 50265:
                    start = True
                global_attentions.append(0)
        global_attentions = torch.tensor(global_attentions)        # token_type_ids = self.tokenized_inputs['token_type_ids'][idx]
        label = self.labels[idx]
        return_dict = {'input_ids': input_ids,
                       "labels": label,
                       'attention_mask': attention_mask,
                       'global_attention_mask': global_attentions}

        return return_dict

