In [14]:
import os
import re
import glob
import pandas as pd
import itertools

def get_group(filename):
    pattern = r'grid_\d+x\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

def get_patch(filename):
    pattern = r'row\d+_col\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

base_dir = '/home/srikapan/workspace/research/multiblock/VLMEvalKit/outputs'
model_name = 'Phi-3.5-Vision' # llava_hf_v1.5_7b, InternVL2-1B, Idefics3-8B-Llama3, Phi-3.5-Vision
dataset_name = 'RealWorldQA'
for model_name in ['llava_hf_v1.5_7b', 'InternVL2-1B', 'Idefics3-8B-Llama3', 'Phi-3.5-Vision']:
    file_paths = glob.glob(f'{base_dir}/{model_name}/{model_name}_{dataset_name}_*openai_result.xlsx')
    groups = {}
    for i in file_paths:
        group = get_group(os.path.basename(i))
        if group not in groups:
            groups[group] = [os.path.basename(i)]
        else:
            groups[group].append(os.path.basename(i))

    all_combinations = []
    for r in range(1, len(groups.keys()) + 1):
        all_combinations.extend(itertools.combinations(groups.keys(), r))

    final_dict = {}
    for gp_element in all_combinations:
        all_excel = {}
        for indi_element in gp_element:
            for i in groups[indi_element]:
                df = pd.read_excel(f'{base_dir}/{model_name}/{i}')
                hit = df.hit.tolist()
                all_excel[os.path.basename(i).replace('.xlsx', '')] = hit
        result = [int(any(values)) for values in zip(*all_excel.values())]
        result = sum(result)/len(result)
        final_dict['-'.join(gp_element)] = result
    df = pd.DataFrame(list(final_dict.items()), columns=['Configuration', 'Value'])
    df.to_csv(f'{base_dir}/{model_name}_summary.csv', index=False)

In [15]:
import os
import re
import ast
import glob
import pandas as pd
import itertools
from pycocoevalcap.bleu.bleu import Bleu

bleu_scorer = Bleu(n=4)

def get_score(reference_captions, generated_captions):
    reference_captions = {"0": ast.literal_eval(reference_captions)}
    generated_captions = {"0": [generated_captions]}

    scores, detailed_scores = bleu_scorer.compute_score(reference_captions, generated_captions)
    return sum(scores)/len(scores)

def get_group(filename):
    pattern = r'grid_\d+x\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted

def get_patch(filename):
    pattern = r'row\d+_col\d+'
    match = re.search(pattern, filename)
    if match:
        extracted = match.group()
    else:
        extracted = 'orig'
    return extracted


base_dir = '/home/srikapan/workspace/research/multiblock/VLMEvalKit/outputs'
dataset_name = 'COCO_VAL_sample'
for model_name in ['llava_hf_v1.5_7b', 'InternVL2-1B', 'Idefics3-8B-Llama3', 'Phi-3.5-Vision']:
    file_paths = glob.glob(f'{base_dir}/{model_name}/{model_name}_{dataset_name}_*.xlsx')
    groups = {}
    for i in file_paths:
        group = get_group(os.path.basename(i))
        if group not in groups:
            groups[group] = [os.path.basename(i)]
        else:
            groups[group].append(os.path.basename(i))

    all_combinations = []
    for r in range(1, len(groups.keys()) + 1):
        all_combinations.extend(itertools.combinations(groups.keys(), r))

    final_dict = {}
    for gp_element in all_combinations:
        all_excel = {}
        for indi_element in gp_element:
            for i in groups[indi_element]:
                df = pd.read_excel(f'{base_dir}/{model_name}/{i}')
                df.loc[:, 'scores'] = df[['answer', 'prediction']].apply(lambda x: get_score(x['answer'], x['prediction']), axis=1)
                scores = df.scores.tolist()
                all_excel[os.path.basename(i).replace('.xlsx', '')] = scores
        result = [(max(values)) for values in zip(*all_excel.values())]
        result = sum(result)
        final_dict['-'.join(gp_element)] = result
    df = pd.DataFrame(list(final_dict.items()), columns=['Configuration', 'Value'])
    df.to_csv(f'{base_dir}/{model_name}_summary.csv', index=False)

{'testlen': 129, 'reflen': 14, 'guess': [129, 128, 127, 126], 'correct': [16, 4, 2, 1]}
ratio: 9.21428571362755
{'testlen': 73, 'reflen': 14, 'guess': [73, 72, 71, 70], 'correct': [15, 8, 3, 0]}
ratio: 5.214285713913266
{'testlen': 95, 'reflen': 11, 'guess': [95, 94, 93, 92], 'correct': [12, 5, 3, 2]}
ratio: 8.636363635578512
{'testlen': 62, 'reflen': 11, 'guess': [62, 61, 60, 59], 'correct': [8, 2, 0, 0]}
ratio: 5.6363636358512395
{'testlen': 76, 'reflen': 11, 'guess': [76, 75, 74, 73], 'correct': [10, 4, 0, 0]}
ratio: 6.90909090846281
{'testlen': 94, 'reflen': 14, 'guess': [94, 93, 92, 91], 'correct': [15, 4, 0, 0]}
ratio: 6.714285713806122
{'testlen': 65, 'reflen': 14, 'guess': [65, 64, 63, 62], 'correct': [11, 3, 1, 0]}
ratio: 4.64285714252551
{'testlen': 111, 'reflen': 11, 'guess': [111, 110, 109, 108], 'correct': [15, 7, 3, 2]}
ratio: 10.090909089991735
{'testlen': 56, 'reflen': 11, 'guess': [56, 55, 54, 53], 'correct': [10, 2, 0, 0]}
ratio: 5.090909090446281
{'testlen': 69, 'ref

{'testlen': 57, 'reflen': 14, 'guess': [57, 56, 55, 54], 'correct': [7, 0, 0, 0]}
ratio: 4.071428571137755
{'testlen': 77, 'reflen': 14, 'guess': [77, 76, 75, 74], 'correct': [7, 0, 0, 0]}
ratio: 5.499999999607143
{'testlen': 59, 'reflen': 11, 'guess': [59, 58, 57, 56], 'correct': [6, 0, 0, 0]}
ratio: 5.3636363631487605
{'testlen': 74, 'reflen': 11, 'guess': [74, 73, 72, 71], 'correct': [9, 2, 0, 0]}
ratio: 6.727272726661157
{'testlen': 68, 'reflen': 11, 'guess': [68, 67, 66, 65], 'correct': [6, 0, 0, 0]}
ratio: 6.181818181256198
{'testlen': 82, 'reflen': 14, 'guess': [82, 81, 80, 79], 'correct': [7, 0, 0, 0]}
ratio: 5.85714285672449
{'testlen': 1647, 'reflen': 14, 'guess': [1647, 1646, 1645, 1644], 'correct': [3, 0, 0, 0]}
ratio: 117.64285713445408
{'testlen': 51, 'reflen': 11, 'guess': [51, 50, 49, 48], 'correct': [6, 1, 0, 0]}
ratio: 4.636363635942149
{'testlen': 72, 'reflen': 11, 'guess': [72, 71, 70, 69], 'correct': [10, 3, 0, 0]}
ratio: 6.5454545448595045
{'testlen': 75, 'reflen'