In [1]:
def check_NA(response):
    apologetic_flag = True if 'sorry' in response or 'does not have the necessary detail' in response or 'cannot accurately identify' in response or 'Sorry' in response or "can't provide" in response or "cannot provide" in response or "does not provide" in response or "cannot assist" in response or "not possible" in response or "unable" in response or 'unknown' in response or 'not clear' in response or 'not determined' in response or 'not known' in response or 'not provided' in response or 'not mentioned' in response or 'not given' in response or 'not specified' in response or 'may not be visible' in response or 'not visible' in response or 'unfortunately' in response or 'not clearly visible' in response else False
    return apologetic_flag

def remove_verbose(response, organism='fish'):
    response = response.replace('The answer is: ', '')
    response = response.replace(f'The {organism} in the image is a ', '')
    response = response.replace(f'The {organism} in the image is called', '')
    response = response.replace(f'The {organism} in the photo is a', '')
    response = response.replace(f'The species and scientific name of the {organism} in the image are', '')
    response = response.replace(f'The species in the image is', '')
    response = response.replace(f'The species of the {organism} in the image is', '')
    response = response.replace(f'The scientific name of the {organism} in the image is', '') 
    response = response.replace(f'The species of the {organism} in the picture is', '')
    response = response.replace(f'The species of the {organism} in the photo is', '')
    response = response.replace(f'The species of the {organism} in this picture is','')
    response = response.replace(f'The {organism} in the photo is called', '')
    return response.strip()




def classification_eval_selection(data, model_name, organism='fish', task_type='direct'):
    out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': 0,
        'correct_ge_count' : 0, # only genus correct, partial
        'NA_count' : 0,
        'incorrect_count': 0,
    }
    
    for data_idx in range(len(data)):
        data_item = data[data_idx]
        
        gt_sci_name = data_item['target-class']
        if gt_sci_name != gt_sci_name:
            continue
        gt_genus = gt_sci_name.split(' ')[0]

        if len(gt_sci_name) == 2:
            gt_species = gt_sci_name.split(' ')[1]

        response = data_item['output']
    
        response = remove_verbose(response, organism=organism)

        gt_option = data_item['option-gt']
        
        # print(f'Ground-truth:{gt_option}', response)

        if len(response)==1:
            if gt_option.lower() == response.lower():
                out_dict['correct_count'] += 1
            else:
                out_dict['incorrect_count'] += 1
            continue

        elif response.lower() in ['a)', 'b)', 'c)', 'd)']:
            if gt_option.lower() == response.lower()[0]:
                out_dict['correct_count'] += 1
            else:
                out_dict['incorrect_count'] += 1
            continue
                
        # correct
        if gt_sci_name.lower() in response.lower():
            out_dict['correct_count'] += 1
            continue

        # partial, correct genus only
#         if gt_genus.lower() in response.lower():
#             out_dict['correct_ge_count'] += 1
#             continue

        if check_NA(response.lower()):
            out_dict['NA_count'] += 1
            continue

        out_dict['incorrect_count'] += 1

    total = out_dict['correct_count'] + out_dict['correct_ge_count'] + out_dict['NA_count'] + out_dict['incorrect_count']


    percnt_out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': out_dict['correct_count']*100/total if total!=0 else 0,
        'correct_ge_count' : out_dict['correct_ge_count']*100/total if total!=0 else 0,
        'NA_count' : out_dict['NA_count']*100/total if total!=0 else 0,
        'incorrect_count': out_dict['incorrect_count']*100/total if total!=0 else 0,
    }
    print(f'Evaluation completed for {model_name}.')

    return out_dict, percnt_out_dict


In [2]:
def write_evaluation(dataset, task_type, task, organism, start_dir, model_list = None):

    model_list = MODEL_NAMES if model_list == None else model_list
    
    for model_name in model_list:
        datalist = []
        
        for chunkid in range(10):
            if organism == 'fish':
                some_value = 20
            elif organism == 'bird':
                some_value = 20
            elif organism == 'butterfly':
                some_value = 20
                
            RESULT_FILE_TEMPT = f'{start_dir}/{dataset}/{task}/{task_type}/{task}_{model_name}_{task_type}_num_{some_value}_chunk_{chunkid}.jsonl'
            datalist+=read_file(RESULT_FILE_TEMPT)
    
        print(f'Starting Evaluations for model {model_name} on {len(datalist)} results.')
    
        if task_type == 'direct':
            _, percent_result_dict = classification_eval_direct(data=datalist, model_name=model_name, task_type=task_type, organism=organism)
        elif task_type == 'selection':
            _, percent_result_dict = classification_eval_selection(data=datalist, model_name=model_name, task_type=task_type, organism=organism)

        
        writer = jsonlines.open(f'./tables/classification_{dataset}_{task_type}_{organism}.jsonl', mode='a')
        writer.write(percent_result_dict)
        writer.close()

In [3]:
import os
import json
import jsonlines

def read_file(filepath):
    if os.path.exists(filepath) == False:
        print(f'{filepath} not found.')
        return []

    datalist = []

    with open(filepath, "r", encoding='utf-8') as fh:
        for line in fh.readlines():
            if line:
                try: # if the response writing creates any issue, this try-catch will handle it.
                    dict_ = json.loads(line) 
                except:
                    continue
                datalist.append(dict_)

    return datalist

In [4]:
MODEL_NAMES = ['gpt-4v',
               'gpt-4o',
               'llava-v1.5-7b',
               'llava-v1.5-13b',
               'cogvlm-chat',
               'blip-flan-xl',
               'blip-flan-xxl',
               'minigpt4-vicuna-7B',
               'minigpt4-vicuna-13B',
               'instruct-flant5xl',
               'instruct-flant5xxl',
               'instruct-vicuna7b',
               'instruct-vicuna13b']

DATASETS = ['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
RESULT_DIRS = ['/projects/ml4science/VLM4Bio/']
EVAL_TYPE = ['easy_medium_hard_results']
TASK_TYPE= ['direct', 'selection']
start_dir = os.path.join(RESULT_DIRS[0], EVAL_TYPE[0])

## Fish

In [5]:
task = 'classification'

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'fish-easy'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'fish-medium'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

Writing evalutations for fish-easy dataset and classification task and fish organism
Starting Evaluations for model gpt-4v on 200 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model gpt-4o on 200 results.
Evaluation completed for gpt-4o.
Starting Evaluations for model llava-v1.5-7b on 200 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 200 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 200 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 200 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 200 results.
Evaluation completed for blip-flan-xxl.
Starting Evaluations for model minigpt4-vicuna-7B on 200 results.
Evaluation completed for minigpt4-vicuna-7B.
Starting Evaluations for model minigpt4-vicuna-13B on 200 results.
Evaluation completed for minigpt4-vicuna-13B.
Starting Evaluat

## Bird

In [6]:
task = 'classification'

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'bird-easy'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'bird-medium'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

Writing evalutations for bird-easy dataset and classification task and bird organism
Starting Evaluations for model gpt-4v on 200 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model gpt-4o on 200 results.
Evaluation completed for gpt-4o.
Starting Evaluations for model llava-v1.5-7b on 200 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 200 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 200 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 200 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 200 results.
Evaluation completed for blip-flan-xxl.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-easy/classification/selection/classification_minigpt4-vicuna-7B_selection_num_20_chunk_0.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-easy/classificat

Starting Evaluations for model instruct-vicuna7b on 200 results.
Evaluation completed for instruct-vicuna7b.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-medium/classification/selection/classification_instruct-vicuna13b_selection_num_20_chunk_0.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-medium/classification/selection/classification_instruct-vicuna13b_selection_num_20_chunk_1.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-medium/classification/selection/classification_instruct-vicuna13b_selection_num_20_chunk_2.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-medium/classification/selection/classification_instruct-vicuna13b_selection_num_20_chunk_3.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bird-medium/classification/selection/classification_instruct-vicuna13b_selection_num_20_chunk_4.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/bi

## Butterfly

In [7]:
task = 'classification'

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'butterfly-easy'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'butterfly-medium'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

#['fish-easy', 'fish-medium', 'bird-easy', 'bird-medium', 'butterfly-easy', 'butterfly-medium', 'butterfly-hard']
dataset = 'butterfly-hard'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

Writing evalutations for butterfly-easy dataset and classification task and butterfly organism
Starting Evaluations for model gpt-4v on 200 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model gpt-4o on 200 results.
Evaluation completed for gpt-4o.
Starting Evaluations for model llava-v1.5-7b on 200 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 200 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 200 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 200 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 200 results.
Evaluation completed for blip-flan-xxl.
Starting Evaluations for model minigpt4-vicuna-7B on 200 results.
Evaluation completed for minigpt4-vicuna-7B.
Starting Evaluations for model minigpt4-vicuna-13B on 200 results.
Evaluation completed for minigpt4-vicuna-13B.
Starti

/projects/ml4science/VLM4Bio/easy_medium_hard_results/butterfly-hard/classification/selection/classification_instruct-flant5xxl_selection_num_20_chunk_0.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/butterfly-hard/classification/selection/classification_instruct-flant5xxl_selection_num_20_chunk_1.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/butterfly-hard/classification/selection/classification_instruct-flant5xxl_selection_num_20_chunk_2.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/butterfly-hard/classification/selection/classification_instruct-flant5xxl_selection_num_20_chunk_3.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/butterfly-hard/classification/selection/classification_instruct-flant5xxl_selection_num_20_chunk_4.jsonl not found.
/projects/ml4science/VLM4Bio/easy_medium_hard_results/butterfly-hard/classification/selection/classification_instruct-flant5xxl_selection_num_20_chunk

In [12]:
import pandas as pd
import json

PROMPT_MODELS = ['gpt-4v', 'llava-v1.5-7b', 'llava-v1.5-13b', 'cogvlm-chat', 'blip-flan-xl', 'blip-flan-xxl']
MODEL_NAMES = ['gpt-4v',
               'gpt-4o',
               'llava-v1.5-7b',
               'llava-v1.5-13b',
               'cogvlm-chat',
               'blip-flan-xl',
               'blip-flan-xxl',
               'minigpt4-vicuna-7B',
               'minigpt4-vicuna-13B',
               'instruct-flant5xl',
               'instruct-flant5xxl',
               'instruct-vicuna7b',
               'instruct-vicuna13b']
# PROMPT_MODELS = MODEL_NAMES

def classification_tables(filename, model_lists=MODEL_NAMES, full_display=False):

    # Initialize empty lists to store data
    model_list = []
    question_type_list = []
    correct_count_list = []
    correct_ge_count_list = []
    na_count_list = []
    incorrect_count_list = []
    
    # Read the JSON lines file line by line
    with open(filename, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data['Model'] not in model_lists:
                continue
            model_list.append(data['Model'])
            question_type_list.append(data['Question-type'])
            correct_count_list.append('{:.2f}'.format(data['correct_count']))
            correct_ge_count_list.append('{:.2f}'.format(data['correct_ge_count']))
            na_count_list.append('{:.2f}'.format(data['NA_count']))
            incorrect_count_list.append('{:.2f}'.format(data['incorrect_count']))
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Models': model_list,
        'Question-type': question_type_list,
        'correct_count': correct_count_list,
        'correct_ge_count': correct_ge_count_list,
        'NA_count': na_count_list,
        'incorrect_count': incorrect_count_list
    })
    
    # Display the DataFrame
    if full_display==False:
        display(df)

    return df[['correct_count']]

In [13]:
print("\t\t\t\t\t###### Fish Easy ######")
df_eval = classification_tables('./tables/classification_fish-easy_selection_fish.jsonl', model_lists=MODEL_NAMES)
display(df_eval)
print("\t\t\t\t\t###### Fish Medium ######")
df_eval = classification_tables('./tables/classification_fish-medium_selection_fish.jsonl', model_lists=MODEL_NAMES)
display(df_eval)


					###### Fish Easy ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,44.5,0.0,39.0,16.5
1,gpt-4o,selection,37.5,0.0,42.5,20.0
2,llava-v1.5-7b,selection,47.5,0.0,0.0,52.5
3,llava-v1.5-13b,selection,46.0,0.0,0.0,54.0
4,cogvlm-chat,selection,24.0,0.0,0.0,76.0
5,blip-flan-xl,selection,34.0,0.0,0.0,66.0
6,blip-flan-xxl,selection,27.5,0.0,0.0,72.5
7,minigpt4-vicuna-7B,selection,29.0,0.0,1.0,70.0
8,minigpt4-vicuna-13B,selection,19.5,0.0,0.0,80.5
9,instruct-flant5xl,selection,32.0,0.0,0.0,68.0


Unnamed: 0,correct_count
0,44.5
1,37.5
2,47.5
3,46.0
4,24.0
5,34.0
6,27.5
7,29.0
8,19.5
9,32.0


					###### Fish Medium ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,3.5,0.0,86.0,10.5
1,gpt-4o,selection,5.5,0.0,82.0,12.5
2,llava-v1.5-7b,selection,30.0,0.0,0.0,70.0
3,llava-v1.5-13b,selection,28.5,0.0,0.0,71.5
4,cogvlm-chat,selection,27.0,0.0,0.0,73.0
5,blip-flan-xl,selection,26.0,0.0,0.0,74.0
6,blip-flan-xxl,selection,23.0,0.0,0.0,77.0
7,minigpt4-vicuna-7B,selection,26.5,0.0,4.0,69.5
8,minigpt4-vicuna-13B,selection,25.0,0.0,1.5,73.5
9,instruct-flant5xl,selection,28.5,0.0,0.0,71.5


Unnamed: 0,correct_count
0,3.5
1,5.5
2,30.0
3,28.5
4,27.0
5,26.0
6,23.0
7,26.5
8,25.0
9,28.5


In [14]:
print("\t\t\t\t\t###### Bird Easy ######")
df_eval = classification_tables('./tables/classification_bird-easy_selection_bird.jsonl', model_lists=MODEL_NAMES)
display(df_eval)
print("\t\t\t\t\t###### Bird Medium ######")
df_eval = classification_tables('./tables/classification_bird-medium_selection_bird.jsonl', model_lists=MODEL_NAMES)
display(df_eval)


					###### Bird Easy ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,73.5,0.0,16.5,10.0
1,gpt-4o,selection,68.0,0.0,20.5,11.5
2,llava-v1.5-7b,selection,53.5,0.0,0.0,46.5
3,llava-v1.5-13b,selection,50.0,0.0,0.0,50.0
4,cogvlm-chat,selection,38.5,0.0,0.0,61.5
5,blip-flan-xl,selection,34.5,0.0,0.0,65.5
6,blip-flan-xxl,selection,36.0,0.0,0.0,64.0
7,minigpt4-vicuna-7B,selection,0.0,0.0,0.0,0.0
8,minigpt4-vicuna-13B,selection,0.0,0.0,0.0,0.0
9,instruct-flant5xl,selection,41.0,0.0,0.0,59.0


Unnamed: 0,correct_count
0,73.5
1,68.0
2,53.5
3,50.0
4,38.5
5,34.5
6,36.0
7,0.0
8,0.0
9,41.0


					###### Bird Medium ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,41.0,0.0,42.5,16.5
1,gpt-4o,selection,40.5,0.0,40.0,19.5
2,llava-v1.5-7b,selection,30.5,0.0,0.0,69.5
3,llava-v1.5-13b,selection,37.0,0.0,0.0,63.0
4,cogvlm-chat,selection,30.0,0.0,0.0,70.0
5,blip-flan-xl,selection,25.5,0.0,0.0,74.5
6,blip-flan-xxl,selection,21.0,0.0,0.0,79.0
7,minigpt4-vicuna-7B,selection,0.0,0.0,0.0,0.0
8,minigpt4-vicuna-13B,selection,0.0,0.0,0.0,0.0
9,instruct-flant5xl,selection,27.0,0.0,0.0,73.0


Unnamed: 0,correct_count
0,41.0
1,40.5
2,30.5
3,37.0
4,30.0
5,25.5
6,21.0
7,0.0
8,0.0
9,27.0


In [15]:
print("\t\t\t\t\t###### Butterfly Easy ######")
df_eval = classification_tables('./tables/classification_butterfly-easy_selection_butterfly.jsonl', model_lists=MODEL_NAMES)
display(df_eval)
print("\t\t\t\t\t###### Butterfly Medium ######")
df_eval = classification_tables('./tables/classification_butterfly-medium_selection_butterfly.jsonl', model_lists=MODEL_NAMES)
display(df_eval)
print("\t\t\t\t\t###### Butterfly Hard ######")
df_eval = classification_tables('./tables/classification_butterfly-hard_selection_butterfly.jsonl', model_lists=MODEL_NAMES)
display(df_eval)

					###### Butterfly Easy ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,18.5,0.0,62.0,19.5
1,gpt-4o,selection,17.5,0.0,61.5,21.0
2,llava-v1.5-7b,selection,19.0,0.0,0.0,81.0
3,llava-v1.5-13b,selection,20.5,0.0,0.0,79.5
4,cogvlm-chat,selection,24.5,0.0,0.0,75.5
5,blip-flan-xl,selection,30.0,0.0,0.0,70.0
6,blip-flan-xxl,selection,25.0,0.0,0.0,75.0
7,minigpt4-vicuna-7B,selection,34.5,0.0,0.5,65.0
8,minigpt4-vicuna-13B,selection,26.0,0.0,1.0,73.0
9,instruct-flant5xl,selection,24.5,0.0,0.0,75.5


Unnamed: 0,correct_count
0,18.5
1,17.5
2,19.0
3,20.5
4,24.5
5,30.0
6,25.0
7,34.5
8,26.0
9,24.5


					###### Butterfly Medium ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,5.5,0.0,81.0,13.5
1,gpt-4o,selection,7.0,0.0,77.5,15.5
2,llava-v1.5-7b,selection,29.5,0.0,0.0,70.5
3,llava-v1.5-13b,selection,29.0,0.0,0.0,71.0
4,cogvlm-chat,selection,29.5,0.0,0.0,70.5
5,blip-flan-xl,selection,20.0,0.0,0.0,80.0
6,blip-flan-xxl,selection,25.5,0.0,0.0,74.5
7,minigpt4-vicuna-7B,selection,33.0,0.0,1.5,65.5
8,minigpt4-vicuna-13B,selection,25.0,0.0,0.5,74.5
9,instruct-flant5xl,selection,27.5,0.0,0.0,72.5


Unnamed: 0,correct_count
0,5.5
1,7.0
2,29.5
3,29.0
4,29.5
5,20.0
6,25.5
7,33.0
8,25.0
9,27.5


					###### Butterfly Hard ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,2.0,0.0,93.5,4.5
1,gpt-4o,selection,1.5,0.0,93.0,5.5
2,llava-v1.5-7b,selection,22.0,0.0,0.0,78.0
3,llava-v1.5-13b,selection,21.0,0.0,0.0,79.0
4,cogvlm-chat,selection,32.0,0.0,0.0,68.0
5,blip-flan-xl,selection,26.5,0.0,0.0,73.5
6,blip-flan-xxl,selection,23.33,0.0,0.0,76.67
7,minigpt4-vicuna-7B,selection,0.0,0.0,0.0,0.0
8,minigpt4-vicuna-13B,selection,0.0,0.0,0.0,0.0
9,instruct-flant5xl,selection,0.0,0.0,0.0,0.0


Unnamed: 0,correct_count
0,2.0
1,1.5
2,22.0
3,21.0
4,32.0
5,26.5
6,23.33
7,0.0
8,0.0
9,0.0
