In [1]:
def check_NA(response):
    apologetic_flag = True if 'sorry' in response or 'does not have the necessary detail' in response or 'cannot accurately identify' in response or 'Sorry' in response or "can't provide" in response or "cannot provide" in response or "does not provide" in response or "cannot assist" in response or "not possible" in response or "unable" in response or 'unknown' in response or 'not clear' in response or 'not determined' in response or 'not known' in response or 'not provided' in response or 'not mentioned' in response or 'not given' in response or 'not specified' in response or 'may not be visible' in response or 'not visible' in response or 'unfortunately' in response or 'not clearly visible' in response else False
    return apologetic_flag

def remove_verbose(response, organism='fish'):
    response = response.replace('The answer is: ', '')
    response = response.replace(f'The {organism} in the image is a ', '')
    response = response.replace(f'The {organism} in the image is called', '')
    response = response.replace(f'The {organism} in the photo is a', '')
    response = response.replace(f'The species and scientific name of the {organism} in the image are', '')
    response = response.replace(f'The species in the image is', '')
    response = response.replace(f'The species of the {organism} in the image is', '')
    response = response.replace(f'The scientific name of the {organism} in the image is', '') 
    response = response.replace(f'The species of the {organism} in the picture is', '')
    response = response.replace(f'The species of the {organism} in the photo is', '')
    response = response.replace(f'The species of the {organism} in this picture is','')
    response = response.replace(f'The {organism} in the photo is called', '')
    return response.strip()




def classification_eval_selection(data, model_name, organism='fish', task_type='direct'):
    out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': 0,
        'correct_ge_count' : 0, # only genus correct, partial
        'NA_count' : 0,
        'incorrect_count': 0,
    }
    
    for data_idx in range(len(data)):
        data_item = data[data_idx]
        
        gt_sci_name = data_item['target-class']
        if gt_sci_name != gt_sci_name:
            continue
        gt_genus = gt_sci_name.split(' ')[0]

        if len(gt_sci_name) == 2:
            gt_species = gt_sci_name.split(' ')[1]

        response = data_item['output']
    
        response = remove_verbose(response, organism=organism)

        gt_option = data_item['option-gt']
        
        # print(f'Ground-truth:{gt_option}', response)

        if len(response)==1:
            if gt_option.lower() == response.lower():
                out_dict['correct_count'] += 1
            else:
                out_dict['incorrect_count'] += 1
            continue

        elif response.lower() in ['a)', 'b)', 'c)', 'd)']:
            if gt_option.lower() == response.lower()[0]:
                out_dict['correct_count'] += 1
            else:
                out_dict['incorrect_count'] += 1
            continue
                
        # correct
        if gt_sci_name.lower() in response.lower():
            out_dict['correct_count'] += 1
            continue

        # partial, correct genus only
        if gt_genus.lower() in response.lower():
            out_dict['correct_ge_count'] += 1
            continue

        if check_NA(response.lower()):
            out_dict['NA_count'] += 1
            continue

        out_dict['incorrect_count'] += 1

    total = out_dict['correct_count'] + out_dict['correct_ge_count'] + out_dict['NA_count'] + out_dict['incorrect_count']


    percnt_out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': out_dict['correct_count']*100/total if total!=0 else 0,
        'correct_ge_count' : out_dict['correct_ge_count']*100/total if total!=0 else 0,
        'NA_count' : out_dict['NA_count']*100/total if total!=0 else 0,
        'incorrect_count': out_dict['incorrect_count']*100/total if total!=0 else 0,
    }
    print(f'Evaluation completed for {model_name}.')

    return out_dict, percnt_out_dict


In [2]:
import pandas as pd
import json

PROMPT_MODELS = ['gpt-4v', 'gpt-4o', 'llava-v1.5-7b', 'llava-v1.5-13b', 'cogvlm-chat', 'blip-flan-xl', 'blip-flan-xxl']
# MODEL_NAMES = ['gpt-4v',
#                'gpt-4o',
#                'llava-v1.5-7b',
#                'llava-v1.5-13b',
#                'cogvlm-chat',
#                'blip-flan-xl',
#                'blip-flan-xxl',
#                'minigpt4-vicuna-7B',
#                'minigpt4-vicuna-13B',
#                'instruct-flant5xl',
#                'instruct-flant5xxl',
#                'instruct-vicuna7b',
#                'instruct-vicuna13b']
# PROMPT_MODELS = MODEL_NAMES

def classification_tables(filename, model_lists=PROMPT_MODELS, full_display=False):

    # Initialize empty lists to store data
    model_list = []
    question_type_list = []
    correct_count_list = []
    correct_ge_count_list = []
    na_count_list = []
    incorrect_count_list = []
    
    # Read the JSON lines file line by line
    with open(filename, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data['Model'] not in model_lists:
                continue
            model_list.append(data['Model'])
            question_type_list.append(data['Question-type'])
            correct_count_list.append('{:.2f}'.format(data['correct_count']))
            correct_ge_count_list.append('{:.2f}'.format(data['correct_ge_count']))
            na_count_list.append('{:.2f}'.format(data['NA_count']))
            incorrect_count_list.append('{:.2f}'.format(data['incorrect_count']))
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Models': model_list,
        'Question-type': question_type_list,
        'correct_count': correct_count_list,
        'correct_ge_count': correct_ge_count_list,
        'NA_count': na_count_list,
        'incorrect_count': incorrect_count_list
    })
    
    # Display the DataFrame
    if full_display==False:
        display(df)

    return df[['correct_count']]

In [3]:
def write_evaluation(dataset, task_type, task, organism, start_dir, prompting_technique, model_list = None):

    model_list = MODEL_NAMES if model_list == None else model_list
    
    for model_name in model_list:
        datalist = []
        
        for chunkid in range(10):
            if organism == 'fish':
                some_value = 50
            elif organism == 'bird':
                some_value = 50
            elif organism == 'butterfly':
                some_value = 50
                
            RESULT_FILE_TEMPT = f'{start_dir}/{dataset}/{prompting_technique}/{task}/{task_type}/{task}_{model_name}_{task_type}_num_{some_value}_chunk_{chunkid}.jsonl'
            datalist+=read_file(RESULT_FILE_TEMPT)
    
        print(f'Starting Evaluations for model {model_name} on {len(datalist)} results.')
    
        if task_type == 'direct':
            _, percent_result_dict = classification_eval_direct(data=datalist, model_name=model_name, task_type=task_type, organism=organism)
        elif task_type == 'selection':
            _, percent_result_dict = classification_eval_selection(data=datalist, model_name=model_name, task_type=task_type, organism=organism)

        
        writer = jsonlines.open(f'./tables/classification_{dataset}_{prompting_technique}_{task_type}_{organism}.jsonl', mode='a')
        writer.write(percent_result_dict)
        writer.close()

In [4]:
import os
import json
import jsonlines

def read_file(filepath):
    if os.path.exists(filepath) == False:
        print(f'{filepath} not found.')
        return []

    datalist = []

    with open(filepath, "r", encoding='utf-8') as fh:
        for line in fh.readlines():
            if line:
                try: # if the response writing creates any issue, this try-catch will handle it.
                    dict_ = json.loads(line) 
                except:
                    continue
                datalist.append(dict_)

    return datalist

In [5]:
MODEL_NAMES = ['gpt-4v',
               'gpt-4o',
               'llava-v1.5-7b',
               'llava-v1.5-13b',
               'cogvlm-chat',
               'blip-flan-xl',
               'blip-flan-xxl']

DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
RESULT_DIRS = ['/projects/ml4science/VLM4Bio/']
EVAL_TYPE = ['prompting_results']
TASK_TYPE= ['direct', 'selection']
start_dir = os.path.join(RESULT_DIRS[0], EVAL_TYPE[0])

## Fish

In [6]:
print('######### Contextual ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'fish-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'contextual'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### Dense-caption ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'fish-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'dense-caption'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### CoT ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'fish-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'cot'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### No-prompting ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'fish-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'no-prompting'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


######### Contextual ###########
Writing evalutations for fish-prompting dataset and classification task and fish organism
Starting Evaluations for model gpt-4v on 500 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model gpt-4o on 500 results.
Evaluation completed for gpt-4o.
Starting Evaluations for model llava-v1.5-7b on 500 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 500 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 500 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 500 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 500 results.
Evaluation completed for blip-flan-xxl.
######### Dense-caption ###########
Writing evalutations for fish-prompting dataset and classification task and fish organism
Starting Evaluations for model gpt-4v on 500 results.
Evaluation completed f

In [7]:
print("\t\t\t\t\t###### No-Prompting ######")
df_eval = classification_tables('./tables/classification_fish-prompting_no-prompting_selection_fish.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### Contextual ######")
df_eval = classification_tables('./tables/classification_fish-prompting_contextual_selection_fish.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### Dense-Caption ######")
df_eval = classification_tables('./tables/classification_fish-prompting_dense-caption_selection_fish.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### CoT ######")
df_eval = classification_tables('./tables/classification_fish-prompting_cot_selection_fish.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

					###### No-Prompting ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,34.4,5.6,45.2,14.8
1,gpt-4o,selection,79.0,12.6,0.0,8.4
2,llava-v1.5-7b,selection,41.6,8.8,0.0,49.6
3,llava-v1.5-13b,selection,35.4,8.2,0.0,56.4
4,cogvlm-chat,selection,31.0,11.8,0.0,57.2
5,blip-flan-xl,selection,28.6,0.0,0.0,71.4
6,blip-flan-xxl,selection,22.6,0.2,0.0,77.2


Unnamed: 0,correct_count
0,34.4
1,79.0
2,41.6
3,35.4
4,31.0
5,28.6
6,22.6


					###### Contextual ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,30.0,3.2,59.2,7.6
1,gpt-4o,selection,77.2,14.4,0.0,8.4
2,llava-v1.5-7b,selection,40.2,10.8,0.0,49.0
3,llava-v1.5-13b,selection,35.6,10.2,0.0,54.2
4,cogvlm-chat,selection,25.6,14.2,0.0,60.2
5,blip-flan-xl,selection,27.2,0.0,0.0,72.8
6,blip-flan-xxl,selection,26.6,0.0,0.0,73.4


Unnamed: 0,correct_count
0,30.0
1,77.2
2,40.2
3,35.6
4,25.6
5,27.2
6,26.6


					###### Dense-Caption ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,18.8,0.8,79.4,1.0
1,gpt-4o,selection,78.6,13.6,0.0,7.8
2,llava-v1.5-7b,selection,26.0,8.8,0.0,65.2
3,llava-v1.5-13b,selection,27.6,11.2,0.0,61.2
4,cogvlm-chat,selection,32.0,12.2,0.0,55.8
5,blip-flan-xl,selection,28.4,0.0,0.0,71.6
6,blip-flan-xxl,selection,29.8,0.0,0.0,70.2


Unnamed: 0,correct_count
0,18.8
1,78.6
2,26.0
3,27.6
4,32.0
5,28.4
6,29.8


					###### CoT ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,42.6,4.0,46.4,7.0
1,gpt-4o,selection,86.0,8.0,0.0,6.0
2,llava-v1.5-7b,selection,41.4,11.8,0.0,46.8
3,llava-v1.5-13b,selection,34.8,8.8,0.0,56.4
4,cogvlm-chat,selection,26.8,11.4,0.8,61.0
5,blip-flan-xl,selection,29.2,1.6,0.0,69.2
6,blip-flan-xxl,selection,24.6,0.0,0.0,75.4


Unnamed: 0,correct_count
0,42.6
1,86.0
2,41.4
3,34.8
4,26.8
5,29.2
6,24.6


## Bird

In [8]:
print('######### Contextual ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'bird-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'contextual'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### Dense-caption ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'bird-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'dense-caption'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### CoT ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'bird-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'cot'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### No-prompting ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'bird-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'no-prompting'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


######### Contextual ###########
Writing evalutations for bird-prompting dataset and classification task and bird organism
Starting Evaluations for model gpt-4v on 500 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model gpt-4o on 500 results.
Evaluation completed for gpt-4o.
Starting Evaluations for model llava-v1.5-7b on 500 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 500 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 500 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 500 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 500 results.
Evaluation completed for blip-flan-xxl.
######### Dense-caption ###########
Writing evalutations for bird-prompting dataset and classification task and bird organism
Starting Evaluations for model gpt-4v on 500 results.
Evaluation completed f

In [10]:
print("\t\t\t\t\t###### No-Prompting ######")
df_eval = classification_tables('./tables/classification_bird-prompting_no-prompting_selection_bird.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### Contextual ######")
df_eval = classification_tables('./tables/classification_bird-prompting_contextual_selection_bird.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### Dense-Caption ######")
df_eval = classification_tables('./tables/classification_bird-prompting_dense-caption_selection_bird.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### CoT ######")
df_eval = classification_tables('./tables/classification_bird-prompting_cot_selection_bird.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

					###### No-Prompting ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,78.8,1.4,16.8,3.0
1,gpt-4o,selection,97.6,1.2,0.0,1.2
2,llava-v1.5-7b,selection,44.2,2.6,0.0,53.2
3,llava-v1.5-13b,selection,49.8,2.6,0.0,47.6
4,cogvlm-chat,selection,45.4,2.6,0.0,52.0
5,blip-flan-xl,selection,35.6,0.0,0.0,64.4
6,blip-flan-xxl,selection,35.8,0.0,0.0,64.2


Unnamed: 0,correct_count
0,78.8
1,97.6
2,44.2
3,49.8
4,45.4
5,35.6
6,35.8


					###### Contextual ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,78.6,2.8,14.6,4.0
1,gpt-4o,selection,98.6,0.6,0.2,0.6
2,llava-v1.5-7b,selection,44.0,1.8,0.0,54.2
3,llava-v1.5-13b,selection,52.0,3.8,0.0,44.2
4,cogvlm-chat,selection,49.4,2.8,0.0,47.8
5,blip-flan-xl,selection,35.6,0.0,0.0,64.4
6,blip-flan-xxl,selection,30.4,0.2,0.0,69.4


Unnamed: 0,correct_count
0,78.6
1,98.6
2,44.0
3,52.0
4,49.4
5,35.6
6,30.4


					###### Dense-Caption ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,87.4,1.0,8.4,3.2
1,gpt-4o,selection,97.0,1.2,0.4,1.4
2,llava-v1.5-7b,selection,33.4,1.8,0.0,64.8
3,llava-v1.5-13b,selection,41.0,2.2,0.0,56.8
4,cogvlm-chat,selection,44.0,2.0,0.0,54.0
5,blip-flan-xl,selection,25.6,0.0,0.0,74.4
6,blip-flan-xxl,selection,22.8,0.0,0.0,77.2


Unnamed: 0,correct_count
0,87.4
1,97.0
2,33.4
3,41.0
4,44.0
5,25.6
6,22.8


					###### CoT ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,62.6,0.6,36.2,0.6
1,gpt-4o,selection,98.6,0.4,0.4,0.6
2,llava-v1.5-7b,selection,37.4,2.2,0.0,60.4
3,llava-v1.5-13b,selection,47.8,3.0,0.0,49.2
4,cogvlm-chat,selection,42.2,2.6,0.2,55.0
5,blip-flan-xl,selection,30.6,0.6,0.0,68.8
6,blip-flan-xxl,selection,31.0,0.0,0.0,69.0


Unnamed: 0,correct_count
0,62.6
1,98.6
2,37.4
3,47.8
4,42.2
5,30.6
6,31.0


## Butterfly

In [11]:
print('######### Contextual ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'butterfly-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'contextual'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### Dense-caption ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'butterfly-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'dense-caption'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### CoT ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'butterfly-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'cot'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


print('######### No-prompting ###########')

task = 'classification'

# DATASETS = ['fish-prompting', 'bird-prompting', 'butterfly-prompting']
dataset = 'butterfly-prompting'

# ['contextual', 'dense-caption', 'cot', 'no-prompting']
prompting_technique = 'no-prompting'

organism = None 
if 'fish' in dataset:
    organism = 'fish'
elif 'bird' in dataset:
    organism = 'bird'
elif 'butterfly' in dataset:
    organism = 'butterfly'

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism, prompting_technique=prompting_technique)


######### Contextual ###########
Writing evalutations for butterfly-prompting dataset and classification task and butterfly organism
Starting Evaluations for model gpt-4v on 500 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model gpt-4o on 500 results.
Evaluation completed for gpt-4o.
Starting Evaluations for model llava-v1.5-7b on 500 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 500 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 500 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 500 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 500 results.
Evaluation completed for blip-flan-xxl.
######### Dense-caption ###########
Writing evalutations for butterfly-prompting dataset and classification task and butterfly organism
Starting Evaluations for model gpt-4v on 500 results.
Ev

In [13]:
print("\t\t\t\t\t###### No-Prompting ######")
df_eval = classification_tables('./tables/classification_butterfly-prompting_no-prompting_selection_butterfly.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### Contextual ######")
df_eval = classification_tables('./tables/classification_butterfly-prompting_contextual_selection_butterfly.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### Dense-Caption ######")
df_eval = classification_tables('./tables/classification_butterfly-prompting_dense-caption_selection_butterfly.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

print("\t\t\t\t\t###### CoT ######")
df_eval = classification_tables('./tables/classification_butterfly-prompting_cot_selection_butterfly.jsonl', model_lists=PROMPT_MODELS)
display(df_eval)

					###### No-Prompting ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,13.2,12.6,65.6,8.6
1,gpt-4o,selection,56.4,24.0,0.2,19.4
2,llava-v1.5-7b,selection,27.2,29.4,0.0,43.4
3,llava-v1.5-13b,selection,26.8,30.0,0.0,43.2
4,cogvlm-chat,selection,25.6,34.8,0.0,39.6
5,blip-flan-xl,selection,24.4,0.0,0.0,75.6
6,blip-flan-xxl,selection,21.2,0.0,0.0,78.8


Unnamed: 0,correct_count
0,13.2
1,56.4
2,27.2
3,26.8
4,25.6
5,24.4
6,21.2


					###### Contextual ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,9.2,7.6,77.6,5.6
1,gpt-4o,selection,56.2,24.8,0.4,18.6
2,llava-v1.5-7b,selection,26.0,31.4,0.0,42.6
3,llava-v1.5-13b,selection,24.6,29.2,0.0,46.2
4,cogvlm-chat,selection,27.2,31.0,0.0,41.8
5,blip-flan-xl,selection,23.6,0.0,0.0,76.4
6,blip-flan-xxl,selection,24.6,0.0,0.0,75.4


Unnamed: 0,correct_count
0,9.2
1,56.2
2,26.0
3,24.6
4,27.2
5,23.6
6,24.6


					###### Dense-Caption ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,49.6,14.2,24.6,11.6
1,gpt-4o,selection,63.2,21.2,0.0,15.6
2,llava-v1.5-7b,selection,25.2,26.4,0.0,48.4
3,llava-v1.5-13b,selection,23.8,25.8,0.0,50.4
4,cogvlm-chat,selection,27.0,31.0,0.0,42.0
5,blip-flan-xl,selection,23.2,0.0,0.0,76.8
6,blip-flan-xxl,selection,23.2,0.0,0.0,76.8


Unnamed: 0,correct_count
0,49.6
1,63.2
2,25.2
3,23.8
4,27.0
5,23.2
6,23.2


					###### CoT ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,63.6,11.8,16.4,8.2
1,gpt-4o,selection,74.6,16.2,0.2,9.0
2,llava-v1.5-7b,selection,21.4,34.8,0.0,43.8
3,llava-v1.5-13b,selection,23.2,28.2,0.0,48.6
4,cogvlm-chat,selection,34.6,30.0,0.0,35.4
5,blip-flan-xl,selection,37.2,16.8,0.0,46.0
6,blip-flan-xxl,selection,23.6,0.2,0.0,76.2


Unnamed: 0,correct_count
0,63.6
1,74.6
2,21.4
3,23.2
4,34.6
5,37.2
6,23.6
