In [16]:
def check_NA(response):
    apologetic_flag = True if 'sorry' in response or 'does not have the necessary detail' in response or 'cannot accurately identify' in response or 'Sorry' in response or "can't provide" in response or "cannot provide" in response or "does not provide" in response or "cannot assist" in response or "not possible" in response or "unable" in response or 'unknown' in response or 'not clear' in response or 'not determined' in response or 'not known' in response or 'not provided' in response or 'not mentioned' in response or 'not given' in response or 'not specified' in response or 'may not be visible' in response or 'not visible' in response or 'unfortunately' in response or 'not clearly visible' in response else False
    return apologetic_flag

def remove_verbose(response, organism='fish'):
    response = response.replace('The answer is: ', '')
    response = response.replace(f'The {organism} in the image is a ', '')
    response = response.replace(f'The {organism} in the image is called', '')
    response = response.replace(f'The {organism} in the photo is a', '')
    response = response.replace(f'The species and scientific name of the {organism} in the image are', '')
    response = response.replace(f'The species in the image is', '')
    response = response.replace(f'The species of the {organism} in the image is', '')
    response = response.replace(f'The scientific name of the {organism} in the image is', '') 
    response = response.replace(f'The species of the {organism} in the picture is', '')
    response = response.replace(f'The species of the {organism} in the photo is', '')
    response = response.replace(f'The species of the {organism} in this picture is','')
    response = response.replace(f'The {organism} in the photo is called', '')
    return response.strip()

def classification_eval_direct(data, model_name, organism='fish', task_type='direct'):
    out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': 0,
        'correct_ge_count' : 0, # only genus correct, partial
        'NA_count' : 0,
        'incorrect_count': 0,
    }
    
    for data_idx in range(len(data)):
        data_item = data[data_idx]
        
        gt_sci_name = data_item['target-class']
        if gt_sci_name != gt_sci_name:
            continue
        gt_genus = gt_sci_name.split(' ')[0]

        if len(gt_sci_name) == 2:
            gt_species = gt_sci_name.split(' ')[1]

        response = data_item['output']
        response = remove_verbose(response, organism=organism)

        # correct
        if gt_sci_name.lower() in response.lower():
            out_dict['correct_count'] += 1
            continue

        # partial, correct genus only
        if gt_genus.lower() in response.lower():
            out_dict['correct_ge_count'] += 1
            continue

        if check_NA(response.lower()):
            out_dict['NA_count'] += 1
            continue

        out_dict['incorrect_count'] += 1

    total = out_dict['correct_count'] + out_dict['correct_ge_count'] + out_dict['NA_count'] + out_dict['incorrect_count']


    percnt_out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': out_dict['correct_count']*100/total if total!=0 else 0,
        'correct_ge_count' : out_dict['correct_ge_count']*100/total if total!=0 else 0,
        'NA_count' : out_dict['NA_count']*100/total if total!=0 else 0,
        'incorrect_count': out_dict['incorrect_count']*100/total if total!=0 else 0,
    }
    print(f'Evaluation completed for {model_name}.')

    return out_dict, percnt_out_dict


def classification_eval_selection(data, model_name, organism='fish', task_type='direct'):
    out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': 0,
        'correct_ge_count' : 0, # only genus correct, partial
        'NA_count' : 0,
        'incorrect_count': 0,
    }
    
    for data_idx in range(len(data)):
        data_item = data[data_idx]
        
        gt_sci_name = data_item['target-class']
        if gt_sci_name != gt_sci_name:
            continue
        gt_genus = gt_sci_name.split(' ')[0]

        if len(gt_sci_name) == 2:
            gt_species = gt_sci_name.split(' ')[1]

        response = data_item['output']
    
        response = remove_verbose(response, organism=organism)

        gt_option = data_item['option-gt']
        
        # print(f'Ground-truth:{gt_option}', response)

        if len(response)==1:
            if gt_option.lower() == response.lower():
                out_dict['correct_count'] += 1
            else:
                out_dict['incorrect_count'] += 1
            continue

        elif response.lower() in ['a)', 'b)', 'c)', 'd)']:
            if gt_option.lower() == response.lower()[0]:
                out_dict['correct_count'] += 1
            else:
                out_dict['incorrect_count'] += 1
            continue
                
        # correct
        if gt_sci_name.lower() in response.lower():
            out_dict['correct_count'] += 1
            continue

        # partial, correct genus only
        if gt_genus.lower() in response.lower():
            out_dict['correct_ge_count'] += 1
            continue

        if check_NA(response.lower()):
            out_dict['NA_count'] += 1
            continue

        out_dict['incorrect_count'] += 1

    total = out_dict['correct_count'] + out_dict['correct_ge_count'] + out_dict['NA_count'] + out_dict['incorrect_count']


    percnt_out_dict = {
        'Model': model_name,
        'Question-type': task_type,
        'correct_count': out_dict['correct_count']*100/total if total!=0 else 0,
        'correct_ge_count' : out_dict['correct_ge_count']*100/total if total!=0 else 0,
        'NA_count' : out_dict['NA_count']*100/total if total!=0 else 0,
        'incorrect_count': out_dict['incorrect_count']*100/total if total!=0 else 0,
    }
    print(f'Evaluation completed for {model_name}.')

    return out_dict, percnt_out_dict


In [23]:
def write_evaluation(dataset, task_type, task, organism, start_dir, model_list = None):

    model_list = MODEL_NAMES if model_list == None else model_list
    
    for model_name in model_list:
        datalist = []
        
        for chunkid in range(10):
            if organism == 'fish':
                some_value = 1034 if chunkid != 9 else 1041
            elif organism == 'bird':
                some_value = 1109 if chunkid != 9 else 1111
            elif organism == 'butterfly':
                some_value = 1001 if chunkid != 9 else 1004
                
            RESULT_FILE_TEMPT = f'{start_dir}/{dataset}/{task}/{task_type}/{task}_{model_name}_{task_type}_num_{some_value}_chunk_{chunkid}.jsonl'
            datalist+=read_file(RESULT_FILE_TEMPT)
    
        print(f'Starting Evaluations for model {model_name} on {len(datalist)} results.')
    
        if task_type == 'direct':
            _, percent_result_dict = classification_eval_direct(data=datalist, model_name=model_name, task_type=task_type, organism=organism)
        elif task_type == 'selection':
            _, percent_result_dict = classification_eval_selection(data=datalist, model_name=model_name, task_type=task_type, organism=organism)

        
        writer = jsonlines.open(f'./tables/classification_{task_type}_{organism}.jsonl', mode='a')
        writer.write(percent_result_dict)
        writer.close()

In [24]:
import os
import json
import jsonlines

def read_file(filepath):
    if os.path.exists(filepath) == False:
        print(f'{filepath} not found.')
        return []

    datalist = []

    with open(filepath, "r", encoding='utf-8') as fh:
        for line in fh.readlines():
            if line:
                try: # if the response writing creates any issue, this try-catch will handle it.
                    dict_ = json.loads(line) 
                except:
                    continue
                datalist.append(dict_)

    return datalist

In [25]:
MODEL_NAMES = ['gpt-4v',
               'llava-v1.5-7b',
               'llava-v1.5-13b',
               'cogvlm-chat',
               'blip-flan-xl',
               'blip-flan-xxl',
               'minigpt4-vicuna-7B',
               'minigpt4-vicuna-13B',
               'instruct-flant5xl',
               'instruct-flant5xxl',
               'instruct-vicuna7b',
               'instruct-vicuna13b']

DATASETS = ['fish-10k', 'bird', 'butterfly-10k']
RESULT_DIRS = ['/projects/ml4science/VLM4Bio/']
EVAL_TYPE = ['results']
TASK_TYPE= ['direct', 'selection']
start_dir = os.path.join(RESULT_DIRS[0], EVAL_TYPE[0])

In [26]:
task = 'classification'

###### FOR Butterfly-10k ########
dataset = DATASETS[2] # 0: fish, 1: bird, 2: butterfly
organism = 'butterfly' if dataset=='butterfly-10k' else dataset

print(f'Writing evalutations for {dataset} dataset and {task} task and {organism} organism')

Writing evalutations for butterfly-10k dataset and classification task and butterfly organism


In [27]:
task_type = TASK_TYPE[0] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

Starting Evaluations for model gpt-4v on 7635 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model llava-v1.5-7b on 10013 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 10013 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 10013 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 10013 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 10013 results.
Evaluation completed for blip-flan-xxl.
Starting Evaluations for model minigpt4-vicuna-7B on 10013 results.
Evaluation completed for minigpt4-vicuna-7B.
Starting Evaluations for model minigpt4-vicuna-13B on 10013 results.
Evaluation completed for minigpt4-vicuna-13B.
Starting Evaluations for model instruct-flant5xl on 10013 results.
Evaluation completed for instruct-flant5xl.
Starting Evaluations for model instruct-flant5xxl on 10013 res

In [28]:
task_type = TASK_TYPE[1] # 0: direct, 1: selection
write_evaluation(dataset=dataset, task_type=task_type, task=task, start_dir=start_dir, organism=organism)

Starting Evaluations for model gpt-4v on 10013 results.
Evaluation completed for gpt-4v.
Starting Evaluations for model llava-v1.5-7b on 10013 results.
Evaluation completed for llava-v1.5-7b.
Starting Evaluations for model llava-v1.5-13b on 10013 results.
Evaluation completed for llava-v1.5-13b.
Starting Evaluations for model cogvlm-chat on 10013 results.
Evaluation completed for cogvlm-chat.
Starting Evaluations for model blip-flan-xl on 10013 results.
Evaluation completed for blip-flan-xl.
Starting Evaluations for model blip-flan-xxl on 10013 results.
Evaluation completed for blip-flan-xxl.
Starting Evaluations for model minigpt4-vicuna-7B on 10013 results.
Evaluation completed for minigpt4-vicuna-7B.
Starting Evaluations for model minigpt4-vicuna-13B on 10013 results.
Evaluation completed for minigpt4-vicuna-13B.
Starting Evaluations for model instruct-flant5xl on 10013 results.
Evaluation completed for instruct-flant5xl.
Starting Evaluations for model instruct-flant5xxl on 10013 re

In [29]:
import pandas as pd
import json

PROMPT_MODELS = ['gpt-4v', 'llava-v1.5-7b', 'llava-v1.5-13b', 'cogvlm-chat', 'blip-flan-xl', 'blip-flan-xxl']
MODEL_NAMES = ['gpt-4v',
               'llava-v1.5-7b',
               'llava-v1.5-13b',
               'cogvlm-chat',
               'blip-flan-xl',
               'blip-flan-xxl',
               'minigpt4-vicuna-7B',
               'minigpt4-vicuna-13B',
               'instruct-flant5xl',
               'instruct-flant5xxl',
               'instruct-vicuna7b',
               'instruct-vicuna13b']
# PROMPT_MODELS = MODEL_NAMES

def classification_tables(filename, model_lists=MODEL_NAMES, full_display=False):

    # Initialize empty lists to store data
    model_list = []
    question_type_list = []
    correct_count_list = []
    correct_ge_count_list = []
    na_count_list = []
    incorrect_count_list = []
    
    # Read the JSON lines file line by line
    with open(filename, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data['Model'] not in model_lists:
                continue
            model_list.append(data['Model'])
            question_type_list.append(data['Question-type'])
            correct_count_list.append('{:.2f}'.format(data['correct_count']))
            correct_ge_count_list.append('{:.2f}'.format(data['correct_ge_count']))
            na_count_list.append('{:.2f}'.format(data['NA_count']))
            incorrect_count_list.append('{:.2f}'.format(data['incorrect_count']))
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Models': model_list,
        'Question-type': question_type_list,
        'correct_count': correct_count_list,
        'correct_ge_count': correct_ge_count_list,
        'NA_count': na_count_list,
        'incorrect_count': incorrect_count_list
    })
    
    # Display the DataFrame
    if full_display==False:
        display(df)

    return df[['correct_count']]

In [30]:
print("\t\t\t\t\t###### Butterfly ######")
df_eval = classification_tables('./tables/classification_direct_butterfly.jsonl', model_lists=MODEL_NAMES)
display(df_eval)
df_eval = classification_tables('./tables/classification_selection_butterfly.jsonl', model_lists=MODEL_NAMES)
display(df_eval)

					###### Butterfly ######


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,direct,0.04,0.09,98.17,1.7
1,llava-v1.5-7b,direct,0.05,0.25,0.0,99.7
2,llava-v1.5-13b,direct,0.0,0.05,0.0,99.95
3,cogvlm-chat,direct,0.01,0.27,0.63,99.09
4,blip-flan-xl,direct,0.0,0.0,0.0,100.0
5,blip-flan-xxl,direct,0.0,0.0,0.0,100.0
6,minigpt4-vicuna-7B,direct,0.07,0.9,13.13,85.9
7,minigpt4-vicuna-13B,direct,0.01,0.06,1.86,98.07
8,instruct-flant5xl,direct,0.0,0.0,0.0,100.0
9,instruct-flant5xxl,direct,0.0,0.0,0.0,100.0


Unnamed: 0,correct_count
0,0.04
1,0.05
2,0.0
3,0.01
4,0.0
5,0.0
6,0.07
7,0.01
8,0.0
9,0.0


Unnamed: 0,Models,Question-type,correct_count,correct_ge_count,NA_count,incorrect_count
0,gpt-4v,selection,28.91,5.75,54.85,10.49
1,llava-v1.5-7b,selection,50.24,24.31,0.0,25.45
2,llava-v1.5-13b,selection,44.58,19.63,0.0,35.78
3,cogvlm-chat,selection,36.45,28.24,0.0,35.3
4,blip-flan-xl,selection,25.14,0.0,0.0,74.86
5,blip-flan-xxl,selection,28.88,0.0,0.0,71.12
6,minigpt4-vicuna-7B,selection,33.06,20.32,1.04,45.58
7,minigpt4-vicuna-13B,selection,28.9,22.26,0.2,48.64
8,instruct-flant5xl,selection,25.28,14.07,0.0,60.65
9,instruct-flant5xxl,selection,36.67,30.73,0.0,32.6


Unnamed: 0,correct_count
0,28.91
1,50.24
2,44.58
3,36.45
4,25.14
5,28.88
6,33.06
7,28.9
8,25.28
9,36.67
