In [1]:
import os
import json
import ast

In [2]:
def format_percentage(percentage):
    """Formats percentage with two decimal points"""
    return f"{percentage * 100:.1f}"

In [16]:
def evaluate_model_performance():
    folder1 = "results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k"
    folder2 = "results/evaluation_small/no_replace_comma/gpt-4"

    folders = [folder1, folder2]

    result_files = os.listdir(folder2)
    result_files = [file for file in result_files if os.path.splitext(file)[1] == ".json"]

    accuracies = []

    correct_dimensions, correct_predictions, correct_cells, total_predictions, total_cells, all_correct = 0, 0, 0, 0, 0, []
    for i, result_file in enumerate(result_files):

        task_name = result_file.split("/")[-1].split("_out")[0]
        true_task_path = os.path.join("data/evaluation_small/", task_name + ".json")

        ground_truth = None 

        with open(true_task_path, "r") as f:
            obj = json.loads(f.read())
            ground_truth = obj["test"][0]["output"]

        results = []
        for folder in folders:
            path = os.path.join(folder, result_file)
            with open(path, "r") as f:
                results += json.loads(f.read())["output"]

        considered_results = []
        dimensions = []
        #results = [results[3]]
        # for each result filter that all row dimensions are the same
        for result in results:
            try:
                result =  ast.literal_eval(result)
            except:
                continue
            d1 = len(result)
            if not isinstance(result[0], list): 
                continue

            d2 = len(result[0])
            invalid = False 

            for row in result: 

                if len(row) != d2: 
                    invalid = True 

            if not invalid: 
                considered_results.append(result)
                dimensions.append((d1, d2))


        # perform majority voting on the number of dimensions
        majority_dimension = max(dimensions,key=dimensions.count)
        # filter to lists with majority dimension
        considered_results = [result for result in considered_results if len(result) == majority_dimension[0] and len(result[0]) == majority_dimension[1]]
        final_output = considered_results[0]
        # majority voting on final output -> we perform character by character voting
        for i in range(len(final_output)):
            for j in range(len(final_output[0])):

                char_counter = []
                for elm_list in considered_results:
                    char_counter.append(elm_list[i][j])

                max_character =  max(char_counter,key=char_counter.count)
                final_output[i][j] = max_character

        correct = True
        if len(final_output) == len(ground_truth) and all(
                    len(pred_row) == len(gt_row) for pred_row, gt_row in zip(final_output, ground_truth)):
                correct_dimensions += 1
                for pred_row, gt_row in zip(final_output, ground_truth):
                    for pred_cell, gt_cell in zip(pred_row, gt_row):
                        total_cells += 1
                        if pred_cell == gt_cell:
                            correct_cells += 1
        for i in range(len(final_output)):
            for j in range(len(final_output[0])):
                try: 
                    if final_output[i][j] != ground_truth[i][j]:
                        correct = False
                except: 
                    correct = False
        total_predictions += 1
        if correct: 
            correct_predictions += 1
            accuracies.append(1)
        else: 
            accuracies.append(0)
    return correct_dimensions, correct_predictions, correct_cells, total_predictions, total_cells, all_correct

dim_correct, acc_correct, c_acc_correct, total, c_acc_total, all_correct = evaluate_model_performance()
cell_accuracy = c_acc_correct/c_acc_total
total_accuracy = acc_correct/total
dim_accuracy = dim_correct/total

print(f"ANALYZED {total} TASKS")
print(f"CORRECT {acc_correct} TASKS")
print(f"ACCURACY: {total_accuracy}")
print(f"DIMENSION ACCURACY: {dim_accuracy}")
print(f"CELL ACCURACY: {cell_accuracy}")
print(f"{format_percentage(dim_accuracy)} & {format_percentage(cell_accuracy)} & {acc_correct} & {format_percentage(total_accuracy)} \\\\")

#print("accuracy",  sum(accuracies), "/", len(accuracies))

path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/f0afb749_out.json
path results/evaluation_small/no_replace_comma/gpt-4/f0afb749_out.json
path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/34b99a2b_out.json
path results/evaluation_small/no_replace_comma/gpt-4/34b99a2b_out.json
path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/a8610ef7_out.json
path results/evaluation_small/no_replace_comma/gpt-4/a8610ef7_out.json
path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/4cd1b7b2_out.json
path results/evaluation_small/no_replace_comma/gpt-4/4cd1b7b2_out.json
path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/ed98d772_out.json
path results/evaluation_small/no_replace_comma/gpt-4/ed98d772_out.json
path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/7953d61e_out.json
path results/evaluation_small/no_replace_comma/gpt-4/7953d61e_out.json
path results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k/a59b95c0_ou