In [91]:
import os
import json
import ast
import numpy as np
import matplotlib.pyplot as plt
import matplotlib


In [92]:
def format_percentage(percentage):
    """Formats percentage with two decimal points"""
    return f"{percentage * 100:.1f}"

In [122]:
def is_correct(ground_truth, final_output):
    correct = True
    for i in range(len(final_output)):
        for j in range(len(final_output[0])):
            try: 
                if final_output[i][j] != ground_truth[i][j]:
                    correct = False
            except: 
                correct = False
    return correct

In [123]:
# Use LaTeX for rendering text
matplotlib.rcParams.update({
    'font.family': 'serif',
    'text.usetex': True,
})

def plot_success_matrix(success_matrix, task_names):
    fig, ax = plt.subplots(figsize=(20, 10))

    # Transpose the success_matrix
    success_matrix = success_matrix.T

    # Create a color map where unsuccessful runs are shown in light color
    cmap = plt.cm.get_cmap('RdYlBu')

    # Display the heatmap with the chosen color map
    cax = ax.matshow(success_matrix, cmap=cmap)

    # Set up axes
    ax.set_xlabel('Task Name')
    ax.set_ylabel('Run Index')

    # Configure ticks
    # Use task names for x-ticks
    ax.set_xticks(np.arange(len(task_names)))
    ax.set_xticklabels(task_names, rotation=90, fontsize=8)
    ax.set_yticks(np.arange(success_matrix.shape[0]))

    # Show grid
    ax.set_axisbelow(True)
    ax.xaxis.set_tick_params(width=0.5)
    ax.yaxis.set_tick_params(width=0.5)

    # Show the plot
    plt.tight_layout()
    plt.savefig('plot.png', dpi=300)


In [126]:
def evaluate_model_performance():
    folder = "results/evaluation_small/no_replace_comma/gpt-3.5-turbo-16k"
    #folder = "results/evaluation_small/no_replace_comma/gpt-4"


    result_files = os.listdir(folder)
    result_files = [os.path.join(folder, file) for file in result_files if os.path.splitext(file)[1] == ".json"]
    accuracies = []
    task_names = []
    
    # We assume there are 5 runs
    num_runs = 5
    num_tasks = len(result_files)
    success_matrix = np.zeros((num_tasks, num_runs))

    correct_dimensions, correct_predictions, correct_cells, total_predictions, total_cells, all_correct = 0, 0, 0, 0, 0, []
    for i, result_file in enumerate(result_files):

        task_name = result_file.split("/")[-1].split("_out")[0]
        true_task_path = os.path.join("data/evaluation_small/", task_name + ".json")
        task_names.append(task_name)

        ground_truth = None 

        with open(true_task_path, "r") as f:
            obj = json.loads(f.read())
            ground_truth = obj["test"][0]["output"]

        with open(result_file, "r") as f:
            results = json.loads(f.read())["output"]

            considered_results = []
            dimensions = []
            #results = [results[0]]
            # for each result filter that all row dimensions are the same
            for run_idx in range(num_runs):
                result = results[run_idx]
                try:
                    result =  ast.literal_eval(result)
                except:
                    continue
                d1 = len(result)
                if not isinstance(result[0], list): 
                    continue

                d2 = len(result[0])
                invalid = False 

                for row in result: 

                    if len(row) != d2: 
                        invalid = True 

                if not invalid: 
                    considered_results.append(result)
                    dimensions.append((d1, d2))


            # perform majority voting on the number of dimensions
            majority_dimension = max(dimensions,key=dimensions.count)
            # filter to lists with majority dimension
            considered_results = [result for result in considered_results if len(result) == majority_dimension[0] and len(result[0]) == majority_dimension[1]]
            final_output = considered_results[0]
            # majority voting on final output -> we perform character by character voting
            for i in range(len(final_output)):
                for j in range(len(final_output[0])):

                    char_counter = []
                    for elm_list in considered_results:
                        char_counter.append(elm_list[i][j])

                    max_character =  max(char_counter,key=char_counter.count)
                    final_output[i][j] = max_character

            
            if len(final_output) == len(ground_truth) and all(
                        len(pred_row) == len(gt_row) for pred_row, gt_row in zip(final_output, ground_truth)):
                    correct_dimensions += 1
                    for pred_row, gt_row in zip(final_output, ground_truth):
                        for pred_cell, gt_cell in zip(pred_row, gt_row):
                            total_cells += 1
                            if pred_cell == gt_cell:
                                correct_cells += 1
                                
            correct = True
            for i in range(len(final_output)):
                for j in range(len(final_output[0])):
                    try: 
                        if final_output[i][j] != ground_truth[i][j]:
                            correct = False
                    except: 
                        correct = False
                        
            total_predictions += 1
            if correct: 
                correct_predictions += 1
                for idx in range(num_runs):
                    current_pred = results[idx]
                    if str(current_pred) == str(current_pred):
                        success_matrix[i, idx] = 1
                accuracies.append(1)
            else: 
                accuracies.append(0)
    return correct_dimensions, correct_predictions, correct_cells, total_predictions, total_cells, all_correct, success_matrix, task_names

dim_correct, acc_correct, c_acc_correct, total, c_acc_total, all_correct, success_matrix, task_names = evaluate_model_performance()
cell_accuracy = c_acc_correct/c_acc_total
total_accuracy = acc_correct/total
dim_accuracy = dim_correct/total

print(f"ANALYZED {total} TASKS")
print(f"CORRECT {acc_correct} TASKS")
print(f"ACCURACY: {total_accuracy}")
print(f"DIMENSION ACCURACY: {dim_accuracy}")
print(f"CELL ACCURACY: {cell_accuracy}")
print(f"{format_percentage(dim_accuracy)} & {format_percentage(cell_accuracy)} & {acc_correct} & {format_percentage(total_accuracy)} \\\\")

print(success_matrix)

# Filter task names that have at least one correct classification
filtered_task_names = [name for idx, name in enumerate(task_names) if success_matrix[idx].any()]
print(filtered_task_names)
# Filter success_matrix for tasks with at least one correct classification
filtered_success_matrix = success_matrix[[idx for idx, _ in enumerate(task_names) if success_matrix[idx].any()]]

# Plot the matrix
plot_success_matrix(filtered_success_matrix, filtered_task_names)

ANALYZED 56 TASKS
CORRECT 10 TASKS
ACCURACY: 0.17857142857142858
DIMENSION ACCURACY: 0.875
CELL ACCURACY: 0.7101735015772871
87.5 & 71.0 & 10 & 17.9 \\
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [