In [None]:
import os
import json
from collections import defaultdict
from tasks import *
import numpy as np

path = 'final_results/np_easy'  # 'np_hard' or 'p_easy' or 'p_hard' 
difficulty = 'easy'
files = os.listdir(path)
merged_responses = {}
problem_num = 500
dataset_loc = 'dataset'

for f in files:
    if len(f.split('_')) < 2:
        continue
    llm, task = f.split('_')[0], f.split('_')[1]
    with open(f'{path}/{f}', 'r') as file:
        response_dict = json.load(file)
    for i in range(0, problem_num):
        if task not in merged_responses:
            merged_responses[task] = defaultdict(dict)
        merged_responses[task][i][llm] = response_dict[str(i)][llm]
task_list = list(merged_responses.keys())

In [None]:
# The scoring process may take a few minutes.
# If want to evaluate on specific tasks and models:
# task_list = ['GED', 'MCP', 'MCS', 'TSP']
# model_list = ['deepseek', 'llama', 'llama8b', 'gpt', 'gpt4', 'claude', 'mixtral']
score = {}
for task_name in task_list:
    task= globals()[task_name + '_Task'](dataset_loc)
    task.load_dataset(difficulty)
    score[task_name] = defaultdict(dict)
    for i in range(0, problem_num):
        score[task_name][i]['gt'] = task.problem_set[i]['exact_answer']
        if task_name in ['GED', 'TSP', 'MCS'] and difficulty=='hard':
            score[task_name][i]['gt'] = task.problem_set[i]['approx_answer']
        for llm in merged_responses[task_name][i].keys():
            if llm == 'problem':
                continue
            r = merged_responses[task_name][i][llm]
            if r is None:
                r = ''
                print(i, llm, task_name)
            score[task_name][i][llm] = task.check_solution(i, r)
# json.dump(score, open('score.json', 'w'))

In [None]:
metrics = defaultdict(dict)
less_is_better = ['GED', 'TSP', 'MVC', 'Distance']
results = []
for task in task_list:
    print(task)
    model_list = list(score[task][0].keys())
    model_list.remove('gt')
    for model in model_list:
        metrics[task][model] = {'rank':[], 'feasible':[], 'MRR':[], 'hallu':[], 'acc': [],'top1':[], 'top3':[], 'len': []}
        for i in range(0, problem_num):
            metrics[task][model]['feasible'].append(score[task][i][model]>0)
            metrics[task][model]['hallu'].append(score[task][i][model]==-2)
            metrics[task][model]['len'].append(len(merged_responses[task_name][i][model]))
            if task in ['GED', 'TSP', 'MVC']:
                acc = 0 <= score[task][i][model] and score[task][i][model] <= score[task][i]['gt']
            elif task in ['MCP', 'MCS', 'MIC']:
                acc = score[task][i][model] >= score[task][i]['gt']
            else:
                acc = score[task][i][model] == score[task][i]['gt']
            
            metrics[task][model]['acc'].append(acc)
            
            rank = 1
            error_knt = 0
            for model2 in model_list:
                if score[task][i][model2] < 0:
                    error_knt += 1
                if task in less_is_better:
                    if score[task][i][model] > score[task][i][model2] and score[task][i][model2] >= 0:
                        rank += 1
                else:
                    if score[task][i][model] < score[task][i][model2] and score[task][i][model2] >= 0:
                        rank += 1
            if score[task][i][model] < 0:
                rank = len(model_list)
            if error_knt == len(model_list):
                continue
            metrics[task][model]['rank'].append(rank)
            metrics[task][model]['top1'].append(rank==1)
            metrics[task][model]['top3'].append(rank<=3)
            metrics[task][model]['MRR'].append(1/rank)
        avg_rank = np.mean(metrics[task][model]['rank'])
        avg_feasible = sum(metrics[task][model]['feasible']) / problem_num
        avg_MRR = np.mean(metrics[task][model]['MRR'])
        avg_hallu = sum(metrics[task][model]['hallu']) / problem_num
        avg_acc = sum(metrics[task][model]['acc']) / problem_num
        avg_top1 = np.mean(metrics[task][model]['top1'])
        avg_top3 = np.mean(metrics[task][model]['top3'])
        avg_len = np.mean(metrics[task][model]['len'])
        results.append((task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len))

In [None]:
# Sorting the results by MRR for each task
sorted_results = defaultdict(list)

for task in task_list:
    task_results = [result for result in results if result[0] == task]
    sorted_results[task] = sorted(task_results, key=lambda x: x[5], reverse=True)  # Sort by MRR

# Print sorted results for each task
for task, task_results in sorted_results.items():
    print(f"\nTask: {task}")
    for result in task_results:
        print(f"Model: {result[1]}, Top1: {result[2]:.3f}, Top3: {result[3]:.3f}, MRR: {result[4]:.3f}, Feasible: {result[5]:.3f}, Hallucination: {result[6]:.3f}, Accuracy: {result[7]:.3f}, len: {result[8]:.3f}")

# Calculate average MRR performance across all tasks for each model
model_metrics = defaultdict(lambda: defaultdict(list))

# Aggregate metrics for each model across tasks
for result in results:
    task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len = result
    model_metrics[model]['MRR'].append(avg_MRR)
    model_metrics[model]['top1'].append(avg_top1)
    model_metrics[model]['top3'].append(avg_top3)
    model_metrics[model]['feasible'].append(avg_feasible)
    model_metrics[model]['hallu'].append(avg_hallu)
    model_metrics[model]['acc'].append(avg_acc)
    model_metrics[model]['len'].append(avg_len)
    
# Compute average metrics for each model and sort models by their average MRR
average_metrics_performance = {model: {metric: sum(values) / len(values) for metric, values in metrics.items()} for model, metrics in model_metrics.items()}
sorted_average_metrics = sorted(average_metrics_performance.items(), key=lambda x: x[1]['MRR'], reverse=True)

# Print the sorted average metrics for each model
print("\nAverage Performance Across All Tasks:")
for model, metrics in sorted_average_metrics:
    print(f"Model: {model}, Average MRR: {metrics['MRR']:.3f}, Average Top1: {metrics['top1']:.3f}, Feasible: {metrics['feasible']:.3f}, Hallucination: {metrics['hallu']:.3f}, Accuracy: {metrics['acc']:.3f}, len: {metrics['len']:.3f}")