In [1]:
import sys
sys.path.append('..')
import os
import json
from collections import defaultdict
from tasks import *
import numpy as np

In [2]:
difficulty = 'hard'  # easy: small graphs; hard: large graphs
path = '../results/'+difficulty
files = os.listdir(path)
merged_responses = {}
problem_num = 500
dataset_loc = '../dataset'

for f in files:
    if len(f.split('_')) < 2:
        continue
    llm, task = f.split('_')[0], f.split('_')[1]
    with open(f'{path}/{f}', 'r') as file:
        response_dict = json.load(file)
    for i in range(0, problem_num):
        if task not in merged_responses:
            merged_responses[task] = defaultdict(dict)
        merged_responses[task][i][llm] = response_dict[str(i)][llm]
task_list = list(merged_responses.keys())

In [3]:
# The scoring process may take a few minutes.
# Refer to the following lines if want to evaluate on specific tasks and models.
# task_list = ['Connected','Diameter','Distance','Neighbor']
# model_list = 'gpt4,gpt4mini,claude,glm,llama,llama8b,deepseek,qwen72b,mixtral,gemma'.split(',')

score = {}
for task_name in task_list:
    task= globals()[task_name + '_Task'](dataset_loc)
    task.load_dataset(difficulty)
    score[task_name] = defaultdict(dict)
    for i in range(0, problem_num):
        score[task_name][i]['gt'] = task.problem_set[i]['exact_answer']
        if score[task_name][i]['gt'] is None:
            score[task_name][i]['gt'] = task.problem_set[i]['approx_answer']
        for llm in merged_responses[task_name][i].keys():
            if llm == 'problem':
                continue
            r = merged_responses[task_name][i][llm]
            if r is None:
                r = ''
                print(i, llm, task_name)
            score[task_name][i][llm] = task.check_solution(i, r)
# json.dump(score, open('score.json', 'w'))

invalid literal for int() with base 10: ''
invalid literal for int() with base 10: ''


In [7]:
metrics = defaultdict(dict)
less_is_better = ['GED', 'TSP', 'MVC', 'Distance']
results = []
for task in task_list:
    model_list = 'gpt4,gpt4mini,claude,glm,llama,llama8b,deepseek,qwen72b,mixtral,gemma,doubao,dsR1'.split(',')
    # model_list = ['qwen2', 'qwen2SFT', 'gpt4', 'gpt4coder','deepseek','deepseekcoder']  # for Table 2 in the paper
    for model in model_list:
        metrics[task][model] = {'rank':[], 'feasible':[], 'MRR':[], 'hallu':[], 'acc': [],'top1':[], 'top3':[], 'len': []}
        for i in range(0, problem_num):
            metrics[task][model]['feasible'].append(score[task][i][model]>0)
            metrics[task][model]['hallu'].append(score[task][i][model]==-2)
            metrics[task][model]['len'].append(len(merged_responses[task_name][i][model]))
            if task in ['GED', 'TSP', 'MVC']:
                acc = 0 <= score[task][i][model] and score[task][i][model] <= score[task][i]['gt']
            elif task in ['MCP', 'MCS', 'MIC']:
                acc = score[task][i][model] >= score[task][i]['gt']
            else:
                acc = score[task][i][model] == score[task][i]['gt']
            
            metrics[task][model]['acc'].append(acc)
            
            rank = 1
            error_knt = 0
            for model2 in model_list:
                if score[task][i][model2] < 0:
                    error_knt += 1
                if task in less_is_better:
                    if score[task][i][model] > score[task][i][model2] and score[task][i][model2] >= 0:
                        rank += 1
                else:
                    if score[task][i][model] < score[task][i][model2] and score[task][i][model2] >= 0:
                        rank += 1
            if score[task][i][model] < 0:
                rank = len(model_list)
            if error_knt == len(model_list):
                continue
            metrics[task][model]['rank'].append(rank)
            metrics[task][model]['top1'].append(rank==1)
            metrics[task][model]['top3'].append(rank<=3)
            metrics[task][model]['MRR'].append(1/rank)
        avg_rank = np.mean(metrics[task][model]['rank'])
        avg_feasible = sum(metrics[task][model]['feasible']) / problem_num
        avg_MRR = np.mean(metrics[task][model]['MRR'])
        avg_hallu = sum(metrics[task][model]['hallu']) / problem_num
        avg_acc = sum(metrics[task][model]['acc']) / problem_num
        avg_top1 = np.mean(metrics[task][model]['top1'])
        avg_top3 = np.mean(metrics[task][model]['top3'])
        avg_len = np.mean(metrics[task][model]['len'])
        results.append((task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len))

In [8]:
# Sorting the results by MRR for each task
sorted_results = defaultdict(list)

for task in task_list:
    task_results = [result for result in results if result[0] == task]
    sorted_results[task] = sorted(task_results, key=lambda x: x[5], reverse=True)  # Sort by MRR

# Print sorted results for each task
for task, task_results in sorted_results.items():
    print(f"\nTask: {task}")
    for result in task_results:
        print(f"Model: {result[1]}, Top1: {result[2]:.3f}, Top3: {result[3]:.3f}, MRR: {result[4]:.3f}, Feasible: {result[5]:.3f}, Hallucination: {result[6]:.3f}, Accuracy: {result[7]:.3f}, len: {result[8]:.3f}")

# Calculate average MRR performance across all tasks for each model
model_metrics = defaultdict(lambda: defaultdict(list))

# Aggregate metrics for each model across tasks
for result in results:
    task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len = result
    model_metrics[model]['MRR'].append(avg_MRR)
    model_metrics[model]['top1'].append(avg_top1)
    model_metrics[model]['top3'].append(avg_top3)
    model_metrics[model]['feasible'].append(avg_feasible)
    model_metrics[model]['hallu'].append(avg_hallu)
    model_metrics[model]['acc'].append(avg_acc)
    model_metrics[model]['len'].append(avg_len)
    
# Compute average metrics for each model and sort models by their average MRR
average_metrics_performance = {model: {metric: sum(values) / len(values) for metric, values in metrics.items()} for model, metrics in model_metrics.items()}
sorted_average_metrics = sorted(average_metrics_performance.items(), key=lambda x: x[1]['MRR'], reverse=True)

# Print the sorted average metrics for each model
print("\nAverage Performance Across All Tasks:")
for model, metrics in sorted_average_metrics:
    print(f"Model: {model}, Average MRR: {metrics['MRR']:.3f}, Average Top1: {metrics['top1']:.3f}, Feasible: {metrics['feasible']:.3f}, Hallucination: {metrics['hallu']:.3f}, Accuracy: {metrics['acc']:.3f}, len: {metrics['len']:.3f}")

# average for P tasks and NP tasks separately
P_tasks = ['Connected','Diameter','Distance','Neighbor']
NP_tasks = ['GED','TSP','MCP','MCS','MIC','MVC']
P_results = []
NP_results = []
for result in results:
    task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len = result
    if task in P_tasks:
        P_results.append(result)
    else:
        NP_results.append(result)     


Task: Connected
Model: dsR1, Top1: 1.000, Top3: 1.000, MRR: 1.000, Feasible: 1.000, Hallucination: 0.000, Accuracy: 0.998, len: 25514.280
Model: qwen72b, Top1: 0.284, Top3: 0.418, MRR: 0.425, Feasible: 0.718, Hallucination: 0.128, Accuracy: 0.284, len: 2415.626
Model: gpt4mini, Top1: 0.246, Top3: 0.344, MRR: 0.374, Feasible: 0.714, Hallucination: 0.266, Accuracy: 0.246, len: 227.756
Model: glm, Top1: 0.518, Top3: 0.602, MRR: 0.594, Feasible: 0.662, Hallucination: 0.338, Accuracy: 0.516, len: 114.640
Model: claude, Top1: 0.518, Top3: 0.592, MRR: 0.592, Feasible: 0.658, Hallucination: 0.342, Accuracy: 0.518, len: 559.132
Model: gpt4, Top1: 0.404, Top3: 0.474, MRR: 0.489, Feasible: 0.598, Hallucination: 0.394, Accuracy: 0.404, len: 522.306
Model: doubao, Top1: 0.394, Top3: 0.468, MRR: 0.476, Feasible: 0.546, Hallucination: 0.454, Accuracy: 0.394, len: 199.636
Model: llama, Top1: 0.264, Top3: 0.334, MRR: 0.367, Feasible: 0.544, Hallucination: 0.456, Accuracy: 0.264, len: 70.968
Model: mix

In [10]:
# Calculate average metrics for P tasks and NP tasks separately
P_metrics = defaultdict(lambda: defaultdict(list))
NP_metrics = defaultdict(lambda: defaultdict(list))

# Aggregate metrics for P and NP tasks
for result in P_results:
    task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len = result
    P_metrics[model]['MRR'].append(avg_MRR)
    P_metrics[model]['top1'].append(avg_top1)
    P_metrics[model]['top3'].append(avg_top3)
    P_metrics[model]['feasible'].append(avg_feasible)
    P_metrics[model]['hallu'].append(avg_hallu)
    P_metrics[model]['acc'].append(avg_acc)
    P_metrics[model]['len'].append(avg_len)

for result in NP_results:
    task, model, avg_top1, avg_top3, avg_MRR, avg_feasible, avg_hallu, avg_acc, avg_len = result
    NP_metrics[model]['MRR'].append(avg_MRR)
    NP_metrics[model]['top1'].append(avg_top1)
    NP_metrics[model]['top3'].append(avg_top3)
    NP_metrics[model]['feasible'].append(avg_feasible)
    NP_metrics[model]['hallu'].append(avg_hallu)
    NP_metrics[model]['acc'].append(avg_acc)
    NP_metrics[model]['len'].append(avg_len)

# Compute average metrics for P and NP tasks
average_P_metrics = {model: {metric: sum(values) / len(values) for metric, values in metrics.items()} for model, metrics in P_metrics.items()}
average_NP_metrics = {model: {metric: sum(values) / len(values) for metric, values in metrics.items()} for model, metrics in NP_metrics.items()}

# Sort models by their average MRR for P and NP tasks
sorted_P_metrics = sorted(average_P_metrics.items(), key=lambda x: x[1]['MRR'], reverse=True)
sorted_NP_metrics = sorted(average_NP_metrics.items(), key=lambda x: x[1]['MRR'], reverse=True)


print(f"difficulty: {difficulty}, {'small' if difficulty == 'easy' else 'large'} graphs")
# Print the sorted average metrics for P tasks
print("\nAverage Performance for P Tasks:")
for model, metrics in sorted_P_metrics:
    print(f"Model: {model}, Average MRR: {metrics['MRR']:.3f}, Average Top1: {metrics['top1']:.3f}, Average Top3: {metrics['top3']:.3f}, Feasible: {metrics['feasible']:.3f}, Hallucination: {metrics['hallu']:.3f}, Accuracy: {metrics['acc']:.3f}")

# Print the sorted average metrics for NP tasks
print("\nAverage Performance for NP Tasks:")
for model, metrics in sorted_NP_metrics:
    print(f"Model: {model}, Average MRR: {metrics['MRR']:.3f}, Average Top1: {metrics['top1']:.3f}, Average Top3: {metrics['top3']:.3f}, Feasible: {metrics['feasible']:.3f}, Hallucination: {metrics['hallu']:.3f}, Accuracy: {metrics['acc']:.3f}")


difficulty: hard, large graphs

Average Performance for P Tasks:
Model: dsR1, Average MRR: 0.926, Average Top1: 0.917, Average Top3: 0.923, Feasible: 0.906, Hallucination: 0.092, Accuracy: 0.877
Model: claude, Average MRR: 0.665, Average Top1: 0.605, Average Top3: 0.668, Feasible: 0.705, Hallucination: 0.295, Accuracy: 0.587
Model: doubao, Average MRR: 0.614, Average Top1: 0.545, Average Top3: 0.617, Feasible: 0.659, Hallucination: 0.340, Accuracy: 0.532
Model: glm, Average MRR: 0.542, Average Top1: 0.469, Average Top3: 0.537, Feasible: 0.570, Hallucination: 0.420, Accuracy: 0.457
Model: gpt4, Average MRR: 0.526, Average Top1: 0.450, Average Top3: 0.512, Feasible: 0.589, Hallucination: 0.373, Accuracy: 0.435
Model: qwen72b, Average MRR: 0.476, Average Top1: 0.399, Average Top3: 0.441, Feasible: 0.557, Hallucination: 0.363, Accuracy: 0.399
Model: gpt4mini, Average MRR: 0.455, Average Top1: 0.373, Average Top3: 0.422, Feasible: 0.569, Hallucination: 0.407, Accuracy: 0.366
Model: llama, A

In [None]:
# difficulty: easy, small graphs

# Average Performance for P Tasks:
# Model: dsR1, Average MRR: 0.982, Average Top1: 0.979, Average Top3: 0.981, Feasible: 0.984, Hallucination: 0.014, Accuracy: 0.976
# Model: claude, Average MRR: 0.848, Average Top1: 0.824, Average Top3: 0.842, Feasible: 0.865, Hallucination: 0.134, Accuracy: 0.822
# Model: doubao, Average MRR: 0.821, Average Top1: 0.794, Average Top3: 0.810, Feasible: 0.848, Hallucination: 0.150, Accuracy: 0.792
# Model: gpt4, Average MRR: 0.805, Average Top1: 0.772, Average Top3: 0.795, Feasible: 0.846, Hallucination: 0.134, Accuracy: 0.769
# Model: glm, Average MRR: 0.764, Average Top1: 0.730, Average Top3: 0.750, Feasible: 0.788, Hallucination: 0.186, Accuracy: 0.727
# Model: gpt4mini, Average MRR: 0.733, Average Top1: 0.692, Average Top3: 0.713, Feasible: 0.828, Hallucination: 0.161, Accuracy: 0.689
# Model: llama, Average MRR: 0.660, Average Top1: 0.615, Average Top3: 0.634, Feasible: 0.719, Hallucination: 0.273, Accuracy: 0.612
# Model: qwen72b, Average MRR: 0.643, Average Top1: 0.590, Average Top3: 0.604, Feasible: 0.748, Hallucination: 0.117, Accuracy: 0.590
# Model: deepseek, Average MRR: 0.572, Average Top1: 0.515, Average Top3: 0.537, Feasible: 0.642, Hallucination: 0.350, Accuracy: 0.514
# Model: mixtral, Average MRR: 0.440, Average Top1: 0.379, Average Top3: 0.386, Feasible: 0.513, Hallucination: 0.464, Accuracy: 0.378
# Model: llama8b, Average MRR: 0.355, Average Top1: 0.286, Average Top3: 0.294, Feasible: 0.445, Hallucination: 0.539, Accuracy: 0.285
# Model: gemma, Average MRR: 0.321, Average Top1: 0.252, Average Top3: 0.257, Feasible: 0.386, Hallucination: 0.614, Accuracy: 0.252

# Average Performance for NP Tasks:
# Model: dsR1, Average MRR: 0.961, Average Top1: 0.949, Average Top3: 0.968, Feasible: 0.972, Hallucination: 0.027, Accuracy: 0.877
# Model: claude, Average MRR: 0.596, Average Top1: 0.507, Average Top3: 0.606, Feasible: 0.744, Hallucination: 0.255, Accuracy: 0.478
# Model: doubao, Average MRR: 0.592, Average Top1: 0.495, Average Top3: 0.614, Feasible: 0.777, Hallucination: 0.217, Accuracy: 0.467
# Model: gpt4, Average MRR: 0.587, Average Top1: 0.495, Average Top3: 0.595, Feasible: 0.770, Hallucination: 0.202, Accuracy: 0.473
# Model: glm, Average MRR: 0.536, Average Top1: 0.437, Average Top3: 0.533, Feasible: 0.788, Hallucination: 0.204, Accuracy: 0.413
# Model: gpt4mini, Average MRR: 0.510, Average Top1: 0.414, Average Top3: 0.502, Feasible: 0.724, Hallucination: 0.255, Accuracy: 0.392
# Model: llama, Average MRR: 0.487, Average Top1: 0.391, Average Top3: 0.463, Feasible: 0.753, Hallucination: 0.245, Accuracy: 0.368
# Model: deepseek, Average MRR: 0.457, Average Top1: 0.351, Average Top3: 0.439, Feasible: 0.741, Hallucination: 0.245, Accuracy: 0.337
# Model: qwen72b, Average MRR: 0.334, Average Top1: 0.226, Average Top3: 0.295, Feasible: 0.651, Hallucination: 0.270, Accuracy: 0.206
# Model: llama8b, Average MRR: 0.305, Average Top1: 0.208, Average Top3: 0.254, Feasible: 0.579, Hallucination: 0.417, Accuracy: 0.202
# Model: mixtral, Average MRR: 0.263, Average Top1: 0.162, Average Top3: 0.201, Feasible: 0.593, Hallucination: 0.374, Accuracy: 0.156
# Model: gemma, Average MRR: 0.244, Average Top1: 0.135, Average Top3: 0.185, Feasible: 0.571, Hallucination: 0.426, Accuracy: 0.129

# difficulty: hard, large graphs

# Average Performance for P Tasks:
# Model: dsR1, Average MRR: 0.926, Average Top1: 0.917, Average Top3: 0.923, Feasible: 0.906, Hallucination: 0.092, Accuracy: 0.877
# Model: claude, Average MRR: 0.665, Average Top1: 0.605, Average Top3: 0.668, Feasible: 0.705, Hallucination: 0.295, Accuracy: 0.587
# Model: doubao, Average MRR: 0.614, Average Top1: 0.545, Average Top3: 0.617, Feasible: 0.659, Hallucination: 0.340, Accuracy: 0.532
# Model: glm, Average MRR: 0.542, Average Top1: 0.469, Average Top3: 0.537, Feasible: 0.570, Hallucination: 0.420, Accuracy: 0.457
# Model: gpt4, Average MRR: 0.526, Average Top1: 0.450, Average Top3: 0.512, Feasible: 0.589, Hallucination: 0.373, Accuracy: 0.435
# Model: qwen72b, Average MRR: 0.476, Average Top1: 0.399, Average Top3: 0.441, Feasible: 0.557, Hallucination: 0.363, Accuracy: 0.399
# Model: gpt4mini, Average MRR: 0.455, Average Top1: 0.373, Average Top3: 0.422, Feasible: 0.569, Hallucination: 0.407, Accuracy: 0.366
# Model: llama, Average MRR: 0.413, Average Top1: 0.325, Average Top3: 0.386, Feasible: 0.493, Hallucination: 0.490, Accuracy: 0.316
# Model: deepseek, Average MRR: 0.341, Average Top1: 0.253, Average Top3: 0.302, Feasible: 0.379, Hallucination: 0.611, Accuracy: 0.247
# Model: mixtral, Average MRR: 0.232, Average Top1: 0.142, Average Top3: 0.170, Feasible: 0.315, Hallucination: 0.672, Accuracy: 0.138
# Model: llama8b, Average MRR: 0.182, Average Top1: 0.097, Average Top3: 0.111, Feasible: 0.189, Hallucination: 0.782, Accuracy: 0.094
# Model: gemma, Average MRR: 0.172, Average Top1: 0.093, Average Top3: 0.095, Feasible: 0.161, Hallucination: 0.840, Accuracy: 0.092

# Average Performance for NP Tasks:
# Model: dsR1, Average MRR: 0.800, Average Top1: 0.726, Average Top3: 0.847, Feasible: 0.898, Hallucination: 0.102, Accuracy: 0.431
# Model: claude, Average MRR: 0.336, Average Top1: 0.176, Average Top3: 0.398, Feasible: 0.519, Hallucination: 0.470, Accuracy: 0.072
# Model: gpt4, Average MRR: 0.288, Average Top1: 0.121, Average Top3: 0.330, Feasible: 0.534, Hallucination: 0.436, Accuracy: 0.063
# Model: doubao, Average MRR: 0.283, Average Top1: 0.120, Average Top3: 0.320, Feasible: 0.493, Hallucination: 0.493, Accuracy: 0.052
# Model: glm, Average MRR: 0.248, Average Top1: 0.092, Average Top3: 0.250, Feasible: 0.505, Hallucination: 0.491, Accuracy: 0.048
# Model: llama, Average MRR: 0.221, Average Top1: 0.090, Average Top3: 0.187, Feasible: 0.450, Hallucination: 0.550, Accuracy: 0.047
# Model: deepseek, Average MRR: 0.215, Average Top1: 0.081, Average Top3: 0.187, Feasible: 0.409, Hallucination: 0.581, Accuracy: 0.031
# Model: gpt4mini, Average MRR: 0.197, Average Top1: 0.065, Average Top3: 0.166, Feasible: 0.397, Hallucination: 0.554, Accuracy: 0.033
# Model: gemma, Average MRR: 0.165, Average Top1: 0.032, Average Top3: 0.144, Feasible: 0.312, Hallucination: 0.688, Accuracy: 0.009
# Model: llama8b, Average MRR: 0.153, Average Top1: 0.045, Average Top3: 0.101, Feasible: 0.241, Hallucination: 0.755, Accuracy: 0.019
# Model: qwen72b, Average MRR: 0.149, Average Top1: 0.034, Average Top3: 0.101, Feasible: 0.214, Hallucination: 0.421, Accuracy: 0.007
# Model: mixtral, Average MRR: 0.142, Average Top1: 0.027, Average Top3: 0.086, Feasible: 0.294, Hallucination: 0.646, Accuracy: 0.016