In [None]:
import os, sys
sys.path.append('..')
os.chdir('..')
import json
from collections import defaultdict
from tasks import *
import networkx as nx 
import numpy as np

In [1]:
path = 'final_results/np_hard'  # 'np_hard' or 'p_easy' or 'p_hard'
difficulty = path.split('_')[-1]
files = os.listdir(path)
merged_responses = {}
problem_num = 500
dataset_loc = 'dataset'

for f in files:
    if len(f.split('_')) < 2:
        continue
    llm, task = f.split('_')[0], f.split('_')[1]
    with open(f'{path}/{f}', 'r') as file:
        response_dict = json.load(file)
    for i in range(0, problem_num):
        if task not in merged_responses:
            merged_responses[task] = defaultdict(dict)
        merged_responses[task][i][llm] = response_dict[str(i)][llm]
task_list = list(merged_responses.keys())

In [3]:
for method in ['chris', 'greedy', 'random']:
    score = {}
    task_name = 'TSP'
    task= globals()[task_name + '_Task'](dataset_loc)
    task.load_dataset(difficulty)
    score[task_name] = defaultdict(dict)
    for i in range(0, problem_num):
        score[task_name][i]['gt'] = task.problem_set[i]['exact_answer']
        score[task_name][i]['gt'] = task.approx_solver(task.problem_set[i]['graph'], method=method)[0]
        for llm in merged_responses[task_name][i].keys():
            if llm == 'problem':
                continue
            r = merged_responses[task_name][i][llm]
            if r is None:
                r = ''
                print(i, llm, task_name)
            score[task_name][i][llm] = task.check_solution(i, r)
    metrics = defaultdict(dict)
    less_is_better = ['GED', 'TSP', 'MVC', 'Distance']
    results = []
    task = task_name
    model_list = list(score[task][0].keys())
    model_list.remove('gt')
    for model in model_list:
        metrics[task][model] = {'worse': [],'equal':[], 'better':[]}
        for i in range(0, problem_num):
            worse = 0 > score[task][i][model] or score[task][i][model] > score[task][i]['gt']
            equal = score[task][i][model] == score[task][i]['gt']
            better = 0 <= score[task][i][model] and score[task][i][model] < score[task][i]['gt']

            metrics[task][model]['worse'].append(worse)
            metrics[task][model]['equal'].append(equal)
            metrics[task][model]['better'].append(better)
        avg_worse = sum(metrics[task][model]['worse']) / problem_num
        avg_equal = sum(metrics[task][model]['equal']) / problem_num
        avg_better = sum(metrics[task][model]['better']) / problem_num
        results.append((task, model, avg_worse, avg_equal, avg_better))

    sorted_results = defaultdict(list)

    task_results = [result for result in results if result[0] == task]
    sorted_results[task] = sorted(task_results, key=lambda x: x[2], reverse=False)

    # Print sorted results for each task
    for task, task_results in sorted_results.items():
        print(f"\nTask: {task}, method: {method}")
        for result in task_results:
            print(f"Model: {result[1]}, worse: {result[2]:.3f}, equal: {result[3]:.3f}, better: {result[4]:.3f}")


Task: TSP, method: chris
Model: gpt4, worse: 0.942, equal: 0.004, better: 0.054
Model: deepseek, worse: 0.978, equal: 0.000, better: 0.022
Model: claude, worse: 0.994, equal: 0.000, better: 0.006
Model: llama, worse: 0.994, equal: 0.000, better: 0.006
Model: gpt, worse: 0.998, equal: 0.000, better: 0.002
Model: qwen7b, worse: 1.000, equal: 0.000, better: 0.000
Model: qwen, worse: 1.000, equal: 0.000, better: 0.000
Model: llama8b, worse: 1.000, equal: 0.000, better: 0.000
Model: mixtral, worse: 1.000, equal: 0.000, better: 0.000
Model: gemma, worse: 1.000, equal: 0.000, better: 0.000

Task: TSP, method: greedy
Model: gpt4, worse: 0.880, equal: 0.028, better: 0.092
Model: deepseek, worse: 0.954, equal: 0.002, better: 0.044
Model: llama, worse: 0.986, equal: 0.000, better: 0.014
Model: claude, worse: 0.990, equal: 0.000, better: 0.010
Model: gpt, worse: 0.996, equal: 0.000, better: 0.004
Model: llama8b, worse: 0.998, equal: 0.000, better: 0.002
Model: qwen7b, worse: 1.000, equal: 0.000, 

In [6]:
# Run Classic solver
import os, sys
os.chdir('..')
sys.path.append('..')
import multiprocessing as mp
from tasks import *
from openai import OpenAI
import networkx as nx
import random
import numpy as np

import argparse
import fast_tsp
import time
from functools import partial

def process_problem(problem, result_dict):
    exact_answer, path = task.exact_solver(*problem['graph'])
    print(f"Processed problem {problem['id']}")
    result_dict[problem['id']] = {'exact_answer': exact_answer, 'path': path}

def save_results(task, result_dict):
    for problem_id, result in result_dict.items():
        idx = next(i for i, prob in enumerate(task.problem_set) if prob['id'] == problem_id)
        task.problem_set[idx]['exact_answer'] = result['exact_answer']
        task.problem_set[idx]['path'] = result['path']
    task.save_dataset('hard')
    print("Results saved.")

if __name__ == '__main__':
    task = GED_Task('dataset')
    task.load_dataset('hard')

    # Determine the number of CPU cores to use
    num_cores = mp.cpu_count()

    # Create a manager for shared objects
    manager = mp.Manager()
    result_dict = manager.dict()

    # Create a pool of worker processes
    pool = mp.Pool(processes=num_cores)

    # Start the processing
    for problem in task.problem_set[:500]:
        pool.apply_async(process_problem, args=(problem, result_dict))

    # Monitor and save results every 10 minutes
    start_time = time.time()
    while pool._cache:
        time.sleep(10)  # Check every 10 seconds
        if time.time() - start_time >= 600:  # 600 seconds = 10 minutes
            save_results(task, result_dict)
            start_time = time.time()

    # Close the pool and wait for all processes to finish
    pool.close()
    pool.join()

    # Final save
    save_results(task, result_dict)

    print("Processing complete.")

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/GED_hard.pkl'