In [1]:
# Read the resulting conversations
# Print the input code from the user
# Print the ground truth
# Print the LLMcoder completions

# Comment the results and analyze and list failure modes
# -> Summarize later (using GPT?)

# Store?
# comments.json for our comments and analysis
# /data/LLMcoder-Eval/eval_curated/comments....json
# {
#     "<results_id>": "<comment>",
#     ...
# }

In [2]:
import json
import os

from llmcoder.utils import get_data_dir

In [3]:
DATA_DIR = get_data_dir('LLMcoder-Eval/eval_curated')
print(DATA_DIR)

/home/psaegert/Projects/23ws-LLMcoder/data/LLMcoder-Eval/eval_curated


In [4]:
# Three results for now:
# 1) plain GPT-3.5-Turbo
# 2) GPT-3.5-Turbo + custom system prompt
# 3) Fine-Tuned GPT-3.5-Turbo

In [5]:
# Define the ground truth
GROUND_TRUTH_FILE = "conversations.jsonl"
RESULTS_FILE = "results_baseline_eval-curated_2023-12-01_15-16-45.json"

In [6]:
# Load the inputs and ground truth from the ground truth file
with open(os.path.join(DATA_DIR, GROUND_TRUTH_FILE), "r") as f:
    ground_truth_file = [json.loads(line) for line in f.readlines()]

# Extract the input code and ground truth from the ground truth file
# The ground truth file is a list of conversations
# The input is the message with "role": "user"
# The ground truth is the message with "tole": "assistant"

inputs = []
ground_truths = []
for conversation in ground_truth_file:
    for message in conversation['messages']:
        if message["role"] == "user":
            inputs.append(message["content"])
        elif message["role"] == "assistant":
            ground_truths.append(message["content"])

print(f'Inputs: {len(inputs)}')
print(f'Ground truth: {len(ground_truths)}')

Inputs: 90
Ground truth: 90


In [7]:
# Load the results
with open(os.path.join(DATA_DIR, RESULTS_FILE), 'r') as f:
    results = json.load(f)

In [8]:
# Create a unified dictionary of inputs, ground truth and results
# The keys are the results ids
# The values are dictionaries with keys:
# - input
# - ground_truth
# - completion

results_dict = {}
for input, ground_truth, (result_key, result_value) in zip(inputs, ground_truths, results.items()):
    results_dict[result_key] = {
        'input': input,
        'ground_truth': ground_truth,
        'completion': result_value['messages'][-1]["content"]
    }

In [9]:
def add_comment(results_dict, result_id, comment, force=False):
    if results_dict[result_id]['comment'] != '' and not force:
        print(f'Warning: comment already exists for {result_id}:\n {results_dict[result_id]["comment"]}')
    else:
        results_dict[result_id]['comment'] = comment

In [10]:
for key, value in results_dict.items():
    # Write the input, ground truth and completion to a file in ./comparison/result_{key}_{input|ground_truth|completion}.txt
    os.makedirs(os.path.join(DATA_DIR, "comparison"), exist_ok=True)

    # Create a new subfolder for the results
    os.makedirs(os.path.join(DATA_DIR, "comparison", f"result_{key}"), exist_ok=True)

    with open(os.path.join(DATA_DIR, "comparison", f"result_{key}", "input.py"), "w") as f:
        f.write(value['input'])
    with open(os.path.join(DATA_DIR, "comparison", f"result_{key}", "ground_truth.py"), "w") as f:
        f.write(value['ground_truth'])
    with open(os.path.join(DATA_DIR, "comparison", f"result_{key}", "completion.py"), "w") as f:
        f.write(value['completion'])

    # Add an empty comment file if it doesn't exist
    if not os.path.exists(os.path.join(DATA_DIR, "comparison", f"result_{key}", "comment.txt")):
        with open(os.path.join(DATA_DIR, "comparison", f"result_{key}", "comment.txt"), "w") as f:
            f.write("")