In [1]:
# Have GitHub copilot completions in
# level_0/eval/github_copilot/completion*.txt

# Need to create level_0/github_copilot/results.json with
# "0": {
#         "messages": [
#             {
#                 "role": "user",
#                 "content": "<USER_CODE>"
#             },
#             {
#                 "role": "assistant",
#                 "content": "<COMPLETION>"
#             }
#         ],
#         "analyzer_results": [
#             <ANALYZER_RESULTS>
#         ],
#         "log": "",
#         "time": -1,
#         "n_tokens_generated": -1
#     },
#
# for each completion in level_0/eval/github_copilot/completion*.txt and user code in level_0/pairs/pair*/input.txt

In [2]:
import json
import os
import re
from tqdm import tqdm
import datetime

from llmcoder.utils import get_data_dir
from llmcoder.analyze import MypyAnalyzer, GPTScoreAnalyzer

import tiktoken

### Format Results

In [3]:
levels = [0, 1, 2]

enc = tiktoken.get_encoding('p50k_base')

In [4]:
user_inputs =  {f'level_{level}': {} for level in levels}
completions = {f'level_{level}': {} for level in levels}
for level in levels:
    for run in os.listdir(get_data_dir('LLMcoder-Eval', f'level_{level}', 'eval', f'level_{level}__github_copilot')):
        completions[f'level_{level}'][run] = {}
        user_inputs[f'level_{level}'][run] = {}

        for completion in os.listdir(get_data_dir('LLMcoder-Eval', f'level_{level}', 'eval', f'level_{level}__github_copilot', run, 'completions')):

            if not completion.startswith('completion'):  # Skip other files (results.json for example)
                continue
            id = int(re.match(r'completion(\d+).txt', completion).group(1)) - 1  # 0-indexed

            with open(os.path.join(get_data_dir('LLMcoder-Eval', f'level_{level}', 'eval', f'level_{level}__github_copilot', run, 'completions', completion))) as f:
                completions[f'level_{level}'][run][f'{id}'] = f.read()

            with open(os.path.join(get_data_dir('LLMcoder-Eval', f'level_{level}', 'pairs', f'pair{id + 1}', 'input.txt'))) as f:
                user_inputs[f'level_{level}'][run][f'{id}'] = f.read()

# Sort the completions by and user inputs by id
for level in levels:
    for run in completions[f'level_{level}']:
        completions[f'level_{level}'][run] = {k: v for k, v in sorted(completions[f'level_{level}'][run].items(), key=lambda item: int(item[0]))}
        user_inputs[f'level_{level}'][run] = {k: v for k, v in sorted(user_inputs[f'level_{level}'][run].items(), key=lambda item: int(item[0]))}

In [5]:
json_results = {f'level_{level}': {} for level in levels}

for level in levels:
    for run in completions[f'level_{level}']:
        json_results[f'level_{level}'][run] = {}

        for id, user_input in tqdm(user_inputs[f'level_{level}'][run].items()):
            json_results[f'level_{level}'][run][f'{id}'] = {
                "messages": [
                    {
                        "role": "user",
                        "content": user_input
                    },
                    {
                        "role": "assistant",
                        "content": completions[f'level_{level}'][run][f'{id}']
                    }
                ],
                "analyzer_results": [
                    {
                        "mypy_analyzer_v1": MypyAnalyzer().analyze(input=user_input, completion=completions[f'level_{level}'][run][f'{id}']),
                        "gpt_score_analyzer_v1": GPTScoreAnalyzer(verbose=True).analyze(input=user_input, completion=completions[f'level_{level}'][run][f'{id}'])
                    }
                ],
                "log": "",
                "time": -1,
                "n_tokens_generated": len(enc.encode(completions[f'level_{level}'][run][f'{id}']))
            }

100%|██████████| 4/4 [00:06<00:00,  1.52s/it]
100%|██████████| 9/9 [00:23<00:00,  2.65s/it]
100%|██████████| 11/11 [00:19<00:00,  1.75s/it]


In [6]:
# Save results
current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
for level in levels:
    for run in completions[f'level_{level}']:
        os.makedirs(os.path.join(get_data_dir('LLMcoder-Eval', f'level_{level}', 'eval', f'level_{level}__github_copilot', run)), exist_ok=True)
        with open(os.path.join(get_data_dir('LLMcoder-Eval', f'level_{level}', 'eval', f'level_{level}__github_copilot', run, 'results.json')), 'w') as f:
            json.dump(json_results[f'level_{level}'][run], f, indent=4, ensure_ascii=False)

### Metrics

In [14]:
from llmcoder.eval.evaluate import Metrics
from dynaconf import Dynaconf

levels = [0, 1, 2]

In [15]:
configs = [Dynaconf(
        environments=True,
        analyzers=["mypy_analyzer_v1", "signature_analyzer_v1", "gpt_score_analyzer_v1"],
        model_first="ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d",
        model_feedback="ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d",
        feedback_variant="coworker",
        system_prompt="2023-11-15_GPT-Builder.txt",
        dataset=f"LLMcoder-Eval/level_{i}",
        max_iter=5,
        log_conversation=True,
        scores=[
            "extrinsic.levenshtein_distance_score",
            "extrinsic.bleu_score",
            "extrinsic.trf_similarity_score",
            "extrinsic.sequence_matcher_score",
            "extrinsic.gpt_reviewer_score",
            "intrinsic.loops_required_score",
            "intrinsic.tokens_used_score",
            "intrinsic.agility_score",
            "intrinsic.time_score",
            "intrinsic.all_analyzers_passed_score",
            "intrinsic.total_tokens_generated_score"
        ],
        n_choices=3,
        n_procs=3,
        backtracking=True,
        settings_file=[f'level_{i}__github_copilot.yaml']
    ) for i in levels]

In [16]:
for config in configs:
    metrics = Metrics(config)
    metrics.run(store=True, index=None, verbose=True)

Set up metrics with configurations:
	- level_0__github_copilot.yaml
Analyzing results for configuration: level_0__github_copilot.yaml


Analysis: total_tokens_generated_score: 100%|██████████| 4/4 [00:20<00:00,  5.10s/it]


Set up metrics with configurations:
	- level_1__github_copilot.yaml
Analyzing results for configuration: level_1__github_copilot.yaml


Analysis: total_tokens_generated_score: 100%|██████████| 9/9 [00:43<00:00,  4.88s/it]


Set up metrics with configurations:
	- level_2__github_copilot.yaml
Analyzing results for configuration: level_2__github_copilot.yaml


Analysis: total_tokens_generated_score: 100%|██████████| 11/11 [00:59<00:00,  5.44s/it]
