In [77]:
import os
import pandas as pd

from dynaconf import Dynaconf

from llmcoder.utils import get_config_dir, get_data_dir

In [78]:
# Get all configs
config_dir = get_config_dir()
config_file_list = sorted(file for file in os.listdir(config_dir) if file.endswith(".yaml"))
config_list = [Dynaconf(settings_file=os.path.join(config_dir, config_name)) for config_name in config_file_list]
config_name_list = [os.path.splitext(config_name)[0] for config_name in config_file_list]

for config_name in config_name_list:
    print(config_name)

level_0__ft__mypy__5_steps__1_choice
level_0__ft__mypy_signature_5_steps__1_choice
level_0__ft__mypy_signature_5_steps__3_choices
level_0__ft__mypy_signature_gptscore_5_steps__3_choices
level_0__ft__no_analyzers__1_step
level_0__gpt-3.5__no_analyzers__1_step
level_1__ft__mypy__5_steps__1_choice
level_1__ft__mypy_signature_5_steps__1_choice
level_1__ft__mypy_signature_5_steps__3_choices
level_1__ft__mypy_signature_gptscore_5_steps__3_choices
level_1__ft__no_analyzers__1_step
level_1__gpt-3.5__no_analyzers__1_step
level_2__ft__mypy__5_steps__1_choice
level_2__ft__mypy_signature_5_steps__1_choice
level_2__ft__mypy_signature_5_steps__3_choices
level_2__ft__mypy_signature_gptscore_5_steps__3_choices
level_2__ft__no_analyzers__1_step
level_2__gpt-3.5__no_analyzers__1_step


In [79]:
# /data/name/of/dataset/eval/<config_name>/<run_id>/metrics.csv

In [80]:
# Load all metrics.csv files into a dict
metrics_dict = {}

for config_name, config in zip(config_name_list, config_list):
    dataset = config.get('dataset')
    path_to_eval = os.path.join(get_data_dir(dataset), 'eval', config_name)

    metrics_dict[config_name] = {}

    for run_id in os.listdir(path_to_eval):
        path_to_metrics = os.path.join(path_to_eval, run_id, 'metrics.csv')
        if os.path.exists(path_to_metrics):
            with open(path_to_metrics, 'r') as f:
                metrics_dict[config_name][run_id] = pd.read_csv(f, index_col=0)

In [81]:
# Dimensions: (config, run, example, metric)

In [82]:
metrics_dict['level_0__ft__mypy__5_steps__1_choice']['2024-01-13_16-46-25']

Unnamed: 0,levenshtein_distance_score,bleu_score,trf_similarity_score,sequence_matcher_score,gpt_reviewer_score,loops_required_score,tokens_used_score,agility_score,time_score
0,0,1.0,1.0,1.0,0.0,0,299,0.0,0.510669
1,298,0.111787,0.813752,0.456835,0.0,0,1075,0.0,1.627634
2,255,0.296616,0.693067,0.41953,-4.0,5,3955,0.0,18.539025
3,93,0.381036,0.855461,0.686869,-2.0,0,481,0.0,0.694365


## Average across runs

In [83]:
from functools import reduce

In [84]:
# Create two new dataframes for the mean and std of each entry in the dataframe stored in metrics_dict['<config_name>']['<run_id>'] across all runs
# The index of the new dataframe should be the same as the index of the original dataframe

for k, v in metrics_dict.items():
    # mean
    metrics_dict[k]['mean'] = reduce(lambda a, b: a.add(b, fill_value=0), metrics_dict[k].values()) / len(metrics_dict[k])

    # std
    metrics_dict[k]['std'] = reduce(lambda a, b: a.add(b, fill_value=0), [((metrics_dict[k][run_id] - metrics_dict[k]['mean']) ** 2) for run_id in metrics_dict[k]]) / len(metrics_dict[k])

In [85]:
metrics_dict['level_0__ft__mypy__5_steps__1_choice']['mean']

Unnamed: 0,levenshtein_distance_score,bleu_score,trf_similarity_score,sequence_matcher_score,gpt_reviewer_score,loops_required_score,tokens_used_score,agility_score,time_score
0,0.0,1.0,1.0,1.0,0.0,0.0,299.0,0.0,0.514487
1,298.5,0.109268,0.800504,0.44103,0.5,2.5,3332.0,0.0,20.665795
2,312.0,0.148308,0.380148,0.217517,-7.0,3.0,2701.0,0.5,14.006182
3,93.0,0.381036,0.855461,0.686869,-2.0,0.0,481.0,0.0,0.731782


In [86]:
metrics_dict['level_0__ft__mypy__5_steps__1_choice']['std']

Unnamed: 0,levenshtein_distance_score,bleu_score,trf_similarity_score,sequence_matcher_score,gpt_reviewer_score,loops_required_score,tokens_used_score,agility_score,time_score
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-05
1,0.166667,4e-06,0.000117,0.000167,0.166667,4.166667,3396033.0,0.0,241.634399
2,2166.0,0.014664,0.065279,0.027206,6.0,2.666667,1048344.0,0.166667,13.69778
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000933
