## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import pickle

from eval import DATA

pd.set_option("display.max_columns", 100)

mbpp = DATA["mbpp"]  # train, validation, and test
humaneval = DATA["openai_humaneval"]  # test only

## Helpers

In [2]:
RESULTS_PATH = "./results/"

def save_pickle(object, to):
    with open(to, "wb") as f:
        pickle.dump(object, f)
    
def load_pickle(from_):
    with open(from_, "rb") as f:
        return pickle.load(f)


## Load and Combine Results

In [3]:
df = pd.concat([
    pd.read_csv(RESULTS_PATH + file_name)
    for file_name in os.listdir(RESULTS_PATH)
    if file_name.endswith(".csv")
]).drop(columns="Unnamed: 0")

print(df.value_counts(["dataset", "split", "model"]))  # Counts
df

dataset           split       model                
mbpp              test        canonical_solution       500
                              claude_3_haiku_0_shot    500
                  train       canonical_solution       374
openai_humaneval  test        canonical_solution       164
                              claude_3_haiku_0_shot    164
mbpp              validation  canonical_solution        90
Name: count, dtype: int64


Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,lloc,sloc,comments,multi,blank,single_comments,CC,h1,h2,N1,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,test,0,canonical_solution,# Write a python function to remove first and ...,passed,0.000058,True,True,11,12,10,1,0,0,1,5,4,10,9,16,14,25,41.219281,95.183873,3.2,304.588394,16.921577,0.031728,79.749780
1,mbpp,test,1,canonical_solution,# Write a function to sort a given matrix in a...,passed,0.000129,True,True,4,3,3,1,0,0,1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,100.000000
2,mbpp,test,2,canonical_solution,# Write a function to count the most common wo...,passed,0.000197,True,True,6,5,5,1,0,0,1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,100.000000
3,mbpp,test,3,canonical_solution,# Write a python function to find the volume o...,passed,0.000033,True,True,3,2,2,1,0,0,1,1,2,6,3,6,8,9,17.509775,27.000000,1.0,27.000000,1.500000,0.009000,100.000000
4,mbpp,test,4,canonical_solution,# Write a function to split a string at lowerc...,passed,0.000057,True,True,4,3,3,1,0,0,1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,claude_3_haiku_0_shot,"def eat(number_eaten, needed, remaining):\n ...",failed: Error,,False,True,15,5,4,0,9,2,0,1,2,4,2,4,6,6,10.000000,15.509775,1.0,15.509775,0.861654,0.005170,77.260164
160,openai_humaneval,test,160,claude_3_haiku_0_shot,"def do_algebra(operator, operand):\n """"""\n ...",passed,0.000112,True,True,23,8,7,0,10,6,0,4,5,10,5,10,15,15,44.828921,58.603359,2.5,146.508397,8.139355,0.019534,85.425318
161,openai_humaneval,test,161,claude_3_haiku_0_shot,"def solve(s):\n """"""\n Reverses the case ...",passed,0.000084,True,True,14,12,10,0,4,0,0,4,2,2,2,2,4,4,4.000000,8.000000,1.0,8.000000,0.444444,0.002667,97.852090
162,openai_humaneval,test,162,claude_3_haiku_0_shot,import hashlib\n\ndef string_to_md5(text: str)...,passed,0.000064,True,True,12,6,5,0,5,2,0,2,1,1,1,1,2,2,0.000000,2.000000,0.5,1.000000,0.055556,0.000667,100.000000


## Analysis

### Correctness Metrics

In [8]:
GROUPING = ["dataset", "split", "model"]
df.groupby(GROUPING)[["compiled", "passed_tests"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,compiled,passed_tests
dataset,split,model,Unnamed: 3_level_1,Unnamed: 4_level_1
mbpp,test,canonical_solution,1.0,1.0
mbpp,test,claude_3_haiku_0_shot,1.0,0.212
mbpp,train,canonical_solution,1.0,1.0
mbpp,validation,canonical_solution,1.0,1.0
openai_humaneval,test,canonical_solution,1.0,1.0
openai_humaneval,test,claude_3_haiku_0_shot,1.0,0.646341


### Efficiency & Simplicity Metrics

In [32]:
friendly_metric_names = {
    "loc": "# total lines of code",
    "lloc": "# logical lines of code",
    "sloc": "# source lines of code",
    "comments": "# single-line (#) comment lines",
    "multi": "# multi-line strings lines",
    "blank": "# white-space only lines",
    "single_comments": "# single-line (#) comment-only lines",
    "CC": "Cylomatic Complexity",
    "h1": "# distinct operators",
    "h2": "# distinct operands",
    "N1": "# total operators",
    "N2": "# total operands",
    "vocabulary": "Vocabulary size (distinct operators + operands)",
    "length": "Program length (total operators + operands)",
    "calculated_length": "Halstead estimated program length",
    "volume": "Halstead volume",
    "difficulty": "Halstead difficulty",
    "effort": "Halstead effort",
    "time": "Halstead time required to program",
    "bugs": "Halstead estimated bugs",
    "MI": "Maintainability Index"
}

In [9]:
test_results_df = df.query("split == 'test'")
mbpp_test_results_df = test_results_df.query("dataset == 'mbpp'")
humaneval_test_results_df = test_results_df.query("dataset == 'openai_humaneval'")

In [19]:
model = "claude_3_haiku_0_shot"  # Enter model to analyze

# Find tasks model got correct:
humaneval_test_correct_tasks = set(
    humaneval_test_results_df
    .query(f"model == '{model}' and passed_tests")
    ["task_id"]
)
mbpp_test_correct_tasks = set(
    mbpp_test_results_df
    .query(f"model == '{model}' and passed_tests")
    ["task_id"]
)

# Filter to only tasks model got correct:
model_correct_humaneval_df = (
    humaneval_test_results_df
    [humaneval_test_results_df["task_id"].apply(
        lambda id: id in humaneval_test_correct_tasks
    )]
)
model_correct_mbpp_df = (
    mbpp_test_results_df
    [mbpp_test_results_df["task_id"].apply(
        lambda id: id in mbpp_test_correct_tasks
    )]
)


#### Means

In [28]:
simplicity_metrics = df.columns[9:]
successful_solution_metrics = ["avg_test_time"] + list(simplicity_metrics)

humaneval_means = (
    model_correct_humaneval_df
    .groupby("model")
    # Select relevant columns to analyze
    [successful_solution_metrics]
    .mean()
    .T
)
mbpp_means = (
    model_correct_mbpp_df
    .groupby("model")
    # Select relevant columns to analyze
    [successful_solution_metrics]
    .mean()
    .T
)

print("HumanEval:")
print(humaneval_means)
print("")
print("MBPP:")
mbpp_means

HumanEval:
model              canonical_solution  claude_3_haiku_0_shot
avg_test_time                0.000630               0.000507
loc                         20.886792              16.339623
lloc                         8.839623               7.830189
sloc                         7.735849               6.933962
comments                     0.018868               0.056604
multi                       10.188679               7.047170
blank                        2.943396               2.254717
single_comments              0.018868               0.103774
CC                           3.575472               3.462264
h1                           2.679245               2.915094
h2                           5.981132               5.509434
N1                           4.122642               3.839623
N2                           8.018868               7.254717
vocabulary                   8.660377               8.424528
length                      12.141509              11.094340
calculated_le

model,canonical_solution,claude_3_haiku_0_shot
avg_test_time,0.049483,0.044969
loc,9.754717,12.235849
lloc,8.783019,7.792453
sloc,8.698113,7.283019
comments,1.0,0.084906
multi,0.0,2.990566
blank,0.056604,1.867925
single_comments,1.0,0.09434
CC,3.339623,2.962264
h1,2.660377,2.490566


#### Percent Change

(Final metric shown in paper)

In [31]:
humaneval_percent_deltas = (
    (humaneval_means[model] - humaneval_means["canonical_solution"])
    /humaneval_means["canonical_solution"]
)
mbpp_percent_deltas = (
    (mbpp_means[model] - mbpp_means["canonical_solution"])
    /mbpp_means["canonical_solution"]
)

print("HumanEval:")
print(round(humaneval_percent_deltas, 3) * 100)
print("")
print("MBPP:")
print(round(mbpp_percent_deltas, 3) * 100)

HumanEval:
avg_test_time        -19.5
loc                  -21.8
lloc                 -11.4
sloc                 -10.4
comments             200.0
multi                -30.8
blank                -23.4
single_comments      450.0
CC                    -3.2
h1                     8.8
h2                    -7.9
N1                    -6.9
N2                    -9.5
vocabulary            -2.7
length                -8.6
calculated_length     -5.1
volume                -9.8
difficulty             6.1
effort                -4.5
time                  -4.5
bugs                  -9.8
MI                     0.4
dtype: float64

MBPP:
avg_test_time          -9.1
loc                    25.4
lloc                  -11.3
sloc                  -16.3
comments              -91.5
multi                   inf
blank                3200.0
single_comments       -90.6
CC                    -11.3
h1                     -6.4
h2                     -7.1
N1                    -10.2
N2                    -10.0
vocabular