## Imports

In [117]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import pickle

from eval import DATA

pd.set_option("display.max_columns", 100)

mbpp = DATA["mbpp"]  # train, validation, and test
humaneval = DATA["openai_humaneval"]  # test only

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Helpers

In [118]:
RESULTS_PATH = "./results/"

def save_pickle(object, to):
    with open(to, "wb") as f:
        pickle.dump(object, f)
    
def load_pickle(from_):
    with open(from_, "rb") as f:
        return pickle.load(f)


## Load and Combine Results

In [119]:
df = pd.concat([
    pd.read_csv(RESULTS_PATH + file_name)
    for file_name in os.listdir(RESULTS_PATH)
    if file_name.endswith(".csv")
]).drop(columns="Unnamed: 0")
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,lloc,sloc,comments,multi,blank,single_comments,CC,h1,h2,N1,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,test,0,canonical_solution,# Write a python function to remove first and ...,passed,0.000058,True,True,11,12,10,1,0,0,1,5,4,10,9,16,14,25,41.219281,95.183873,3.2,304.588394,16.921577,0.031728,79.749780
1,mbpp,test,1,canonical_solution,# Write a function to sort a given matrix in a...,passed,0.000129,True,True,4,3,3,1,0,0,1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,100.000000
2,mbpp,test,2,canonical_solution,# Write a function to count the most common wo...,passed,0.000197,True,True,6,5,5,1,0,0,1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,100.000000
3,mbpp,test,3,canonical_solution,# Write a python function to find the volume o...,passed,0.000033,True,True,3,2,2,1,0,0,1,1,2,6,3,6,8,9,17.509775,27.000000,1.0,27.000000,1.500000,0.009000,100.000000
4,mbpp,test,4,canonical_solution,# Write a function to split a string at lowerc...,passed,0.000057,True,True,4,3,3,1,0,0,1,1,0,0,0,0,0,0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,claude_3_haiku_0_shot,"def eat(number_eaten, needed, remaining):\n ...",failed: Error,,False,True,15,5,4,0,9,2,0,1,2,4,2,4,6,6,10.000000,15.509775,1.0,15.509775,0.861654,0.005170,77.260164
160,openai_humaneval,test,160,claude_3_haiku_0_shot,"def do_algebra(operator, operand):\n """"""\n ...",passed,0.000112,True,True,23,8,7,0,10,6,0,4,5,10,5,10,15,15,44.828921,58.603359,2.5,146.508397,8.139355,0.019534,85.425318
161,openai_humaneval,test,161,claude_3_haiku_0_shot,"def solve(s):\n """"""\n Reverses the case ...",passed,0.000084,True,True,14,12,10,0,4,0,0,4,2,2,2,2,4,4,4.000000,8.000000,1.0,8.000000,0.444444,0.002667,97.852090
162,openai_humaneval,test,162,claude_3_haiku_0_shot,import hashlib\n\ndef string_to_md5(text: str)...,passed,0.000064,True,True,12,6,5,0,5,2,0,2,1,1,1,1,2,2,0.000000,2.000000,0.5,1.000000,0.055556,0.000667,100.000000


In [120]:
df.value_counts(["dataset", "split", "model"])

dataset           split       model                
mbpp              test        canonical_solution       500
                              claude_3_haiku_0_shot    500
                  train       canonical_solution       374
openai_humaneval  test        canonical_solution       164
                              claude_3_haiku_0_shot    164
mbpp              validation  canonical_solution        90
Name: count, dtype: int64

In [121]:
df.columns

Index(['dataset', 'split', 'task_id', 'model', 'code', 'result',
       'avg_test_time', 'passed_tests', 'compiled', 'loc', 'lloc', 'sloc',
       'comments', 'multi', 'blank', 'single_comments', 'CC', 'h1', 'h2', 'N1',
       'N2', 'vocabulary', 'length', 'calculated_length', 'volume',
       'difficulty', 'effort', 'time', 'bugs', 'MI'],
      dtype='object')

## Analysis

In [122]:
GROUPING = ["dataset", "split", "model"]
df.groupby(GROUPING)[["compiled", "passed_tests"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,compiled,passed_tests
dataset,split,model,Unnamed: 3_level_1,Unnamed: 4_level_1
mbpp,test,canonical_solution,1.0,1.0
mbpp,test,claude_3_haiku_0_shot,1.0,0.212
mbpp,train,canonical_solution,1.0,1.0
mbpp,validation,canonical_solution,1.0,1.0
openai_humaneval,test,canonical_solution,1.0,1.0
openai_humaneval,test,claude_3_haiku_0_shot,1.0,0.646341


In [140]:
test_results_df = df.query("split == 'test'")
mbpp_test_results_df = test_results_df.query("dataset == 'mbpp'")
humaneval_test_results_df = test_results_df.query("dataset == 'openai_humaneval'")

mbpp_test_correct_tasks = set(
    mbpp_test_results_df
    .query("model == 'claude_3_haiku_0_shot' and passed_tests")
    ["task_id"]
)
humaneval_test_correct_tasks = set(
    humaneval_test_results_df
    .query("model == 'claude_3_haiku_0_shot' and passed_tests")
    ["task_id"]
)

In [142]:

claude_correct_mbpp_df = (
    mbpp_test_results_df
    # Filter to only tasks Claude got correct:
    [mbpp_test_results_df["task_id"].apply(
        lambda id: id in mbpp_test_correct_tasks
    )]
)
claude_correct_humaneval_df = (
    humaneval_test_results_df
    # Filter to only tasks Claude got correct:
    [humaneval_test_results_df["task_id"].apply(
        lambda id: id in humaneval_test_correct_tasks
    )]
)
claude_correct_tasks_df = pd.concat([claude_correct_mbpp_df, claude_correct_humaneval_df])
claude_correct_tasks_df.value_counts(GROUPING)

dataset           split  model                
mbpp              test   canonical_solution       106
                         claude_3_haiku_0_shot    106
openai_humaneval  test   canonical_solution       106
                         claude_3_haiku_0_shot    106
Name: count, dtype: int64

In [143]:
simplicity_metrics = df.columns[9:]
successful_solution_metrics = ["avg_test_time"] + list(simplicity_metrics)

means = (claude_correct_tasks_df
    .groupby(GROUPING)
    # Select relevant columns to analyze
    [successful_solution_metrics]
    .mean())

means

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avg_test_time,loc,lloc,sloc,comments,multi,blank,single_comments,CC,h1,h2,N1,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
dataset,split,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
mbpp,test,canonical_solution,0.049483,9.754717,8.783019,8.698113,1.0,0.0,0.056604,1.0,3.339623,2.660377,5.811321,4.622642,9.0,8.471698,13.622642,24.628718,53.319253,2.076706,254.391406,14.132856,0.017773,89.552736
mbpp,test,claude_3_haiku_0_shot,0.044969,12.235849,7.792453,7.283019,0.084906,2.990566,1.867925,0.09434,2.962264,2.490566,5.396226,4.150943,8.103774,7.886792,12.254717,22.322801,47.697408,1.872666,225.818862,12.545492,0.015899,79.616909
openai_humaneval,test,canonical_solution,0.00063,20.886792,8.839623,7.735849,0.018868,10.188679,2.943396,0.018868,3.575472,2.679245,5.981132,4.122642,8.018868,8.660377,12.141509,22.884071,43.571574,1.85572,133.000919,7.38894,0.014524,84.143378
openai_humaneval,test,claude_3_haiku_0_shot,0.000507,16.339623,7.830189,6.933962,0.056604,7.04717,2.254717,0.103774,3.462264,2.915094,5.509434,3.839623,7.254717,8.424528,11.09434,21.714565,39.290925,1.969576,126.996514,7.055362,0.013097,84.493031


In [145]:
std_devs = (claude_correct_tasks_df
    .groupby(GROUPING)
    # Select relevant columns to analyze
    [successful_solution_metrics]
    .std())

std_devs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,avg_test_time,loc,lloc,sloc,comments,multi,blank,single_comments,CC,h1,h2,N1,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
dataset,split,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
mbpp,test,canonical_solution,0.508611,6.680193,6.297399,6.665004,0.0,0.0,0.333243,0.0,2.046539,2.409848,6.016031,5.422094,10.635072,8.247606,16.052564,32.975497,77.225089,2.206504,559.596915,31.088717,0.025742,11.749034
mbpp,test,claude_3_haiku_0_shot,0.462167,8.420555,5.647953,6.018629,0.554083,4.074299,1.831343,0.561093,1.706707,2.330932,5.820619,5.280318,10.428833,7.97415,15.706269,31.641543,75.287975,2.128387,527.102562,29.283476,0.025096,15.766219
openai_humaneval,test,canonical_solution,0.003223,8.726883,4.826784,5.105183,0.194257,4.656001,1.672353,0.194257,2.088281,1.764988,4.472096,3.359997,6.611651,5.950409,9.95747,21.949414,43.129557,1.557187,206.645463,11.480303,0.014377,14.113865
openai_humaneval,test,claude_3_haiku_0_shot,0.00249,7.640602,4.532363,4.565437,0.333243,4.605136,1.580002,0.38863,1.9817,2.042957,3.971914,2.966919,5.783493,5.801392,8.733655,20.089768,37.011889,1.668091,177.000415,9.833356,0.012337,13.93289


In [159]:
mbpp_deltas = means.iloc[1] - means.iloc[0]
humaneval_deltas = means.iloc[3] - means.iloc[2]
deltas = pd.DataFrame(
    dict(
        mbpp = mbpp_deltas,
        openai_humaneval = humaneval_deltas
    )
)
deltas

Unnamed: 0,mbpp,openai_humaneval
avg_test_time,-0.004515,-0.000123
loc,2.481132,-4.54717
lloc,-0.990566,-1.009434
sloc,-1.415094,-0.801887
comments,-0.915094,0.037736
multi,2.990566,-3.141509
blank,1.811321,-0.688679
single_comments,-0.90566,0.084906
CC,-0.377358,-0.113208
h1,-0.169811,0.235849


In [238]:
# % Deltas (for table in paper)
round(pd.DataFrame(
    dict(
        mbpp = (means.iloc[1] - means.iloc[0])/means.iloc[0],
        openai_humaneval = (means.iloc[3] - means.iloc[2])/means.iloc[2]
    )
), 3) * 100

Unnamed: 0,mbpp,openai_humaneval
avg_test_time,-9.1,-19.5
loc,25.4,-21.8
lloc,-11.3,-11.4
sloc,-16.3,-10.4
comments,-91.5,200.0
multi,inf,-30.8
blank,3200.0,-23.4
single_comments,-90.6,450.0
CC,-11.3,-3.2
h1,-6.4,8.8


In [161]:
# http://www.stat.ucla.edu/~cochran/stat10/winter/lectures/lect21.html
squared_standard_errors = ((std_devs.T)**2 / claude_correct_tasks_df.value_counts(GROUPING)).T
standard_errors = (squared_standard_errors.groupby("dataset").sum())**0.5
z_scores = deltas / standard_errors.T
z_scores

Unnamed: 0,mbpp,openai_humaneval
avg_test_time,-0.067636,-0.310973
loc,2.376589,-4.036201
lloc,-1.205623,-1.569621
sloc,-1.622357,-1.205456
comments,-17.003718,1.00722
multi,7.55707,-4.938959
blank,10.018549,-3.081853
single_comments,-16.618194,2.011982
CC,-1.457948,-0.404857
h1,-0.521464,0.899409


In [198]:
all_deltas = (claude_correct_tasks_df
        .query("model == 'claude_3_haiku_0_shot'")
        [successful_solution_metrics]
    - claude_correct_tasks_df
        .query("model == 'canonical_solution'")
        [successful_solution_metrics])

all_deltas

Unnamed: 0,avg_test_time,loc,lloc,sloc,comments,multi,blank,single_comments,CC,h1,h2,N1,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
1,2.784585e-05,7,1,-1,-1,7,2,-1,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,-4.435423e-06,-14,-17,-16,-1,0,3,-1,2,-1,-4,-3,-6,-5,-9,-20.954618,-40.003064,-0.900000,-121.683745,-6.760208,-0.013334,0.651482
11,1.174533e-07,-6,-2,-2,-1,0,-3,-1,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-16.246946
13,1.953400e-06,-5,-4,-4,-1,0,0,-1,0,-3,-5,-4,-8,-8,-12,-30.138965,-53.022614,-2.333333,-212.931520,-11.829529,-0.017674,-8.540982
15,1.895260e-05,-6,-5,-5,-1,0,0,-1,-1,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-8.626777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,4.197402e-05,-8,-2,-4,0,-5,1,0,0,2,2,3,4,4,7,16.729056,34.225562,1.700000,235.163906,13.064661,0.011409,-1.516471
160,-1.063950e-05,-7,2,2,0,-10,1,0,2,4,6,3,6,10,9,36.828921,44.671790,2.000000,139.542613,7.752367,0.014891,35.324041
161,-5.711580e-06,-12,-5,-5,0,-6,-1,0,-1,-1,-4,-2,-5,-5,-7,-16.264663,-26.869175,-0.750000,-53.021056,-2.945614,-0.008956,7.139965
162,5.916840e-07,2,2,2,0,0,0,0,0,1,1,1,1,2,2,0.000000,2.000000,0.500000,1.000000,0.055556,0.000667,0.000000


In [202]:
percent_deltas = (
    all_deltas
    / claude_correct_tasks_df
        .query("model == 'canonical_solution'")
        [successful_solution_metrics])

percent_deltas.insert(
    0, "dataset",
    claude_correct_tasks_df
        .query("model == 'claude_3_haiku_0_shot'")
        ["dataset"]
)
percent_deltas

Unnamed: 0,dataset,avg_test_time,loc,lloc,sloc,comments,multi,blank,single_comments,CC,h1,h2,N1,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
1,mbpp,0.215597,1.750000,0.333333,-0.333333,-1.0,inf,inf,-1.0,0.00,,,,,,,,,,,,,0.000000
7,mbpp,-0.101367,-0.518519,-0.629630,-0.615385,-1.0,,inf,-1.0,1.00,-0.250000,-0.400000,-0.500000,-0.500000,-0.357143,-0.500000,-0.508369,-0.583710,-0.375000,-0.739819,-0.739819,-0.583710,0.009752
11,mbpp,0.002356,-0.461538,-0.222222,-0.222222,-1.0,,-1.000000,-1.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.180617
13,mbpp,0.067436,-0.500000,-0.444444,-0.444444,-1.0,,,-1.0,0.00,-0.600000,-0.555556,-0.666667,-0.666667,-0.571429,-0.666667,-0.750866,-0.773687,-0.700000,-0.932106,-0.932106,-0.773687,-0.100852
15,mbpp,0.194224,-0.750000,-0.714286,-0.714286,-1.0,,,-1.0,-0.25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.089014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,openai_humaneval,0.296498,-0.307692,-0.142857,-0.266667,,-0.555556,0.500000,,0.00,0.500000,0.250000,0.600000,0.400000,0.333333,0.466667,0.522783,0.636465,0.680000,1.749262,1.749262,0.636465,-0.016537
160,openai_humaneval,-0.087055,-0.233333,0.333333,0.400000,,-0.500000,0.200000,,1.00,4.000000,1.500000,1.500000,1.500000,2.000000,1.500000,4.603615,3.206515,4.000000,20.032577,20.032577,3.206515,0.705053
161,openai_humaneval,-0.063847,-0.461538,-0.294118,-0.333333,,-0.600000,-1.000000,,-0.20,-0.333333,-0.666667,-0.500000,-0.714286,-0.555556,-0.636364,-0.802612,-0.770571,-0.428571,-0.868898,-0.868898,-0.770571,0.078710
162,openai_humaneval,0.009343,0.200000,0.500000,0.666667,,0.000000,0.000000,,0.00,inf,inf,inf,inf,inf,inf,,inf,inf,inf,inf,inf,0.000000


In [217]:
percent_deltas[should_increase]

Unnamed: 0,comments,multi,single_comments,MI
1,-1.0,inf,-1.0,0.000000
7,-1.0,,-1.0,0.009752
11,-1.0,,-1.0,-0.180617
13,-1.0,,-1.0,-0.100852
15,-1.0,,-1.0,-0.089014
...,...,...,...,...
156,,-0.555556,,-0.016537
160,,-0.500000,,0.705053
161,,-0.600000,,0.078710
162,,0.000000,,0.000000


In [232]:
should_increase = ["comments", "multi", "single_comments", "MI"]
# all_deltas.insert(
#     0,
#     "dataset",
#     claude_correct_tasks_df
#         .query("model == 'claude_3_haiku_0_shot'")
#         ["dataset"]
# )

print("% Increased:")
print(
    all_deltas
    .groupby("dataset")
    [should_increase]
    .apply(lambda c: c > 0)
    .groupby("dataset")
    .mean()
    .T)
print("")
print("Avg. % Increase when Increased:")
pd.DataFrame([
    dict(
        dataset = dataset,
        **{
            metric:
                percent_deltas
                .query(f"dataset == '{dataset}' and {metric} > 0")
                [metric]
                .mean()
            for metric in should_increase
        }
    )
    for dataset in ["mbpp", "openai_humaneval"]
]).T

% Increased:
dataset              mbpp  openai_humaneval
comments         0.028302          0.028302
multi            0.386792          0.330189
single_comments  0.028302          0.066038
MI               0.188679          0.386792

Avg. % Increase when Increased:


Unnamed: 0,0,1
dataset,mbpp,openai_humaneval
comments,2.0,inf
multi,inf,0.483506
single_comments,2.0,inf
MI,0.088988,0.217426


In [233]:
should_decrease = [
    metric for metric in successful_solution_metrics
    if metric not in should_increase
]
print("% Decreased:")
print(all_deltas.groupby("dataset")[should_decrease].apply(lambda c: c < 0).groupby("dataset").mean().T)
print("")
print("Avg. % Decrease when Decreased:")
pd.DataFrame([
    dict(
        dataset = dataset,
        **{
            metric:
                percent_deltas
                .query(f"dataset == '{dataset}' and {metric} < 0")
                [metric]
                .mean()
            for metric in should_decrease
        }
    )
    for dataset in ["mbpp", "openai_humaneval"]
]).T

% Decreased:
dataset                mbpp  openai_humaneval
avg_test_time      0.566038          0.632075
loc                0.396226          0.688679
lloc               0.481132          0.433962
sloc               0.594340          0.386792
blank              0.018868          0.471698
CC                 0.320755          0.264151
h1                 0.235849          0.198113
h2                 0.311321          0.367925
N1                 0.301887          0.311321
N2                 0.320755          0.349057
vocabulary         0.320755          0.339623
length             0.320755          0.349057
calculated_length  0.330189          0.358491
volume             0.349057          0.377358
difficulty         0.283019          0.254717
effort             0.349057          0.349057
time               0.349057          0.349057
bugs               0.349057          0.377358

Avg. % Decrease when Decreased:


Unnamed: 0,0,1
dataset,mbpp,openai_humaneval
avg_test_time,-0.108976,-0.152113
loc,-0.369325,-0.349066
lloc,-0.349835,-0.348159
sloc,-0.370846,-0.394434
blank,-1.0,-0.625786
CC,-0.383147,-0.33835
h1,-0.487429,-0.471429
h2,-0.41347,-0.387362
N1,-0.461511,-0.417352


In [None]:
friendly_metric_names = {
    "loc": "# total lines of code",
    "lloc": "# logical lines of code",
    "sloc": "# source lines of code",
    "comments": "# single-line (#) comment lines",
    "multi": "# multi-line strings lines",
    "blank": "# white-space only lines",
    "single_comments": "# single-line (#) comment-only lines",
    "CC": "Cylomatic Complexity",
    "h1": "# distinct operators",
    "h2": "# distinct operands",
    "N1": "# total operators",
    "N2": "# total operands",
    "vocabulary": "Vocabulary size (distinct operators + operands)",
    "length": "Program length (total operators + operands)",
    "calculated_length": "Halstead estimated program length",
    "volume": "Halstead volume",
    "difficulty": "Halstead difficulty",
    "effort": "Halstead effort",
    "time": "Halstead time required to program",
    "bugs": "Halstead estimated bugs",
    "MI": "Maintainability Index"
}

### Correctness Failure Investigation

In [58]:
canonical_solutions = [
    task["prompt"] + task["canonical_solution"] for task in humaneval["test"]
]
print(canonical_solutions[32])

import math


def poly(xs: list, x: float):
    """
    Evaluates polynomial with coefficients xs at point x.
    return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
    """
    return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])


def find_zero(xs: list):
    """ xs are coefficients of a polynomial.
    find_zero find x such that poly(x) = 0.
    find_zero returns only only zero point, even if there are many.
    Moreover, find_zero only takes list xs having even number of coefficients
    and largest non zero coefficient as it guarantees
    a solution.
    >>> round(find_zero([1, 2]), 2) # f(x) = 1 + 2x
    -0.5
    >>> round(find_zero([-6, 11, -6, 1]), 2) # (x - 1) * (x - 2) * (x - 3) = -6 + 11x - 6x^2 + x^3
    1.0
    """
    begin, end = -1., 1.
    while poly(xs, begin) * poly(xs, end) > 0:
        begin *= 2.0
        end *= 2.0
    while end - begin > 1e-10:
        center = (begin + end) / 2.0
        if poly(xs, center) * poly(xs, begin) > 0:
          

In [53]:
# print(humaneval["test"][9]["test"])

In [124]:
df.query("model == 'claude_3_haiku_0_shot' and not passed_tests")[["dataset", "task_id", "result", "code"]]

Unnamed: 0,dataset,task_id,result,code
0,mbpp,0,failed: name 'remove_Occ' is not defined,"def remove_occurrences(string, char):\n res..."
2,mbpp,2,failed: name 'count_common' is not defined,from collections import Counter\n\ndef count_m...
3,mbpp,3,failed: name 'find_Volume' is not defined,"def calculate_triangular_prism_volume(length, ..."
4,mbpp,4,failed: name 'split_lowerstring' is not defined,"def split_at_lowercase(text):\n """"""\n Sp..."
5,mbpp,5,failed: name 'text_lowercase_underscore' is no...,import re\n\ndef find_lowercase_underscore_seq...
...,...,...,...,...
153,openai_humaneval,153,failed: name 'Strongest_Extension' is not defined,"def strongest_extension(class_name, extensions..."
154,openai_humaneval,154,failed: name 'cycpattern_check' is not defined,"def cyclic_pattern_check(string_a, string_b):\..."
157,openai_humaneval,157,failed: name 'right_angle_triangle' is not def...,"def is_right_angle_triangle(a, b, c):\n """"""..."
158,openai_humaneval,158,failed: t1,"def find_max(words):\n """"""\n Find the wo..."


model == 'claude_3_haiku_0_shot'and dataset == 'mbpp'and not passed_tests


In [133]:
(df
    .query(
        "model == 'claude_3_haiku_0_shot'"
        " and dataset == 'mbpp'"
        " and not passed_tests"
    )
    [["dataset", "task_id", "result", "code"]])

Unnamed: 0,dataset,task_id,result,code
0,mbpp,0,failed: name 'remove_Occ' is not defined,"def remove_occurrences(string, char):\n res..."
2,mbpp,2,failed: name 'count_common' is not defined,from collections import Counter\n\ndef count_m...
3,mbpp,3,failed: name 'find_Volume' is not defined,"def calculate_triangular_prism_volume(length, ..."
4,mbpp,4,failed: name 'split_lowerstring' is not defined,"def split_at_lowercase(text):\n """"""\n Sp..."
5,mbpp,5,failed: name 'text_lowercase_underscore' is no...,import re\n\ndef find_lowercase_underscore_seq...
...,...,...,...,...
493,mbpp,493,failed: name 'sum_Of_Series' is not defined,def sum_of_cubes(n):\n return (n * (n + 1) ...
494,mbpp,494,failed: name 're_order' is not defined,"def move_zeroes_to_end(arr):\n """"""\n Mov..."
497,mbpp,497,failed: name 'same_order' is not defined,"def check_common_order(list1, list2):\n """"""..."
498,mbpp,498,failed: name 'average_Odd' is not defined,def average_odd(n):\n if n % 2 == 0:\n ...


In [134]:
for i, row in (df
    .query(
        "model == 'claude_3_haiku_0_shot'"
        " and dataset == 'mbpp'"
        " and not passed_tests"
    )
    [["dataset", "task_id", "result", "code"]]).iterrows():
    print(str(row["task_id"]) + ": " + row["result"])

0: failed: name 'remove_Occ' is not defined
2: failed: name 'count_common' is not defined
3: failed: name 'find_Volume' is not defined
4: failed: name 'split_lowerstring' is not defined
5: failed: name 'text_lowercase_underscore' is not defined
6: failed: name 'square_perimeter' is not defined
8: failed: name 'test_duplicate' is not defined
9: failed: 
10: failed: name 'multiples_of_num' is not defined
12: failed: name 'maximum_Sum' is not defined
14: failed: name 'find_Product' is not defined
16: failed: name 'remove' is not defined
17: failed: name 'binomial_Coeff' is not defined
18: failed: name 'get_Odd_Occurrence' is not defined
19: failed: name 'count_Substring_With_Equal_Ends' is not defined
20: failed: name 'func' is not defined
21: failed: name 'max_Prime_Factors' is not defined
22: failed: name 'decimal_To_Binary' is not defined
24: failed: name 'find_rect_num' is not defined
25: failed: name 'find_Nth_Digit' is not defined
27: failed: name 'div_even_odd' is not defined
28: f

In [45]:
for i, row in df.query("model == 'claude_3_haiku_0_shot' and not passed_tests")[["task_id", "result", "code"]].iterrows():
    print(str(row["task_id"]) + ": " + row["result"])
    print(row["code"])
    print("-" * 80)

4: failed: name 'mean_absolute_deviation' is not defined
from typing import List

def calculate_mean_absolute_deviation(numbers: List[float]) -> float:
    """
    Calculates the Mean Absolute Deviation (MAD) for a given list of numbers.

    MAD is the average absolute difference between each element and the mean of the dataset.

    Args:
        numbers (List[float]): The input list of numbers.

    Returns:
        float: The Mean Absolute Deviation.
    """
    mean = sum(numbers) / len(numbers)
    deviations = [abs(x - mean) for x in numbers]
    return sum(deviations) / len(numbers)
--------------------------------------------------------------------------------
7: failed: name 'filter_by_substring' is not defined
from typing import List

def filter_strings_by_substring(strings: List[str], substring: str) -> List[str]:
    """
    Filter a list of strings to only include those that contain the given substring.

    Args:
        strings (List[str]): The list of strings to filte