## Imports

In [None]:
%load_ext autoreload
%autoreload 2

import anthropic
from dotenv import load_dotenv
import pandas as pd
import pickle
import re
import time

# Expects this file and eval.py to be in the same folder:
from eval import DATA, evaluate, bulk_evaluate

# Expects ANTHROPIC_API_KEY to be provided in .env file in user's home directory
load_dotenv()

mbpp = DATA["mbpp"]  # train, validation, and test
humaneval = DATA["openai_humaneval"]  # test only

## Helpers

In [2]:

RESULTS_PATH = "./results/"

def save_pickle(object, to):
    with open(to, "wb") as f:
        pickle.dump(object, f)
    
def load_pickle(from_):
    with open(from_, "rb") as f:
        return pickle.load(f)

def get_canonical_solutions(dataset, split):
    assert dataset in ("mbpp", "openai_humaneval")

    if dataset == "mbpp":
        mbpp = DATA["mbpp"]
        return [
            "# " + task["text"] + "\n" + task["code"] for task in mbpp[split]
        ]
    else:  # humaneval
        humaneval = DATA["openai_humaneval"]
        return [
            task["prompt"] + task["canonical_solution"] for task in humaneval["test"]
        ]

def get_expected_function_names(dataset, split):
    assert dataset in ("mbpp", "openai_humaneval")

    if dataset == "mbpp":
        mbpp = DATA["mbpp"]
        return [
            re.findall(
                "assert\s*\(?\w+\s*\(",
                task["test_list"][0]
            )[-1].replace("assert", "").replace("(", "").strip()
            for task in mbpp[split]
        ]
    else:  # humaneval
        humaneval = DATA["openai_humaneval"]
        return [
            task["entry_point"]
            for task in humaneval["test"]
        ]

def check_results_for_errors(results, result_ids):
    print(len(results))
    errors = [i for i, result in zip(result_ids, results) if result == "ERROR"]
    print(len(errors))
    return errors

## Prompts

In [3]:
BASIC_SYSTEM_PROMPT = "Refactor the given Python program to a more readable, efficient, and maintainable one. You can assume that the given program is semantically correct. Do not change the external behavior of the program, and keep the syntactic and semantic correctness. Python programs should be in a code block. Do not explain anything in natural language."
CLAUDE_PROMPT = (BASIC_SYSTEM_PROMPT + "\n\nPut your response in a markdown code block. Respond with only the code block. Don't explain the changes made.")

# ~18% tests passed
v1_prompt = CLAUDE_PROMPT

# ~28% tests passed
v2_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made

Following the above rules is CRITICALLY IMPORTANT to ensure the program continues to work as expected for users!
""".strip()

# ~71% tests passed
v3_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made

Refactor only according to these guidelines. Again, do not change the name of the function, including changing capitalization.
""".strip()

# 75.55555555555555% tests passed
v4_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made

Again, do not change the name of the function in any way. The function name should remain "{function_name}".
""".strip()

# 71.11111111111111% tests passed
v5_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one. The given program is correct but needs improvement. DO NOT change the name of the program. DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.). Put your response in a markdown code block. Respond with only the code block. Don't explain the changes made.

Again, do not change the name of the function in any way. The function name should remain `{function_name}`.
""".strip()

# 74.44444444444445% tests passed
v6_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made

Again, do not change the name of the function in any way. The function name should remain `{function_name}`.

Also, don't forget to import any packages you use (e.g. `os`, `re`, `sys`)!
""".strip()

# 74.44444444444445% tests passed
v7_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made
 - Don't forget to import any packages you use (e.g. `os`, `re`, `sys`)

Again, do not change the name of the function in any way. The function name should remain "{function_name}".
""".strip()

# 74.44444444444445% tests passed
v8_prompt = """
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made
 - If you use any packages (e.g. `os`, `re`, `sys`), don't forget to import them

Again, be careful not to change the function name! The function name should remain `{function_name}`.
""".strip()

current_prompt = v8_prompt
current_prompt_name = "v4_prompt"
print(current_prompt)

Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made
 - If you use any packages (e.g. `os`, `re`, `sys`), don't forget to import them

Again, be careful not to change the function name! The function name should remain `{function_name}`.


## Generation Helpers

In [44]:
def refactor(code, model="claude-3-haiku-20240307", system_prompt=CLAUDE_PROMPT, max_tokens=1000, temperature=1e-8):
    """
    temperature = Amount of randomness injected into the response. Ranges from
        0.0 to 1.0. Use temperature closer to 0.0 for analytical / multiple
        choice, and closer to 1.0 for creative and generative tasks. Anthropic
        API defaults to 1.0 [1]. Here we default to 1e-8 or very close to 0.0,
        but not exactly 0.0. This is we observed some randomness even a 0.0.
        Anthropic acknowledges this in their documentation [1]. See [2] for a
        similar dicussion for the GPT API. We use the very low, but not zero
        temperature based on that discussion, the empirical results shown there,
        and our own testing.
        Sources:
         1. https://docs.anthropic.com/claude/reference/complete_post#:~:text=to%20stop%20generating.-,temperature,-number
         2. https://community.openai.com/t/why-the-api-output-is-inconsistent-even-after-the-temperature-is-set-to-0/329541
    """
    client = anthropic.Anthropic()
    message = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system_prompt,
        messages=[{
            "role": "user",
            "content": [{"type": "text", "text": code}]
        }]
    )
    return message.content[0].text.strip()

def extract_from_code_block(raw_generation):
    assert raw_generation.startswith("```")
    assert raw_generation.endswith("```")

    # Simply remove first and last line
    return "\n".join(raw_generation.split("\n")[1:-1])

def select_refactoring_rules(code, seconds_between_generations=2):
    system_prompt = (
        "You are an expert programmer. Given the code below, pick the most suitable refactoring rule from the list of rules copied below.\n"
        "You MUST follow these requirements: \n"
        "1) Only pick one rule and 2) Only output the number of the rule you have chosen\n"
        "List of Rules:\n"
        "1. Use a formatted string.\n"
        "2. Use a built-in (i.e., radians) function.\n"
        "3. Use a logical operator instead of a nested if.\n"
        "4. Use a for-loop instead of a while-loop.\n"
        "5. Use list comprehension instead of a for-loop.\n"
        "6. Use the map function instead of list comprehension."
        "7. Use a throwaway variable.\n"
        "8. Use the enumerate function instead of the range function.\n"
        "9. Use the zip function instead of the range function.\n"
        "10. Use a ternary operator instead of an if-branch.\n"
        " 11. Merge repeated ifs.\n"
        " 12. Merge dictionary assignments.\n"
        " 13. Remove unnecessary calls to dict.items().\n"
        " 14. Remove str() from calls to print().\n"
        " 15. Flatten nested try.\n"
        " 16. Convert any to in."
    )
    output_texts = []
    for i, code_block in enumerate(code):
        print(i)
        if i > 0:
            # To avoid hitting the Claude API rate limit:
            time.sleep(seconds_between_generations)
        
        raw_generation = refactor(
            code_block,
            system_prompt=system_prompt
        )
        output_texts.append(raw_generation)

    return output_texts

def extract_rule(output):
    # Finding the first number in the specified range
    found_numbers = re.findall(r'\b(?:1[0-6]|[1-9])\b', output)
    return int(found_numbers[0]) if found_numbers else None

RULES = {
    1: "Use a formatted string",
    2: "Use a built-in (i.e., radians) function",
    3: "Use a logical operator instead of a nested if",
    4: "Use a for-loop instead of a while-loop",
    5: "Use list comprehension instead of a for-loop",
    6: "Use the map function instead of list comprehension",
    7: "Use a throwaway variable",
    8: "Use the enumerate function instead of the range function",
    9: "Use the zip function instead of the range function",
    10: "Use a ternary operator instead of an if-branch",
    11: "Merge repeated ifs",
    12: "Merge dictionary assignments",
    13: "Remove unnecessary calls to dict.items()",
    14: "Remove str() from calls to print()",
    15: "Flatten nested try",
    16: "Convert any to in."
}

def generate_output_with_rule(code, rules, function_names, seconds_between_generations=2):
    output_texts = []
    for i, (code_block, rule, function_name) in enumerate(zip(code, rules, function_names)):
        print(i)
        if i > 0:
            # To avoid hitting the Claude API rate limit:
            time.sleep(seconds_between_generations)
        
        rule_prompt = (
            f"""
            You are an expert programmer. Please refactor the following Python program to a more readable, efficient, and maintainable one:
            - The given program is correct but needs improvement
            - MAKE SURE TO follow these given refactoring rules: {RULES[rule]}
            - DO NOT change the name of the program
            - DO NOT change the input or output behavior of the program (e.g., number of inputs / outputs, input / output types, etc.)
            - Put your response in a markdown code block
            - Respond with only the code block
            - Don't explain the changes made

            Again, do not change the name of the function in any way. The function name should remain "{function_name}".
            """.strip()
        )
        
        raw_generation = refactor(
            code_block,
            system_prompt=rule_prompt
        )
        result = extract_from_code_block(raw_generation)
        output_texts.append(result)

    return output_texts


## Zero-Shot

### HumanEval Generation

In [5]:
canonical_solutions = [
    task["prompt"] + task["canonical_solution"] for task in humaneval["test"]
]

In [11]:
# WARNING: Running this cell will delete all your generations / evals. Be sure
#   you want to do this!
# all_generations = []
# all_evals = []

In [None]:

# WARNNG: Very slow step:
assert len(all_generations) == len(all_evals)

i = len(all_evals)
while i < len(canonical_solutions):
    print(i)

    # Call Claude API to refactor the code:
    raw_generation = refactor(canonical_solutions[i])
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)

    # To avoid hitting the Claude API rate limit, buy time by doing the eval
    # immediately after generation:
    all_evals.append(evaluate(
        dataset="openai_humaneval",
        split="test",
        task_id=i,
        code=all_generations[i]
    ))
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + "openai_humaneval_test_claude_3_haiku_0_shot_in_progress_generations.pkl")
        save_pickle(all_evals, RESULTS_PATH + "openai_humaneval_test_claude_3_haiku_0_shot_in_progress_evals.pkl")


In [69]:
df = pd.DataFrame(all_evals)
df.insert(3, "model", "claude_3_haiku_0_shot")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,openai_humaneval,test,0,claude_3_haiku_0_shot,from typing import List\n\ndef has_close_eleme...,passed,0.000200,True,True,18,...,6,9,9,20.264663,28.529325,1.50,42.793988,2.377444,0.009510,92.948976
1,openai_humaneval,test,1,claude_3_haiku_0_shot,from typing import List\n\ndef separate_paren_...,passed,0.000097,True,True,33,...,5,6,8,10.000000,20.679700,1.25,25.849625,1.436090,0.006893,91.833863
2,openai_humaneval,test,2,claude_3_haiku_0_shot,def truncate_number(number: float) -> float:\n...,passed,0.000065,True,True,15,...,2,3,3,2.000000,4.754888,0.50,2.377444,0.132080,0.001585,55.567861
3,openai_humaneval,test,3,claude_3_haiku_0_shot,from typing import List\n\ndef below_zero(oper...,passed,0.000135,True,True,11,...,4,6,6,10.000000,15.509775,1.00,15.509775,0.861654,0.005170,71.694386
4,openai_humaneval,test,4,claude_3_haiku_0_shot,from typing import List\n\ndef calculate_mean_...,failed: name 'mean_absolute_deviation' is not ...,,False,True,17,...,6,8,9,17.509775,27.000000,1.00,27.000000,1.500000,0.009000,87.277819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,claude_3_haiku_0_shot,"def eat(number_eaten, needed, remaining):\n ...",failed: Error,,False,True,15,...,4,6,6,10.000000,15.509775,1.00,15.509775,0.861654,0.005170,77.260164
160,openai_humaneval,test,160,claude_3_haiku_0_shot,"def do_algebra(operator, operand):\n """"""\n ...",passed,0.000112,True,True,23,...,10,15,15,44.828921,58.603359,2.50,146.508397,8.139355,0.019534,85.425318
161,openai_humaneval,test,161,claude_3_haiku_0_shot,"def solve(s):\n """"""\n Reverses the case ...",passed,0.000084,True,True,14,...,2,4,4,4.000000,8.000000,1.00,8.000000,0.444444,0.002667,97.852090
162,openai_humaneval,test,162,claude_3_haiku_0_shot,import hashlib\n\ndef string_to_md5(text: str)...,passed,0.000064,True,True,12,...,1,2,2,0.000000,2.000000,0.50,1.000000,0.055556,0.000667,100.000000


In [71]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "openai_humaneval_test_claude_3_haiku_0_shot.csv")

### MBPP Generation

#### Test

In [4]:
canonical_solutions = [
    "# " + task["text"] + "\n" + task["code"] for task in mbpp["test"]
]

In [13]:
# WARNING: Running this cell will delete all your generations / evals. Be sure
#   you want to do this!
# all_generations = []
# all_evals = []

In [None]:
# WARNNG: Very slow step:
assert len(all_generations) == len(all_evals)

i = len(all_evals)
while i < len(canonical_solutions):
    print(i)

    # Call Claude API to refactor the code:
    raw_generation = refactor(canonical_solutions[i])
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)

    # To avoid hitting the Claude API rate limit, buy time by doing the eval
    # immediately after generation:
    all_evals.append(evaluate(
        dataset="mbpp",
        split="test",
        task_id=i,
        code=all_generations[i]
    ))
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + "mbpp_test_claude_3_haiku_0_shot_in_progress_generations.pkl")
        save_pickle(all_evals, RESULTS_PATH + "mbpp_test_claude_3_haiku_0_shot_in_progress_evals.pkl")


In [37]:
df = pd.DataFrame(all_evals)
df.insert(3, "model", "claude_3_haiku_0_shot")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,test,0,claude_3_haiku_0_shot,"def remove_occurrences(string, char):\n res...",failed: name 'remove_Occ' is not defined,True,False,,17,...,12,10,18,23.509775,59.794706,4.000000,239.178823,13.287712,0.019932,61.616668
1,mbpp,test,1,claude_3_haiku_0_shot,"def sort_matrix(matrix):\n """"""\n Sorts a...",passed,True,True,0.000157,11,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
2,mbpp,test,2,claude_3_haiku_0_shot,from collections import Counter\n\ndef count_m...,failed: name 'count_common' is not defined,True,False,,14,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
3,mbpp,test,3,claude_3_haiku_0_shot,"def calculate_triangular_prism_volume(length, ...",failed: name 'find_Volume' is not defined,True,False,,13,...,6,8,9,17.509775,27.000000,1.000000,27.000000,1.500000,0.009000,51.650873
4,mbpp,test,4,claude_3_haiku_0_shot,"def split_at_lowercase(text):\n """"""\n Sp...",failed: name 'split_lowerstring' is not defined,True,False,,11,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,mbpp,test,495,claude_3_haiku_0_shot,"def permutation_coefficient(n, k):\n """"""\n ...",passed,True,True,0.000060,27,...,28,18,42,58.529325,175.136850,7.000000,1225.957950,68.108775,0.058379,87.544796
496,mbpp,test,496,claude_3_haiku_0_shot,"def remove_words(original_list, words_to_remov...",passed,True,True,0.000085,2,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,88.288489
497,mbpp,test,497,claude_3_haiku_0_shot,"def check_common_order(list1, list2):\n """"""...",failed: name 'same_order' is not defined,True,False,,13,...,6,7,9,13.609640,25.266194,1.200000,30.319433,1.684413,0.008422,69.363638
498,mbpp,test,498,claude_3_haiku_0_shot,def average_odd(n):\n if n % 2 == 0:\n ...,failed: name 'average_Odd' is not defined,True,False,,13,...,14,13,21,35.161259,77.709234,6.000000,466.255404,25.903078,0.025903,64.545413


In [38]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "mbpp_test_claude_3_haiku_0_shot.csv")

#### Validate

In [19]:
dataset = "mbpp"
split = "validation"

canonical_solutions = get_canonical_solutions(dataset, split)
function_names = get_expected_function_names(dataset, split)
assert len(canonical_solutions) == len(function_names)

print("n =", len(canonical_solutions))
print(current_prompt.format(function_name=function_names[0]))

n = 90
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made
 - If you use any packages (e.g. `os`, `re`, `sys`), don't forget to import them

Again, be careful not to change the function name! The function name should remain `find_Min_Sum`.


In [207]:
# i = 35
# raw_generation = refactor(
#     canonical_solutions[i],
#     system_prompt=current_prompt.format(function_name=function_names[i])
# )
# generation = extract_from_code_block(raw_generation)
# print(generation)

def last_occurence_char(string, char):
    try:
        return string.rindex(char) + 1
    except ValueError:
        return None


In [209]:
# WARNING: Running this cell will delete all your generations. Be sure you want to do this!
all_generations = []

In [212]:
# WARNNG: Slow step:
i = len(all_generations)
while i < len(canonical_solutions):
    print(i)

    # Call Claude API to refactor the code:
    raw_generation = refactor(
        canonical_solutions[i],
        system_prompt=current_prompt.format(function_name=function_names[i])
    )
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + "mbpp_validation_claude_3_haiku_0_shot_in_progress_generations.pkl")


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89


In [213]:
execution_ids_backup = []
execution_results_backup = []

results = bulk_evaluate(
    dataset="mbpp",
    split="validation",
    code=all_generations,
    execution_ids_backup=execution_ids_backup,
    execution_results_backup=execution_results_backup
)


Sending 90 jobs for execution...
90 jobs sent
Fetching 90 job results...
Fetching 1/90...
Fetching 2/90...
Fetching 3/90...
Fetching 4/90...
Fetching 5/90...
Fetching 6/90...
Fetching 7/90...
Fetching 8/90...
Fetching 9/90...
Fetching 10/90...
Fetching 11/90...
Fetching 12/90...
Fetching 13/90...
Fetching 14/90...
Fetching 15/90...
Fetching 16/90...
Fetching 17/90...
Fetching 18/90...
Fetching 19/90...
Fetching 20/90...
Fetching 21/90...
Fetching 22/90...
Fetching 23/90...
Fetching 24/90...
Fetching 25/90...
Fetching 26/90...
Fetching 27/90...
Fetching 28/90...
Fetching 29/90...
Fetching 30/90...
Fetching 31/90...
Fetching 32/90...
Fetching 33/90...
Fetching 34/90...
Fetching 35/90...
Fetching 36/90...
Fetching 37/90...
Fetching 38/90...
Fetching 39/90...
Fetching 40/90...
Fetching 41/90...
Fetching 42/90...
Fetching 43/90...
Fetching 44/90...
Fetching 45/90...
Fetching 46/90...
Fetching 47/90...
Fetching 48/90...
Fetching 49/90...
Fetching 50/90...
Fetching 51/90...
Fetching 52/90...


In [214]:
df = pd.DataFrame(results)
df.insert(3, "model", f"claude_3_haiku_0_shot_{current_prompt_name}")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,validation,0,claude_3_haiku_0_shot_v8_prompt,def find_Min_Sum(num):\n factors = []\n ...,passed,True,True,0.000028,9,...,12,13,18,35.161259,66.607915,5.142857,342.554991,19.030833,0.022203,65.877825
1,mbpp,validation,1,claude_3_haiku_0_shot_v8_prompt,"def flatten(nested_tuple):\n """"""Flatten a n...",passed,True,True,0.000125,14,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,70.421123
2,mbpp,validation,2,claude_3_haiku_0_shot_v8_prompt,"def add_str(test_tuple, string_to_add):\n r...",passed,True,True,0.000089,6,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
3,mbpp,validation,3,claude_3_haiku_0_shot_v8_prompt,def sum_elements(tuples):\n return sum(sum(...,failed: 'int' object is not iterable,True,False,,2,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
4,mbpp,validation,4,claude_3_haiku_0_shot_v8_prompt,"def modular_sum(arr, n, m):\n if n > m:\n ...",failed:,True,False,,17,...,10,11,15,27.651484,51.891474,2.857143,148.261355,8.236742,0.017297,62.047759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,mbpp,validation,85,claude_3_haiku_0_shot_v8_prompt,def tuple_size(tuples):\n total_size = 0\n ...,failed: name 'sys' is not defined,True,False,,5,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,79.742343
86,mbpp,validation,86,claude_3_haiku_0_shot_v8_prompt,"def find_kth(arr1, arr2, m, n, k):\n if m >...",passed,True,True,0.000097,22,...,34,24,51,89.138353,233.833088,7.000000,1636.831613,90.935090,0.077944,55.226892
87,mbpp,validation,87,claude_3_haiku_0_shot_v8_prompt,"def armstrong_number(num):\n """"""\n Check...",passed,True,True,0.000028,32,...,18,15,27,44.039100,105.486046,6.000000,632.916276,35.162015,0.035162,87.127054
88,mbpp,validation,88,claude_3_haiku_0_shot_v8_prompt,def sum_average(n):\n total = n * (n + 1) /...,passed,True,True,0.000034,4,...,8,10,12,23.509775,39.863137,2.666667,106.301699,5.905650,0.013288,75.524960


In [215]:
df["passed_tests"].mean()

0.7444444444444445

In [216]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
df.to_csv(RESULTS_PATH + f"mbpp_validation_claude_3_haiku_0_shot_{current_prompt_name}.csv")

In [217]:
df.query("not passed_tests")

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
3,mbpp,validation,3,claude_3_haiku_0_shot_v8_prompt,def sum_elements(tuples):\n return sum(sum(...,failed: 'int' object is not iterable,True,False,,2,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
4,mbpp,validation,4,claude_3_haiku_0_shot_v8_prompt,"def modular_sum(arr, n, m):\n if n > m:\n ...",failed:,True,False,,17,...,10,11,15,27.651484,51.891474,2.857143,148.261355,8.236742,0.017297,62.047759
12,mbpp,validation,12,claude_3_haiku_0_shot_v8_prompt,def check_string(input_str):\n requirements...,failed:,True,False,,13,...,13,17,20,56.105716,81.749257,2.0,163.498514,9.083251,0.02725,66.694293
16,mbpp,validation,16,claude_3_haiku_0_shot_v8_prompt,"def get_pairs_count(arr, target_sum):\n cou...",failed: get_pairs_count() takes 2 positional a...,True,False,,11,...,6,9,9,20.264663,28.529325,1.5,42.793988,2.377444,0.00951,68.590709
22,mbpp,validation,22,claude_3_haiku_0_shot_v8_prompt,"def remove_datatype(test_tuple, data_type):\n ...",failed:,True,False,,2,...,1,2,2,0.0,2.0,0.5,1.0,0.055556,0.000667,90.922018
23,mbpp,validation,23,claude_3_haiku_0_shot_v8_prompt,"def search_literal(pattern, text):\n try:\n...",failed: name 're' is not defined,True,False,,9,...,4,2,8,0.0,8.0,2.0,16.0,0.888889,0.002667,72.457231
24,mbpp,validation,24,claude_3_haiku_0_shot_v8_prompt,"def topbottom_surfacearea(radius):\n """"""\n ...",failed:,True,False,,11,...,4,6,6,10.0,15.509775,1.0,15.509775,0.861654,0.00517,61.515063
29,mbpp,validation,29,claude_3_haiku_0_shot_v8_prompt,"def find_Diff(arr):\n """"""\n Find the dif...",failed: find_Diff() takes 1 positional argumen...,True,False,,18,...,4,6,6,10.0,15.509775,1.0,15.509775,0.861654,0.00517,97.336698
34,mbpp,validation,34,claude_3_haiku_0_shot_v8_prompt,def toggle_F_and_L_bits(n):\n if n == 1:\n ...,failed:,True,False,,6,...,14,14,21,39.509775,79.954453,5.25,419.76088,23.320049,0.026651,98.431625
36,mbpp,validation,36,claude_3_haiku_0_shot_v8_prompt,def Total_Hamming_Distance(n):\n total_dist...,failed:,True,False,,6,...,10,8,15,17.509775,45.0,1.666667,75.0,4.166667,0.015,71.046112


In [218]:
task_id = 89
print(canonical_solutions[task_id])
print("-"*40)
print(all_generations[task_id])

# Write a python function to check whether the given number is even or not using bitwise operator.
def is_Even(n) : 
    if (n^1 == n+1) :
        return True; 
    else :
        return False; 
----------------------------------------
def is_even(num: int) -> bool:
    """
    Checks whether the given number is even or not using bitwise operator.

    Args:
        num (int): The number to be checked.

    Returns:
        bool: True if the number is even, False otherwise.
    """
    return (num & 1) == 0


#### Train

In [20]:
dataset = "mbpp"
split = "train"

canonical_solutions = get_canonical_solutions(dataset, split)
function_names = get_expected_function_names(dataset, split)
assert len(canonical_solutions) == len(function_names)

print("n =", len(canonical_solutions))
print(current_prompt.format(function_name=function_names[0]))

n = 374
Refactor the following Python program to a more readable, efficient, and maintainable one:
 - The given program is correct but needs improvement
 - DO NOT change the name of the program
 - DO NOT change the input or output behavior of the program (e.g. number of inputs / outputs, input / output types, etc.)
 - Put your response in a markdown code block
 - Respond with only the code block
 - Don't explain the changes made
 - If you use any packages (e.g. `os`, `re`, `sys`), don't forget to import them

Again, be careful not to change the function name! The function name should remain `max_chain_length`.


In [21]:
print(canonical_solutions[370])
print("---")
print(function_names[370])

# Write a function to find the maximum number of segments of lengths a, b and c that can be formed from n.
def maximum_segments(n, a, b, c) : 
	dp = [-1] * (n + 10) 
	dp[0] = 0
	for i in range(0, n) : 
		if (dp[i] != -1) : 
			if(i + a <= n ): 
				dp[i + a] = max(dp[i] + 1, 
							dp[i + a]) 
			if(i + b <= n ): 
				dp[i + b] = max(dp[i] + 1, 
							dp[i + b]) 
			if(i + c <= n ): 
				dp[i + c] = max(dp[i] + 1, 
							dp[i + c]) 
	return dp[n]
---
maximum_segments


In [23]:
# WARNING: Running this cell will delete all your generations. Be sure you want to do this!
all_generations = []

In [24]:
# WARNNG: Slow step:
i = len(all_generations)
while i < len(canonical_solutions):
    print(i)

    # Call Claude API to refactor the code:
    raw_generation = refactor(
        canonical_solutions[i],
        system_prompt=current_prompt.format(function_name=function_names[i])
    )
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + f"{dataset}_{split}_claude_3_haiku_0_shot_in_progress_generations.pkl")


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
execution_ids_backup = []
execution_results_backup = []

results = bulk_evaluate(
    dataset=dataset,
    split=split,
    code=all_generations,
    execution_ids_backup=execution_ids_backup,
    execution_results_backup=execution_results_backup
)


In [53]:
df = pd.DataFrame(results)
df.insert(3, "model", f"claude_3_haiku_0_shot_{current_prompt_name}")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,train,0,claude_3_haiku_0_shot_v4_prompt,"class Pair:\n def __init__(self, a, b):\n ...",passed,0.000109,True,True,18,...,12,16,18,49.663388,72.000000,2.727273,196.363636,10.909091,0.024000,61.051817
1,mbpp,train,1,claude_3_haiku_0_shot_v4_prompt,def first_repeated_char(string):\n char_cou...,passed,0.000030,True,True,7,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,76.420208
2,mbpp,train,2,claude_3_haiku_0_shot_v4_prompt,"def get_ludic(n):\n ludics = list(range(1, ...",failed:,,False,True,11,...,14,11,21,28.754888,72.648064,2.625000,190.701168,10.594509,0.024216,63.847273
3,mbpp,train,3,claude_3_haiku_0_shot_v4_prompt,"def reverse_words(input_string):\n """"""\n ...",passed,0.000032,True,True,14,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
4,mbpp,train,4,claude_3_haiku_0_shot_v4_prompt,def is_prime(num):\n if num < 2:\n r...,failed: name 'prime_num' is not defined,,False,True,7,...,10,13,15,35.609640,55.506596,3.125000,173.458112,9.636562,0.018502,68.813126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,mbpp,train,369,claude_3_haiku_0_shot_v4_prompt,"def min_of_two(a, b):\n return min(a, b)",passed,0.000030,True,True,2,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
370,mbpp,train,370,claude_3_haiku_0_shot_v4_prompt,"def maximum_segments(n, a, b, c):\n dp = [0...",failed:,,False,True,7,...,30,19,45,66.603359,191.156738,4.000000,764.626952,42.479275,0.063719,68.105859
371,mbpp,train,371,claude_3_haiku_0_shot_v4_prompt,"def concatenate_nested(test_tup1, test_tup2):\...",failed:,,False,True,2,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,88.557495
372,mbpp,train,372,claude_3_haiku_0_shot_v4_prompt,"def left_rotate(string, rotation_count):\n ...",passed,0.000031,True,True,2,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,84.716246


In [54]:
df["passed_tests"].mean()

0.7513368983957219

In [55]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
df.to_csv(RESULTS_PATH + f"{dataset}_{split}_claude_3_haiku_0_shot_{current_prompt_name}.csv")

## Few-Shot

### HumanEval

In [30]:
dataset = "openai_humaneval"
split = "test"
model = "claude_3_haiku_few_shot"

canonical_solutions = get_canonical_solutions(dataset, split)
function_names = get_expected_function_names(dataset, split)

In [9]:
print("Expected =", len(canonical_solutions))
selected_refactoring_rules = select_refactoring_rules(canonical_solutions)

Expected = 164
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


In [16]:
extraced_rules = list(map(extract_rule, selected_refactoring_rules))

In [29]:
all_generations = generate_output_with_rule(canonical_solutions, extraced_rules, function_names)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


In [33]:
execution_ids_backup = []
execution_results_backup = []

results = bulk_evaluate(
    dataset=dataset,
    split=split,
    code=all_generations,
    execution_ids_backup=execution_ids_backup,
    execution_results_backup=execution_results_backup
)


Sending 164 jobs for execution...
164 jobs sent
Fetching 164 job results...
Fetching 1/164...
Fetching 2/164...
Fetching 3/164...
Fetching 4/164...
Fetching 5/164...
Fetching 6/164...
Fetching 7/164...
Fetching 8/164...
Fetching 9/164...
Fetching 10/164...
Fetching 11/164...
Fetching 12/164...
Fetching 13/164...
Fetching 14/164...
Fetching 15/164...
Fetching 16/164...
Fetching 17/164...
Fetching 18/164...
Fetching 19/164...
Fetching 20/164...
Fetching 21/164...
Fetching 22/164...
Fetching 23/164...
Fetching 24/164...
Fetching 25/164...
Fetching 26/164...
Fetching 27/164...
Fetching 28/164...
Fetching 29/164...
Fetching 30/164...
Fetching 31/164...
Fetching 32/164...
Fetching 33/164...
Fetching 34/164...
Fetching 35/164...
Fetching 36/164...
Fetching 37/164...
Fetching 38/164...
Fetching 39/164...
Fetching 40/164...
Fetching 41/164...
Fetching 42/164...
Fetching 43/164...
Fetching 44/164...
Fetching 45/164...
Fetching 46/164...
Fetching 47/164...
Fetching 48/164...
Fetching 49/164...
Fe

In [34]:
df = pd.DataFrame(results)
df.insert(3, "model", model)
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,openai_humaneval,test,0,claude_3_haiku_few_shot,from typing import List\n\ndef has_close_eleme...,passed,0.000185,True,True,15,...,6,9,9,20.264663,28.529325,1.500000,42.793988,2.377444,0.009510,95.214357
1,openai_humaneval,test,1,claude_3_haiku_few_shot,from typing import List\n\ndef separate_paren_...,passed,0.000088,True,True,26,...,10,9,15,20.264663,47.548875,2.500000,118.872188,6.604010,0.015850,89.398652
2,openai_humaneval,test,2,claude_3_haiku_few_shot,import math\n\ndef truncate_number(number: flo...,passed,0.000063,True,True,13,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,74.980907
3,openai_humaneval,test,3,claude_3_haiku_few_shot,from typing import List\n\ndef below_zero(oper...,failed:,,False,True,5,...,4,6,6,10.000000,15.509775,1.000000,15.509775,0.861654,0.005170,78.261044
4,openai_humaneval,test,4,claude_3_haiku_few_shot,from typing import List\n\n\ndef mean_absolute...,passed,0.000092,True,True,17,...,8,10,12,24.406372,39.863137,1.714286,68.336807,3.796489,0.013288,92.200729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,claude_3_haiku_few_shot,"def eat(number, need, remaining):\n """"""\n ...",passed,0.000120,True,True,30,...,4,6,6,10.000000,15.509775,1.000000,15.509775,0.861654,0.005170,100.000000
160,openai_humaneval,test,160,claude_3_haiku_few_shot,"def do_algebra(operator, operand):\n """"""\n ...",passed,0.000114,True,True,29,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,53.370235
161,openai_humaneval,test,161,claude_3_haiku_few_shot,"def solve(s):\n """"""You are given a string s...",passed,0.000081,True,True,22,...,2,4,4,4.000000,8.000000,1.000000,8.000000,0.444444,0.002667,95.885515
162,openai_humaneval,test,162,claude_3_haiku_few_shot,"def string_to_md5(text):\n """"""\n Given a...",failed: name 'hashlib' is not defined,,False,True,8,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000


In [35]:
df["compiled"].mean(), df["passed_tests"].mean()

(1.0, 0.7987804878048781)

In [36]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
df.to_csv(RESULTS_PATH + f"{dataset}_{split}_{model}.csv")

### MBPP

#### Test

In [45]:
dataset = "mbpp"
split = "test"
model = "claude_3_haiku_few_shot"

canonical_solutions = get_canonical_solutions(dataset, split)
function_names = get_expected_function_names(dataset, split)
print("Expected =", len(canonical_solutions))

Expected = 500


In [46]:
selected_refactoring_rules = select_refactoring_rules(canonical_solutions)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [47]:
extraced_rules = list(map(extract_rule, selected_refactoring_rules))

In [48]:
all_generations = generate_output_with_rule(canonical_solutions, extraced_rules, function_names)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [49]:
execution_ids_backup = []
execution_results_backup = []

results = bulk_evaluate(
    dataset=dataset,
    split=split,
    code=all_generations,
    execution_ids_backup=execution_ids_backup,
    execution_results_backup=execution_results_backup
)


Sending 500 jobs for execution...
500 jobs sent
Fetching 500 job results...
Fetching 1/500...
Fetching 2/500...
Fetching 3/500...
Fetching 4/500...
Fetching 5/500...
Fetching 6/500...
Fetching 7/500...
Fetching 8/500...
Fetching 9/500...
Fetching 10/500...
Fetching 11/500...
Fetching 12/500...
Fetching 13/500...
Fetching 14/500...
Fetching 15/500...
Fetching 16/500...
Fetching 17/500...
Fetching 18/500...
Fetching 19/500...
Fetching 20/500...
Fetching 21/500...
Fetching 22/500...
Fetching 23/500...
Fetching 24/500...
Fetching 25/500...
Fetching 26/500...
Fetching 27/500...
Fetching 28/500...
Fetching 29/500...
Fetching 30/500...
Fetching 31/500...
Fetching 32/500...
Fetching 33/500...
Fetching 34/500...
Fetching 35/500...
Fetching 36/500...
Fetching 37/500...
Fetching 38/500...
Fetching 39/500...
Fetching 40/500...
Fetching 41/500...
Fetching 42/500...
Fetching 43/500...
Fetching 44/500...
Fetching 45/500...
Fetching 46/500...
Fetching 47/500...
Fetching 48/500...
Fetching 49/500...
Fe

In [50]:
df = pd.DataFrame(results)
df.insert(3, "model", model)
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,test,0,claude_3_haiku_few_shot,"def remove_Occ(s, ch):\n result = list(s)\n...",failed: list index out of range,,False,True,12,...,3,5,5,6.754888,11.609640,1.000000,11.609640,0.644980,0.003870,68.464892
1,mbpp,test,1,claude_3_haiku_few_shot,def sort_matrix(matrix):\n return sorted(ma...,passed,0.000129,True,True,2,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
2,mbpp,test,2,claude_3_haiku_few_shot,def count_common(words):\n word_counts = Co...,failed: name 'Counter' is not defined,,False,True,3,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
3,mbpp,test,3,claude_3_haiku_few_shot,"def find_Volume(length, base, height):\n vo...",failed:,,False,True,3,...,6,8,9,17.509775,27.000000,1.000000,27.000000,1.500000,0.009000,79.435163
4,mbpp,test,4,claude_3_haiku_few_shot,def split_lowerstring(text):\n return [matc...,failed: name 're' is not defined,,False,True,2,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,mbpp,test,495,claude_3_haiku_few_shot,"def permutation_coefficient(n, k):\n P = [[...",passed,0.000076,True,True,11,...,24,15,36,44.828921,140.648061,6.000000,843.888369,46.882687,0.046883,61.300316
496,mbpp,test,496,claude_3_haiku_few_shot,"def remove_words(list1, removewords):\n ret...",passed,0.000086,True,True,2,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,88.288489
497,mbpp,test,497,claude_3_haiku_few_shot,"def same_order(l1, l2):\n common_elements =...",passed,0.000075,True,True,5,...,8,9,12,20.264663,38.039100,2.000000,76.078200,4.226567,0.012680,73.015386
498,mbpp,test,498,claude_3_haiku_few_shot,def average_Odd(n):\n if n % 2 == 0:\n ...,passed,0.000027,True,True,11,...,11,13,17,35.609640,62.907475,3.437500,216.244446,12.013580,0.020969,66.186143


In [51]:
df["compiled"].mean(), df["passed_tests"].mean()

(1.0, 0.752)

In [52]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
df.to_csv(RESULTS_PATH + f"{dataset}_{split}_{model}.csv")