## Imports

In [1]:
%load_ext autoreload
%autoreload 2

import anthropic
from dotenv import load_dotenv
from multiprocess import Pool
import pandas as pd
import pickle

# Expects this file and eval.py to be in the same folder:
from eval import DATA, evaluate, bulk_evaluate

# Expects ANTHROPIC_API_KEY to be provided in .env file in user's home directory
load_dotenv()

mbpp = DATA["mbpp"]  # train, validation, and test
humaneval = DATA["openai_humaneval"]  # test only

## Helpers

In [2]:

RESULTS_PATH = "./results/"

def save_pickle(object, to):
    with open(to, "wb") as f:
        pickle.dump(object, f)
    
def load_pickle(from_):
    with open(from_, "rb") as f:
        return pickle.load(f)

def check_results_for_errors(results, result_ids):
    print(len(results))
    errors = [i for i, result in zip(result_ids, results) if result == "ERROR"]
    print(len(errors))
    return errors


## Generation Helpers

In [7]:
BASIC_SYSTEM_PROMPT = "Refactor the given Python program to a more readable, efficient, and maintainable one. You can assume that the given program is semantically correct. Do not change the external behavior of the program, and keep the syntactic and semantic correctness. Python programs should be in a code block. Do not explain anything in natural language."
CLAUDE_PROMPT = (BASIC_SYSTEM_PROMPT + "\n\nPut your response in a markdown code block. Respond with only the code block. Don't explain the changes made.")

def refactor(code, model="claude-3-haiku-20240307", system_prompt=CLAUDE_PROMPT, max_tokens=1000, temperature=0):
    client = anthropic.Anthropic()
    message = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system_prompt,
        messages=[{
            "role": "user",
            "content": [{"type": "text", "text": code}]
        }]
    )
    return message.content[0].text.strip()

def extract_from_code_block(raw_generation):
    assert raw_generation.startswith("```")
    assert raw_generation.endswith("```")

    # Simply remove first and last line
    return "\n".join(raw_generation.split("\n")[1:-1])


## HumanEval Generation

In [10]:
canonical_solutions = [
    task["prompt"] + task["canonical_solution"] for task in humaneval["test"]
]

In [11]:
# WARNING: Running this cell will delete all your generations / evals. Be sure
#   you want to do this!
# all_generations = []
# all_evals = []

In [None]:

# WARNNG: Very slow step:
assert len(all_generations) == len(all_evals)

i = len(all_evals)
while i < len(canonical_solutions):
    print(i)

    # Call Claude API to refactor the code:
    raw_generation = refactor(canonical_solutions[i])
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)

    # To avoid hitting the Claude API rate limit, buy time by doing the eval
    # immediately after generation:
    all_evals.append(evaluate(
        dataset="openai_humaneval",
        split="test",
        task_id=i,
        code=all_generations[i]
    ))
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + "openai_humaneval_test_claude_3_haiku_0_shot_in_progress_generations.pkl")
        save_pickle(all_evals, RESULTS_PATH + "openai_humaneval_test_claude_3_haiku_0_shot_in_progress_evals.pkl")


In [69]:
df = pd.DataFrame(all_evals)
df.insert(3, "model", "claude_3_haiku_0_shot")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,openai_humaneval,test,0,claude_3_haiku_0_shot,from typing import List\n\ndef has_close_eleme...,passed,0.000200,True,True,18,...,6,9,9,20.264663,28.529325,1.50,42.793988,2.377444,0.009510,92.948976
1,openai_humaneval,test,1,claude_3_haiku_0_shot,from typing import List\n\ndef separate_paren_...,passed,0.000097,True,True,33,...,5,6,8,10.000000,20.679700,1.25,25.849625,1.436090,0.006893,91.833863
2,openai_humaneval,test,2,claude_3_haiku_0_shot,def truncate_number(number: float) -> float:\n...,passed,0.000065,True,True,15,...,2,3,3,2.000000,4.754888,0.50,2.377444,0.132080,0.001585,55.567861
3,openai_humaneval,test,3,claude_3_haiku_0_shot,from typing import List\n\ndef below_zero(oper...,passed,0.000135,True,True,11,...,4,6,6,10.000000,15.509775,1.00,15.509775,0.861654,0.005170,71.694386
4,openai_humaneval,test,4,claude_3_haiku_0_shot,from typing import List\n\ndef calculate_mean_...,failed: name 'mean_absolute_deviation' is not ...,,False,True,17,...,6,8,9,17.509775,27.000000,1.00,27.000000,1.500000,0.009000,87.277819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,claude_3_haiku_0_shot,"def eat(number_eaten, needed, remaining):\n ...",failed: Error,,False,True,15,...,4,6,6,10.000000,15.509775,1.00,15.509775,0.861654,0.005170,77.260164
160,openai_humaneval,test,160,claude_3_haiku_0_shot,"def do_algebra(operator, operand):\n """"""\n ...",passed,0.000112,True,True,23,...,10,15,15,44.828921,58.603359,2.50,146.508397,8.139355,0.019534,85.425318
161,openai_humaneval,test,161,claude_3_haiku_0_shot,"def solve(s):\n """"""\n Reverses the case ...",passed,0.000084,True,True,14,...,2,4,4,4.000000,8.000000,1.00,8.000000,0.444444,0.002667,97.852090
162,openai_humaneval,test,162,claude_3_haiku_0_shot,import hashlib\n\ndef string_to_md5(text: str)...,passed,0.000064,True,True,12,...,1,2,2,0.000000,2.000000,0.50,1.000000,0.055556,0.000667,100.000000


In [71]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "openai_humaneval_test_claude_3_haiku_0_shot.csv")

## MBPP Generation

In [12]:
canonical_solutions = [
    "# " + task["text"] + "\n" + task["code"] for task in mbpp["test"]
]

In [13]:
# WARNING: Running this cell will delete all your generations / evals. Be sure
#   you want to do this!
# all_generations = []
# all_evals = []

In [None]:
# WARNNG: Very slow step:
assert len(all_generations) == len(all_evals)

i = len(all_evals)
while i < len(canonical_solutions):
    print(i)

    # Call Claude API to refactor the code:
    raw_generation = refactor(canonical_solutions[i])
    generation = extract_from_code_block(raw_generation)
    all_generations.append(generation)

    # To avoid hitting the Claude API rate limit, buy time by doing the eval
    # immediately after generation:
    all_evals.append(evaluate(
        dataset="mbpp",
        split="test",
        task_id=i,
        code=all_generations[i]
    ))
    i += 1

    # Every 10, Save to pickle just in case
    if i % 10 == 0:
        save_pickle(all_generations, RESULTS_PATH + "mbpp_test_claude_3_haiku_0_shot_in_progress_generations.pkl")
        save_pickle(all_evals, RESULTS_PATH + "mbpp_test_claude_3_haiku_0_shot_in_progress_evals.pkl")


In [37]:
df = pd.DataFrame(all_evals)
df.insert(3, "model", "claude_3_haiku_0_shot")
df.insert(4, "code", all_generations)
df

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,test,0,claude_3_haiku_0_shot,"def remove_occurrences(string, char):\n res...",failed: name 'remove_Occ' is not defined,True,False,,17,...,12,10,18,23.509775,59.794706,4.000000,239.178823,13.287712,0.019932,61.616668
1,mbpp,test,1,claude_3_haiku_0_shot,"def sort_matrix(matrix):\n """"""\n Sorts a...",passed,True,True,0.000157,11,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
2,mbpp,test,2,claude_3_haiku_0_shot,from collections import Counter\n\ndef count_m...,failed: name 'count_common' is not defined,True,False,,14,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
3,mbpp,test,3,claude_3_haiku_0_shot,"def calculate_triangular_prism_volume(length, ...",failed: name 'find_Volume' is not defined,True,False,,13,...,6,8,9,17.509775,27.000000,1.000000,27.000000,1.500000,0.009000,51.650873
4,mbpp,test,4,claude_3_haiku_0_shot,"def split_at_lowercase(text):\n """"""\n Sp...",failed: name 'split_lowerstring' is not defined,True,False,,11,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,mbpp,test,495,claude_3_haiku_0_shot,"def permutation_coefficient(n, k):\n """"""\n ...",passed,True,True,0.000060,27,...,28,18,42,58.529325,175.136850,7.000000,1225.957950,68.108775,0.058379,87.544796
496,mbpp,test,496,claude_3_haiku_0_shot,"def remove_words(original_list, words_to_remov...",passed,True,True,0.000085,2,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,88.288489
497,mbpp,test,497,claude_3_haiku_0_shot,"def check_common_order(list1, list2):\n """"""...",failed: name 'same_order' is not defined,True,False,,13,...,6,7,9,13.609640,25.266194,1.200000,30.319433,1.684413,0.008422,69.363638
498,mbpp,test,498,claude_3_haiku_0_shot,def average_odd(n):\n if n % 2 == 0:\n ...,failed: name 'average_Odd' is not defined,True,False,,13,...,14,13,21,35.161259,77.709234,6.000000,466.255404,25.903078,0.025903,64.545413


In [38]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "mbpp_test_claude_3_haiku_0_shot.csv")