## Imports

In [1]:
%load_ext autoreload
%autoreload 2

from multiprocess import Pool
import pandas as pd
import pickle

# Expects this file and eval.py to be in the same folder:
from eval import DATA, evaluate, bulk_evaluate

mbpp = DATA["mbpp"]  # train, validation, and test
humaneval = DATA["openai_humaneval"]  # test only

## Helpers

In [3]:
RESULTS_PATH = "./results/"

def save_pickle(object, to):
    with open(to, "wb") as f:
        pickle.dump(object, f)
    
def load_pickle(from_):
    with open(from_, "rb") as f:
        return pickle.load(f)

def check_results_for_errors(results, result_ids):
    print(len(results))
    errors = [i for i, result in zip(result_ids, results) if result == "ERROR"]
    print(len(errors))
    return errors


## HumanEval

In [None]:
canonical_solutions = [
    task["prompt"] + task["canonical_solution"] for task in humaneval["test"]
]
print(len(canonical_solutions))

results = bulk_evaluate(
    dataset="openai_humaneval",
    split="test",
    code=canonical_solutions,
    num_processes=10
)
print(len(results))
save_pickle(results, RESULTS_PATH + "openai_humaneval_test_results_1.pkl")

errors = check_results_for_errors(results, result_ids=range(len(results)))
errors  # If any errors (e.g. from timeouts) may need to re-run some evaluations

In [55]:
# Assuming no errors
df = pd.DataFrame(results)
df.insert(3, "model", "canonical_solution")
df.insert(4, "code", canonical_solutions)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,openai_humaneval,test,0,canonical_solution,from typing import List\n\n\ndef has_close_ele...,passed,0.000203,True,True,19,...,6,9,9,20.264663,28.529325,1.500000,42.793988,2.377444,0.009510,95.605923
1,openai_humaneval,test,1,canonical_solution,from typing import List\n\n\ndef separate_pare...,passed,0.000093,True,True,28,...,10,9,15,20.264663,47.548875,2.500000,118.872188,6.604010,0.015850,89.398652
2,openai_humaneval,test,2,canonical_solution,\n\ndef truncate_number(number: float) -> floa...,passed,0.000058,True,True,12,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,65.110353
3,openai_humaneval,test,3,canonical_solution,from typing import List\n\n\ndef below_zero(op...,passed,0.000130,True,True,20,...,4,5,6,6.754888,13.931569,1.333333,18.575425,1.031968,0.004644,96.412688
4,openai_humaneval,test,4,canonical_solution,from typing import List\n\n\ndef mean_absolute...,passed,0.000101,True,True,14,...,6,8,9,17.509775,27.000000,1.000000,27.000000,1.500000,0.009000,80.591775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,openai_humaneval,test,159,canonical_solution,"\ndef eat(number, need, remaining):\n """"""\n...",passed,0.000118,True,True,34,...,8,6,12,9.509775,31.019550,4.000000,124.078200,6.893233,0.010340,43.507133
160,openai_humaneval,test,160,canonical_solution,"\ndef do_algebra(operator, operand):\n """"""\...",passed,0.000122,True,True,30,...,4,5,6,8.000000,13.931569,0.500000,6.965784,0.386988,0.004644,50.101277
161,openai_humaneval,test,161,canonical_solution,"\ndef solve(s):\n """"""You are given a string...",passed,0.000089,True,True,26,...,7,9,11,20.264663,34.869175,1.750000,61.021056,3.390059,0.011623,90.712125
162,openai_humaneval,test,162,canonical_solution,"\ndef string_to_md5(text):\n """"""\n Given...",passed,0.000063,True,True,10,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000


In [None]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "openai_humaneval_test_cannonical.csv")

## MBPP Test

In [None]:
canonical_solutions = [
    "# " + task["text"] + "\n" + task["code"] for task in mbpp["test"]
]
print(len(canonical_solutions))

results = bulk_evaluate(
    dataset="mbpp",
    split="test",
    code=canonical_solutions,
    num_processes=10
)
print(len(results))
save_pickle(results, RESULTS_PATH + "mbpp_test_results_1.pkl")

errors = check_results_for_errors(results, result_ids=range(len(results)))
errors  # If any errors (e.g. from timeouts) may need to re-run some evaluations

In [107]:
# Assuming no errors
df = pd.DataFrame(results)
df.insert(3, "model", "canonical_solution")
df.insert(4, "code", canonical_solutions)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,test,0,canonical_solution,# Write a python function to remove first and ...,passed,0.000058,True,True,11,...,16,14,25,41.219281,95.183873,3.200000,304.588394,16.921577,0.031728,79.749780
1,mbpp,test,1,canonical_solution,# Write a function to sort a given matrix in a...,passed,0.000129,True,True,4,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
2,mbpp,test,2,canonical_solution,# Write a function to count the most common wo...,passed,0.000197,True,True,6,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
3,mbpp,test,3,canonical_solution,# Write a python function to find the volume o...,passed,0.000033,True,True,3,...,6,8,9,17.509775,27.000000,1.000000,27.000000,1.500000,0.009000,100.000000
4,mbpp,test,4,canonical_solution,# Write a function to split a string at lowerc...,passed,0.000057,True,True,4,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,mbpp,test,495,canonical_solution,# Write a function to calculate the permutatio...,passed,0.000072,True,True,14,...,24,15,36,44.828921,140.648061,6.000000,843.888369,46.882687,0.046883,77.194341
496,mbpp,test,496,canonical_solution,# Write a function to remove specific words fr...,passed,0.000102,True,True,6,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,100.000000
497,mbpp,test,497,canonical_solution,# Write a function to check if the common elem...,passed,0.000079,True,True,6,...,8,9,12,20.264663,38.039100,2.000000,76.078200,4.226567,0.012680,96.396081
498,mbpp,test,498,canonical_solution,# Write a python function to find the average ...,passed,0.000025,True,True,12,...,15,14,23,39.302969,87.569163,7.500000,656.768724,36.487151,0.029190,80.381523


In [108]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "mbpp_test_cannonical.csv")

## MBPP Train

In [None]:
canonical_solutions = [
    "# " + task["text"] + "\n" + task["code"] for task in mbpp["train"]
]
print(len(canonical_solutions))

results = bulk_evaluate(
    dataset="mbpp",
    split="train",
    code=canonical_solutions,
    num_processes=10
)
print(len(results))
save_pickle(results, RESULTS_PATH + "mbpp_train_results_1.pkl")

errors = check_results_for_errors(results, result_ids=range(len(results)))
errors  # If any errors (e.g. from timeouts) may need to re-run some evaluations

In [117]:
# Assuming no errors
df = pd.DataFrame(results)
df.insert(3, "model", "canonical_solution")
df.insert(4, "code", canonical_solutions)
df

Unnamed: 0,dataset,split,task_id,model,code,result,compiled,passed_tests,avg_test_time,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,train,0,canonical_solution,# Write a function to find the longest chain w...,passed,True,True,0.000107,17,...,12,15,18,46.053748,70.324031,2.181818,153.434249,8.524125,0.023441,74.679196
1,mbpp,train,1,canonical_solution,# Write a python function to find the first re...,passed,True,True,0.000029,6,...,4,5,6,6.754888,13.931569,1.333333,18.575425,1.031968,0.004644,99.719576
2,mbpp,train,2,canonical_solution,# Write a function to get a lucid number small...,passed,True,True,0.000084,14,...,14,12,21,32.000000,75.284213,3.500000,263.494744,14.638597,0.025095,77.915787
3,mbpp,train,3,canonical_solution,# Write a function to reverse words in a given...,passed,True,True,0.000032,3,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
4,mbpp,train,4,canonical_solution,# Write a function to check if the given integ...,passed,True,True,0.000026,10,...,8,10,12,23.509775,39.863137,2.666667,106.301699,5.905650,0.013288,86.066773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,mbpp,train,369,canonical_solution,# Write a function to find minimum of two numb...,passed,True,True,0.000029,5,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,100.000000
370,mbpp,train,370,canonical_solution,# Write a function to find the maximum number ...,passed,True,True,0.000055,16,...,38,22,58,81.096509,258.647034,5.588235,1445.380483,80.298916,0.086216,73.665796
371,mbpp,train,371,canonical_solution,# Write a function to concatenate the given tw...,passed,True,True,0.000063,4,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,100.000000
372,mbpp,train,372,canonical_solution,# Write a python function to left rotate the s...,passed,True,True,0.000031,4,...,2,3,3,2.000000,4.754888,0.500000,2.377444,0.132080,0.001585,100.000000


In [119]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "mbpp_train_cannonical.csv")

## MBPP Validate

In [None]:
canonical_solutions = [
    "# " + task["text"] + "\n" + task["code"] for task in mbpp["validation"]
]
print(len(canonical_solutions))

results = bulk_evaluate(
    dataset="mbpp",
    split="validation",
    code=canonical_solutions,
    num_processes=10
)
print(len(results))
save_pickle(results, RESULTS_PATH + "mbpp_validation_results_1.pkl")

errors = check_results_for_errors(results, result_ids=range(len(results)))
errors  # If any errors (e.g. from timeouts) may need to re-run some evaluations

In [124]:
# Assuming no errors
df = pd.DataFrame(results)
df.insert(3, "model", "canonical_solution")
df.insert(4, "code", canonical_solutions)
df

Unnamed: 0,dataset,split,task_id,model,code,result,avg_test_time,passed_tests,compiled,loc,...,N2,vocabulary,length,calculated_length,volume,difficulty,effort,time,bugs,MI
0,mbpp,validation,0,canonical_solution,# Write a python function to find minimum sum ...,passed,0.000034,True,True,11,...,16,13,24,35.161259,88.810553,6.857143,608.986651,33.832592,0.029604,81.956795
1,mbpp,validation,1,canonical_solution,# Write a function to count the element freque...,passed,0.000113,True,True,14,...,4,6,6,10.000000,15.509775,1.000000,15.509775,0.861654,0.005170,82.585354
2,mbpp,validation,2,canonical_solution,# Write a function to convert tuple into list ...,passed,0.000091,True,True,4,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
3,mbpp,validation,3,canonical_solution,# Write a function to find the summation of tu...,passed,0.000048,True,True,4,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
4,mbpp,validation,4,canonical_solution,# Write a function to check if there is a subs...,passed,0.000061,True,True,18,...,16,16,24,51.019550,96.000000,2.666667,256.000000,14.222222,0.032000,71.883363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,mbpp,validation,85,canonical_solution,# Write a function to find the size of the giv...,passed,0.000111,True,True,4,...,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000
86,mbpp,validation,86,canonical_solution,# Write a function to find kth element from th...,passed,0.000102,True,True,23,...,32,18,48,59.715357,200.156400,6.153846,1231.731693,68.429538,0.066719,66.296343
87,mbpp,validation,87,canonical_solution,# Write a function to check whether the given ...,passed,0.000029,True,True,17,...,18,15,27,44.039100,105.486046,6.000000,632.916276,35.162015,0.035162,73.507300
88,mbpp,validation,88,canonical_solution,# Write a function to find sum and average of ...,passed,0.000038,True,True,7,...,6,6,9,10.000000,23.264663,1.500000,34.896994,1.938722,0.007755,95.075198


In [126]:
# Finally, when everything looks good save the data. Will overwrite previously
# saved data at this location. Be sure you want to do this!
# df.to_csv(RESULTS_PATH + "mbpp_validation_cannonical.csv")