In [138]:
import json
import random
import math

In [139]:
random.seed(42)
pk = 3

In [140]:
def get_statistical_significance_samples_ids(population_size, confidence_interval, error_margin):
    """
    Get the sample ids for the statistical significance test.
    """
    z_score = 0
    if confidence_interval == 0.95:
        z_score = 1.96
    elif confidence_interval == 0.99:
        z_score = 2.58
    no_top = (z_score ** 2) * 0.25
    no_bottom = error_margin ** 2
    no = no_top / no_bottom
    sample_size = math.ceil(no / (1 + no / population_size))

    # Generate random sample IDs
    sample_ids = random.sample(range(0, population_size), int(sample_size))
    
    return sample_ids

    

In [141]:
def fix_heuristic_list(heuristic_list):
    result = []
    for heuristic in heuristic_list:
        result.append(int(heuristic.replace('H', '')))
    return result

In [142]:

combined_data_path = '../Combined_Datasets/datasets_study_prompts.json'
combined_data = []

In [143]:
TARGET_DATASET = [
    {
    "Source_Path": "../datasets/HumanEval/human-eval-v2-20210705.jsonl",
    "Output_Path": "../results/HumanEval_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/mbxp/mbpp_release_v1.jsonl",
    "Output_Path": "../results/mbxp_python_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/mbxp/mbjp_release_v1.jsonl",
    "Output_Path": "../results/mbxp_java_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/mbxp_humaneval/HumanEval.jsonl",
    "Output_Path": "../results/mbxp_humaneval_python_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/mbxp_humaneval/HumanEval_java_v1.1.jsonl",
    "Output_Path": "../results/mbxp_humaneval_java_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/mbxp_mathqa/mathqa-test-python_v1.jsonl",
    "Output_Path": "../results/mbxp_mathqa_python_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/mbxp_mathqa/mathqa-test-java_v1.jsonl",
    "Output_Path": "../results/mbxp_mathqa_java_heuristic_results.json",
    "jsonl": True,
    },
    # {
    # "Source_Path": "../datasets/ODEX/en_test.jsonl",
    # "Output_Path": "../results/odex_en_heuristic_results.json",
    # "jsonl": True,
    # },
    {
    "Source_Path": "../datasets/pandasNumpyEval/offical_numpy.jsonl",
    "Output_Path": "../results/pandasNumpyEval_numpy_heuristic_results.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/pandasNumpyEval/offical_pandas.jsonl",
    "Output_Path": "../results/pandasNumpyEval_pandas_heuristic_results.json",
    "jsonl": True,
    },
    # {
    # "Source_Path": "../datasets/CoderEval/CoderEval4Python.json",
    # "Output_Path": "../results/CoderEval4Python__heuristic.json",
    # "jsonl": False,
    # },
    # {
    # "Source_Path": "../datasets/MCoNaLa/test/flores101/es_test_to_en.json",
    # "Output_Path": "../results/MCoNaLa_es_test_to_en__heuristic.json",
    # "jsonl": False,
    # },
    # {
    # "Source_Path": "../datasets/MCoNaLa/test/flores101/ja_test_to_en.json",
    # "Output_Path": "../results/MCoNaLa_ja_test_to_en__heuristic.json",
    # "jsonl": False,
    # },
    # {
    # "Source_Path": "../datasets/MCoNaLa/test/flores101/ru_test_to_en.json",
    # "Output_Path": "../results/MCoNaLa_ru_test_to_en__heuristic.json",
    # "jsonl": False,
    # },
    {
    "Source_Path": "../datasets/TorchDataEval/real_beatnum_eval_v3_human_labelled.jsonl",
    "Output_Path": "../results/real_beatnum_eval_v3_human_labelled_heuristic.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/TorchDataEval/real_monkey_eval_v3_human_labelled.jsonl",
    "Output_Path": "../results/real_monkey_eval_v3_human_labelled_heuristic.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/TorchDataEval/real_torchdata_eval_v3_human_labelled.jsonl",
    "Output_Path": "../results/real_torchdata_eval_v3_human_labelled_heuristic.json",
    "jsonl": True,
    },
    {
    "Source_Path": "../datasets/TorchDataEval/real_torchdata_eval_v3_human_labelled_make_sense.jsonl",
    "Output_Path": "../results/real_torchdata_eval_v3_human_labelled_make_sense_heuristic.json",
    "jsonl": True,
    },
    # {
    # "Source_Path": "../datasets/CodeComplex/extend_data.jsonl",
    # "Output_Path": "../results/CodeComplex_extend_data_heuristic.json",
    # "jsonl": True,
    # },
    # {
    # "Source_Path": "../datasets/CodeComplex/new_data.jsonl",
    # "Output_Path": "../results/CodeComplex_new_data_heuristic.json",
    # "jsonl": True,
    # },
    {
    "Source_Path": "../datasets/HumanEval-Infilling/HumanEval-MultiLineInfilling.jsonl",
    "Output_Path": "../results/HumanEval-MultiLineInfilling_heuristic.json",
    "jsonl": True,
    },
    # {
    # "Source_Path": "../datasets/JigsawDataset/PandasEval1.json",
    # "Output_Path": "../results/JigsawDataset_pandas_eval1_heuristic.json",
    # "jsonl": False,
    # },
    # {
    # "Source_Path": "../datasets/JigsawDataset/PandasEval2.json",
    # "Output_Path": "../results/JigsawDataset_pandas_eval2_heuristic.json",
    # "jsonl": False,
    # },
    {
    "Source_Path": "../datasets/MBPP/sanitized-mbpp.json",
    "Output_Path": "../results/sanitized-mbpp_heuristic.json",
    "jsonl": False,
    },
]

In [144]:
for datasets in TARGET_DATASET:
    original_data_path = datasets['Source_Path']
    result_data_path = datasets['Output_Path']
    dataset_name = "/".join(datasets['Source_Path'].split('/')[2:]).split('.')[0]
    print(dataset_name)

    original_data = []
    if datasets['jsonl']:
        with open(original_data_path) as f:
            for line in f:
                original_data.append(json.loads(line))
    else:
        with open(original_data_path) as f:
            original_data = json.load(f)

    print(len(original_data))

    result_data = []
    with open(result_data_path) as f:
        data = json.load(f)
        extra = 0
        for i in range(len(data)):
            if extra > 0:
                extra -= 1
                continue
            new_data = data[i]
            new_comment= data[i]['nl']['comment']
            new_heuristics = data[i]['Heuristic']
            for j in range(i+1, len(data)):
                if data[i]['nl']['id'] == data[j]['nl']['id'] :
                    new_comment = new_comment+"\n"+ data[j]['nl']['comment']
                    new_heuristics = new_heuristics + data[j]['Heuristic']
                    extra += 1
                else:
                    break
            
            new_heuristics = list(set(new_heuristics))
            new_data['nl']['comment'] = new_comment
            new_data['Heuristic'] = new_heuristics
            result_data.append(new_data)

    print(len(result_data))
    assert len(result_data) == len(original_data)
    sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
    print(len(sample_ids))

    language = "py"
    if "java" in dataset_name.lower():
        language = "java"
    print(language)
    for i in range(len(original_data)):
        if i not in sample_ids:
            continue
        original = original_data[i]
        result = result_data[i]
        original['nl'] = result['nl']['comment'] 
        combined_data.append({
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result['nl']['id'],
                "content": original,
                "language": language,
                "problems": fix_heuristic_list(result['Heuristic']),
            }

        })
        pk += 1



HumanEval/human-eval-v2-20210705
164
164
132
py
mbxp/mbpp_release_v1
974
974
396
py
mbxp/mbjp_release_v1
966
966
395
py
mbxp_humaneval/HumanEval
164
164
132
py
mbxp_humaneval/HumanEval_java_v1
161
161
130
java
mbxp_mathqa/mathqa-test-python_v1
1883
1883
492
py
mbxp_mathqa/mathqa-test-java_v1
1883
1883
492
java
pandasNumpyEval/offical_numpy
101
101
88
py
pandasNumpyEval/offical_pandas
101
101
88
py
TorchDataEval/real_beatnum_eval_v3_human_labelled
101
101
88
py
TorchDataEval/real_monkey_eval_v3_human_labelled
101
101
88
py
TorchDataEval/real_torchdata_eval_v3_human_labelled
50
50
47
py
TorchDataEval/real_torchdata_eval_v3_human_labelled_make_sense
50
50
47
py
HumanEval-Infilling/HumanEval-MultiLineInfilling
5815
5815
598
py
MBPP/sanitized-mbpp
427
427
261
py


In [145]:
dataset = {
    "Source_Path": "../datasets/ODEX/en_test.jsonl",
    "Output_Path": "../results/odex_en_heuristic_results.json",
    "jsonl": True,
    }

original_data_path = dataset["Source_Path"]
result_data_path = dataset["Output_Path"]
dataset_name = "/".join(dataset["Source_Path"].split("/")[2:]).split(".")[0]
print(dataset_name)

original_data = []
if dataset["jsonl"]:
    with open(original_data_path) as f:
        for line in f:
            original_data.append(json.loads(line))
else:
    with open(original_data_path) as f:
        original_data = json.load(f)

print(len(original_data))

result_data = []
with open(result_data_path) as f:
    data = json.load(f)
    extra = 0
    for i in range(len(data)):
        if extra > 0:
            extra -= 1
            continue
        new_data = data[i]
        new_comment = data[i]["nl"]["comment"]
        new_heuristics = data[i]["Heuristic"]
        # for j in range(i + 1, len(data)):
        #     if data[i]["nl"]["id"] == data[j]["nl"]["id"]:
        #         new_comment = new_comment + "\n" + data[i]["nl"]["comment"]
        #         new_heuristics = new_heuristics + data[j]["Heuristic"]
        #         extra += 1
        #     else:
        #         break
        new_heuristics = list(set(new_heuristics))
        new_data["nl"]["comment"] = new_comment
        new_data["Heuristic"] = new_heuristics
        result_data.append(new_data)

print(len(result_data))
assert len(result_data) == len(original_data)
sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
print(len(sample_ids))


for i in range(len(original_data)):
    if i not in sample_ids:
        continue
    original = original_data[i]
    result = result_data[i]
    original["nl"] = result["nl"]["comment"]
    combined_data.append(
        {
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result["nl"]["id"],
                "content": original,
                "language": "java",
                "problems": fix_heuristic_list(result["Heuristic"]),
            },
        }
    )
    pk += 1


ODEX/en_test
439
439
265


In [146]:
dataset = {
    "Source_Path": "../datasets/CoderEval/CoderEval4Python.json",
    "Output_Path": "../results/CoderEval4Python__heuristic.json",
    "jsonl": False,
    }

original_data_path = dataset["Source_Path"]
result_data_path = dataset["Output_Path"]
dataset_name = "/".join(dataset["Source_Path"].split("/")[2:]).split(".")[0]
print(dataset_name)

original_data = []
if dataset["jsonl"]:
    with open(original_data_path) as f:
        for line in f:
            original_data.append(json.loads(line))
else:
    with open(original_data_path) as f:
        original_data = json.load(f)["RECORDS"]

print(len(original_data))

result_data = []
with open(result_data_path) as f:
    data = json.load(f)
    extra = 0
    for i in range(len(data)):
        if extra > 0:
            extra -= 1
            continue
        new_data = data[i]
        new_comment = data[i]["nl"]["comment"]
        new_heuristics = data[i]["Heuristic"]
        for j in range(i + 1, len(data)):
            if data[i]["nl"]["id"] == data[j]["nl"]["id"]:
                new_comment = new_comment + "\n" + data[i]["nl"]["comment"]
                new_heuristics = new_heuristics + data[j]["Heuristic"]
                extra += 1
            else:
                break
        new_heuristics = list(set(new_heuristics))
        new_data["nl"]["comment"] = new_comment
        new_data["Heuristic"] = new_heuristics
        result_data.append(new_data)

print(len(result_data))
assert len(result_data) == len(original_data)
sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
print(len(sample_ids))


for i in range(len(original_data)):
    if i not in sample_ids:
        continue
    original = original_data[i]
    result = result_data[i]
    original["nl"] = result["nl"]["comment"]
    combined_data.append(
        {
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result["nl"]["id"],
                "content": original,
                "language": "py",
                "problems": fix_heuristic_list(result["Heuristic"]),
            },
        }
    )
    pk += 1


CoderEval/CoderEval4Python
230
230
171


In [147]:
dataset = { "Source_Path": "../datasets/CodeComplex/extend_data.jsonl",
    "Output_Path": "../results/CodeComplex_extend_data_heuristic.json",
    "jsonl": True,
    }

original_data_path = dataset["Source_Path"]
result_data_path = dataset["Output_Path"]
dataset_name = "/".join(dataset["Source_Path"].split("/")[2:]).split(".")[0]
print(dataset_name)

original_data = []
if dataset["jsonl"]:
    with open(original_data_path) as f:
        for line in f:
            original_data.append(json.loads(line))
else:
    with open(original_data_path) as f:
        original_data = json.load(f)["RECORDS"]

print(len(original_data))

result_data = []
with open(result_data_path) as f:
    data = json.load(f)
    extra = 0
    for i in range(len(data)):
        if extra > 0:
            extra -= 1
            continue
        new_data = data[i]
        new_comment = data[i]["nl"]["comment"]
        new_heuristics = data[i]["Heuristic"]
        for j in range(i + 1, len(data)):
            if data[i]["nl"]["id"] == data[j]["nl"]["id"]:
                new_comment = new_comment + "\n" + data[i]["nl"]["comment"]
                new_heuristics = new_heuristics + data[j]["Heuristic"]
                extra += 1
            else:
                break
        new_heuristics = list(set(new_heuristics))
        new_data["nl"]["comment"] = new_comment
        new_data["Heuristic"] = new_heuristics
        result_data.append(new_data)

print(len(result_data))
# assert len(result_data) == len(original_data)
sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
print(len(sample_ids))


for i in range(len(original_data)):
    if i not in sample_ids:
        continue
    original = original_data[i]
    for data in result_data:
        if data["nl"]["id"] == i:
            result = data
            break
    original["nl"] = result["nl"]["comment"]
    combined_data.append(
        {
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result["nl"]["id"],
                "content": original,
                "language": "java",
                "problems": fix_heuristic_list(result["Heuristic"]),
            },
        }
    )
    pk += 1


CodeComplex/extend_data
1003
605
401


In [148]:
dataset = {
    "Source_Path": "../datasets/CodeComplex/new_data.json",
    "Output_Path": "../results/CodeComplex_new_data_heuristic.json",
    "jsonl": False,
    }

original_data_path = dataset["Source_Path"]
result_data_path = dataset["Output_Path"]
dataset_name = "/".join(dataset["Source_Path"].split("/")[2:]).split(".")[0]
print(dataset_name)

original_data = []
if dataset["jsonl"]:
    with open(original_data_path) as f:
        for line in f:
            original_data.append(json.loads(line))
else:
    with open(original_data_path) as f:
        original_data = json.load(f)

print(len(original_data))

result_data = []
with open(result_data_path) as f:
    data = json.load(f)
    extra = 0
    for i in range(len(data)):
        if extra > 0:
            extra -= 1
            continue
        new_data = data[i]
        new_comment = data[i]["nl"]["comment"]
        new_heuristics = data[i]["Heuristic"]
        for j in range(i + 1, len(data)):
            if data[i]["nl"]["id"] == data[j]["nl"]["id"]:
                new_comment = new_comment + "\n" + data[i]["nl"]["comment"]
                new_heuristics = new_heuristics + data[j]["Heuristic"]
                extra += 1
            else:
                break
        new_heuristics = list(set(new_heuristics))
        new_data["nl"]["comment"] = new_comment
        new_data["Heuristic"] = new_heuristics
        result_data.append(new_data)

print(len(result_data))
# assert len(result_data) == len(original_data)
sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
print(len(sample_ids))


for i in range(len(original_data)):
    if i not in sample_ids:
        continue
    original = original_data[i]
    for data in result_data:
        if data["nl"]["id"] == i:
            result = data
            break
    original["nl"] = result["nl"]["comment"]
    combined_data.append(
        {
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result["nl"]["id"],
                "content": original,
                "language": "java",
                "problems": fix_heuristic_list(result["Heuristic"]),
            },
        }
    )
    pk += 1


CodeComplex/new_data
4517
2529
581


In [149]:
dataset = {
    "Source_Path": "../datasets/JigsawDataset/PandasEval1.json",
    "Output_Path": "../results/JigsawDataset_pandas_eval1_heuristic.json",
    "jsonl": False,
    }
    # {
    # "Source_Path": "../datasets/JigsawDataset/PandasEval2.json",
    # "Output_Path": "../results/JigsawDataset_pandas_eval2_heuristic.json",
    # "jsonl": False,
    # },
original_data_path = dataset["Source_Path"]
result_data_path = dataset["Output_Path"]
dataset_name = "/".join(dataset["Source_Path"].split("/")[2:]).split(".")[0]
print(dataset_name)

original_data = []
if dataset["jsonl"]:
    with open(original_data_path) as f:
        for line in f:
            original_data.append(json.loads(line))
else:
    with open(original_data_path) as f:
        original_data = json.load(f)

print(len(original_data))

result_data = []
with open(result_data_path) as f:
    data = json.load(f)
    extra = 0
    for i in range(len(data)):
        if extra > 0:
            extra -= 1
            continue
        new_data = data[i]
        new_comment = data[i]["nl"]["comment"]
        new_heuristics = data[i]["Heuristic"]
        for j in range(i + 1, len(data)):
            if data[i]["nl"]["id"] == data[j]["nl"]["id"]:
                new_comment = new_comment + "\n" + data[i]["nl"]["comment"]
                new_heuristics = new_heuristics + data[j]["Heuristic"]
                extra += 1
            else:
                break
        new_heuristics = list(set(new_heuristics))
        new_data["nl"]["comment"] = new_comment
        new_data["Heuristic"] = new_heuristics
        result_data.append(new_data)

print(len(result_data))
assert len(result_data) == len(original_data)
sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
print(len(sample_ids))


for i in range(len(original_data)):
    if i not in sample_ids:
        continue

    original = original_data[str(i)]
    result = result_data[i]
    original["nl"] = result["nl"]["comment"]
    combined_data.append(
        {
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result["nl"]["id"],
                "content": original,
                "language": "py",
                "problems": fix_heuristic_list(result["Heuristic"]),
            },
        }
    )
    pk += 1


JigsawDataset/PandasEval1
68
68
62


In [150]:
dataset =  {
    "Source_Path": "../datasets/JigsawDataset/PandasEval2.json",
    "Output_Path": "../results/JigsawDataset_pandas_eval2_heuristic.json",
    "jsonl": False,
    }
original_data_path = dataset["Source_Path"]
result_data_path = dataset["Output_Path"]
dataset_name = "/".join(dataset["Source_Path"].split("/")[2:]).split(".")[0]
print(dataset_name)

original_data = []
if dataset["jsonl"]:
    with open(original_data_path) as f:
        for line in f:
            original_data.append(json.loads(line))
else:
    with open(original_data_path) as f:
        original_data = json.load(f)

print(len(original_data))

result_data = []
with open(result_data_path) as f:
    data = json.load(f)
    extra = 0
    for i in range(len(data)):
        if extra > 0:
            extra -= 1
            continue
        new_data = data[i]
        new_comment = data[i]["nl"]["comment"]
        new_heuristics = data[i]["Heuristic"]
        for j in range(i + 1, len(data)):
            if data[i]["nl"]["id"] == data[j]["nl"]["id"]:
                new_comment = new_comment + "\n" + data[i]["nl"]["comment"]
                new_heuristics = new_heuristics + data[j]["Heuristic"]
                extra += 1
            else:
                break
        new_heuristics = list(set(new_heuristics))
        new_data["nl"]["comment"] = new_comment
        new_data["Heuristic"] = new_heuristics
        result_data.append(new_data)

print(len(result_data))
assert len(result_data) == len(original_data)
sample_ids = get_statistical_significance_samples_ids(len(original_data), 0.99, 0.05)
print(len(sample_ids))


for i in range(len(original_data)):
    if i not in sample_ids:
        continue

    original = original_data[str(i)]
    result = result_data[i]
    original["nl"] = result["nl"]["comment"]
    combined_data.append(
        {
            "model": "datasets_study.prompt",
            "pk": pk,
            "fields": {
                "source_dataset": dataset_name,
                "prompt_id": result["nl"]["id"],
                "content": original,
                "language": "py",
                "problems": fix_heuristic_list(result["Heuristic"]),
            },
        }
    )
    pk += 1


JigsawDataset/PandasEval2
21
21
21


In [151]:
with open(combined_data_path, 'w') as f:
    json.dump(combined_data, f, indent=4)