In [31]:
import os
import json
import csv

In [32]:
def get_prompts(filename):
    with open(filename) as f:
        prompts = []
        for line in f:
            prompts.append(json.loads(line))
    return prompts


In [33]:
result = {}
benchmark_root = "../Repair_Quality_Analyzer/Repair_Quality_Filtered_Suggestions/"
dir_list = os.listdir(benchmark_root)
for file in dir_list:
    if ".DS_Store" in file:
        continue
    print("Processing file: ", file)
    result[file] = {}
    
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))
    if len(prompts) == 0:
        continue
    total_samples = 0
    count_compilable = 0
    count_vulnerable = 0
    count_good_suggestions = 0

    count_compilable_prompts = 0
    count_vulnerable_prompts = 0
    count_good_prompts = 0

    for prompt in prompts:
        current_prompt_compilable = 0
        current_prompt_vulnerable = 0
        total_samples += len(prompt["suggestions"])
        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            if suggestion["Is_Compilable"]:
                count_compilable += 1
                current_prompt_compilable += 1
            if suggestion["Is_Vulnerable"]:
                count_vulnerable += 1
                current_prompt_vulnerable += 1
        if current_prompt_compilable > 0:
            count_compilable_prompts += 1
        if current_prompt_vulnerable > 0:
            count_vulnerable_prompts += 1
        count_good_suggestions += (current_prompt_compilable-current_prompt_vulnerable)
        if current_prompt_compilable>0 and current_prompt_vulnerable!=current_prompt_compilable:
            count_good_prompts += 1
    if total_samples == 0:
        continue
    print("Total compilable: ", count_compilable)
    print("Total vulnerable: ", count_vulnerable)
    result[file]["total_samples"] = total_samples
    result[file]["total_prompts"] = len(prompts)
    result[file]["total_compilable"] = count_compilable
    result[file]["total_vulnerable"] = count_vulnerable
    result[file]["total_compilable_prompts"] = count_compilable_prompts
    result[file]["total_vulnerable_prompts"] = count_vulnerable_prompts
    result[file]["total_good_suggestions"] = count_good_suggestions
    result[file]["total_good_prompts"] = count_good_prompts
    result[file]["percentage_compilable"] = count_compilable / total_samples
    result[file]["percentage_vulnerable"] = count_vulnerable / total_samples
    result[file]["percentage_good_suggestions"] = count_good_suggestions / total_samples
    result[file]["percentage_compilable_prompts"] = count_compilable_prompts / len(prompts)
    result[file]["percentage_vulnerable_prompts"] = count_vulnerable_prompts / len(prompts)
    result[file]["percentage_good_prompts"] = count_good_prompts / len(prompts)

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Number of prompts:  23
Total compilable:  193
Total vulnerable:  192
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Number of prompts:  1
Total compilable:  1
Total vulnerable:  1
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalPython_codegen-350M-mono_128_10.jsonl
Number of prompts:  2
Total compilable:  16
Total vulnerable:  16
Processing file:  HumanEval_java_incoder-1B_128_10.jsonl
Number of prompts:  15
Total compilable:  58
Total vulnerable:  58
Processing file:  SecurityEval_python_codegen-2B-multi_128_10.jsonl
Number of prompts:  24
Processing file:  HumanEval_python_PolyCoder-2.7B_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalJava_codegen-2B-multi_128_10.jsonl
Number of prompts:  1
Total compilable:  5
Total vulnerable:  5
Processing file:  CoderEval4Python_prompt_codegen-350M-multi_128_10.jsonl
Number of prompts:  0
Process

In [34]:
import pandas as pd

df = pd.DataFrame.from_dict(result, orient='index')
df.head()
df.to_csv("repair_result.csv")

In [35]:
df = pd.read_csv("repair_result.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,230,23,193,192,23,23,1,1,0.83913,0.834783,0.004348,1.0,1.0,0.043478
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,10,1,1,1,1,1,0,0,0.1,0.1,0.0,1.0,1.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,20,2,16,16,2,2,0,0,0.8,0.8,0.0,1.0,1.0,0.0
3,HumanEval_java_incoder-1B_128_10.jsonl,70,15,58,58,7,7,0,0,0.828571,0.828571,0.0,0.466667,0.466667,0.0
4,SOEvalJava_codegen-2B-multi_128_10.jsonl,10,1,5,5,1,1,0,0,0.5,0.5,0.0,1.0,1.0,0.0


In [36]:
def get_dataset(x):
    if 'SOEval' in x:
       return "_".join(x.split('_')[0:1])
    else:
        return "_".join(x.split('_')[0:2])

In [37]:
def get_model(x):
    if 'SOEval' in x:
        return x.split('_')[1]
    else:
        return x.split('_')[2]

In [38]:
def get_language(x):
    dataset_type = "Java"
    if "python" in x.lower():
        dataset_type = "Python"
    return dataset_type

In [39]:
df.insert(1, "Dataset", df["Unnamed: 0"].apply(get_dataset))
df.insert(2, "Model", df["Unnamed: 0"].apply(get_model))
df.insert(3, "Language", df["Unnamed: 0"].apply(get_language))

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,Dataset,Model,Language,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,SecurityEval_python,codeparrot-small,Python,230,23,193,192,23,23,1,1,0.83913,0.834783,0.004348,1.0,1.0,0.043478
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,SOEvalJava,PolyCoder-0.4B,Java,10,1,1,1,1,1,0,0,0.1,0.1,0.0,1.0,1.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,SOEvalPython,codegen-350M-mono,Python,20,2,16,16,2,2,0,0,0.8,0.8,0.0,1.0,1.0,0.0
3,HumanEval_java_incoder-1B_128_10.jsonl,HumanEval_java,incoder-1B,Java,70,15,58,58,7,7,0,0,0.828571,0.828571,0.0,0.466667,0.466667,0.0
4,SOEvalJava_codegen-2B-multi_128_10.jsonl,SOEvalJava,codegen-2B-multi,Java,10,1,5,5,1,1,0,0,0.5,0.5,0.0,1.0,1.0,0.0


In [41]:
df.to_csv("repair_result.csv", index=False)

In [42]:
df = pd.read_csv("repair_result.csv")

In [43]:
grouped_df = df.groupby(["Model","Language"]).mean()

In [44]:
grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
Model,Language,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PolyCoder-0.4B,Java,80.0,8.0,44.25,33.75,7.75,7.5,10.5,4.75,0.46,0.350625,0.109375,0.984375,0.96875,0.490625
PolyCoder-0.4B,Python,120.0,12.0,85.5,85.5,12.0,12.0,0.0,0.0,0.775,0.775,0.0,1.0,1.0,0.0
PolyCoder-160M,Java,82.5,8.25,28.5,21.75,7.75,7.0,6.75,4.0,0.338526,0.27203,0.066496,0.955769,0.883761,0.385043
PolyCoder-160M,Python,125.0,12.5,83.5,83.5,12.5,12.5,0.0,0.0,0.659783,0.659783,0.0,1.0,1.0,0.0
PolyCoder-2.7B,Java,80.0,8.333333,60.0,54.666667,8.0,7.666667,5.333333,3.666667,0.790833,0.737778,0.053056,0.962963,0.925926,0.318519


In [45]:
grouped_df.to_csv("grouped_repair_result.csv")