In [31]:
import os
import json
import csv

In [32]:
def get_prompts(filename):
    with open(filename) as f:
        prompts = []
        for line in f:
            prompts.append(json.loads(line))
    return prompts


In [33]:
result = {}
benchmark_root = "../Repair_Quality_Analyzer/Repair_Quality_Filtered_Suggestions_2/"
dir_list = os.listdir(benchmark_root)
for file in dir_list:
    if ".DS_Store" in file:
        continue
    print("Processing file: ", file)
    result[file] = {}
    
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))
    if len(prompts) == 0:
        continue
    total_samples = 0
    count_compilable = 0
    count_vulnerable = 0
    count_good_suggestions = 0

    count_compilable_prompts = 0
    count_vulnerable_prompts = 0
    count_good_prompts = 0

    for prompt in prompts:
        current_prompt_compilable = 0
        current_prompt_vulnerable = 0
        total_samples += len(prompt["suggestions"])
        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            if suggestion["Is_Compilable"]:
                count_compilable += 1
                current_prompt_compilable += 1
            if suggestion["Is_Vulnerable"]:
                count_vulnerable += 1
                current_prompt_vulnerable += 1
        if current_prompt_compilable > 0:
            count_compilable_prompts += 1
        if current_prompt_vulnerable > 0:
            count_vulnerable_prompts += 1
        count_good_suggestions += (current_prompt_compilable-current_prompt_vulnerable)
        if current_prompt_compilable>0 and current_prompt_vulnerable!=current_prompt_compilable:
            count_good_prompts += 1
    if total_samples == 0:
        continue
    print("Total compilable: ", count_compilable)
    print("Total vulnerable: ", count_vulnerable)
    result[file]["total_samples"] = total_samples
    result[file]["total_prompts"] = len(prompts)
    result[file]["total_compilable"] = count_compilable
    result[file]["total_vulnerable"] = count_vulnerable
    result[file]["total_compilable_prompts"] = count_compilable_prompts
    result[file]["total_vulnerable_prompts"] = count_vulnerable_prompts
    result[file]["total_good_suggestions"] = count_good_suggestions
    result[file]["total_good_prompts"] = count_good_prompts
    result[file]["percentage_compilable"] = count_compilable / total_samples
    result[file]["percentage_vulnerable"] = count_vulnerable / total_samples
    result[file]["percentage_good_suggestions"] = count_good_suggestions / total_samples
    result[file]["percentage_compilable_prompts"] = count_compilable_prompts / len(prompts)
    result[file]["percentage_vulnerable_prompts"] = count_vulnerable_prompts / len(prompts)
    result[file]["percentage_good_prompts"] = count_good_prompts / len(prompts)

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Number of prompts:  14
Total compilable:  105
Total vulnerable:  101
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Number of prompts:  1
Total compilable:  1
Total vulnerable:  1
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalPython_codegen-350M-mono_128_10.jsonl
Number of prompts:  1
Total compilable:  7
Total vulnerable:  7
Processing file:  HumanEval_java_incoder-1B_128_10.jsonl
Number of prompts:  15
Total compilable:  1
Total vulnerable:  0
Processing file:  SecurityEval_python_codegen-2B-multi_128_10.jsonl
Number of prompts:  16
Total compilable:  116
Total vulnerable:  109
Processing file:  HumanEval_python_PolyCoder-2.7B_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalJava_codegen-2B-multi_128_10.jsonl
Number of prompts:  1
Total compilable:  1
Total vulnerable:  1
Processing file:  CoderEval4Python_prompt_codegen-350M-multi_

In [34]:
import pandas as pd

df = pd.DataFrame.from_dict(result, orient='index')
df.head()
df.to_csv("repair_result_2.csv")

In [35]:
df = pd.read_csv("repair_result_2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,140,14,105,101,12,11,4,1,0.75,0.721429,0.028571,0.857143,0.785714,0.071429
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,10,1,1,1,1,1,0,0,0.1,0.1,0.0,1.0,1.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,10,1,7,7,1,1,0,0,0.7,0.7,0.0,1.0,1.0,0.0
3,HumanEval_java_incoder-1B_128_10.jsonl,140,15,1,0,1,0,1,1,0.007143,0.0,0.007143,0.066667,0.0,0.066667
4,SecurityEval_python_codegen-2B-multi_128_10.jsonl,160,16,116,109,16,16,7,2,0.725,0.68125,0.04375,1.0,1.0,0.125


In [36]:
def get_dataset(x):
    if 'SOEval' in x:
       return "_".join(x.split('_')[0:1])
    else:
        return "_".join(x.split('_')[0:2])

In [37]:
def get_model(x):
    if 'SOEval' in x:
        return x.split('_')[1]
    else:
        return x.split('_')[2]

In [38]:
def get_language(x):
    dataset_type = "Java"
    if "python" in x.lower():
        dataset_type = "Python"
    return dataset_type

In [39]:
df.insert(1, "Dataset", df["Unnamed: 0"].apply(get_dataset))
df.insert(2, "Model", df["Unnamed: 0"].apply(get_model))
df.insert(3, "Language", df["Unnamed: 0"].apply(get_language))

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,Dataset,Model,Language,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,SecurityEval_python,codeparrot-small,Python,140,14,105,101,12,11,4,1,0.75,0.721429,0.028571,0.857143,0.785714,0.071429
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,SOEvalJava,PolyCoder-0.4B,Java,10,1,1,1,1,1,0,0,0.1,0.1,0.0,1.0,1.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,SOEvalPython,codegen-350M-mono,Python,10,1,7,7,1,1,0,0,0.7,0.7,0.0,1.0,1.0,0.0
3,HumanEval_java_incoder-1B_128_10.jsonl,HumanEval_java,incoder-1B,Java,140,15,1,0,1,0,1,1,0.007143,0.0,0.007143,0.066667,0.0,0.066667
4,SecurityEval_python_codegen-2B-multi_128_10.jsonl,SecurityEval_python,codegen-2B-multi,Python,160,16,116,109,16,16,7,2,0.725,0.68125,0.04375,1.0,1.0,0.125


In [41]:
df.to_csv("repair_result_2.csv", index=False)

In [42]:
df = pd.read_csv("repair_result_2.csv")

In [43]:
grouped_df = df.groupby(["Model","Language"]).mean()

In [44]:
grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
Model,Language,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PolyCoder-0.4B,Java,70.0,7.0,17.0,8.75,5.0,3.5,8.25,4.0,0.213504,0.106795,0.106709,0.820513,0.573504,0.504274
PolyCoder-0.4B,Python,40.0,4.0,27.5,27.5,4.0,4.0,0.0,0.0,0.521429,0.521429,0.0,1.0,1.0,0.0
PolyCoder-160M,Java,75.0,7.5,9.5,5.75,4.25,2.5,3.75,2.5,0.112051,0.064103,0.047949,0.473718,0.249359,0.310256
PolyCoder-160M,Python,60.0,6.0,23.0,23.0,4.0,4.0,0.0,0.0,0.383333,0.383333,0.0,0.666667,0.666667,0.0
PolyCoder-2.7B,Java,60.0,6.0,15.5,9.75,5.25,4.5,5.75,2.0,0.344643,0.275,0.069643,0.839286,0.767857,0.214286


In [45]:
grouped_df.to_csv("grouped_repair_result_2.csv")