In [1]:
import os
import json
import csv

In [2]:
def get_prompts(filename):
    with open(filename) as f:
        prompts = []
        for line in f:
            prompts.append(json.loads(line))
    return prompts


In [3]:
result = {}
benchmark_root = "../Repair_Quality_Analyzer/Repair_Quality_Filtered_Suggestions_1/"
dir_list = os.listdir(benchmark_root)
for file in dir_list:
    if ".DS_Store" in file:
        continue
    print("Processing file: ", file)
    result[file] = {}
    
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))
    if len(prompts) == 0:
        continue
    total_samples = 0
    count_compilable = 0
    count_vulnerable = 0
    count_good_suggestions = 0

    count_compilable_prompts = 0
    count_vulnerable_prompts = 0
    count_good_prompts = 0

    for prompt in prompts:
        current_prompt_compilable = 0
        current_prompt_vulnerable = 0
        total_samples += len(prompt["suggestions"])
        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            if suggestion["Is_Compilable"]:
                count_compilable += 1
                current_prompt_compilable += 1
            if suggestion["Is_Vulnerable"]:
                count_vulnerable += 1
                current_prompt_vulnerable += 1
        if current_prompt_compilable > 0:
            count_compilable_prompts += 1
        if current_prompt_vulnerable > 0:
            count_vulnerable_prompts += 1
        count_good_suggestions += (current_prompt_compilable-current_prompt_vulnerable)
        if current_prompt_compilable>0 and current_prompt_vulnerable!=current_prompt_compilable:
            count_good_prompts += 1
    if total_samples == 0:
        continue
    print("Total compilable: ", count_compilable)
    print("Total vulnerable: ", count_vulnerable)
    result[file]["total_samples"] = total_samples
    result[file]["total_prompts"] = len(prompts)
    result[file]["total_compilable"] = count_compilable
    result[file]["total_vulnerable"] = count_vulnerable
    result[file]["total_compilable_prompts"] = count_compilable_prompts
    result[file]["total_vulnerable_prompts"] = count_vulnerable_prompts
    result[file]["total_good_suggestions"] = count_good_suggestions
    result[file]["total_good_prompts"] = count_good_prompts
    result[file]["percentage_compilable"] = count_compilable / total_samples
    result[file]["percentage_vulnerable"] = count_vulnerable / total_samples
    result[file]["percentage_good_suggestions"] = count_good_suggestions / total_samples
    result[file]["percentage_compilable_prompts"] = count_compilable_prompts / len(prompts)
    result[file]["percentage_vulnerable_prompts"] = count_vulnerable_prompts / len(prompts)
    result[file]["percentage_good_prompts"] = count_good_prompts / len(prompts)

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Number of prompts:  23
Total compilable:  202
Total vulnerable:  168
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Number of prompts:  1
Total compilable:  0
Total vulnerable:  0
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalPython_codegen-350M-mono_128_10.jsonl
Number of prompts:  2
Total compilable:  17
Total vulnerable:  14
Processing file:  HumanEval_java_incoder-1B_128_10.jsonl
Number of prompts:  15
Total compilable:  0
Total vulnerable:  0
Processing file:  SecurityEval_python_codegen-2B-multi_128_10.jsonl
Number of prompts:  24
Total compilable:  214
Total vulnerable:  166
Processing file:  HumanEval_python_PolyCoder-2.7B_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalJava_codegen-2B-multi_128_10.jsonl
Number of prompts:  1
Total compilable:  0
Total vulnerable:  0
Processing file:  CoderEval4Python_prompt_codegen-350M-mult

In [5]:
import pandas as pd

df = pd.DataFrame.from_dict(result, orient='index')
df.head()
df.to_csv("repair_result_1.csv")

In [6]:
df = pd.read_csv("repair_result_1.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,230,23,202,168,23,22,34,17,0.878261,0.730435,0.147826,1.0,0.956522,0.73913
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,10,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,20,2,17,14,2,2,3,2,0.85,0.7,0.15,1.0,1.0,1.0
3,HumanEval_java_incoder-1B_128_10.jsonl,150,15,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,SecurityEval_python_codegen-2B-multi_128_10.jsonl,240,24,214,166,24,22,48,18,0.891667,0.691667,0.2,1.0,0.916667,0.75


In [7]:
def get_dataset(x):
    if 'SOEval' in x:
       return "_".join(x.split('_')[0:1])
    else:
        return "_".join(x.split('_')[0:2])

In [8]:
def get_model(x):
    if 'SOEval' in x:
        return x.split('_')[1]
    else:
        return x.split('_')[2]

In [9]:
df.insert(1, "Dataset", df["Unnamed: 0"].apply(get_dataset))
df.insert(2, "Model", df["Unnamed: 0"].apply(get_model))

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,Dataset,Model,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,SecurityEval_python,codeparrot-small,230,23,202,168,23,22,34,17,0.878261,0.730435,0.147826,1.0,0.956522,0.73913
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,SOEvalJava,PolyCoder-0.4B,10,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,SOEvalPython,codegen-350M-mono,20,2,17,14,2,2,3,2,0.85,0.7,0.15,1.0,1.0,1.0
3,HumanEval_java_incoder-1B_128_10.jsonl,HumanEval_java,incoder-1B,150,15,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,SecurityEval_python_codegen-2B-multi_128_10.jsonl,SecurityEval_python,codegen-2B-multi,240,24,214,166,24,22,48,18,0.891667,0.691667,0.2,1.0,0.916667,0.75


In [11]:
df.to_csv("repair_result_1.csv", index=False)

In [12]:
df = pd.read_csv("repair_result_1.csv")

In [13]:
grouped_df = df.groupby(["Model"]).mean()

In [14]:
grouped_df.head()

Unnamed: 0_level_0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PolyCoder-0.4B,93.333333,9.333333,22.333333,19.333333,4.0,4.0,3.0,2.333333,0.162121,0.140909,0.021212,0.333333,0.333333,0.181818
PolyCoder-160M,96.666667,9.666667,27.166667,21.833333,4.166667,4.0,5.333333,2.833333,0.247464,0.224275,0.023188,0.333333,0.326087,0.123188
PolyCoder-2.7B,98.0,9.8,33.6,30.2,4.8,4.8,3.4,2.6,0.316364,0.291818,0.024545,0.4,0.4,0.209091
codegen-2B-mono,20.0,2.0,15.0,14.0,2.0,2.0,1.0,1.0,0.75,0.7,0.05,1.0,1.0,0.5
codegen-2B-multi,98.333333,9.833333,39.0,30.333333,4.666667,4.333333,8.666667,3.333333,0.295726,0.23531,0.060417,0.366987,0.353098,0.229167


In [15]:
grouped_df.to_csv("grouped_repair_result_1.csv")