In [1]:
import os
import json
import csv

In [2]:
def get_prompts(filename):
    with open(filename) as f:
        prompts = []
        for line in f:
            prompts.append(json.loads(line))
    return prompts


In [3]:
result = {}
benchmark_root = "../Repair_Quality_Analyzer/Repair_Quality_Filtered_Suggestions_2/"
dir_list = os.listdir(benchmark_root)
for file in dir_list:
    if ".DS_Store" in file:
        continue
    print("Processing file: ", file)
    result[file] = {}
    
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))
    if len(prompts) == 0:
        continue
    total_samples = 0
    count_compilable = 0
    count_vulnerable = 0
    count_good_suggestions = 0

    count_compilable_prompts = 0
    count_vulnerable_prompts = 0
    count_good_prompts = 0

    for prompt in prompts:
        current_prompt_compilable = 0
        current_prompt_vulnerable = 0
        total_samples += len(prompt["suggestions"])
        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            if suggestion["Is_Compilable"]:
                count_compilable += 1
                current_prompt_compilable += 1
            if suggestion["Is_Vulnerable"]:
                count_vulnerable += 1
                current_prompt_vulnerable += 1
        if current_prompt_compilable > 0:
            count_compilable_prompts += 1
        if current_prompt_vulnerable > 0:
            count_vulnerable_prompts += 1
        count_good_suggestions += (current_prompt_compilable-current_prompt_vulnerable)
        if current_prompt_compilable>0 and current_prompt_vulnerable!=current_prompt_compilable:
            count_good_prompts += 1
    if total_samples == 0:
        continue
    print("Total compilable: ", count_compilable)
    print("Total vulnerable: ", count_vulnerable)
    result[file]["total_samples"] = total_samples
    result[file]["total_prompts"] = len(prompts)
    result[file]["total_compilable"] = count_compilable
    result[file]["total_vulnerable"] = count_vulnerable
    result[file]["total_compilable_prompts"] = count_compilable_prompts
    result[file]["total_vulnerable_prompts"] = count_vulnerable_prompts
    result[file]["total_good_suggestions"] = count_good_suggestions
    result[file]["total_good_prompts"] = count_good_prompts
    result[file]["percentage_compilable"] = count_compilable / total_samples
    result[file]["percentage_vulnerable"] = count_vulnerable / total_samples
    result[file]["percentage_good_suggestions"] = count_good_suggestions / total_samples
    result[file]["percentage_compilable_prompts"] = count_compilable_prompts / len(prompts)
    result[file]["percentage_vulnerable_prompts"] = count_vulnerable_prompts / len(prompts)
    result[file]["percentage_good_prompts"] = count_good_prompts / len(prompts)

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Number of prompts:  14
Total compilable:  105
Total vulnerable:  101
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Number of prompts:  1
Total compilable:  1
Total vulnerable:  1
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalPython_codegen-350M-mono_128_10.jsonl
Number of prompts:  1
Total compilable:  7
Total vulnerable:  7
Processing file:  HumanEval_java_incoder-1B_128_10.jsonl
Number of prompts:  15
Total compilable:  1
Total vulnerable:  0
Processing file:  SecurityEval_python_codegen-2B-multi_128_10.jsonl
Number of prompts:  16
Total compilable:  116
Total vulnerable:  109
Processing file:  HumanEval_python_PolyCoder-2.7B_128_10.jsonl
Number of prompts:  0
Processing file:  SOEvalJava_codegen-2B-multi_128_10.jsonl
Number of prompts:  1
Total compilable:  1
Total vulnerable:  1
Processing file:  CoderEval4Python_prompt_codegen-350M-multi_

In [4]:
import pandas as pd

df = pd.DataFrame.from_dict(result, orient='index')
df.head()
df.to_csv("repair_result_2.csv")

In [5]:
df = pd.read_csv("repair_result_2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,140,14,105,101,12,11,4,1,0.75,0.721429,0.028571,0.857143,0.785714,0.071429
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,10,1,1,1,1,1,0,0,0.1,0.1,0.0,1.0,1.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,10,1,7,7,1,1,0,0,0.7,0.7,0.0,1.0,1.0,0.0
3,HumanEval_java_incoder-1B_128_10.jsonl,140,15,1,0,1,0,1,1,0.007143,0.0,0.007143,0.066667,0.0,0.066667
4,SecurityEval_python_codegen-2B-multi_128_10.jsonl,160,16,116,109,16,16,7,2,0.725,0.68125,0.04375,1.0,1.0,0.125


In [6]:
def get_dataset(x):
    if 'SOEval' in x:
       return "_".join(x.split('_')[0:1])
    else:
        return "_".join(x.split('_')[0:2])

In [7]:
def get_model(x):
    if 'SOEval' in x:
        return x.split('_')[1]
    else:
        return x.split('_')[2]

In [8]:
df.insert(1, "Dataset", df["Unnamed: 0"].apply(get_dataset))
df.insert(2, "Model", df["Unnamed: 0"].apply(get_model))

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Dataset,Model,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
0,SecurityEval_python_codeparrot-small_128_10.jsonl,SecurityEval_python,codeparrot-small,140,14,105,101,12,11,4,1,0.75,0.721429,0.028571,0.857143,0.785714,0.071429
1,SOEvalJava_PolyCoder-0.4B_128_10.jsonl,SOEvalJava,PolyCoder-0.4B,10,1,1,1,1,1,0,0,0.1,0.1,0.0,1.0,1.0,0.0
2,SOEvalPython_codegen-350M-mono_128_10.jsonl,SOEvalPython,codegen-350M-mono,10,1,7,7,1,1,0,0,0.7,0.7,0.0,1.0,1.0,0.0
3,HumanEval_java_incoder-1B_128_10.jsonl,HumanEval_java,incoder-1B,140,15,1,0,1,0,1,1,0.007143,0.0,0.007143,0.066667,0.0,0.066667
4,SecurityEval_python_codegen-2B-multi_128_10.jsonl,SecurityEval_python,codegen-2B-multi,160,16,116,109,16,16,7,2,0.725,0.68125,0.04375,1.0,1.0,0.125


In [10]:
df.to_csv("repair_result_2.csv", index=False)

In [11]:
df = pd.read_csv("repair_result_2.csv")

In [12]:
grouped_df = df.groupby(["Model"]).mean()

In [13]:
grouped_df.head()

Unnamed: 0_level_0,total_samples,total_prompts,total_compilable,total_vulnerable,total_compilable_prompts,total_vulnerable_prompts,total_good_suggestions,total_good_prompts,percentage_compilable,percentage_vulnerable,percentage_good_suggestions,percentage_compilable_prompts,percentage_vulnerable_prompts,percentage_good_prompts
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PolyCoder-0.4B,60.0,6.0,20.5,15.0,4.666667,3.666667,5.5,2.666667,0.316146,0.245006,0.07114,0.880342,0.71567,0.336182
PolyCoder-160M,72.0,7.2,12.2,9.2,4.2,2.8,3.0,2.0,0.166308,0.127949,0.038359,0.512308,0.332821,0.248205
PolyCoder-2.7B,60.0,6.166667,21.166667,17.333333,5.5,5.0,3.833333,1.333333,0.434307,0.387879,0.046429,0.878968,0.831349,0.142857
codegen-2B-mono,75.0,7.5,58.5,53.5,7.5,7.5,5.0,1.0,0.788462,0.75,0.038462,1.0,1.0,0.076923
codegen-2B-multi,80.0,8.0,50.333333,37.666667,8.0,7.333333,12.666667,3.666667,0.576515,0.441572,0.134943,1.0,0.92803,0.394571


In [14]:
grouped_df.to_csv("grouped_repair_result_2.csv")