In [1]:
import json
import os
import math
import random
random.seed(42)

In [2]:
def get_prompts(filename):
    with open(filename) as f:
        prompts = []
        for line in f:
            prompts.append(json.loads(line))
    return prompts


In [3]:

benchmark_root = "../Quality_Analyzer/Quality_Filtered_Suggestions/"
dir_list = os.listdir(benchmark_root)

In [4]:
code_folder_root = './Quality_Codes/'

In [5]:
count_total_vulnerable_prompt = 0
count_total_vulnerable_item = 0

file_list = {}
for file in dir_list:
    if ".DS_Store" in file:
        continue
    file_list[file] = {
        "prompt_ids": [],
        "prompt_count": 0
    }

    print("Processing file: ", file)
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))
    count_prompt = 0
    count_item = 0
    for prompt in prompts:
        is_vulnerable_prompt = False
        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            if suggestion["Is_Vulnerable"]:
                is_vulnerable_prompt = True
                count_item+=1
               
        if is_vulnerable_prompt:
            count_prompt+=1
            file_list[file]["prompt_ids"].append(prompt['task_id'])
    file_list[file]["prompt_count"] = count_prompt

    count_total_vulnerable_item+=count_item
    count_total_vulnerable_prompt+=count_prompt


print("Total number of vulnerable prompts: ", count_total_vulnerable_prompt)
print("Total number of vulnerable items: ", count_total_vulnerable_item)

    

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Number of prompts:  121
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Number of prompts:  28
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Number of prompts:  230
Processing file:  SOEvalPython_codegen-350M-mono_128_10.jsonl
Number of prompts:  42
Processing file:  HumanEval_java_incoder-1B_128_10.jsonl
Number of prompts:  161
Processing file:  SecurityEval_python_codegen-2B-multi_128_10.jsonl
Number of prompts:  121
Processing file:  HumanEval_python_PolyCoder-2.7B_128_10.jsonl
Number of prompts:  164
Processing file:  SOEvalJava_codegen-2B-multi_128_10.jsonl
Number of prompts:  28
Processing file:  CoderEval4Python_prompt_codegen-350M-multi_128_10.jsonl
Number of prompts:  230
Processing file:  SecurityEval_python_codegen-2B-mono_128_10.jsonl
Number of prompts:  121
Processing file:  CoderEval4Java_prompt_PolyCoder-0.4B_128_10.jsonl
Number of prompts:  230
Processing file:  HumanEval_py

In [6]:
target_population = 329

In [7]:
import pandas as pd
df = pd.DataFrame()

In [8]:

total_samples = 0
for file in dir_list:
    if "Security" not in file:
        continue
    if 'SOEval' in file:
        code_folder = "_".join(file.split('_')[0:1])
    else:
        code_folder = "_".join(file.split('_')[0:2])
    if not os.path.exists(code_folder_root+code_folder):
        os.mkdir(code_folder_root+code_folder)
    
    
    if 'SOEval' in file:
        model = file.split('_')[1]
    else:
        model = file.split('_')[2]
    if not os.path.exists(code_folder_root+code_folder+'/'+model):
        os.mkdir(code_folder_root+code_folder+'/'+model)

    path_to_file = code_folder_root+code_folder+'/'+model+'/'
    extension = ".java"
    if 'python' in file.lower():
        extension = ".py"
    
    print("Processing file: ", file)
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))

    print("Number of vulnerable prompts: ", len(file_list[file]["prompt_ids"]))

    target_population_for_file = round((len(file_list[file]["prompt_ids"])*target_population)/count_total_vulnerable_prompt)
    print("Target population for file: ", target_population_for_file)
    total_samples+=target_population_for_file

    sample_list = random.sample(file_list[file]["prompt_ids"], target_population_for_file)
    assert len(sample_list) == target_population_for_file
    file_list[file]["sampled_prompt_ids"] = sample_list

    # print("Original list: ", file_list[file]["prompt_ids"])
    # print("Sample list: ", sample_list)
    for prompt in prompts:
        current_list = []
        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            if prompt['task_id'] in sample_list:
                file_name = str(prompt['task_id']).replace("/","_")+'_'+str(i)+extension
                with open(path_to_file+file_name, 'w') as f:
                    f.write(suggestion['fixed_generated_text'])
                current_list.append({"Model": file, "Prompt_ID": prompt['task_id'] , "Suggestion_ID":str(i), "Suggestion": suggestion['fixed_generated_text'],"Is_Compilable":int(suggestion["Is_Compilable"]), "Is_Vulnerable": int(suggestion['Is_Vulnerable'])})
        if len(current_list)>0:
            new_current_list = sorted(current_list, key=lambda k: k['Is_Compilable'], reverse=True)

            new_compilable_list = []
            new_non_compilable_list = []
            for item in new_current_list:
                if item['Is_Compilable']:
                    new_compilable_list.append(item)
                else:
                    new_non_compilable_list.append(item)
            
            new_compilable_list = sorted(new_compilable_list, key=lambda k: k['Is_Vulnerable'])
            new_current_list = new_compilable_list + new_non_compilable_list
            # print(current_list, len(current_list))
            # print(new_current_list, len(new_current_list))
            df = df.append(new_current_list, ignore_index=True)
print("Total number of samples: ", total_samples)

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Number of prompts:  121
Number of vulnerable prompts:  57
Target population for file:  8
Processing file:  SecurityEval_python_codegen-2B-multi_128_10.jsonl
Number of prompts:  121
Number of vulnerable prompts:  65
Target population for file:  9
Processing file:  SecurityEval_python_codegen-2B-mono_128_10.jsonl
Number of prompts:  121
Number of vulnerable prompts:  59
Target population for file:  9
Processing file:  SecurityEval_python_PolyCoder-2.7B_128_10.jsonl
Number of prompts:  121
Number of vulnerable prompts:  55
Target population for file:  8
Processing file:  SecurityEval_python_incoder-1B_128_10.jsonl
Number of prompts:  121
Number of vulnerable prompts:  24
Target population for file:  3
Processing file:  SecurityEval_python_gpt3.5_512_10.jsonl
Number of prompts:  121
Number of vulnerable prompts:  47
Target population for file:  7
Processing file:  SecurityEval_python_PolyCoder-0.4B_128_10.jsonl
Number of p

In [9]:
df.sort_values(by=['Model', 'Prompt_ID'], inplace=True, ascending=False)
df.to_csv("sampled_prompts.csv", index=False)