In [1]:
import json
import os

import pandas as pd

import random
random.seed(42)


In [2]:
java_prompts = ['aiXcoder_prompt.jsonl', 'CoderEval4Java_prompt.jsonl', 'HumanEval_java.jsonl', 'SOEvalJava.jsonl']
python_prompts = ['SecurityEval_python.jsonl', 'CoderEval4Python_prompt.jsonl', 'HumanEval_python.jsonl', 'SOEvalPython.jsonl']

In [3]:
total_java_samples = 594

java_prompts_samples = []

target_sample_size = 34

for prompt in java_prompts:
    dataset_name = prompt.split('.jsonl')[0]
    
    with open('./../Benchmarks/'+prompt, 'r') as f:
        prompt_data = [json.loads(line) for line in f.readlines()]

    current_total_samples = len(prompt_data)
    # Get the sample size for this prompt by ceiling the proportion of the total samples
    sample_size = int(target_sample_size * (current_total_samples / total_java_samples)+0.5)

    # shuffle the data
    random.shuffle(prompt_data)

    # take the first sample_size samples
    for i in range(sample_size):
        java_prompts_samples.append({'task_id': prompt_data[i]['task_id'], 'dataset_name': dataset_name})

print(len(java_prompts_samples))

df = pd.DataFrame(java_prompts_samples, columns=['task_id', 'dataset_name'])
df.to_csv('java_prompts_samples.csv', index=False)


34


In [4]:
total_python_samples = 557

python_prompts_samples = []

target_sample_size = 31

for prompt in python_prompts:
    dataset_name = prompt.split('.jsonl')[0]
    
    with open('./../Benchmarks/'+prompt, 'r') as f:
        prompt_data = [json.loads(line) for line in f.readlines()]

    current_total_samples = len(prompt_data)
    # Get the sample size for this prompt by ceiling the proportion of the total samples
    sample_size = int(target_sample_size * (current_total_samples / total_python_samples)+0.5)

    # shuffle the data
    random.shuffle(prompt_data)

    # take the first sample_size samples
    for i in range(sample_size):
        python_prompts_samples.append({'task_id': prompt_data[i]['task_id'], 'dataset_name': dataset_name})


print(len(python_prompts_samples))

df = pd.DataFrame(python_prompts_samples, columns=['task_id', 'dataset_name'])
df.to_csv('python_prompts_samples.csv', index=False)

31


In [5]:
def get_prompts(filename):
    with open(filename) as f:
        prompts = []
        for line in f:
            prompts.append(json.loads(line))
    return prompts


In [6]:

benchmark_root = "../Quality_Analyzer/Quality_Filtered_Suggestions/"
dir_list = os.listdir(benchmark_root)

In [7]:
code_folder_root = './Codes/'

In [8]:
df = pd.DataFrame()

In [9]:
def check_if_present(task_id, dataset_name, sample_list):
    for sample in sample_list:
        if sample['task_id'] == task_id and sample['dataset_name'] == dataset_name:
            return True
        
    return False

In [10]:

total_samples = 0

samples_list = []
problematic_files = []
for file in dir_list:

    print("Processing file: ", file)  

    if '.DS_Store' in file:
        continue  
    # if "Security" not in file:
    #     continue
    if 'SOEval' in file:
        code_folder = "_".join(file.split('_')[0:1])
    else:
        code_folder = "_".join(file.split('_')[0:2])
    if not os.path.exists(code_folder_root+code_folder):
        os.mkdir(code_folder_root+code_folder)
    
    
    if 'SOEval' in file:
        model = file.split('_')[1]
    else:
        model = file.split('_')[2]
    if not os.path.exists(code_folder_root+code_folder+'/'+model):
        os.mkdir(code_folder_root+code_folder+'/'+model)

    path_to_file = code_folder_root+code_folder+'/'+model+'/'
    extension = ".java"
    if 'python' in file.lower():
        extension = ".py"
    
    print("Processing file: ", file)
    print("Code folder: ", code_folder)
    print("Model: ", model)
    benchmark_path = os.path.join(benchmark_root, file)
    prompts = get_prompts(benchmark_path)
    print("Number of prompts: ", len(prompts))



    # print("Original list: ", file_list[file]["prompt_ids"])
    # print("Sample list: ", sample_list)

    for prompt in prompts:
        task_id = prompt['task_id']

        
        if extension == '.py':
            is_present = check_if_present(task_id,code_folder , python_prompts_samples)

        else:
            is_present = check_if_present(task_id,code_folder , java_prompts_samples)

        if not is_present:
            continue
        else:
            total_samples += 1
        if len(prompt["suggestions"]) == 0:
            problematic_files.append(file) 
            continue
        current_list = []

        for i in range(len(prompt["suggestions"])):
            suggestion = prompt["suggestions"][i]
            current_list.append({"Dataset":code_folder,"Model": model, "Prompt_ID": prompt['task_id'],"extension":extension, "Suggestion_ID":str(i), "Suggestion": suggestion['fixed_generated_text'],"Is_Compilable":int(suggestion["Is_Compilable"]), "Is_Vulnerable": int(suggestion['Is_Vulnerable'])})


        if len(current_list)>0:
            new_current_list = sorted(current_list, key=lambda k: k['Is_Compilable'], reverse=True)

            new_compilable_list = []
            new_non_compilable_list = []
            for item in new_current_list:
                if item['Is_Compilable']:
                    new_compilable_list.append(item)
                else:
                    new_non_compilable_list.append(item)
            
            new_compilable_list = sorted(new_compilable_list, key=lambda k: k['Is_Vulnerable'])
            new_current_list = new_compilable_list + new_non_compilable_list
        

        print("Processing prompt: ", prompt['task_id'])
        suggestion = new_current_list[0]
        file_name = str(prompt['task_id']).replace("/","_")+extension
        with open(path_to_file+file_name, 'w') as f:
            f.write(suggestion['Suggestion'])
        samples_list.append(suggestion)
print("Total number of samples: ", total_samples)

Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Processing file:  SecurityEval_python_codeparrot-small_128_10.jsonl
Code folder:  SecurityEval_python
Model:  codeparrot-small
Number of prompts:  121
Processing prompt:  CWE-078_codeql_1.py
Processing prompt:  CWE-1204_sonar_1.py
Processing prompt:  CWE-285_codeql_1.py
Processing prompt:  CWE-321_author_1.py
Processing prompt:  CWE-641_sonar_1.py
Processing prompt:  CWE-776_codeql_1.py
Processing prompt:  CWE-918_codeql_2.py
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Processing file:  SOEvalJava_PolyCoder-0.4B_128_10.jsonl
Code folder:  SOEvalJava
Model:  PolyCoder-0.4B
Number of prompts:  28
Processing prompt:  3422673
Processing prompt:  1757065
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Processing file:  CoderEval4Python_prompt_codeparrot_128_10.jsonl
Code folder:  CoderEval4Python_prompt
Model:  codeparrot
Number of prompts:  230
Processing prompt:  62b87b7e9a0c4fa8b80b35bc
Pro

In [11]:
with open('problematic_files.txt', 'w') as f:
    for file in problematic_files:
        f.write(file+'\n')

In [12]:
df = pd.DataFrame(samples_list, columns=["Dataset", "Model", "Prompt_ID", "extension", "Suggestion_ID", "Suggestion", "Is_Compilable", "Is_Vulnerable"])
df.to_csv('samples.csv', index=False)