In [11]:
import os
import json
import time
import pandas as pd
from openai import OpenAI

In [12]:
client = OpenAI(api_key="")


In [22]:
df = pd.read_csv("vulnerabilities.csv")  # replace with your CSV filename

# 2. Build batch tasks
tasks = []
for idx, row in df.iterrows():
    # Construct the user message combining vulnerability type and fixed code
    user_content = f"""
Code:
```
{row['fixed_code']}
```"""
    # System prompt for crafting the higher-level user prompt
    system_prompt = (
        "You are a prompt engineering assistant. "
        "Given a code snippet, construct a prompt that, "
        "when given to a code generation model, will produce the code. "
        "Only output the crafted prompt."
    )

    tasks.append({
        "custom_id": str(idx),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1-nano",
            "temperature": 0.2,
            "max_tokens": 512,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_content},
            ],
        }
    })

# 3. Write tasks to a JSONL file
batch_filename = "batch_tasks.jsonl"
with open(batch_filename, "w") as f:
    for task in tasks:
        f.write(json.dumps(task) + "\n")

# 4. Upload batch file
batch_file = client.files.create(
    file=open(batch_filename, "rb"),
    purpose="batch"
)

# 5. Create the batch job
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

In [None]:
def split_jsonl(input_file: str, output_prefix: str, chunk_size: int = 1000):
    """
    Splits a JSONL file into multiple files each containing `chunk_size` lines.

    :param input_file: Path to the source .jsonl file
    :param output_prefix: Prefix for the output files; chunk index and .jsonl will be appended
    :param chunk_size: Number of lines per output file
    """
    with open(input_file, 'r', encoding='utf-8') as infile:
        chunk_idx = 0
        out_f = None

        for line_num, line in enumerate(infile):
            # start a new chunk file
            if line_num % chunk_size == 0:
                if out_f:
                    out_f.close()
                out_filename = f"{output_prefix}_{chunk_idx}.jsonl"
                out_f = open(out_filename, 'w', encoding='utf-8')
                print(f"Writing {out_filename}…")
                chunk_idx += 1

            out_f.write(line)

        # close the last file
        if out_f:
            out_f.close()


if __name__ == "__main__":
    split_jsonl(
        input_file="batch_tasks.jsonl",   # your original file
        output_prefix="batch_tasks_chunk", # output will be batch_tasks_chunk_0.jsonl, _1.jsonl, …
        chunk_size=5000
    )


In [None]:
if __name__ == "__main__":
    split_jsonl(
        input_file="batch_tasks_chunk_2.jsonl",   # your original file
        output_prefix="batch_tasks_chunk_2_", # output will be batch_tasks_chunk_0.jsonl, _1.jsonl, …
        chunk_size=2500
    )

In [None]:
import pandas as pd
import json
from glob import glob

# 1. Load your original dataset
df = pd.read_csv("vulnerabilities.csv")

# 2. Read all batch output files and collect prompts by index
idx_to_prompt = {}
for filepath in sorted(glob("batchr*.jsonl")):
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            record = json.loads(line)
            idx = int(record["custom_id"])
            # adjust this path if your structure differs
            print(record)
            prompt = record["response"]["body"]["choices"][0]["message"]["content"]
            idx_to_prompt[idx] = prompt

# 3. Map prompts back onto the DataFrame (assumes DataFrame index == custom_id)
df["prompt"] = df.index.map(idx_to_prompt)

# 4. Save the enriched dataset
output_csv = "vulnerabilities_with_prompts.csv"
df.to_csv(output_csv, index=False)
print(f"Done! New CSV written to {output_csv}")
