In [1]:
# =============================
# C Bug Dataset Generator (Buggy + Clean, Balanced)
# =============================
import os, json, random, uuid
from typing import Dict, List, Tuple

random.seed(42)

OUT_DIR = "c_bug_dataset"
os.makedirs(OUT_DIR, exist_ok=True)

# ---------- Utilities ----------
VAR_NAMES = ["a","b","c","x","y","z","i","j","k","n","m","idx","val","len"]
STRINGS  = ["hi", "test", "hello", "world", "abc", "abcd", "foobar", "HCL", "HCLTech"]

def rnd_name():
    return random.choice(VAR_NAMES) + (str(random.randint(0, 9)) if random.random() < 0.5 else "")

def with_includes(code: str, need_string: bool=False, need_stdlib: bool=False, need_stdio: bool=True) -> str:
    inc = []
    if need_stdio:  inc.append("#include <stdio.h>")
    if need_string: inc.append("#include <string.h>")
    if need_stdlib: inc.append("#include <stdlib.h>")
    return "\n".join(inc) + "\n\n" + code

def wrap_main(body: str) -> str:
    return f"int main(){{\n{body}\n    return 0;\n}}"

def messy_spaces(s: str) -> str:
    # Add harmless formatting noise
    if random.random() < 0.5:
        s = s.replace(";", " ;")
    if random.random() < 0.5:
        s = s.replace("  ", " ")
    if random.random() < 0.3:
        s = s.replace("\n", "\n    ")
    return s

def make_record(prompt: str, response: str, label: int, bug_type: str, code: str, fixed_code: str, difficulty="easy") -> Dict:
    return {
        "id": str(uuid.uuid4()),
        "prompt": prompt,
        "response": response,
        "label": label,
        "bug_type": bug_type,
        "code": code,
        "fixed_code": fixed_code,
        "difficulty": difficulty,
        "source": "synthetic"
    }

# ---------- Bug generators (buggy + fixed) ----------
def gen_out_of_bounds() -> Tuple[str, str, str]:
    n = random.randint(3, 10)
    arr = rnd_name()
    idx_bug = n            # out of bounds (0..n-1 valid)
    idx_ok  = n-1
    val = random.randint(1, 50)
    body_bug = f"    int {arr}[{n}];\n    {arr}[{idx_bug}] = {val};\n    printf(\"%d\\n\", {arr}[{idx_ok}]);"
    body_fix = f"    int {arr}[{n}];\n    {arr}[{idx_ok}] = {val};\n    printf(\"%d\\n\", {arr}[{idx_ok}]);"
    return with_includes(wrap_main(body_bug)), with_includes(wrap_main(body_fix)), "Out-of-bounds array access"

def gen_null_deref() -> Tuple[str, str, str]:
    p = rnd_name()
    body_bug = f"    char *{p};\n    *{p} = 'A';\n    printf(\"%c\\n\", *{p});"
    body_fix = f"    char {p}_buf[2];\n    char *{p} = {p}_buf;\n    *{p} = 'A';\n    printf(\"%c\\n\", *{p});"
    code_bug = with_includes(wrap_main(body_bug))
    code_fix = with_includes(wrap_main(body_fix))
    return code_bug, code_fix, "NULL pointer dereference / uninitialized pointer"

def gen_memory_leak() -> Tuple[str, str, str]:
    p = rnd_name()
    body_bug = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ *{p} = 42; }}\n    // missing free\n    printf(\"%d\\n\", {p} ? *{p} : -1);"
    body_fix = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ *{p} = 42; printf(\"%d\\n\", *{p}); free({p}); }}"
    code_bug = with_includes(wrap_main(body_bug), need_stdlib=True)
    code_fix = with_includes(wrap_main(body_fix), need_stdlib=True)
    return code_bug, code_fix, "Memory leak (missing free)"

def gen_double_free() -> Tuple[str, str, str]:
    p = rnd_name()
    body_bug = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ free({p}); free({p}); }}"
    body_fix = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ free({p}); }}"
    code_bug = with_includes(wrap_main(body_bug), need_stdlib=True)
    code_fix = with_includes(wrap_main(body_fix), need_stdlib=True)
    return code_bug, code_fix, "Double free"

def gen_use_after_free() -> Tuple[str, str, str]:
    p = rnd_name()
    body_bug = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ *{p} = 5; free({p}); printf(\"%d\\n\", *{p}); }}"
    body_fix = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ *{p} = 5; printf(\"%d\\n\", *{p}); free({p}); }}"
    code_bug = with_includes(wrap_main(body_bug), need_stdlib=True)
    code_fix = with_includes(wrap_main(body_fix), need_stdlib=True)
    return code_bug, code_fix, "Use-after-free"

def gen_buffer_overflow_strcpy() -> Tuple[str, str, str]:
    size = random.randint(3, 8)
    s = random.choice(STRINGS)
    # ensure buggy case overflows (s length + 1 > size)
    while len(s) + 1 <= size:
        s = s + "x"
    buf = rnd_name()
    body_bug = f"    char {buf}[{size}];\n    strcpy({buf}, \"{s}\");\n    printf(\"%s\\n\", {buf});"
    # fix: increase buffer or use strncpy safely sized
    size_fix = max(size, len(s) + 1)
    body_fix = f"    char {buf}[{size_fix}];\n    strncpy({buf}, \"{s}\", sizeof({buf})-1);\n    {buf}[sizeof({buf})-1] = '\\0';\n    printf(\"%s\\n\", {buf});"
    code_bug = with_includes(wrap_main(body_bug), need_string=True)
    code_fix = with_includes(wrap_main(body_fix), need_string=True)
    return code_bug, code_fix, "Buffer overflow (strcpy)"

def gen_uninitialized_var() -> Tuple[str, str, str]:
    x = rnd_name()
    body_bug = f"    int {x};\n    if ({x} == 1) {{ printf(\"ok\\n\"); }}"
    body_fix = f"    int {x} = 0;\n    if ({x} == 1) {{ printf(\"ok\\n\"); }}"
    code_bug = with_includes(wrap_main(body_bug))
    code_fix = with_includes(wrap_main(body_fix))
    return code_bug, code_fix, "Use of uninitialized variable"

def gen_off_by_one_loop() -> Tuple[str, str, str]:
    i = rnd_name()
    n = random.randint(3, 10)
    body_bug = f"    for (int {i} = 0; {i} <= {n-1}; {i}++) {{ printf(\"%d \", {i}); }} // may access {n} elements if used as index"
    body_fix = f"    for (int {i} = 0; {i} < {n}; {i}++) {{ printf(\"%d \", {i}); }}"
    code_bug = with_includes(wrap_main(body_bug))
    code_fix = with_includes(wrap_main(body_fix))
    return code_bug, code_fix, "Off-by-one loop boundary"

def gen_integer_overflow() -> Tuple[str, str, str]:
    a = rnd_name()
    body_bug = f"    int {a} = 2147483647; int b = {a} + 1; printf(\"%d\\n\", b);"
    body_fix = f"    long long {a} = 2147483647; long long b = {a} + 1; printf(\"%lld\\n\", b);"
    code_bug = with_includes(wrap_main(body_bug))
    code_fix = with_includes(wrap_main(body_fix))
    return code_bug, code_fix, "Integer overflow (signed 32-bit)"

def gen_double_fclose() -> Tuple[str, str, str]:
    f = rnd_name()
    body_bug = f"    FILE *{f} = fopen(\"file.txt\", \"w\");\n    if ({f}) {{ fprintf({f}, \"hi\"); fclose({f}); fclose({f}); }}"
    body_fix = f"    FILE *{f} = fopen(\"file.txt\", \"w\");\n    if ({f}) {{ fprintf({f}, \"hi\"); fclose({f}); }}"
    code_bug = with_includes(wrap_main(body_bug))
    code_fix = with_includes(wrap_main(body_fix))
    return code_bug, code_fix, "Double fclose"

def gen_div_by_zero() -> Tuple[str, str, str]:
    x = rnd_name()
    y = rnd_name()
    body_bug = f"    int {x} = {random.randint(1, 9)}; int {y} = 0; printf(\"%d\\n\", {x}/{y});"
    body_fix = f"    int {x} = {random.randint(1, 9)}; int {y} = {random.randint(1, 9)}; if ({y} != 0) printf(\"%d\\n\", {x}/{y});"
    code_bug = with_includes(wrap_main(body_bug))
    code_fix = with_includes(wrap_main(body_fix))
    return code_bug, code_fix, "Division by zero"

BUG_GENERATORS = [
    gen_out_of_bounds,
    gen_null_deref,
    gen_memory_leak,
    gen_double_free,
    gen_use_after_free,
    gen_buffer_overflow_strcpy,
    gen_uninitialized_var,
    gen_off_by_one_loop,
    gen_integer_overflow,
    gen_double_fclose,
    gen_div_by_zero,
]

# ---------- Clean-only generators (look similar but correct) ----------
def gen_clean_malloc_free():
    p = rnd_name()
    body = f"    int *{p} = (int*)malloc(sizeof(int));\n    if ({p}) {{ *{p} = 7; printf(\"%d\\n\", *{p}); free({p}); }}"
    return with_includes(wrap_main(body), need_stdlib=True)

def gen_clean_strncpy():
    b = rnd_name()
    s = random.choice(STRINGS)
    size = len(s) + 1 + random.randint(0,3)
    body = f"    char {b}[{size}];\n    strncpy({b}, \"{s}\", sizeof({b})-1);\n    {b}[sizeof({b})-1] = '\\0';\n    printf(\"%s\\n\", {b});"
    return with_includes(wrap_main(body), need_string=True)

def gen_clean_bounds():
    n = random.randint(3, 10)
    arr = rnd_name()
    idx_ok = n-1
    body = f"    int {arr}[{n}];\n    {arr}[{idx_ok}] = 5;\n    printf(\"%d\\n\", {arr}[{idx_ok}]);"
    return with_includes(wrap_main(body))

CLEAN_ONLY = [gen_clean_malloc_free, gen_clean_strncpy, gen_clean_bounds]

# ---------- Build dataset ----------
def build_dataset(samples_per_bug: int = 80, add_clean_only: int = 200) -> List[Dict]:
    """
    samples_per_bug: how many buggy examples PER bug type (each paired with a fixed version)
    add_clean_only: additional clean examples from CLEAN_ONLY generators
    """
    records: List[Dict] = []
    for gen in BUG_GENERATORS:
        for _ in range(samples_per_bug):
            code_bug, code_fix, bug_desc = gen()
            # Slight noise
            code_bug = messy_spaces(code_bug)
            code_fix = messy_spaces(code_fix)

            prompt_bug = f"Find bug in this C code:\n\n{code_bug}"
            resp_bug   = f"Bug: {bug_desc}. Provide a safe fix. Example fix:\n{code_fix}"
            prompt_clean = f"Find bug in this C code:\n\n{code_fix}"
            resp_clean   = "No obvious bug detected."

            # Buggy sample
            records.append(
                make_record(prompt_bug, resp_bug, 1, bug_desc, code_bug, code_fix, difficulty="easy")
            )
            # Clean counterpart (paired)
            records.append(
                make_record(prompt_clean, resp_clean, 0, "none", code_fix, code_fix, difficulty="easy")
            )

    # Add extra clean-only samples (not tied to a specific bug)
    for _ in range(add_clean_only):
        code_ok = random.choice(CLEAN_ONLY)()
        code_ok = messy_spaces(code_ok)
        prompt = f"Find bug in this C code:\n\n{code_ok}"
        records.append(
            make_record(prompt, "No obvious bug detected.", 0, "none", code_ok, code_ok, difficulty="easy")
        )

    # Deduplicate by (code, label)
    seen = set()
    unique = []
    for r in records:
        key = (r["code"], r["label"])
        if key not in seen:
            seen.add(key)
            unique.append(r)

    random.shuffle(unique)
    return unique

dataset = build_dataset(samples_per_bug=80, add_clean_only=200)  # ~ (11 bug types * 80 * 2) + 200 ≈ 1,960 samples
print(f"Total samples: {len(dataset)} (≈50% buggy, 50% clean)")

# ---------- Train/test split and save ----------
split = int(0.8 * len(dataset))
train, test = dataset[:split], dataset[split:]

def dump_jsonl(path: str, rows: List[Dict]):
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

dump_jsonl(os.path.join(OUT_DIR, "train.jsonl"), train)
dump_jsonl(os.path.join(OUT_DIR, "test.jsonl"),  test)

# Additionally save small JSON for quick inspection
with open(os.path.join(OUT_DIR, "stats.json"), "w", encoding="utf-8") as f:
    counts = {"total": len(dataset),
              "train": len(train),
              "test": len(test),
              "buggy": sum(1 for r in dataset if r["label"]==1),
              "clean": sum(1 for r in dataset if r["label"]==0)}
    json.dump(counts, f, indent=2)

print("✅ Saved:", os.path.join(OUT_DIR, "train.jsonl"), os.path.join(OUT_DIR, "test.jsonl"))
print("📊 Stats:", counts)
# Peek one buggy and one clean
print("\n--- Buggy example ---")
for r in dataset:
    if r["label"] == 1:
        print(json.dumps({k:r[k] for k in ['bug_type','prompt','response']}, indent=2)[:800])
        break
print("\n--- Clean example ---")
for r in dataset:
    if r["label"] == 0:
        print(json.dumps({k:r[k] for k in ['bug_type','prompt','response']}, indent=2)[:800])


Total samples: 1809 (≈50% buggy, 50% clean)
✅ Saved: c_bug_dataset/train.jsonl c_bug_dataset/test.jsonl
📊 Stats: {'total': 1809, 'train': 1447, 'test': 362, 'buggy': 810, 'clean': 999}

--- Buggy example ---
{
  "bug_type": "Integer overflow (signed 32-bit)",
  "prompt": "Find bug in this C code:\n\n#include <stdio.h>\n\nint main(){\n  int y = 2147483647; int b = y + 1; printf(\"%d\\n\", b);\n  return 0;\n}",
  "response": "Bug: Integer overflow (signed 32-bit). Provide a safe fix. Example fix:\n#include <stdio.h>\n\nint main(){\n  long long y = 2147483647; long long b = y + 1; printf(\"%lld\\n\", b);\n  return 0;\n}"
}

--- Clean example ---
{
  "bug_type": "none",
  "prompt": "Find bug in this C code:\n\n#include <stdio.h>\n\nint main(){\n  long long k6 = 2147483647; long long b = k6 + 1; printf(\"%lld\\n\", b);\n  return 0;\n}",
  "response": "No obvious bug detected."
}
{
  "bug_type": "none",
  "prompt": "Find bug in this C code:\n\n#include <stdio.h>\n    \n    int main(){\n     