In [6]:
# ===========================
# Script 1: Synthetic Dataset Generator
# ===========================
import json
import uuid
import random

# --------------------------
# BUGGY CODE CATEGORIES
# --------------------------
BUG_TYPES = {
    # --- Memory Safety Issues ---
    "buffer_overflow": {
        "buggy": """#include <stdio.h>
int main() {
  char buf[5];
  sprintf(buf, "HelloWorld"); // overflow
  printf("%s\\n", buf);
  return 0;
}""",
        "fixed": """#include <stdio.h>
#include <string.h>
int main() {
  char buf[11];
  strncpy(buf, "HelloWorld", sizeof(buf)-1);
  buf[sizeof(buf)-1] = '\\0';
  printf("%s\\n", buf);
  return 0;
}""",
        "explanation": "Buffer overflow due to writing beyond array bounds."
    },
    "off_by_one": {
        "buggy": """#include <stdio.h>
int main() {
  int arr[5];
  for(int i=0;i<=5;i++) arr[i]=i; // off-by-one
  return 0;
}""",
        "fixed": """#include <stdio.h>
int main() {
  int arr[5];
  for(int i=0;i<5;i++) arr[i]=i; // fixed
  return 0;
}""",
        "explanation": "Off-by-one loop error causing array overflow."
    },
    "use_after_free": {
        "buggy": """#include <stdlib.h>
#include <stdio.h>
int main() {
  int *p = malloc(5*sizeof(int));
  free(p);
  p[0]=10; // use-after-free
  return 0;
}""",
        "fixed": """#include <stdlib.h>
#include <stdio.h>
int main() {
  int *p = malloc(5*sizeof(int));
  p[0]=10;
  free(p);
  return 0;
}""",
        "explanation": "Use-after-free: accessing memory after freeing."
    },
    "double_free": {
        "buggy": """#include <stdlib.h>
int main() {
  int *p = malloc(10);
  free(p);
  free(p); // double free
  return 0;
}""",
        "fixed": """#include <stdlib.h>
int main() {
  int *p = malloc(10);
  free(p);
  p=NULL; // safe
  return 0;
}""",
        "explanation": "Double free: freeing the same memory twice."
    },
    "memory_leak": {
        "buggy": """#include <stdlib.h>
int main() {
  int *p = malloc(100);
  p[0]=1;
  return 0; // leak
}""",
        "fixed": """#include <stdlib.h>
int main() {
  int *p = malloc(100);
  p[0]=1;
  free(p);
  return 0;
}""",
        "explanation": "Memory leak: allocated memory not freed."
    },
    "null_pointer": {
        "buggy": """#include <stdio.h>
int main() {
  int *p=NULL;
  printf("%d", *p); // null deref
}""",
        "fixed": """#include <stdio.h>
int main() {
  int *p=NULL;
  if(p) printf("%d", *p);
  else printf("Pointer is NULL");
}""",
        "explanation": "Null pointer dereference."
    },
    "uninitialized_variable": {
        "buggy": """#include <stdio.h>
int main() {
  int x;
  printf("%d", x); // uninitialized
}""",
        "fixed": """#include <stdio.h>
int main() {
  int x=0;
  printf("%d", x);
}""",
        "explanation": "Uninitialized variable used."
    },
    "dangling_pointer": {
        "buggy": """#include <stdlib.h>
int main() {
  int *p=malloc(4);
  free(p);
  printf("%d", *p); // dangling
}""",
        "fixed": """#include <stdlib.h>
int main() {
  int *p=malloc(4);
  printf("%d", *p);
  free(p);
  p=NULL;
}""",
        "explanation": "Dangling pointer after free."
    },
    "integer_overflow": {
        "buggy": """#include <stdio.h>
int main() {
  int x = 2147483647;
  x = x + 1; // overflow
  printf("%d", x);
}""",
        "fixed": """#include <stdio.h>
#include <limits.h>
int main() {
  long long x = INT_MAX;
  x = x + 1;
  printf("%lld", x);
}""",
        "explanation": "Integer overflow beyond int limit."
    },

    # --- Concurrency Issues ---
    "deadlock": {
        "buggy": """#include <pthread.h>
pthread_mutex_t m1=PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t m2=PTHREAD_MUTEX_INITIALIZER;
void* t1(void* arg){
  pthread_mutex_lock(&m1);
  pthread_mutex_lock(&m2);
  pthread_mutex_unlock(&m2);
  pthread_mutex_unlock(&m1);
  return NULL;
}
void* t2(void* arg){
  pthread_mutex_lock(&m2);
  pthread_mutex_lock(&m1);
  pthread_mutex_unlock(&m1);
  pthread_mutex_unlock(&m2);
  return NULL;
}""",
        "fixed": """#include <pthread.h>
pthread_mutex_t m1=PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t m2=PTHREAD_MUTEX_INITIALIZER;
void* t1(void* arg){
  pthread_mutex_lock(&m1);
  pthread_mutex_lock(&m2);
  pthread_mutex_unlock(&m2);
  pthread_mutex_unlock(&m1);
  return NULL;
}
void* t2(void* arg){
  pthread_mutex_lock(&m1); // consistent order
  pthread_mutex_lock(&m2);
  pthread_mutex_unlock(&m2);
  pthread_mutex_unlock(&m1);
  return NULL;
}""",
        "explanation": "Deadlock due to inconsistent lock order."
    },
    "race_condition": {
        "buggy": """#include <pthread.h>
#include <stdio.h>
int counter=0;
void* f(void* arg){
  for(int i=0;i<1000;i++) counter++;
  return NULL;
}""",
        "fixed": """#include <pthread.h>
#include <stdio.h>
int counter=0;
pthread_mutex_t lock=PTHREAD_MUTEX_INITIALIZER;
void* f(void* arg){
  for(int i=0;i<1000;i++){
    pthread_mutex_lock(&lock);
    counter++;
    pthread_mutex_unlock(&lock);
  }
  return NULL;
}""",
        "explanation": "Race condition: counter modified unsafely."
    },
    "unlock_without_lock": {
        "buggy": """#include <pthread.h>
pthread_mutex_t m=PTHREAD_MUTEX_INITIALIZER;
int main(){
  pthread_mutex_unlock(&m); // unlock without lock
}""",
        "fixed": """#include <pthread.h>
pthread_mutex_t m=PTHREAD_MUTEX_INITIALIZER;
int main(){
  pthread_mutex_lock(&m);
  pthread_mutex_unlock(&m);
}""",
        "explanation": "Unlock without holding lock."
    },
    "sleep_in_critical": {
        "buggy": """#include <pthread.h>
#include <unistd.h>
pthread_mutex_t m=PTHREAD_MUTEX_INITIALIZER;
void* f(void* arg){
  pthread_mutex_lock(&m);
  sleep(5); // unsafe
  pthread_mutex_unlock(&m);
  return NULL;
}""",
        "fixed": """#include <pthread.h>
#include <unistd.h>
pthread_mutex_t m=PTHREAD_MUTEX_INITIALIZER;
void* f(void* arg){
  pthread_mutex_lock(&m);
  // critical work
  pthread_mutex_unlock(&m);
  sleep(5);
  return NULL;
}""",
        "explanation": "Sleeping in critical section blocks other threads."
    },

    # --- Misc Logic Issues ---
    "division_by_zero": {
        "buggy": """#include <stdio.h>
int main(){int x=0; printf("%d", 10/x);}""",
        "fixed": """#include <stdio.h>
int main(){int x=0; if(x!=0) printf("%d", 10/x);}""",
        "explanation": "Division by zero."
    },
    "format_string": {
        "buggy": """#include <stdio.h>
int main(){char buf[20]; scanf(buf); printf(buf);}""",
        "fixed": """#include <stdio.h>
int main(){char buf[20]; scanf("%19s", buf); printf("%s", buf);}""",
        "explanation": "Format string vulnerability."
    },
    "signed_unsigned_mismatch": {
        "buggy": """#include <stdio.h>
int main(){int x=-1; unsigned int y=x; printf("%u", y);}""",
        "fixed": """#include <stdio.h>
#include <stdint.h>
int main(){int32_t x=-1; printf("%d", x);}""",
        "explanation": "Signed/unsigned mismatch."
    },
    "stack_overflow": {
        "buggy": """void f(){int arr[10000000]; f();} int main(){f();}""",
        "fixed": """#include <stdlib.h>
void f(){int *arr=malloc(10000000*sizeof(int)); free(arr);} int main(){f();}""",
        "explanation": "Stack overflow via deep recursion or huge array."
    },
    "array_index_negative": {
        "buggy": """#include <stdio.h>
int main(){int arr[5]; arr[-1]=10;}""",
        "fixed": """#include <stdio.h>
int main(){int arr[5]; int idx=0; if(idx>=0) arr[idx]=10;}""",
        "explanation": "Array index negative."
    },
    "shift_overflow": {
        "buggy": """#include <stdio.h>
int main(){int x=1<<40; printf("%d", x);}""",
        "fixed": """#include <stdio.h>
#include <stdint.h>
int main(){long long x=1LL<<40; printf("%lld", x);}""",
        "explanation": "Bit shift overflow."
    },
    "logic_error": {
        "buggy": """#include <stdio.h>
int main(){int a=5,b=10; if(a>b) printf("a bigger");}""",
        "fixed": """#include <stdio.h>
int main(){int a=5,b=10; if(a<b) printf("a smaller");}""",
        "explanation": "Logic error in condition."
    }
}
# --------------------------
# NO-BUG CATEGORIES
# --------------------------
NO_BUG_EXAMPLES = [
    """#include <stdio.h>
int main() {
  printf("Hello, World!\\n");
  return 0;
}""",
    """#include <stdlib.h>
int main() {
  int *p = malloc(10 * sizeof(int));
  for (int i = 0; i < 10; i++) p[i] = i;
  free(p);
  return 0;
}""",
    """#include <pthread.h>
#include <stdio.h>
pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
int counter=0;
void* f(void* arg){
  for(int i=0;i<1000;i++){
    pthread_mutex_lock(&m);
    counter++;
    pthread_mutex_unlock(&m);
  }
  return NULL;
}
int main(){return 0;}""",
    """#include <string.h>
#include <stdio.h>
int main(){
  char src[]="abcd";
  char dst[10];
  strncpy(dst, src, sizeof(dst)-1);
  dst[sizeof(dst)-1]='\\0';
  printf("%s\\n", dst);
  return 0;
}""",
    """#include <stdio.h>
int add(int a,int b){return a+b;}
int main(){
  printf("%d", add(3,4));
  return 0;
}"""
]

# --------------------------
# GENERATOR FUNCTION
# --------------------------
def generate_balanced_dataset(samples_per_category):
    dataset=[]

    # Bug categories
    for bug_type, bug in BUG_TYPES.items():
        for _ in range(samples_per_category):
            entry={
                "id": str(uuid.uuid4()),
                "prompt": f"Find bug in this C code:\n\n{bug['buggy']}",
                "response": bug["explanation"],
                "label": 1,
                "bug_type": bug_type,
                "code": bug["buggy"],
                "fixed_code": bug["fixed"],
                "difficulty": random.choice(["easy","medium","hard"]),
                "source": "synthetic"
            }
            dataset.append(entry)

    # No bug category
    for _ in range(samples_per_category):
        code = random.choice(NO_BUG_EXAMPLES)
        entry={
            "id": str(uuid.uuid4()),
            "prompt": f"Find bug in this C code:\n\n{code}",
            "response": "No obvious bug detected.",
            "label": 0,
            "bug_type": "none",
            "code": code,
            "fixed_code": code,
            "difficulty": "easy",
            "source": "synthetic"
        }
        dataset.append(entry)

    random.shuffle(dataset)
    return dataset

# --------------------------
# SAVE DATASET
# --------------------------
dataset = generate_balanced_dataset(samples_per_category=5000)
with open("synthetic_c_bugs.json","w") as f:
    json.dump(dataset,f,indent=2)

print("✅ Saved dataset with",len(dataset),"entries across",
      len(BUG_TYPES)+1,"categories (including no-bug).")


✅ Saved dataset with 105000 entries across 21 categories (including no-bug).
