In [None]:
# === Generate a sizable C & Linux system programming dataset (JSONL) ===
# Outputs:
#   /content/data/sysprog_train_large.jsonl   (≈ 200–300 items)
#   /content/data/sysprog_valid_small.jsonl   (≈ 10% of total)

import os, json, random, textwrap

random.seed(13)
os.makedirs("/content/data", exist_ok=True)

def fence(code: str, lang: str = "c") -> str:
    return f"```{lang}\n{code.strip()}\n```"

def c_prog(body: str, includes=None, defs: str | None = None) -> str:
    """Wrap a main() with includes and optional #defines."""
    inc = includes or ["stdio.h"]
    code = "".join([f"#include <{h}>\n" for h in inc])
    if defs:
        code += defs.strip() + "\n"
    code += "\nint main(int argc, char **argv) {\n"
    code += textwrap.indent(body.strip(), "    ")
    code += "\n}\n"
    return code

samples: list[dict] = []

# 1) Process creation & waiting
samples += [
    {
        "instruction": "Write a C program that creates a child with fork(), waits for it, and prints both PIDs.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "pid_t pid = fork();\n"
                    "if (pid < 0) { perror(\"fork\"); return 1; }\n"
                    "if (pid == 0) { printf(\"child pid=%d\\n\", getpid()); return 0; }\n"
                    "int st = 0; waitpid(pid, &st, 0);\n"
                    "printf(\"parent pid=%d, child=%d, status=%d\\n\", getpid(), pid, WEXITSTATUS(st));\n"
                    "return 0;"
                ),
                includes=["stdio.h","sys/types.h","sys/wait.h","unistd.h"],
            )
        ),
    },
    {
        "instruction": "Explain the difference between fork() and vfork() with concise rationale and a micro example.",
        "input": "",
        "output": (
            "fork() duplicates the process using copy-on-write; parent and child run independently. "
            "vfork() runs the child first while sharing the parent's address space until exec/exit; "
            "the parent is suspended to avoid undefined behavior.\n\n"
            + fence(
                c_prog(
                    body="pid_t p = vfork(); if (p == 0) { _exit(0); } return 0;",
                    includes=["unistd.h"]
                )
            )
        ),
    },
]

# 2) Files & POSIX I/O
samples += [
    {
        "instruction": "Write a C function that reads a file and prints it to stdout using POSIX read() in a loop.",
        "input": "",
        "output": fence(
            (
                "#define _XOPEN_SOURCE 700\n"
                "#include <unistd.h>\n#include <fcntl.h>\n#include <stdio.h>\n#include <errno.h>\n#include <string.h>\n\n"
                "void print_file(const char *path) {\n"
                "    int fd = open(path, O_RDONLY);\n"
                "    if (fd < 0) { perror(\"open\"); return; }\n"
                "    char buf[4096]; ssize_t n;\n"
                "    while ((n = read(fd, buf, sizeof(buf))) > 0) {\n"
                "        if (write(STDOUT_FILENO, buf, n) < 0) { perror(\"write\"); break; }\n"
                "    }\n"
                "    if (n < 0) perror(\"read\");\n"
                "    if (close(fd) < 0) perror(\"close\");\n"
                "}\n"
            ),
            "c",
        ),
    }
]

# 3) Signals
samples += [
    {
        "instruction": "Write a C program that installs a SIGINT handler, counts signals, and exits after 3 Ctrl-C presses.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "static volatile sig_atomic_t cnt = 0;\n"
                    "void h(int s) { cnt++; printf(\"SIGINT %d\\n\", cnt); if (cnt >= 3) _exit(0); }\n"
                    "struct sigaction sa = {0}; sa.sa_handler = h; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_RESTART;\n"
                    "if (sigaction(SIGINT, &sa, NULL) < 0) { perror(\"sigaction\"); return 1; }\n"
                    "printf(\"press Ctrl-C up to 3 times\\n\");\n"
                    "for (;;) pause();"
                ),
                includes=["stdio.h","signal.h","unistd.h","stdlib.h"],
            )
        ),
    },
    {
        "instruction": "Explain reliable signal handling with sigaction() vs signal(). Provide an example installing a SIGCHLD handler.",
        "input": "",
        "output": (
            "sigaction() is preferred for predictable semantics, flags (SA_RESTART, SA_NOCLDWAIT), and mask control; "
            "signal() is older and less consistent across systems.\n\n" +
            fence(
                c_prog(
                    body=(
                        "void h(int s) { int st; while (waitpid(-1, &st, WNOHANG) > 0) { printf(\"child exit=%d\\n\", WEXITSTATUS(st)); } }\n"
                        "struct sigaction sa = {0}; sa.sa_handler = h; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_RESTART; sigaction(SIGCHLD, &sa, NULL);\n"
                        "if (fork() == 0) { _exit(0); }\n"
                        "pause();"
                    ),
                    includes=["stdio.h","signal.h","sys/wait.h","unistd.h"]
                )
            )
        ),
    }
]

# 4) Threads (pthreads)
samples += [
    {
        "instruction": "Write a C program that spawns 4 POSIX threads; each increments a shared counter with a mutex.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "pthread_t th[4]; pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; int counter = 0;\n"
                    "void* run(void* arg) { for (int i=0;i<100000;i++){ pthread_mutex_lock(&m); counter++; pthread_mutex_unlock(&m);} return NULL; }\n"
                    "for (int i=0;i<4;i++) pthread_create(&th[i], NULL, run, NULL);\n"
                    "for (int i=0;i<4;i++) pthread_join(th[i], NULL);\n"
                    "printf(\"counter=%d\\n\", counter);\n"
                    "return 0;"
                ),
                includes=["stdio.h","pthread.h"]
            )
        ),
    },
    {
        "instruction": "Demonstrate pthread condition variables with a producer/consumer of a single-slot buffer.",
        "input": "",
        "output": fence(
            (
                "#include <stdio.h>\n#include <pthread.h>\n\n"
                "pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t cv = PTHREAD_COND_INITIALIZER; int ready=0; int value=0;\n"
                "void* producer(void* arg){ pthread_mutex_lock(&m); value=42; ready=1; pthread_cond_signal(&cv); pthread_mutex_unlock(&m); return NULL; }\n"
                "void* consumer(void* arg){ pthread_mutex_lock(&m); while(!ready) pthread_cond_wait(&cv,&m); printf(\"got %d\\n\", value); pthread_mutex_unlock(&m); return NULL; }\n"
                "int main(){ pthread_t p,c; pthread_create(&p,NULL,producer,NULL); pthread_create(&c,NULL,consumer,NULL); pthread_join(p,NULL); pthread_join(c,NULL); return 0; }\n"
            ),
            "c",
        ),
    },
]

# 5) IPC: pipe, POSIX shared memory
samples += [
    {
        "instruction": "Write a C program that creates a pipe; parent writes a message; child reads and prints it.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "int fds[2]; if (pipe(fds) < 0) { perror(\"pipe\"); return 1; }\n"
                    "pid_t p = fork(); if (p < 0) { perror(\"fork\"); return 1; }\n"
                    "if (p == 0) { close(fds[1]); char buf[128]; ssize_t n = read(fds[0], buf, sizeof(buf)-1); if (n>0){ buf[n]='\\0'; printf(\"child: %s\\n\", buf);} return 0; }\n"
                    "close(fds[0]); const char *msg = \"hello from parent\"; write(fds[1], msg, strlen(msg)); close(fds[1]);\n"
                    "waitpid(p, NULL, 0);\n"
                    "return 0;"
                ),
                includes=["stdio.h","string.h","unistd.h","sys/wait.h"]
            )
        ),
    },
    {
        "instruction": "Create and use POSIX shared memory with shm_open + mmap. Write from one process and read from another.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "const char *name = \"/demo_shm\"; int fd = shm_open(name, O_CREAT|O_RDWR, 0600);\n"
                    "if (fd < 0) { perror(\"shm_open\"); return 1; }\n"
                    "size_t sz = 4096; if (ftruncate(fd, sz) < 0) { perror(\"ftruncate\"); return 1; }\n"
                    "char *p = mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);\n"
                    "if (p == MAP_FAILED) { perror(\"mmap\"); return 1; }\n"
                    "pid_t c = fork(); if (c == 0) { strcpy(p, \"hello shm\"); _exit(0);} waitpid(c, NULL, 0);\n"
                    "printf(\"parent sees: %s\\n\", p);\n"
                    "munmap(p, sz); close(fd); shm_unlink(name);\n"
                    "return 0;"
                ),
                includes=["stdio.h","sys/mman.h","sys/stat.h","fcntl.h","unistd.h","string.h","sys/wait.h"]
            )
        ),
    },
]

# 6) Networking & sockets
samples += [
    {
        "instruction": "Write a minimal TCP echo server in C using IPv4 and accept() one client.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "int s = socket(AF_INET, SOCK_STREAM, 0); if (s < 0) { perror(\"socket\"); return 1; }\n"
                    "struct sockaddr_in addr = {0}; addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(INADDR_ANY); addr.sin_port = htons(9090);\n"
                    "int on=1; setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));\n"
                    "if (bind(s, (struct sockaddr*)&addr, sizeof(addr)) < 0) { perror(\"bind\"); return 1; }\n"
                    "if (listen(s, 1) < 0) { perror(\"listen\"); return 1; }\n"
                    "int c = accept(s, NULL, NULL); if (c < 0) { perror(\"accept\"); return 1; }\n"
                    "char b[1024]; ssize_t n; while ((n = recv(c, b, sizeof(b), 0)) > 0) { send(c, b, n, 0); }\n"
                    "close(c); close(s);\n"
                    "return 0;"
                ),
                includes=["stdio.h","sys/types.h","sys/socket.h","netinet/in.h","arpa/inet.h","unistd.h"]
            )
        ),
    },
    {
        "instruction": "Explain the difference between select(), poll(), and epoll() and when to use each on Linux.",
        "input": "",
        "output": (
            "select() uses fixed-size fd-sets and scales poorly; poll() removes the fixed set limit but still scans O(n). "
            "epoll() maintains readiness sets and scales better for many fds; use epoll for high concurrency."
        ),
    },
]

# 7) mmap demo
samples += [
    {
        "instruction": "Demonstrate using mmap() for a file-backed mapping and print the first 64 bytes.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "int fd = open(\"/etc/hosts\", O_RDONLY); if (fd < 0) { perror(\"open\"); return 1; }\n"
                    "off_t sz = lseek(fd, 0, SEEK_END); lseek(fd, 0, SEEK_SET);\n"
                    "char *p = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);\n"
                    "if (p == MAP_FAILED) { perror(\"mmap\"); return 1; }\n"
                    "for (size_t i=0; i<64 && i<(size_t)sz; i++) putchar(p[i]); putchar('\\n');\n"
                    "munmap(p, sz); close(fd);\n"
                    "return 0;"
                ),
                includes=["stdio.h","sys/mman.h","sys/stat.h","fcntl.h","unistd.h"]
            )
        ),
    }
]

# 8) Timers & inotify
samples += [
    {
        "instruction": "Create a timerfd that ticks once per second and prints a counter for 5 ticks.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "int t = timerfd_create(CLOCK_MONOTONIC, 0); if (t < 0) { perror(\"timerfd_create\"); return 1; }\n"
                    "struct itimerspec its = {{1,0}, {1,0}}; timerfd_settime(t, 0, &its, NULL);\n"
                    "for (int i=1;i<=5;i++){ uint64_t exp; read(t,&exp,sizeof(exp)); printf(\"tick %d\\n\", i); }\n"
                    "close(t); return 0;"
                ),
                includes=["stdio.h","sys/timerfd.h","unistd.h","stdint.h"]
            )
        ),
    },
    {
        "instruction": "Use inotify to watch /tmp for IN_CREATE events and print created names.",
        "input": "",
        "output": fence(
            c_prog(
                body=(
                    "int fd = inotify_init1(IN_NONBLOCK); if (fd < 0) { perror(\"inotify_init1\"); return 1; }\n"
                    "int wd = inotify_add_watch(fd, \"/tmp\", IN_CREATE); if (wd < 0) { perror(\"inotify_add_watch\"); return 1; }\n"
                    "char buf[4096] __attribute__((aligned(__alignof__(struct inotify_event))));\n"
                    "for (int i=0;i<5;i++) {\n"
                    "    ssize_t len = read(fd, buf, sizeof(buf));\n"
                    "    if (len > 0) {\n"
                    "        size_t off = 0; while (off < (size_t)len) {\n"
                    "            struct inotify_event *ev = (struct inotify_event*)(buf+off);\n"
                    "            if (ev->mask & IN_CREATE) printf(\"created: %s\\n\", ev->name);\n"
                    "            off += sizeof(*ev) + ev->len;\n"
                    "        }\n"
                    "    }\n"
                    "    sleep(1);\n"
                    "}\n"
                    "inotify_rm_watch(fd, wd); close(fd); return 0;"
                ),
                includes=["stdio.h","sys/inotify.h","unistd.h","sys/types.h","sys/stat.h","fcntl.h","string.h","time.h"]
            )
        ),
    },
]

# 9) Debugging & fixes
samples += [
    {
        "instruction": "Given the code `int *p=NULL; *p=42;` explain the crash and provide a safe fix.",
        "input": "",
        "output": (
            "Dereferencing a NULL pointer causes a segmentation fault. Allocate memory or use a valid object before dereferencing.\n\n" +
            fence('int *p = malloc(sizeof *p);\nif (!p) { perror("malloc"); exit(1);} *p = 42; free(p);', "c")
        ),
    },
    {
        "instruction": "strace shows `openat(AT_FDCWD, \"missing.txt\", O_RDONLY) = -1 ENOENT`. Explain and suggest a fix.",
        "input": "",
        "output": (
            "The path does not exist. Create the file, correct the path, or handle ENOENT with a clear error message."
        ),
    },
]

# 10) Programmatic syscall demos (expand coverage)
syscalls = [
    ("open", "int fd=open(\"/tmp/demo.txt\", O_CREAT|O_WRONLY, 0644); if(fd<0){perror(\"open\");return 1;} write(fd, \"hi\\n\", 3); close(fd);"),
    ("read", "int fd=open(\"/etc/hosts\", O_RDONLY); if(fd<0){perror(\"open\");return 1;} char b[256]; ssize_t n=read(fd,b,sizeof(b)); if(n>0) write(1,b,n); close(fd);"),
    ("stat", "struct stat st; if(stat(\"/etc/hosts\", &st)<0){perror(\"stat\");return 1;} printf(\"size=%ld\\n\", (long)st.st_size);"),
    ("dup2", "int fd=open(\"/tmp/log.txt\", O_CREAT|O_WRONLY|O_TRUNC, 0644); dup2(fd,1); printf(\"to file\\n\"); close(fd);"),
    ("mmap", "int fd=open(\"/etc/hosts\",O_RDONLY); off_t sz=lseek(fd,0,SEEK_END); lseek(fd,0,SEEK_SET); char* p=mmap(NULL,sz,PROT_READ,MAP_PRIVATE,fd,0); if(p!=MAP_FAILED) write(1,p, sz>64?64:sz); munmap(p,sz); close(fd);"),
]
for name, body in syscalls:
    samples.append({
        "instruction": f"Write a minimal C snippet that demonstrates the POSIX {name}() usage with proper error handling.",
        "input": "",
        "output": fence(
            c_prog(
                body=body + "\nreturn 0;",
                includes=["stdio.h","unistd.h","fcntl.h","sys/stat.h","sys/types.h","sys/mman.h"]
            ),
            "c"
        ),
    })

# 11) Pthread variations
for nthreads in [2, 4, 8]:
    body = (
        f"pthread_t th[{nthreads}]; pthread_mutex_t m=PTHREAD_MUTEX_INITIALIZER; long counter=0;\n"
        "void* run(void*arg){ for(long i=0;i<200000;i++){ pthread_mutex_lock(&m); counter++; pthread_mutex_unlock(&m);} return NULL; }\n"
        f"for(int i=0;i<{nthreads};i++) pthread_create(&th[i],NULL,run,NULL);\n"
        f"for(int i=0;i<{nthreads};i++) pthread_join(th[i],NULL);\n"
        "printf(\"counter=%ld\\n\", counter);\n"
        "return 0;"
    )
    samples.append({
        "instruction": f"Write a C program that launches {nthreads} threads and increments a shared counter safely using a mutex.",
        "input": "",
        "output": fence(c_prog(body=body, includes=["stdio.h","pthread.h"]), "c"),
    })

# 12) TCP servers on several ports
for port in [8080, 9090, 10000]:
    body = (
        "int s=socket(AF_INET,SOCK_STREAM,0); if(s<0){perror(\"socket\");return 1;}\n"
        "struct sockaddr_in a={0}; a.sin_family=AF_INET; a.sin_addr.s_addr=htonl(INADDR_ANY); a.sin_port=htons(%d);\n"
        "int on=1; setsockopt(s,SOL_SOCKET,SO_REUSEADDR,&on,sizeof(on));\n"
        "if(bind(s,(struct sockaddr*)&a,sizeof(a))<0){perror(\"bind\");return 1;} listen(s,1);\n"
        "int c=accept(s,NULL,NULL); if(c<0){perror(\"accept\");return 1;} char b[256]; ssize_t n=read(c,b,sizeof(b)); if(n>0) write(c,b,n);\n"
        "close(c); close(s); return 0;" % port
    )
    samples.append({
        "instruction": f"Provide a minimal TCP server in C that listens on port {port} and echoes one line.",
        "input": "",
        "output": fence(c_prog(body=body, includes=["stdio.h","sys/types.h","sys/socket.h","netinet/in.h","arpa/inet.h","unistd.h"]), "c"),
    })

# 13) Epoll/select Q&A
samples += [
    {"instruction": "When is epoll edge-triggered mode preferable over level-triggered?", "input": "", "output": "Edge-triggered reduces wakeups for high-volume readiness; you must drain the socket until EAGAIN."},
    {"instruction": "Why can select() fail for fd>1024 without recompiling?", "input": "", "output": "FD_SETSIZE is typically 1024; exceeding it is undefined unless you rebuild with a larger size."},
]

# 14) Inotify variations
for path in ["/tmp", "/var/tmp"]:
    body = (
        f"int fd=inotify_init1(IN_NONBLOCK); if(fd<0){{perror(\"inotify_init1\");return 1;}} int wd=inotify_add_watch(fd, \"{path}\", IN_CREATE);\n"
        "if(wd<0){perror(\"inotify_add_watch\");return 1;} char buf[4096] __attribute__((aligned(__alignof__(struct inotify_event))));\n"
        "for(int i=0;i<3;i++){ ssize_t len=read(fd,buf,sizeof(buf)); if(len>0){ size_t off=0; while(off<len){ struct inotify_event*ev=(struct inotify_event*)(buf+off); if(ev->mask&IN_CREATE) printf(\"created: %s\\n\", ev->name); off+=sizeof(*ev)+ev->len; } } sleep(1);} \n"
        "inotify_rm_watch(fd,wd); close(fd); return 0;"
    )
    samples.append({
        "instruction": f"Show how to watch {path} for IN_CREATE using inotify and print created filenames.",
        "input": "",
        "output": fence(c_prog(body=body, includes=["stdio.h","sys/inotify.h","unistd.h","sys/types.h","sys/stat.h","fcntl.h","string.h","time.h"]), "c"),
    })

# 15) Strace/gdb diagnosis
samples += [
    {"instruction": "strace shows repeated EAGAIN on recv(). What does it mean and what should you do?", "input": "", "output": "The socket is non-blocking and no data is available; wait for readability (epoll/select) and retry on EAGAIN."},
    {"instruction": "gdb backtrace points to free() after double free detected. Explain and suggest mitigation.", "input": "", "output": "Double free corrupts the heap. Free each allocation exactly once, set pointers to NULL after free, and consider ASan/valgrind."},
]

# --- Light augmentation: paraphrase some instructions to increase size ---
def tweak(text: str) -> str:
    t = text.replace("Write a", "Provide a").replace("Demonstrate", "Show").replace("Explain", "Briefly explain")
    t = t.replace("Create and use", "Demonstrate creating and using")
    return t

augmented = []
for s in samples:
    if random.random() < 0.35:
        t = dict(s)
        t["instruction"] = tweak(t["instruction"])
        augmented.append(t)

dataset = samples + augmented
random.shuffle(dataset)

# --- Save train/valid (≈90/10 split) ---
n = len(dataset)
valid_n = max(20, n // 10)
train = dataset[:-valid_n]
valid = dataset[-valid_n:]

train_path = "/content/data/sysprog_train_large.jsonl"
valid_path = "/content/data/sysprog_valid_small.jsonl"

with open(train_path, "w") as f:
    for s in train:
        f.write(json.dumps(s) + "\n")

with open(valid_path, "w") as f:
    for s in valid:
        f.write(json.dumps(s) + "\n")

print(f"Saved train: {train_path} ({len(train)} samples)")
print(f"Saved valid: {valid_path} ({len(valid)} samples)")
print("Sample item:", train[0]["instruction"][:100] + " ...")


Saved train: /content/data/sysprog_train_large.jsonl (25 samples)
Saved valid: /content/data/sysprog_valid_small.jsonl (20 samples)
Sample item: Show how to watch /var/tmp for IN_CREATE using inotify and print created filenames. ...
