# Generate Mutation Training Data (Colab)

Standalone notebook for generating Rust mutation training data using `cargo-mutants`.

**What this does:**
1. Clones curated Rust repositories
2. Runs `cargo-mutants` to introduce mutations
3. Captures (buggy code, compiler error / test failure, fix) triples
4. Saves as JSONL + HuggingFace dataset to Google Drive

**Requirements:** Colab instance with high RAM (recommended: High-RAM runtime with 50+ GB)

**Time estimate:** 2-4 hours for all 21 repos at 100 mutations each

---
## Step 0: Environment Setup

In [None]:
#@title ### 0.1 Mount Drive & Clone Repo
import os
import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted")
else:
    raise RuntimeError("This notebook is designed for Google Colab. Use generate_mutations_local.ipynb for local Jupyter.")

REPO_URL = "https://github.com/rmarnold/llm-training-pipeline.git"  #@param {type:"string"}
BRANCH = "main"  #@param {type:"string"}

REPO_DIR = "/content/llm-training-pipeline"

if os.path.exists(REPO_DIR):
    %cd {REPO_DIR}
    !git pull origin {BRANCH}
else:
    !git clone -b {BRANCH} {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}

PROJECT_ROOT = REPO_DIR
os.chdir(PROJECT_ROOT)
print(f"\nProject root: {PROJECT_ROOT}")

In [None]:
#@title ### 0.2 Install Dependencies
#@markdown Installs Python deps and the Rust toolchain (rustup + cargo-mutants).

print("Installing Python dependencies...")
print("=" * 60)
!pip install -q -e ".[gpt_oss,colab]"

print("\nInstalling Rust toolchain...")
print("=" * 60)
!curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
os.environ["PATH"] = f"{os.environ['HOME']}/.cargo/bin:{os.environ['PATH']}"
!cargo install cargo-mutants

# Verification
print("\n" + "=" * 60)
print("Verification:")
print("=" * 60)

import subprocess
for cmd, label in [("cargo --version", "cargo"), ("cargo mutants --version", "cargo-mutants")]:
    result = subprocess.run(cmd.split(), capture_output=True, text=True)
    if result.returncode == 0:
        print(f"  \u2713 {label}: {result.stdout.strip()}")
    else:
        print(f"  \u2717 {label}: not found")

for pkg in ["datasets", "yaml"]:
    try:
        __import__(pkg if pkg != "yaml" else "yaml")
        print(f"  \u2713 {pkg}")
    except ImportError:
        print(f"  \u2717 {pkg}: not installed")

print("=" * 60)

In [None]:
#@title ### 0.3 Configure { run: "auto" }
#@markdown ---

max_mutations_per_repo = 100  #@param {type:"integer"}
#@markdown Maximum mutations to generate per repository

timeout_per_mutation = 300  #@param {type:"integer"}
#@markdown Timeout per mutation test in seconds

# ============================================================
# Auto-detect CPU count and RAM for safe parallelism
# ============================================================
import multiprocessing

cpu_count = multiprocessing.cpu_count()

# RAM-aware: each cargo-mutants worker + its cargo subprocesses can use
# 1-4 GB depending on the crate size. Allow ~20 GB per worker to be safe.
try:
    _mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
    total_ram_gb = _mem_bytes / (1024**3)
    ram_jobs = max(1, int(total_ram_gb / 20))
except (ValueError, OSError):
    total_ram_gb = 0
    ram_jobs = cpu_count

mutation_jobs = min(max(1, cpu_count - 2), ram_jobs)

# Storage paths
DRIVE_BASE = "/content/drive/MyDrive/gpt-oss-20b-rust-agent-v2"
CLONE_DIR = "/tmp/rust_repos"  # Local SSD for fast clones
OUTPUT_DIR = os.path.join(DRIVE_BASE, "data/rust/mutations")  # Direct to Drive

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CLONE_DIR, exist_ok=True)

print("=" * 60)
print("MUTATION GENERATION CONFIG")
print("=" * 60)
ram_str = f"{total_ram_gb:.0f} GB" if total_ram_gb else "unknown"
print(f"  CPUs: {cpu_count}")
print(f"  RAM: {ram_str}")
print(f"  Parallel jobs: {mutation_jobs}")
print(f"  Max mutations/repo: {max_mutations_per_repo}")
print(f"  Timeout/mutation: {timeout_per_mutation}s")
print(f"  Clone dir: {CLONE_DIR} (local SSD)")
print(f"  Output dir: {OUTPUT_DIR} (Drive)")
print("=" * 60)

---
## Step 1: Generate Mutations

In [None]:
#@title ### 1.1 Run Mutation Generation
#@markdown Clones all curated Rust repos and runs cargo-mutants on each.
#@markdown Progress is printed per-repo.

print(f"Starting mutation generation ({mutation_jobs} parallel jobs)...")
print("=" * 60)

!python scripts/16_generate_mutations.py \
    --config configs/data_sources_rust.yaml \
    --clone_dir {CLONE_DIR} \
    --output_dir {OUTPUT_DIR} \
    --max_mutations_per_repo {max_mutations_per_repo} \
    --timeout_per_mutation {timeout_per_mutation} \
    --jobs {mutation_jobs}

In [None]:
#@title ### 1.2 Monitor Progress (run in a separate tab)
#@markdown Run this cell while 1.1 is executing to check status.

import json
import glob

# Check JSONL output so far
jsonl_path = os.path.join(OUTPUT_DIR, "mutations.jsonl")
if os.path.exists(jsonl_path):
    with open(jsonl_path) as f:
        lines = f.readlines()
    print(f"Training examples so far: {len(lines)}")

    if lines:
        last = json.loads(lines[-1])
        print(f"Last example file: {last.get('explanation', '')[:80]}...")
else:
    print("No output file yet (generation still starting up)")

# Check which repos have been cloned
if os.path.exists(CLONE_DIR):
    repos = [d for d in os.listdir(CLONE_DIR) if os.path.isdir(os.path.join(CLONE_DIR, d))]
    print(f"\nRepos cloned: {len(repos)}")
    for r in sorted(repos):
        print(f"  - {r}")

# Show system resource usage
print("\nSystem resources:")
!free -h 2>/dev/null | head -3 || vm_stat | head -5
print()
!uptime

---
## Step 2: Verify & Inspect Data

In [None]:
#@title ### 2.1 Verify Output

import json

jsonl_path = os.path.join(OUTPUT_DIR, "mutations.jsonl")
hf_path = os.path.join(OUTPUT_DIR, "hf_dataset")

print("Output Verification:")
print("=" * 60)

# Check JSONL
if os.path.exists(jsonl_path):
    with open(jsonl_path) as f:
        lines = f.readlines()
    size_mb = os.path.getsize(jsonl_path) / (1024 * 1024)
    print(f"  \u2713 JSONL: {len(lines):,} examples ({size_mb:.1f} MB)")

    # Count by type
    caught = sum(1 for l in lines if '"Test failure:' in l)
    unviable = sum(1 for l in lines if '"Compiler error:' in l)
    print(f"    Caught mutations (test failures): {caught:,}")
    print(f"    Unviable mutations (compiler errors): {unviable:,}")
else:
    print(f"  \u2717 JSONL not found at {jsonl_path}")

# Check HF dataset
if os.path.exists(hf_path):
    items = os.listdir(hf_path)
    print(f"  \u2713 HF dataset: {hf_path} ({len(items)} files)")
else:
    print(f"  \u2014 HF dataset not found (generated after JSONL)")

print("=" * 60)

In [None]:
#@title ### 2.2 Inspect Sample Examples

import json

jsonl_path = os.path.join(OUTPUT_DIR, "mutations.jsonl")

if not os.path.exists(jsonl_path):
    print("No data yet. Run Step 1 first.")
else:
    with open(jsonl_path) as f:
        examples = [json.loads(line) for line in f.readlines()[:5]]

    for i, ex in enumerate(examples, 1):
        print(f"\n{'='*60}")
        print(f"Example {i}")
        print(f"{'='*60}")
        print(f"Explanation: {ex.get('explanation', 'N/A')[:120]}")
        print(f"\nBuggy code (first 200 chars):")
        print(ex.get('buggy_code', '')[:200])
        print(f"\nError (first 200 chars):")
        print(ex.get('error_message', '')[:200])
        print(f"\nFixed code (first 200 chars):")
        print(ex.get('fixed_code', '')[:200])

In [None]:
#@title ### 2.3 Stats by Repository
#@markdown Shows how many training examples came from each repo.

import json
from collections import Counter

jsonl_path = os.path.join(OUTPUT_DIR, "mutations.jsonl")

if not os.path.exists(jsonl_path):
    print("No data yet. Run Step 1 first.")
else:
    repo_counts = Counter()
    type_counts = Counter()

    with open(jsonl_path) as f:
        for line in f:
            ex = json.loads(line)
            explanation = ex.get('explanation', '')
            # Extract file path from explanation
            if '(' in explanation and ')' in explanation:
                file_path = explanation.split('(')[-1].rstrip(')')
                # Get crate-level path
                parts = file_path.split('/')
                repo_counts[parts[0] if parts else 'unknown'] += 1
            # Count error types
            if 'Test failure' in ex.get('error_message', ''):
                type_counts['caught (test failure)'] += 1
            elif 'Compiler error' in ex.get('error_message', ''):
                type_counts['unviable (compiler error)'] += 1

    print("Examples by source file prefix:")
    print("=" * 60)
    for repo, count in repo_counts.most_common():
        print(f"  {repo:<30} {count:>5}")
    print(f"  {'TOTAL':<30} {sum(repo_counts.values()):>5}")

    print(f"\nExamples by type:")
    print("=" * 60)
    for t, count in type_counts.most_common():
        print(f"  {t:<35} {count:>5}")

---
## Step 3: Re-run Failed Repos (Optional)

If some repos failed or timed out, you can re-run them individually.

In [None]:
#@title ### 3.1 Re-run Specific Repos
#@markdown Comma-separated list of repos to re-run (e.g., "tokio-rs/tokio,serde-rs/serde")

retry_repos = ""  #@param {type:"string"}
#@markdown Leave empty to skip

if retry_repos.strip():
    repos = [r.strip() for r in retry_repos.split(",") if r.strip()]
    repos_arg = " ".join(repos)

    # Use a separate output dir to avoid overwriting
    retry_output = os.path.join(OUTPUT_DIR, "retry")
    os.makedirs(retry_output, exist_ok=True)

    print(f"Re-running {len(repos)} repos...")
    print("=" * 60)

    !python scripts/16_generate_mutations.py \
        --repos {repos_arg} \
        --clone_dir {CLONE_DIR} \
        --output_dir {retry_output} \
        --max_mutations_per_repo {max_mutations_per_repo} \
        --timeout_per_mutation {timeout_per_mutation} \
        --jobs {mutation_jobs}

    # Merge retry results into main output
    retry_jsonl = os.path.join(retry_output, "mutations.jsonl")
    main_jsonl = os.path.join(OUTPUT_DIR, "mutations.jsonl")
    if os.path.exists(retry_jsonl):
        with open(retry_jsonl) as f:
            retry_lines = f.readlines()
        with open(main_jsonl, "a") as f:
            f.writelines(retry_lines)
        print(f"\nMerged {len(retry_lines)} retry examples into {main_jsonl}")
else:
    print("No repos to retry. Set retry_repos above.")

---
## Step 4: Verify Drive Backup

In [None]:
#@title ### 4.1 Verify Data on Drive

print("Drive Data Verification:")
print("=" * 60)

for name, path in [
    ("mutations.jsonl", os.path.join(OUTPUT_DIR, "mutations.jsonl")),
    ("hf_dataset/", os.path.join(OUTPUT_DIR, "hf_dataset")),
]:
    if os.path.exists(path):
        if os.path.isdir(path):
            items = os.listdir(path)
            print(f"  \u2713 {name} ({len(items)} files)")
        else:
            size_mb = os.path.getsize(path) / (1024 * 1024)
            print(f"  \u2713 {name} ({size_mb:.1f} MB)")
    else:
        print(f"  \u2717 {name} not found")

print(f"\nDrive path: {OUTPUT_DIR}")
print("\nThis data is ready for use by the training notebook.")
print("Set skip_data_generation=True in train_gpt_oss_rust_agent_v2.ipynb")
print("=" * 60)

---
## Done!

Your mutation training data is saved to Google Drive.

**Next steps:**
- Open `train_gpt_oss_rust_agent_v2.ipynb`
- Set `skip_data_generation = True` in Step 0.3
- The training notebook will use this pre-generated data from Drive

**Output location:**
- JSONL: `Drive/gpt-oss-20b-rust-agent-v2/data/rust/mutations/mutations.jsonl`
- HF Dataset: `Drive/gpt-oss-20b-rust-agent-v2/data/rust/mutations/hf_dataset/`