In [None]:
# @title 1. Environment Setup
!apt-get update -qq
# Install drivers so shader_utils can import without crashing
!apt-get install -y -qq libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers
!pip install -q moderngl numpy pillow

import os
import json
import re
import random
import shutil
from google.colab import drive

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

In [None]:
# @title 2. Setup Shared Library
import sys
import os
import importlib

# Path to your shared library
LIB_DIR = "/content/drive/MyDrive/projects/EarthShader/lib"

if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

try:
    import shader_utils
    importlib.reload(shader_utils)
    print(f"[INFO] Shared library loaded from: {LIB_DIR}")
except ImportError:
    print(f"[!] ERROR: Could not find 'shader_utils.py' in {LIB_DIR}")
    print("    Please double-check your Drive folder structure.")

In [None]:
# @title 3. Configuration
import os

# --- PATHS ---
BASE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset"
INPUT_FILE = os.path.join(BASE_DIR, "verified_seeds.jsonl")
OUTPUT_FILE = os.path.join(BASE_DIR, "thestack/synthetic_dataset.jsonl")

# --- FUZZER SETTINGS ---
# You will get (VARIATIONS_PER_SEED * 2) total outputs per seed.
# Example: 5 => 5 Float Fuzzes + 5 Time Fuzzes = 10 Total.
VARIATIONS_PER_SEED = 5

# 1. Float Mutation Settings
MUTATION_STRENGTH = 0.10    # +/- 10% change
MUTATION_PROBABILITY = 0.5  # Modify 50% of found numbers

# 2. Time Mutation Settings
TIME_SCALE_MIN = 1.0
TIME_SCALE_MAX = 10.0

# Execution Limit (None = process all)
LIMIT = None
FORCE_REGENERATE = True

In [None]:
# @title 4. Run Dual Deterministic Fuzzer (Static Time)
import re
import random
import json
import os
import uuid

# --- HELPER: FLOAT FUZZING ---
def fuzz_float_callback(match):
    original_str = match.group(0)

    # Optimization: Randomly skip some numbers to keep anchors
    if random.random() > MUTATION_PROBABILITY:
        return original_str

    try:
        val = float(original_str)

        # Skip structural constants (0.0, 1.0) most of the time
        if val == 0.0 or val == 1.0:
             if random.random() > 0.2: return original_str

        # Mutation: Scale by +/- STRENGTH
        factor = 1.0 + random.uniform(-MUTATION_STRENGTH, MUTATION_STRENGTH)
        new_val = val * factor

        # Formatting: Ensure valid GLSL float (e.g. "10." or "0.5")
        formatted = f"{new_val:.5f}"
        cleaned = formatted.rstrip('0')
        if cleaned.endswith('.'): return cleaned
        if '.' not in cleaned: return cleaned + "."
        return cleaned

    except ValueError:
        return original_str

def generate_float_fuzzed_code(code):
    """Strategy A: Multiplicative Float Fuzzing"""
    clean_code = shader_utils.clean_shader_code(code)
    # Regex matches floats like "10.", ".5", "0.2"
    float_pattern = r'(?<![\w\.])\d+\.\d+|(?<![\w\.])\.\d+|(?<![\w\.])\d+\.'
    return re.sub(float_pattern, fuzz_float_callback, clean_code)

# --- HELPER: TIME FUZZING ---
def generate_time_fuzzed_code(code):
    """
    Strategy B: Static Time Snapshot
    Replaces 'iTime' with a custom macro set to a random constant timestamp.
    Example: #define time_a1b2 (4.235)
    """
    clean_code = shader_utils.clean_shader_code(code)

    # 1. Check if iTime is actually used
    if "iTime" not in clean_code:
        return None

    # 2. Generate Random Parameters
    random_suffix = uuid.uuid4().hex[:6] # e.g., "a1b2c3"
    var_name = f"time_{random_suffix}"

    # Pick a random timestamp between MIN and MAX (e.g. 1.0s to 10.0s)
    random_timestamp = random.uniform(TIME_SCALE_MIN, TIME_SCALE_MAX)

    # 3. Create the Macro Definition
    # We set the variable to a STATIC constant
    define_line = f"#define {var_name} ({random_timestamp:.3f})"

    # 4. Replace iTime in the body
    # We use regex \b boundaries to avoid replacing 'my_iTime_variable'
    # We perform the replacement BEFORE adding the define to avoid circular logic
    modified_body = re.sub(r'\biTime\b', var_name, clean_code)

    # 5. Combine
    # Put the define at the very top
    final_code = f"{define_line}\n{modified_body}"

    return final_code

# --- MAIN PIPELINE ---
def sync_datasets():
    if FORCE_REGENERATE:
        if os.path.exists(OUTPUT_FILE): os.remove(OUTPUT_FILE)
        return set()

    valid_parents = set()
    if os.path.exists(INPUT_FILE):
        with open(INPUT_FILE, 'r') as f:
            for line in f:
                try: valid_parents.add(json.loads(line)['id'])
                except: continue

    completed_parents = set()
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, 'r') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    # We check if this parent has ANY variations.
                    pid = data.get('parent_id')
                    if pid in valid_parents: completed_parents.add(pid)
                except: continue
    return completed_parents

def run_fuzzer():
    if not os.path.exists(INPUT_FILE):
        print(f"[ERROR] Input file not found: {INPUT_FILE}")
        return

    completed_ids = sync_datasets()
    print(f"[INFO] Synced. {len(completed_ids)} seeds already partially processed.")

    total_generated = 0
    processed_seeds = 0

    with open(INPUT_FILE, "r") as f_in, open(OUTPUT_FILE, "a") as f_out:
        for line in f_in:
            if not line.strip(): continue

            try:
                data = json.loads(line)
                original_code = data.get('code')
                original_id = data.get('id')

                if original_id in completed_ids: continue
                if not original_code: continue

                processed_seeds += 1
                if processed_seeds % 10 == 0:
                    print(f"[INFO] Processing Seed {processed_seeds}...", end="\r")

                orig_hash = shader_utils.get_content_hash(shader_utils.clean_shader_code(original_code))

                # --- 1. FLOAT FUZZING LOOP ---
                for i in range(VARIATIONS_PER_SEED):
                    new_code = generate_float_fuzzed_code(original_code)
                    new_hash = shader_utils.get_content_hash(new_code)

                    if new_hash == orig_hash: continue

                    record = {
                        "id": f"{original_id}_float_{i+1}",
                        "hash": new_hash,
                        "parent_id": original_id,
                        "source": "regex_fuzzer",
                        "license": "synthetic",
                        "strategy": f"float_mutation_{MUTATION_STRENGTH}",
                        "code": new_code
                    }
                    f_out.write(json.dumps(record) + "\n")
                    total_generated += 1

                # --- 2. TIME FUZZING LOOP (SNAPSHOTS) ---
                for i in range(VARIATIONS_PER_SEED):
                    new_code = generate_time_fuzzed_code(original_code)

                    # If iTime wasn't found, returns None. Skip.
                    if not new_code: break

                    new_hash = shader_utils.get_content_hash(new_code)

                    record = {
                        "id": f"{original_id}_time_{i+1}",
                        "hash": new_hash,
                        "parent_id": original_id,
                        "source": "regex_fuzzer",
                        "license": "synthetic",
                        "strategy": "time_snapshot",
                        "code": new_code
                    }
                    f_out.write(json.dumps(record) + "\n")
                    total_generated += 1

                f_out.flush()

                if LIMIT and processed_seeds >= LIMIT:
                    print("\n[INFO] Limit reached.")
                    break

            except json.JSONDecodeError:
                continue

    print(f"\n[INFO] Complete.")
    print(f"   Seeds Processed: {processed_seeds}")
    print(f"   Variants Generated: {total_generated}")

if __name__ == "__main__":
    run_fuzzer()

In [None]:
# @title 5. Inspect Random Pair (Visual + Ban Tools)
import json
import random
import os
import html
import base64
from io import BytesIO
from PIL import Image
from IPython.display import display, HTML

def image_to_base64(img):
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{img_str}"

def inspect_visual_pair():
    if not os.path.exists(OUTPUT_FILE):
        print(f"[!] No synthetic data found at: {OUTPUT_FILE}")
        return

    # 1. Pick Random Variation
    with open(OUTPUT_FILE, "r") as f:
        lines = f.readlines()
        if not lines: return
        syn_data = json.loads(random.choice(lines))

    # 2. Find Parent
    parent_id = syn_data.get("parent_id")
    original_code = "// Original code not found in verified_seeds.jsonl"

    if os.path.exists(INPUT_FILE):
        with open(INPUT_FILE, "r") as f:
            for line in f:
                try:
                    d = json.loads(line)
                    if d.get("id") == parent_id:
                        original_code = d.get("code", "")
                        break
                except: continue

    # 3. Calculate Parent Hash (Crucial for Banning)
    clean_parent_code = shader_utils.clean_shader_code(original_code)
    parent_hash = shader_utils.get_content_hash(clean_parent_code)

    # 4. Render Images
    print("Rendering previews...", end="\r")
    img_orig = shader_utils.render_image(original_code, width=512, height=288)
    img_new = shader_utils.render_image(syn_data.get("code", ""), width=512, height=288)

    if img_orig is None: img_orig = Image.new('RGB', (512, 288), (50, 0, 0))
    if img_new is None: img_new = Image.new('RGB', (512, 288), (50, 0, 0))

    b64_orig = image_to_base64(img_orig)
    b64_new = image_to_base64(img_new)

    # 5. Prepare UI
    strategy = html.escape(syn_data.get('strategy', 'Unknown'))
    orig_code_safe = html.escape(original_code)
    new_code_safe = html.escape(syn_data.get("code", ""))

    html_ui = f"""
    <style>
        .container {{ display: flex; gap: 20px; font-family: sans-serif; }}
        .column {{ flex: 1; min-width: 0; }}
        .shader-box {{
            width: 100%; height: 300px;
            font-family: monospace; font-size: 11px; white-space: pre;
            background: #f7f7f7; border: 1px solid #ccc;
        }}
        .ban-box {{
            margin-top: 15px; padding: 10px;
            background: #ffebee; border: 1px solid #ef9a9a; color: #c62828;
            font-family: monospace; font-size: 12px;
        }}
    </style>

    <div style="background: #e8f0fe; padding: 10px; margin-bottom: 20px; font-family: sans-serif;">
        <strong>STRATEGY:</strong> {strategy}
    </div>

    <div class="container">
        <div class="column">
            <h3>ORIGINAL</h3>
            <div style="font-size:11px; color:#555; margin-bottom:5px;">ID: {parent_id}</div>
            <img style="width:100%" src="{b64_orig}">
            <textarea class="shader-box" readonly>{orig_code_safe}</textarea>

            <div class="ban-box">
                <strong>BAD SEED? Copy this command to Gatekeeper (Cell 8):</strong><br>
                add_to_blocklist('{parent_hash}')
            </div>
        </div>

        <div class="column">
            <h3>FUZZED VARIATION</h3>
            <div style="font-size:11px; color:#555; margin-bottom:5px;">ID: {syn_data.get('id')}</div>
            <img style="width:100%" src="{b64_new}">
            <textarea class="shader-box" readonly>{new_code_safe}</textarea>
        </div>
    </div>
    """
    display(HTML(html_ui))

inspect_visual_pair()