# **Phase 2: The Seed Gatekeeper**

**Goal:** Aggregate, validate, and triage GLSL shaders to create a "Gold Standard" dataset for training.

### **Core Concepts**
* No shader enters the training set **unless it compiles and renders a valid image** in a headless environment.
* **Content Hashing:** Shaders are tracked by the **SHA-256 Hash** of their code, not their filename. This prevents duplicates and ensures that if you ban a specific code block, it stays banned forever, even if it reappears under a new name.
* **Tri-State Lifecycle:**
    1.  **Verified (`verified_seeds.jsonl`):** Valid code ready for synthetic augmentation.
    2.  **Quarantine (`quarantine.jsonl`):** Broken code waiting for manual repair.
    3.  **Blocklist (`blocklist.json`):** Permanently banned code hashes (e.g. black screens, crashes).

### **The Workflow**
1.  **Run Gatekeeper (Cell 4):** It ingests new candidates from "The Stack" (Phase 1) and your `manual_injections` folder. It tries to render them.
2.  **Inspect Failures (Cell 6):** Use the inspector tool to see code in **Quarantine**.
3.  **Fix or Ban:**
    * **To FIX:** Copy the code, fix the error locally, save as `.glsl`, and drop it into `dataset/manual_injections/`.
    * **To BAN:** Copy the ID and run `add_to_blocklist("ID")` (Cell 5).
4.  **Re-Run:** The Gatekeeper automatically promotes your fixes and deletes banned items from Quarantine.

In [None]:
# @title 1. Environment Setup
!apt-get update -qq
!apt-get install -y -qq libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers
!pip install -q moderngl numpy pillow datasets

import os
import json
import hashlib
import shutil
import moderngl
import numpy as np
from PIL import Image
from google.colab import drive

# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

print("[INFO] Environment Ready.")

In [None]:
# @title 2. Configuration
# --- PATHS ---
BASE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset"

# Inputs
STACK_SOURCE = os.path.join(BASE_DIR, "thestack/shaders_archive.jsonl")
MANUAL_SOURCE_DIR = os.path.join(BASE_DIR, "manual_injections") # Drop .glsl files here
BLOCKLIST_FILE = os.path.join(BASE_DIR, "blocklist.json")

# Outputs
VERIFIED_FILE = os.path.join(BASE_DIR, "verified_seeds.jsonl")
QUARANTINE_FILE = os.path.join(BASE_DIR, "quarantine.jsonl")

# --- RENDERING SETTINGS ---
WIDTH, HEIGHT = 512, 512  # Resolution for validation check
RENDER_TIME = 1.0         # Render at t=1.0 to check animation

# GLSL Boilerplate (Wrapper)
HEADER = """
#version 330
uniform vec3      iResolution;
uniform float     iTime;
uniform vec4      iMouse;
out vec4 fragColor;
"""

FOOTER = """
void main() {
    mainImage(fragColor, gl_FragCoord.xy);
}
"""

VERTEX_SHADER = """
#version 330
in vec2 in_vert;
void main() {
    gl_Position = vec4(in_vert, 0.0, 1.0);
}
"""

# Ensure directories exist
for path in [MANUAL_SOURCE_DIR, os.path.dirname(VERIFIED_FILE)]:
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

In [None]:
# @title 3. Helper Functions

def get_content_hash(code):
    """Generates a SHA-256 hash of the code (ignoring whitespace) for ID."""
    # Normalize: strip whitespace to avoid duplicate hashes for identical logic
    normalized = "".join(code.split())
    return hashlib.sha256(normalized.encode('utf-8')).hexdigest()

def create_headless_context():
    """Creates a standalone ModernGL context."""
    try:
        return moderngl.create_context(standalone=True, backend='egl')
    except:
        return moderngl.create_context(standalone=True)

def render_shader(ctx, user_code):
    """
    Attempts to compile and render the shader.
    Returns: (Success (bool), Message (str), Entropy (float))
    """
    try:
        # Assemble full shader
        full_source = f"{HEADER}\n{user_code}\n{FOOTER}"

        # Compile
        prog = ctx.program(
            vertex_shader=VERTEX_SHADER,
            fragment_shader=full_source
        )

        # Set Uniforms
        if 'iResolution' in prog: prog['iResolution'].value = (WIDTH, HEIGHT, 1.0)
        if 'iTime' in prog: prog['iTime'].value = RENDER_TIME
        if 'iMouse' in prog: prog['iMouse'].value = (0.0, 0.0, 0.0, 0.0)

        # Render to FBO
        fbo = ctx.simple_framebuffer((WIDTH, HEIGHT))
        fbo.use()
        fbo.clear(0.0, 0.0, 0.0, 1.0)

        # Full screen quad
        vertices = np.array([-1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0], dtype='f4')
        vbo = ctx.buffer(vertices)
        vao = ctx.simple_vertex_array(prog, vbo, 'in_vert')
        vao.render(moderngl.TRIANGLE_STRIP)

        # Read pixels to check output
        raw = fbo.read(components=3)
        img = np.frombuffer(raw, dtype=np.uint8)

        # Calculate Entropy (simple check for "black screen")
        if np.all(img == 0):
            return False, "Rendered purely black image", 0.0

        # Cleanup
        vbo.release()
        vao.release()
        prog.release()
        fbo.release()

        return True, "Success", 1.0 # (placeholder entropy)

    except Exception as e:
        return False, str(e), 0.0

def load_jsonl(path):
    data = {}
    if os.path.exists(path):
        with open(path, 'r') as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    # If entry has a hash ID, use it, otherwise re-hash
                    code_hash = get_content_hash(entry.get('code', ''))
                    data[code_hash] = entry
                except: continue
    return data

def load_blocklist():
    if os.path.exists(BLOCKLIST_FILE):
        try:
            with open(BLOCKLIST_FILE, 'r') as f:
                return set(json.load(f))
        except: return set()
    return set()

In [None]:
# @title 4. Run Gatekeeper (With Error Histogram)
from collections import Counter

def run_gatekeeper():
    print("[*] Loading State...")
    verified_db = load_jsonl(VERIFIED_FILE)
    quarantine_db = load_jsonl(QUARANTINE_FILE)
    blocklist = load_blocklist()

    print(f"    - Verified:   {len(verified_db)}")
    print(f"    - Quarantine: {len(quarantine_db)}")
    print(f"    - Blocklist:  {len(blocklist)}")

    # 1. Setup render context
    ctx = create_headless_context()

    # 2. Gather candidates
    candidates = []

    # A. From stack
    if os.path.exists(STACK_SOURCE):
        with open(STACK_SOURCE, 'r') as f:
            for line in f:
                try: candidates.append(json.loads(line))
                except: continue

    # B. From manual
    if os.path.exists(MANUAL_SOURCE_DIR):
        for fname in os.listdir(MANUAL_SOURCE_DIR):
            if fname.endswith(".glsl"):
                path = os.path.join(MANUAL_SOURCE_DIR, fname)
                with open(path, 'r') as f:
                    candidates.append({
                        "id": fname,
                        "source": "manual_injection",
                        "license": "manual",
                        "code": f.read()
                    })

    print(f"[*] Processing {len(candidates)} candidates...")

    new_verified = 0
    new_quarantine = 0
    error_counts = Counter() # Track error types

    # 3. Process loop
    for item in candidates:
        code = item.get('code', '')
        if not code: continue

        code_hash = get_content_hash(code)

        # FILTER: Blocklist
        if code_hash in blocklist:
            if code_hash in quarantine_db: del quarantine_db[code_hash]
            continue

        # FILTER: Already verified
        if code_hash in verified_db:
            continue

        # VALIDATE: Render test
        success, message, entropy = render_shader(ctx, code)

        if success:
            # Promotion!
            verified_db[code_hash] = item
            if code_hash in quarantine_db: del quarantine_db[code_hash]
            new_verified += 1
            print(f"   [+] Verified: {item['id'][:10]}...", end="\r")
        else:
            # Demotion
            item['error'] = message
            quarantine_db[code_hash] = item
            new_quarantine += 1

            # Summarize error (First line only) for histogram
            error_summary = str(message).split('\n')[0][:80]
            error_counts[error_summary] += 1

            print(f"   [-] Quarantine: {item['id'][:10]}... ({error_summary[:15]}...)", end="\r")

    # 4. Save state
    print(f"\n[*] Saving Updates...")

    with open(VERIFIED_FILE, 'w') as f:
        for v in verified_db.values(): f.write(json.dumps(v) + "\n")

    with open(QUARANTINE_FILE, 'w') as f:
        for v in quarantine_db.values(): f.write(json.dumps(v) + "\n")

    print(f"\n[DONE] Gatekeeper Run Complete.")
    print(f"   New Verified:   {new_verified}")
    print(f"   New Quarantine: {new_quarantine}")
    print(f"   Total Verified: {len(verified_db)}")

    # 5. Print error analysis
    if error_counts:
        print("\n" + "="*60)
        print(" TOP QUARANTINE CAUSES (Use this to improve filters)")
        print("="*60)
        print(f"{'COUNT':<6} | {'ERROR MESSAGE (First Line)'}")
        print("-" * 60)
        for error, count in error_counts.most_common(10):
             print(f"{count:<6} | {error}")
        print("="*60)

if __name__ == "__main__":
    run_gatekeeper()

In [None]:
# @title 5. Inspector: View Quarantine
import html
from IPython.display import display, HTML

def inspect_quarantine(index=0):
    if not os.path.exists(QUARANTINE_FILE):
        print("[!] Quarantine is empty.")
        return

    with open(QUARANTINE_FILE, 'r') as f:
        lines = f.readlines()

    if not lines: return
    if index < 0: index = len(lines) + index
    if index >= len(lines): index = 0

    try:
        item = json.loads(lines[index])
        code = item.get('code', '')
        # Calculate hash on the fly so you can see it
        code_hash = get_content_hash(code)

        print(f"\nQUARANTINE ITEM #{index + 1} / {len(lines)}")
        print(f"Source ID:   {item.get('id')}  <-- (Git Commit / Filename)")
        print(f"Fingerprint: {code_hash}  <-- (Content Hash)")
        print(f"Error:       {item.get('error')}")
        print("-" * 80)

        # Copy-friendly box
        safe_code = html.escape(code)
        display(HTML(f"""
        <textarea rows="15" style="width:100%; font-family:monospace; font-size:11px;">
{safe_code}
</textarea>
        """))
        print(f"To Ban:\nadd_to_blocklist('{item.get('id')}')")

    except: pass

# View first item
inspect_quarantine(0)

In [None]:
# @title 6. Blocklist Tool (Ban & Delete)
def add_to_blocklist(target_id_or_hash):
    """
    Bans a shader. You can pass the Source ID or the Hash.
    """
    blocklist = load_blocklist()
    quarantine_db = load_jsonl(QUARANTINE_FILE)

    target_hash = None
    found_id = "Unknown"

    # 1. Try finding by ID (Most common)
    for h, item in quarantine_db.items():
        if item.get('id') == target_id_or_hash:
            target_hash = h
            found_id = target_id_or_hash
            break

    # 2. Try finding by Hash (Direct)
    if not target_hash and target_id_or_hash in quarantine_db:
        target_hash = target_id_or_hash
        found_id = quarantine_db[target_hash].get('id')

    if not target_hash:
        # Fallback: User provided a raw hash that isn't in quarantine?
        if len(target_id_or_hash) == 64:
            target_hash = target_id_or_hash
        else:
            print(f"[!] Could not find '{target_id_or_hash}' in Quarantine.")
            return

    # EXECUTE BAN
    if target_hash not in blocklist:
        blocklist.add(target_hash)
        with open(BLOCKLIST_FILE, 'w') as f:
            json.dump(list(blocklist), f)

        print(f"[-] BANNED:")
        print(f"    ID:   {found_id}")
        print(f"    Hash: {target_hash}")
    else:
        print(f"[*] Hash already blocked.")

    # CLEANUP
    if target_hash in quarantine_db:
        del quarantine_db[target_hash]
        with open(QUARANTINE_FILE, 'w') as f:
            for v in quarantine_db.values():
                f.write(json.dumps(v) + "\n")
        print(f"    (Removed from Quarantine)")

# Usage Example:
# add_to_blocklist("5de2b0...")