# **Phase 2: The Seed Gatekeeper**

**Goal:** Aggregate, validate, and triage GLSL shaders to create a "Gold Standard" dataset for training.

### **Core Concepts**
* No shader enters the training set **unless it compiles and renders a valid image** in a headless environment.
* **Content Hashing:** Shaders are tracked by the **SHA-256 Hash** of their code, not their filename. This prevents duplicates and ensures that if you ban a specific code block, it stays banned forever, even if it reappears under a new name.
* **Tri-State Lifecycle:**
    1.  **Verified (`verified_seeds.jsonl`):** Valid code ready for synthetic augmentation.
    2.  **Quarantine (`quarantine.jsonl`):** Broken code waiting for manual repair.
    3.  **Blocklist (`blocklist.json`):** Permanently banned code hashes (e.g. black screens, crashes).

### **The Workflow**
1.  **Run Gatekeeper (Cell 4):** It ingests new candidates from "The Stack" (Phase 1) and your `manual_injections` folder. It tries to render them.
2.  **Inspect Failures (Cell 6):** Use the inspector tool to see code in **Quarantine**.
3.  **Fix or Ban:**
    * **To FIX:** Copy the code, fix the error locally, save as `.glsl`, and drop it into `dataset/manual_injections/`.
    * **To BAN:** Copy the ID and run `add_to_blocklist("ID")` (Cell 5).
4.  **Re-Run:** The Gatekeeper automatically promotes your fixes and deletes banned items from Quarantine.

In [None]:
# @title 1. Environment Setup
!apt-get update -qq
!apt-get install -y -qq libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers
!pip install -q moderngl numpy pillow datasets

import os
import json
import hashlib
import shutil
import moderngl
import numpy as np
from PIL import Image
from google.colab import drive

# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

print("[INFO] Environment Ready.")

In [None]:
# @title 2. Configuration
# --- PATHS ---
BASE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset"

# Inputs
STACK_SOURCE = os.path.join(BASE_DIR, "thestack/shaders_archive.jsonl")
MANUAL_SOURCE_DIR = os.path.join(BASE_DIR, "manual_injections")
BLOCKLIST_FILE = os.path.join(BASE_DIR, "blocklist.json")

# Outputs
VERIFIED_FILE = os.path.join(BASE_DIR, "verified_seeds.jsonl")
QUARANTINE_FILE = os.path.join(BASE_DIR, "quarantine.jsonl")

# --- RENDERING SETTINGS ---
WIDTH, HEIGHT = 512, 512
RENDER_TIME = 1.0

# GLSL Boilerplate (Updated Uniforms)
HEADER = """
#version 330
uniform vec3      iResolution;
uniform float     iTime;
uniform float     iTimeDelta;
uniform float     iFrame;
uniform vec4      iMouse;
uniform vec4      iDate;
out vec4 fragColor;
"""

FOOTER = """
void main() {
    mainImage(fragColor, gl_FragCoord.xy);
}
"""

VERTEX_SHADER = """
#version 330
in vec2 in_vert;
void main() {
    gl_Position = vec4(in_vert, 0.0, 1.0);
}
"""

for path in [MANUAL_SOURCE_DIR, os.path.dirname(VERIFIED_FILE)]:
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

In [None]:
# @title 3. Helper Functions

def get_content_hash(code):
    """Generates a SHA-256 hash of the code for ID."""
    normalized = "".join(code.split())
    return hashlib.sha256(normalized.encode('utf-8')).hexdigest()

def create_headless_context():
    try:
        return moderngl.create_context(standalone=True, backend='egl')
    except:
        return moderngl.create_context(standalone=True)

def render_shader(ctx, user_code):
    """
    Attempts to compile and render the shader.
    """
    try:
        full_source = f"{HEADER}\n{user_code}\n{FOOTER}"

        prog = ctx.program(
            vertex_shader=VERTEX_SHADER,
            fragment_shader=full_source
        )

        # Set ALL Uniforms (with dummy values)
        if 'iResolution' in prog: prog['iResolution'].value = (WIDTH, HEIGHT, 1.0)
        if 'iTime' in prog:       prog['iTime'].value = RENDER_TIME
        if 'iTimeDelta' in prog:  prog['iTimeDelta'].value = 1.0 / 60.0
        if 'iFrame' in prog:      prog['iFrame'].value = 0.0
        if 'iMouse' in prog:      prog['iMouse'].value = (0.0, 0.0, 0.0, 0.0)
        if 'iDate' in prog:       prog['iDate'].value = (2024.0, 1.0, 1.0, 0.0)

        fbo = ctx.simple_framebuffer((WIDTH, HEIGHT))
        fbo.use()
        fbo.clear(0.0, 0.0, 0.0, 1.0)

        vertices = np.array([-1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0], dtype='f4')
        vbo = ctx.buffer(vertices)
        vao = ctx.simple_vertex_array(prog, vbo, 'in_vert')
        vao.render(moderngl.TRIANGLE_STRIP)

        raw = fbo.read(components=3)
        img = np.frombuffer(raw, dtype=np.uint8)

        if np.std(img) < 0.01:
            return False, "Rendered flat/empty image", 0.0

        vbo.release()
        vao.release()
        prog.release()
        fbo.release()

        return True, "Success", 1.0

    except Exception as e:
        return False, str(e), 0.0

def load_jsonl(path):
    data = {}
    if os.path.exists(path):
        with open(path, 'r') as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    code_hash = get_content_hash(entry.get('code', ''))
                    data[code_hash] = entry
                except: continue
    return data

def load_blocklist():
    if os.path.exists(BLOCKLIST_FILE):
        try:
            with open(BLOCKLIST_FILE, 'r') as f:
                return set(json.load(f))
        except: return set()
    return set()

In [115]:
# @title 4. Run Gatekeeper (Clean Output)
from collections import Counter
import time
import json
import os
import sys

def run_gatekeeper():
    print("[*] Loading State...")
    verified_db = load_jsonl(VERIFIED_FILE)
    quarantine_db = load_jsonl(QUARANTINE_FILE)
    blocklist = load_blocklist()

    print(f"    - Verified:   {len(verified_db)}")
    print(f"    - Quarantine: {len(quarantine_db)}")
    print(f"    - Blocklist:  {len(blocklist)}")

    # 1. Setup Render Context
    ctx = create_headless_context()

    # 2. Gather Candidates
    candidates = []

    # A. From Stack
    if os.path.exists(STACK_SOURCE):
        with open(STACK_SOURCE, 'r') as f:
            for line in f:
                try: candidates.append(json.loads(line))
                except: continue

    # B. From Manual
    if os.path.exists(MANUAL_SOURCE_DIR):
        if not os.listdir(MANUAL_SOURCE_DIR):
             time.sleep(1) # Sync wait
        for fname in sorted(os.listdir(MANUAL_SOURCE_DIR)):
            if fname.endswith(".glsl"):
                path = os.path.join(MANUAL_SOURCE_DIR, fname)
                with open(path, 'r') as f:
                    candidates.append({
                        "id": fname,
                        "source": "manual_injection",
                        "license": "manual",
                        "code": f.read()
                    })

    total_candidates = len(candidates)
    print(f"[*] Processing {total_candidates} candidates...")

    new_verified = 0
    new_quarantine = 0
    error_counts = Counter()

    # 3. Process Loop
    for i, item in enumerate(candidates):
        code = item.get('code', '')
        shader_id = item.get('id', 'unknown')

        if not code: continue

        code_hash = get_content_hash(code)

        # --- PRE-CHECK FILTERING ---
        if code_hash in blocklist:
            if code_hash in quarantine_db: del quarantine_db[code_hash]
            continue

        if code_hash in verified_db:
            continue

        if code_hash in quarantine_db:
            continue

        # --- DYNAMIC LOGGING ---
        # We print this BEFORE rendering using \r to overwrite the line.
        # If it hangs, this text will stay visible.
        progress = f"[{i+1}/{total_candidates}]"
        status_msg = f"{progress} Attempting: {shader_id[:10]}... | Hash: {code_hash}"
        # Pad with spaces to clear previous longer lines
        print(f"\r{status_msg:<120}", end="", flush=True)

        # VALIDATE
        success, message, entropy = render_shader(ctx, code)

        if success:
            # Verified
            verified_db[code_hash] = item
            new_verified += 1

            # We don't print a new line for success, just let the next "Attempting" overwrite it
            # But we update the current line to show success briefly
            # print(f"\r{progress} [+] Verified: {shader_id[:10]}...           ", end="", flush=True)

            # Cleanup
            if code_hash in quarantine_db: del quarantine_db[code_hash]

            if "_fix" in shader_id:
                original_id = shader_id.replace("_fix.glsl", "").replace("_fix", "")
                hashes_to_purge = [h for h, q in quarantine_db.items() if q.get('id') == original_id]
                for h in hashes_to_purge:
                    del quarantine_db[h]

        else:
            # Quarantine - Print this on a NEW line so we have a record of errors
            item['error'] = message
            quarantine_db[code_hash] = item
            new_quarantine += 1

            short_err = str(message).split('\n')[0][:50]
            error_counts[short_err] += 1

            # Clear the dynamic line and print the error
            print(f"\r{progress} [-] Quarantine: {shader_id[:10]}... -> {short_err:<50}")

        # Checkpoint (Every 50)
        if i > 0 and i % 50 == 0:
            with open(VERIFIED_FILE, 'w') as f:
                for v in verified_db.values(): f.write(json.dumps(v) + "\n")
            with open(QUARANTINE_FILE, 'w') as f:
                for v in quarantine_db.values(): f.write(json.dumps(v) + "\n")

    # 4. Final Save
    print(f"\n\n[*] Saving Updates...")
    with open(VERIFIED_FILE, 'w') as f:
        for v in verified_db.values(): f.write(json.dumps(v) + "\n")
    with open(QUARANTINE_FILE, 'w') as f:
        for v in quarantine_db.values(): f.write(json.dumps(v) + "\n")

    print(f"[DONE] Gatekeeper Run Complete.")
    print(f"   New Verified:   {new_verified}")
    print(f"   New Quarantine: {new_quarantine}")
    print(f"   Total Verified: {len(verified_db)}")

if __name__ == "__main__":
    run_gatekeeper()

[*] Loading State...
    - Verified:   796
    - Quarantine: 211
    - Blocklist:  45
[*] Processing 1174 candidates...


[*] Saving Updates...
[DONE] Gatekeeper Run Complete.
   New Verified:   0
   New Quarantine: 0
   Total Verified: 796


In [113]:
# @title 5. Inspector: View Quarantine
import html
from IPython.display import display, HTML

def inspect_quarantine(index=0):
    if not os.path.exists(QUARANTINE_FILE):
        print("[!] Quarantine is empty.")
        return

    with open(QUARANTINE_FILE, 'r') as f:
        lines = f.readlines()

    if not lines: return
    if index < 0: index = len(lines) + index
    if index >= len(lines): index = 0

    try:
        item = json.loads(lines[index])
        code = item.get('code', '')
        # Calculate hash on the fly so you can see it
        code_hash = get_content_hash(code)

        print(f"\nQUARANTINE ITEM #{index + 1} / {len(lines)}")
        print(f"Source ID:   {item.get('id')}  <-- (Git Commit / Filename)")
        print(f"Fingerprint: {code_hash}  <-- (Content Hash)")
        print(f"Error:       {item.get('error')}")
        print("-" * 80)

        # Copy-friendly box
        safe_code = html.escape(code)
        display(HTML(f"""
        <textarea rows="15" style="width:100%; font-family:monospace; font-size:11px;">
{safe_code}
</textarea>
        """))
        print(f"To Ban:\nadd_to_blocklist('{item.get('id')}')")

    except: pass

# View first item
inspect_quarantine()


QUARANTINE ITEM #1 / 212
Source ID:   ddc835e0f19f85618a295241e3e6fe4011afff53  <-- (Git Commit / Filename)
Fingerprint: 588bdcf0e22e825ec6335d55f0b33b2683cc0f548e4bd172d14cb109e188691e  <-- (Content Hash)
Error:       GLSL Compiler failed

fragment_shader
0:215(25): error: `SPLASH_OFFSET_X' undeclared
0:215(20): error: cannot construct `vec2' from a non-numeric data type
0:219(32): error: `SCALE' undeclared
0:219(15): error: operands to arithmetic operators must be numeric


--------------------------------------------------------------------------------


To Ban:
add_to_blocklist('ddc835e0f19f85618a295241e3e6fe4011afff53')


In [114]:
# @title 6. Blocklist Tool (Ban & Delete)
def add_to_blocklist(target_id_or_hash):
    """
    Bans a shader. You can pass the Source ID or the Hash.
    """
    blocklist = load_blocklist()
    quarantine_db = load_jsonl(QUARANTINE_FILE)

    target_hash = None
    found_id = "Unknown"

    # 1. Try finding by ID (Most common)
    for h, item in quarantine_db.items():
        if item.get('id') == target_id_or_hash:
            target_hash = h
            found_id = target_id_or_hash
            break

    # 2. Try finding by Hash (Direct)
    if not target_hash and target_id_or_hash in quarantine_db:
        target_hash = target_id_or_hash
        found_id = quarantine_db[target_hash].get('id')

    if not target_hash:
        # Fallback: User provided a raw hash that isn't in quarantine?
        if len(target_id_or_hash) == 64:
            target_hash = target_id_or_hash
        else:
            print(f"[!] Could not find '{target_id_or_hash}' in Quarantine.")
            return

    # EXECUTE BAN
    if target_hash not in blocklist:
        blocklist.add(target_hash)
        with open(BLOCKLIST_FILE, 'w') as f:
            json.dump(list(blocklist), f)

        print(f"[-] BANNED:")
        print(f"    ID:   {found_id}")
        print(f"    Hash: {target_hash}")
    else:
        print(f"[*] Hash already blocked.")

    # CLEANUP
    if target_hash in quarantine_db:
        del quarantine_db[target_hash]
        with open(QUARANTINE_FILE, 'w') as f:
            for v in quarantine_db.values():
                f.write(json.dumps(v) + "\n")
        print(f"    (Removed from Quarantine)")

# Usage Example:
# add_to_blocklist("5de2b0...")



[-] BANNED:
    ID:   ddc835e0f19f85618a295241e3e6fe4011afff53
    Hash: 588bdcf0e22e825ec6335d55f0b33b2683cc0f548e4bd172d14cb109e188691e
    (Removed from Quarantine)


In [None]:
# @title Emergency: Ban by Hash
import json
import os

# --- PASTE THE OFFENDING HASH HERE ---
TARGET_HASH = "a24e5d1ea02cb1efcc7a171ee9c2d9f325750f41e6e0e8c69ec783ca8838fe36"
# -------------------------------------

def ban_by_hash(target_hash):
    target_hash = target_hash.strip()
    if not target_hash or target_hash == "PASTE_HASH_HERE":
        print("[!] Please paste a valid hash in the TARGET_HASH variable.")
        return

    # Load Blocklist
    blocklist = set()
    if os.path.exists(BLOCKLIST_FILE):
        try:
            with open(BLOCKLIST_FILE, 'r') as f:
                blocklist = set(json.load(f))
        except: pass

    # Add to Blocklist
    if target_hash not in blocklist:
        blocklist.add(target_hash)
        with open(BLOCKLIST_FILE, 'w') as f:
            json.dump(list(blocklist), f)
        print(f"[-] BANNED Hash: {target_hash}")
        print("    (You can now restart the Gatekeeper)")
    else:
        print("[*] This hash is ALREADY in the blocklist.")

ban_by_hash(TARGET_HASH)

In [None]:
# @title Inspector: View Hanging Shader
import json
import os
import hashlib

# 1. Paste the hash from your "Attempting..." log line
TARGET_HASH = "fe516ebf8abdb146dac1cf70c647e8474ba8e876c1c53af0f12edd58884141c5"

# Paths (Same as your configuration)
BASE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset"
STACK_SOURCE = os.path.join(BASE_DIR, "thestack/shaders_archive.jsonl")
MANUAL_SOURCE_DIR = os.path.join(BASE_DIR, "manual_injections")

def get_content_hash(code):
    """Generates the hash exactly like the Gatekeeper does."""
    normalized = "".join(code.split())
    return hashlib.sha256(normalized.encode('utf-8')).hexdigest()

def find_and_inspect():
    print(f"[*] Searching for hash: {TARGET_HASH[:10]}...")
    found = False

    # A. Search 'The Stack'
    if os.path.exists(STACK_SOURCE):
        with open(STACK_SOURCE, 'r') as f:
            for i, line in enumerate(f):
                try:
                    item = json.loads(line)
                    code = item.get('code', '')
                    if get_content_hash(code) == TARGET_HASH:
                        print_match(i, item, "The Stack")
                        found = True
                        break
                except: continue

    # B. Search Manual Injections (if not found yet)
    if not found and os.path.exists(MANUAL_SOURCE_DIR):
        for fname in sorted(os.listdir(MANUAL_SOURCE_DIR)):
            if fname.endswith(".glsl"):
                path = os.path.join(MANUAL_SOURCE_DIR, fname)
                with open(path, 'r') as f:
                    code = f.read()
                    if get_content_hash(code) == TARGET_HASH:
                        print_match(0, {"id": fname, "code": code}, "Manual Injection")
                        found = True
                        break

    if not found:
        print("[!] Shader not found. Are you sure the Hash is correct?")

def print_match(index, item, source):
    print("\n" + "="*80)
    print(f"FOUND IN: {source} (Index {index})")
    print(f"ID:       {item.get('id')}")
    print("-" * 80)
    print("CODE:")
    print(item.get('code', ''))
    print("-" * 80)
    print("\nTo Ban this shader, run:")
    print(f"add_to_blocklist('{TARGET_HASH}')")

find_and_inspect()

In [None]:
# @title 7. Inspector: View Candidate (For Stalls/Hangs)
import json
import os

def inspect_candidate(index):
    """
    Fetches a shader from the INPUT queue by its processing index.
    Use this to identify shaders that cause the Gatekeeper to hang.
    """
    print(f"[*] Reconstructing Candidate List to find #{index}...")

    # 1. Re-build the list exactly how the Gatekeeper does
    candidates = []

    # A. From Stack
    if os.path.exists(STACK_SOURCE):
        with open(STACK_SOURCE, 'r') as f:
            for line in f:
                try: candidates.append(json.loads(line))
                except: continue

    # B. From Manual
    if os.path.exists(MANUAL_SOURCE_DIR):
        # Sort to ensure deterministic order matches Gatekeeper
        for fname in sorted(os.listdir(MANUAL_SOURCE_DIR)):
            if fname.endswith(".glsl"):
                path = os.path.join(MANUAL_SOURCE_DIR, fname)
                with open(path, 'r') as f:
                    candidates.append({
                        "id": fname,
                        "source": "manual_injection",
                        "license": "manual",
                        "code": f.read()
                    })

    # 2. Bounds Check
    total = len(candidates)
    if index < 0 or index >= total:
        print(f"[!] Index {index} out of bounds (0-{total-1}).")
        return

    # 3. Display
    target = candidates[index]
    print(f"\nCANDIDATE #{index} / {total}")
    print(f"ID:     {target.get('id')}")
    print(f"Source: {target.get('source')}")
    print("-" * 80)

    # Print Code
    print(target.get('code', '')[:12000])
    if len(target.get('code', '')) > 12000: print("\n... [Truncated]")

    print("-" * 80)
    print(f"To Ban: add_to_blocklist('{target.get('id')}')")

# Inspect the one that stuck
inspect_candidate(512)