# **Phase 2: The Seed Gatekeeper**

**Goal:** Aggregate, validate, and triage GLSL shaders to create a "Gold Standard" dataset for training.

### **Core Concepts**
* No shader enters the training set **unless it compiles and renders a valid image** in a headless environment.
* **Content Hashing:** Shaders are tracked by the **SHA-256 Hash** of their code, not their filename. This prevents duplicates and ensures that if you ban a specific code block, it stays banned forever, even if it reappears under a new name.
* **Tri-State Lifecycle:**
    1.  **Verified (`verified_seeds.jsonl`):** Valid code ready for synthetic augmentation.
    2.  **Quarantine (`quarantine.jsonl`):** Broken code waiting for manual repair.
    3.  **Blocklist (`blocklist.json`):** Permanently banned code hashes (e.g. black screens, crashes).

### **The Workflow**
1.  **Run Gatekeeper (Cell 4):** It ingests new candidates from "The Stack" (Phase 1) and your `manual_injections` folder. It tries to render them.
2.  **Inspect Failures (Cell 6):** Use the inspector tool to see code in **Quarantine**.
3.  **Fix or Ban:**
    * **To FIX:** Copy the code, fix the error locally, save as `.glsl`, and drop it into `dataset/manual_injections/`.
    * **To BAN:** Copy the ID and run `add_to_blocklist("ID")` (Cell 5).
4.  **Re-Run:** The Gatekeeper automatically promotes your fixes and deletes banned items from Quarantine.

In [14]:
# @title 1. Environment Setup
!apt-get update -qq
!apt-get install -y -qq libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers
!pip install -q moderngl numpy pillow datasets

import os
import json
import hashlib
import shutil
import moderngl
import numpy as np
from PIL import Image
from google.colab import drive

# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

print("[INFO] Environment Ready.")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[INFO] Environment Ready.


In [15]:
# @title 2. Configuration & Library Setup
import sys
import os
import importlib

# --- PATHS ---
# Unified structure under 'projects/EarthShader'
BASE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset"
LIB_DIR = "/content/drive/MyDrive/projects/EarthShader/lib"

# Inputs
STACK_SOURCE = os.path.join(BASE_DIR, "thestack/shaders_archive.jsonl")
MANUAL_SOURCE_DIR = os.path.join(BASE_DIR, "manual_injections")
BLOCKLIST_FILE = os.path.join(BASE_DIR, "blocklist.json")

# Outputs
VERIFIED_FILE = os.path.join(BASE_DIR, "verified_seeds.jsonl")
QUARANTINE_FILE = os.path.join(BASE_DIR, "quarantine.jsonl")

# --- RENDERING SETTINGS ---
WIDTH, HEIGHT = 512, 512
RENDER_TIME = 1.0

# --- SETUP SHARED LIBRARY ---
if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

try:
    import shader_utils
    importlib.reload(shader_utils)
    print(f"[INFO] Shared library loaded from: {LIB_DIR}")
except ImportError:
    print(f"[!] ERROR: Could not find 'shader_utils.py' in {LIB_DIR}")
    print("    Please double-check your Drive folder structure.")

# Create directories
for path in [MANUAL_SOURCE_DIR, os.path.dirname(VERIFIED_FILE)]:
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

[INFO] Shared library loaded from: /content/drive/MyDrive/projects/EarthShader/lib


In [16]:
# @title 3. Helper Functions (Notebook Specific)
import json
import os

def load_jsonl(path):
    """Loads a JSONL file into a dictionary keyed by Content Hash."""
    data = {}
    if os.path.exists(path):
        with open(path, 'r') as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    # Use the library to ensure hash consistency
                    code_hash = shader_utils.get_content_hash(entry.get('code', ''))
                    data[code_hash] = entry
                except: continue
    return data

def load_blocklist():
    """Loads the list of banned hashes."""
    if os.path.exists(BLOCKLIST_FILE):
        try:
            with open(BLOCKLIST_FILE, 'r') as f:
                return set(json.load(f))
        except: return set()
    return set()

In [20]:
# @title 4. Blocklist Tool (Ban & Delete)
def add_to_blocklist(target_id_or_hash):
    """
    Bans a shader. You can pass the Source ID or the Hash.
    """
    blocklist = load_blocklist()
    quarantine_db = load_jsonl(QUARANTINE_FILE)

    target_hash = None
    found_id = "Unknown"

    # 1. Try finding by ID (Most common)
    for h, item in quarantine_db.items():
        if item.get('id') == target_id_or_hash:
            target_hash = h
            found_id = target_id_or_hash
            break

    # 2. Try finding by Hash (Direct)
    if not target_hash and target_id_or_hash in quarantine_db:
        target_hash = target_id_or_hash
        found_id = quarantine_db[target_hash].get('id')

    if not target_hash:
        # Fallback: User provided a raw hash that isn't in quarantine?
        if len(target_id_or_hash) == 64:
            target_hash = target_id_or_hash
        else:
            print(f"[!] Could not find '{target_id_or_hash}' in Quarantine.")
            return

    # EXECUTE BAN
    if target_hash not in blocklist:
        blocklist.add(target_hash)
        with open(BLOCKLIST_FILE, 'w') as f:
            json.dump(list(blocklist), f)

        print(f"[-] BANNED:")
        print(f"    ID:   {found_id}")
        print(f"    Hash: {target_hash}")
    else:
        print(f"[*] Hash already blocked.")

    # CLEANUP
    if target_hash in quarantine_db:
        del quarantine_db[target_hash]
        with open(QUARANTINE_FILE, 'w') as f:
            for v in quarantine_db.values():
                f.write(json.dumps(v) + "\n")
        print(f"    (Removed from Quarantine)")

# Usage Example:
# add_to_blocklist("5de2b0...")s

In [45]:
# @title 5. Run Gatekeeper (Using Shared Library)
from collections import Counter
import time
import json
import os
import sys

def run_gatekeeper():
    print("[*] Loading State...")
    verified_db = load_jsonl(VERIFIED_FILE)
    quarantine_db = load_jsonl(QUARANTINE_FILE)
    blocklist = load_blocklist()

    print(f"    - Verified:   {len(verified_db)}")
    print(f"    - Quarantine: {len(quarantine_db)}")
    print(f"    - Blocklist:  {len(blocklist)}")

    # 1. Setup Render Context (Via Library)
    # Note: Cell 1 must have installed moderngl/egl for this to work
    ctx = shader_utils.create_headless_context()

    # 2. Gather Candidates
    candidates = []

    # A. From Stack
    if os.path.exists(STACK_SOURCE):
        with open(STACK_SOURCE, 'r') as f:
            for line in f:
                try: candidates.append(json.loads(line))
                except: continue

    # B. From Manual
    if os.path.exists(MANUAL_SOURCE_DIR):
        if not os.listdir(MANUAL_SOURCE_DIR):
             time.sleep(1) # Sync wait
        for fname in sorted(os.listdir(MANUAL_SOURCE_DIR)):
            if fname.endswith(".glsl"):
                path = os.path.join(MANUAL_SOURCE_DIR, fname)
                with open(path, 'r') as f:
                    candidates.append({
                        "id": fname,
                        "source": "manual_injection",
                        "license": "manual",
                        "code": f.read()
                    })

    total_candidates = len(candidates)
    print(f"[*] Processing {total_candidates} candidates...")

    new_verified = 0
    new_quarantine = 0
    error_counts = Counter()

    # 3. Process Loop
    for i, item in enumerate(candidates):
        code = item.get('code', '')
        shader_id = item.get('id', 'unknown')

        if not code: continue

        # Use Library Hash
        code_hash = shader_utils.get_content_hash(code)

        # --- PRE-CHECK FILTERING ---
        if code_hash in blocklist:
            if code_hash in quarantine_db: del quarantine_db[code_hash]
            continue

        if code_hash in verified_db:
            continue

        if code_hash in quarantine_db:
            continue

        # --- DYNAMIC LOGGING ---
        progress = f"[{i+1}/{total_candidates}]"
        status_msg = f"{progress} Attempting: {shader_id[:10]}... | Hash: {code_hash}"
        print(f"\r{status_msg:<120}", end="", flush=True)

        # VALIDATE (Via Library)
        # We pass the context, code, and resolution (defined in config)
        success, message, entropy = shader_utils.render_shader(
            ctx, code, width=WIDTH, height=HEIGHT, time_val=RENDER_TIME
        )

        if success:
            # Verified
            verified_db[code_hash] = item
            new_verified += 1

            # Cleanup
            if code_hash in quarantine_db: del quarantine_db[code_hash]

            if "_fix" in shader_id:
                original_id = shader_id.replace("_fix.glsl", "").replace("_fix", "")
                hashes_to_purge = [h for h, q in quarantine_db.items() if q.get('id') == original_id]
                for h in hashes_to_purge:
                    del quarantine_db[h]

        else:
            # Quarantine
            item['error'] = message
            quarantine_db[code_hash] = item
            new_quarantine += 1

            short_err = str(message).split('\n')[0][:50]
            error_counts[short_err] += 1

            print(f"\r{progress} [-] Quarantine: {shader_id[:10]}... -> {short_err:<50}")

        # Checkpoint (Every 50)
        if i > 0 and i % 50 == 0:
            with open(VERIFIED_FILE, 'w') as f:
                for v in verified_db.values(): f.write(json.dumps(v) + "\n")
            with open(QUARANTINE_FILE, 'w') as f:
                for v in quarantine_db.values(): f.write(json.dumps(v) + "\n")

    # 4. Final Save
    print(f"\n\n[*] Saving Updates...")
    with open(VERIFIED_FILE, 'w') as f:
        for v in verified_db.values(): f.write(json.dumps(v) + "\n")
    with open(QUARANTINE_FILE, 'w') as f:
        for v in quarantine_db.values(): f.write(json.dumps(v) + "\n")

    print(f"[DONE] Gatekeeper Run Complete.")
    print(f"   New Verified:   {new_verified}")
    print(f"   New Quarantine: {new_quarantine}")
    print(f"   Total Verified: {len(verified_db)}")

if __name__ == "__main__":
    run_gatekeeper()

[*] Loading State...
    - Verified:   810
    - Quarantine: 166
    - Blocklist:  106
[*] Processing 1186 candidates...
[1182/1186] Attempting: d31a6d8fd9... | Hash: 8fc7037fc0d1215cb8054073d37fd6333f9711200e728094f74032a48e75a629          

[*] Saving Updates...
[DONE] Gatekeeper Run Complete.
   New Verified:   1
   New Quarantine: 0
   Total Verified: 811


In [41]:
# @title 5. Inspector: Interactive Review (GUI Version)
import ipywidgets as widgets
from IPython.display import display, clear_output
import json
import os
import html

class QuarantineReviewer:
    def __init__(self):
        self.items = []
        self.current_index = 0

        # --- UI COMPONENTS ---
        self.btn_ban = widgets.Button(
            description="BAN & DELETE",
            button_style='danger', # Red
            icon='ban',
            layout=widgets.Layout(width='150px')
        )
        self.btn_skip = widgets.Button(
            description="Next / Skip",
            button_style='', # Grey
            icon='arrow-right',
            layout=widgets.Layout(width='150px')
        )
        self.btn_refresh = widgets.Button(
            description="Reload File",
            button_style='info', # Blue
            icon='refresh',
            layout=widgets.Layout(width='120px')
        )

        self.output_area = widgets.Output()

        # Event Handlers
        self.btn_ban.on_click(self.on_ban)
        self.btn_skip.on_click(self.on_skip)
        self.btn_refresh.on_click(self.on_refresh)

        # Layout container
        self.controls = widgets.HBox([self.btn_ban, self.btn_skip, self.btn_refresh])
        self.ui = widgets.VBox([self.controls, self.output_area])

        # Start
        self.load_data()

    def load_data(self):
        if os.path.exists(QUARANTINE_FILE):
            with open(QUARANTINE_FILE, 'r') as f:
                self.items = [json.loads(line) for line in f]
        else:
            self.items = []
        self.current_index = 0
        self.render()

    def on_refresh(self, b):
        self.load_data()

    def on_ban(self, b):
        if not self.items: return

        # Get current item
        item = self.items[self.current_index]
        shader_id = item.get('id')

        # Use existing tool to Ban and Delete from file
        # (This writes to disk immediately)
        with self.output_area:
            print(f"[*] Banning {shader_id}...")
            add_to_blocklist(shader_id)

        # Reload the file to sync state (since file changed)
        self.load_data()

    def on_skip(self, b):
        if self.current_index < len(self.items) - 1:
            self.current_index += 1
            self.render()
        else:
            with self.output_area:
                clear_output()
                print("[*] End of quarantine list reached.")

    def render(self):
        with self.output_area:
            clear_output()

            if not self.items:
                print("[*] Quarantine is empty! Great job.")
                return

            item = self.items[self.current_index]
            code = item.get('code', '')

            # 1. INFO HEADER
            print(f"ITEM {self.current_index + 1} / {len(self.items)}")
            print(f"ID:    {item.get('id')}")
            print(f"Error: {item.get('error')}")
            print("-" * 80)

            # 2. CODE BOX (Non-editable, Copy-friendly)
            safe_code = html.escape(code)
            display(widgets.HTML(f"""
            <textarea style="
                width: 100%;
                height: 400px;
                font-family: monospace;
                font-size: 11px;
                white-space: pre;
                background-color: #f7f7f7;
                border: 1px solid #ccc;
            ">{safe_code}</textarea>
            """))

# Run the app
reviewer = QuarantineReviewer()
display(reviewer.ui)

VBox(children=(HBox(children=(Button(button_style='danger', description='BAN & DELETE', icon='ban', layout=Lay…

In [33]:
add_to_blocklist('0e739d894dc62eb03ddf8de0cccb9b9293073e9b')

[-] BANNED:
    ID:   0e739d894dc62eb03ddf8de0cccb9b9293073e9b
    Hash: 4c3725cfa592b283c1d252de403d24295a9548b37e905298c9d85afff4084ddb
    (Removed from Quarantine)


In [4]:
# @title 7. Inspector: View Hanging Shader
import json
import os
import hashlib
import html
from IPython.display import display, HTML

# 1. Paste the hash from your "Attempting..." log line
TARGET_HASH = "4937aa9468def26d56490631c12168070c5c0d9ac47316c21c45468368de8326"

# Paths (Using the 'projects' structure you confirmed)
BASE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset"
STACK_SOURCE = os.path.join(BASE_DIR, "thestack/shaders_archive.jsonl")
MANUAL_SOURCE_DIR = os.path.join(BASE_DIR, "manual_injections")

def get_content_hash(code):
    """Generates the hash exactly like the Gatekeeper does."""
    normalized = "".join(code.split())
    return hashlib.sha256(normalized.encode('utf-8')).hexdigest()

def find_and_inspect():
    print(f"[*] Searching for hash: {TARGET_HASH[:10]}...")
    found = False

    # A. Search 'The Stack'
    if os.path.exists(STACK_SOURCE):
        with open(STACK_SOURCE, 'r') as f:
            for i, line in enumerate(f):
                try:
                    item = json.loads(line)
                    code = item.get('code', '')
                    if get_content_hash(code) == TARGET_HASH:
                        print_match(i, item, "The Stack")
                        found = True
                        break
                except: continue

    # B. Search Manual Injections (if not found yet)
    if not found and os.path.exists(MANUAL_SOURCE_DIR):
        for fname in sorted(os.listdir(MANUAL_SOURCE_DIR)):
            if fname.endswith(".glsl"):
                path = os.path.join(MANUAL_SOURCE_DIR, fname)
                with open(path, 'r') as f:
                    code = f.read()
                    if get_content_hash(code) == TARGET_HASH:
                        print_match(0, {"id": fname, "code": code}, "Manual Injection")
                        found = True
                        break

    if not found:
        print("[!] Shader not found. Are you sure the Hash is correct?")

def print_match(index, item, source):
    print("\n" + "="*80)
    print(f"FOUND IN: {source} (Index {index})")
    print(f"ID:       {item.get('id')}")
    print("-" * 80)

    # --- CHANGED: Render code in a text area for easy copying ---
    code = item.get('code', '')
    safe_code = html.escape(code)

    display(HTML(f"""
    <strong>Shader Code:</strong><br>
    <textarea rows="20" style="width:100%; font-family:monospace; font-size:12px; white-space: pre;">
{safe_code}
</textarea>
    """))
    # ------------------------------------------------------------

    print("-" * 80)
    print("\nTo Ban this shader, run:")
    print(f"add_to_blocklist('{TARGET_HASH}')")

find_and_inspect()

[*] Searching for hash: 4937aa9468...

FOUND IN: The Stack (Index 659)
ID:       77bd096bd2afd31e50258706f04d53e6869f06fa
--------------------------------------------------------------------------------


--------------------------------------------------------------------------------

To Ban this shader, run:
add_to_blocklist('4937aa9468def26d56490631c12168070c5c0d9ac47316c21c45468368de8326')


In [None]:
# @title 8. Emergency: Ban by Hash
import json
import os

# --- PASTE THE OFFENDING HASH HERE ---
TARGET_HASH = "a24e5d1ea02cb1efcc7a171ee9c2d9f325750f41e6e0e8c69ec783ca8838fe36"
# -------------------------------------

def ban_by_hash(target_hash):
    target_hash = target_hash.strip()
    if not target_hash or target_hash == "PASTE_HASH_HERE":
        print("[!] Please paste a valid hash in the TARGET_HASH variable.")
        return

    # Load Blocklist
    blocklist = set()
    if os.path.exists(BLOCKLIST_FILE):
        try:
            with open(BLOCKLIST_FILE, 'r') as f:
                blocklist = set(json.load(f))
        except: pass

    # Add to Blocklist
    if target_hash not in blocklist:
        blocklist.add(target_hash)
        with open(BLOCKLIST_FILE, 'w') as f:
            json.dump(list(blocklist), f)
        print(f"[-] BANNED Hash: {target_hash}")
        print("    (You can now restart the Gatekeeper)")
    else:
        print("[*] This hash is ALREADY in the blocklist.")

ban_by_hash(TARGET_HASH)