In [1]:
# @title 1. Environment Setup
!apt-get update -qq

# WE INSTALL THESE GRAPHICS DRIVERS SOLELY FOR THE SHARED LIBRARY
# Even though this notebook only does text cleaning, importing 'shader_utils.py'
# requires 'moderngl' to be present, otherwise the import will crash.
!apt-get install -y -qq libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers
!pip install -q datasets huggingface_hub moderngl numpy

import os
import re
import json
import shutil
import time
from datasets import load_dataset
from google.colab import drive
from google.colab import userdata
from huggingface_hub import login

# 1. Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# 2. Login to Hugging Face
# Ensure 'HF_TOKEN' is set in your Colab Secrets (Key Icon)
try:
    hf_token = userdata.get('HF_TOKEN')
    login(hf_token)
    print("[INFO] Logged in to Hugging Face successfully.")
except Exception as e:
    print(f"[WARN] Could not login to Hugging Face: {e}")
    print("       (Make sure 'HF_TOKEN' is set in Colab Secrets)")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[WARN] Could not login to Hugging Face: Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.
       (Make sure 'HF_TOKEN' is set in Colab Secrets)


In [2]:
# @title 2 .Setup Shared Library
import sys
import os
import importlib

# Define path to the 'lib' folder you created
LIB_DIR = "/content/drive/MyDrive/projects/EarthShader/lib"

# Add to path
if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

# Import and reload
try:
    import shader_utils
    importlib.reload(shader_utils)
    print(f"[INFO] Shared library loaded from: {LIB_DIR}")
except ImportError:
    print(f"[!] ERROR: Could not find 'shader_utils.py' in {LIB_DIR}")

[INFO] Shared library loaded from: /content/drive/MyDrive/projects/EarthShader/lib


In [3]:
# @title 3. Configuration
# Paths
LOCAL_FILE = "/content/shaders_archive_temp.jsonl"
DRIVE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset/thestack"
FINAL_DRIVE_FILE = os.path.join(DRIVE_DIR, "shaders_archive.jsonl")

# Execution Limit (Set to None for full production run)
LIMIT = None

# Safe License List (Strict Permissive Only)
SAFE_LICENSES = [
    "MIT", "Apache-2.0", "BSD-3-Clause", "BSD-2-Clause",
    "CC0-1.0", "Unlicense", "ISC", "BlueOak-1.0.0"
]

In [4]:
# @title 4. Helper Functions
import shutil
import os

def save_to_drive():
    """
    Backs up the local JSONL file to Google Drive.
    Uses global paths defined in the Configuration cell.
    """
    if not os.path.exists(DRIVE_DIR):
        os.makedirs(DRIVE_DIR, exist_ok=True)

    # Copy the temp file from Colab runtime to Drive
    if os.path.exists(LOCAL_FILE):
        shutil.copy(LOCAL_FILE, FINAL_DRIVE_FILE)
    else:
        print(f"[!] Warning: Local file {LOCAL_FILE} not found to save.")

In [5]:
# @title 5. Start Download Job
def download_production():
    print(f"[*] Starting PRODUCTION Download from 'The Stack'...")
    print(f"[*] Target: {FINAL_DRIVE_FILE}")

    # Load dataset in Streaming Mode
    try:
        ds = load_dataset("bigcode/the-stack", data_dir="data/glsl", split="train", streaming=True)
    except Exception as e:
        print(f"[!] Error connecting to Hugging Face: {e}")
        return

    saved_count = 0
    scanned_count = 0
    start_time = time.time()

    print(f"[*] Stream opened. Scanning & Cleaning...")

    with open(LOCAL_FILE, "w", encoding="utf-8") as f:
        try:
            for sample in ds:
                scanned_count += 1

                # Update Status every 10,000 scanned files
                if scanned_count % 10000 == 0:
                    elapsed = time.time() - start_time
                    rate = scanned_count / (elapsed + 0.01)
                    print(f"   [Scanning] Checked: {scanned_count:,} | Saved: {saved_count:,} | Speed: {rate:.0f} files/sec", end="\r", flush=True)

                try:
                    raw_code = sample.get("content", "")
                    licenses = sample.get("max_stars_repo_licenses", [])

                    # 1. License Filter (Using Shared Library)
                    if not shader_utils.is_safe_license(licenses):
                        continue

                    # 2. Clean the Code (Using Shared Library)
                    clean_code = shader_utils.clean_shader_code(raw_code)

                    # 3. Strict Shadertoy Validation (Using Shared Library)
                    if shader_utils.is_strictly_shadertoy(clean_code):
                        record = {
                            "id": sample.get("hexsha"),
                            "license": licenses[0],
                            "repo": sample.get("max_stars_repo_name", "unknown"),
                            "code": clean_code
                        }
                        f.write(json.dumps(record) + "\n")
                        saved_count += 1

                        # Backup every 500 saved shaders
                        if saved_count % 500 == 0:
                            save_to_drive()
                            print(f"   [Backup] Saved {saved_count} shaders to Drive...                ", end="\r", flush=True)

                    if LIMIT and saved_count >= LIMIT:
                        print("\n[*] Limit reached.")
                        break

                except Exception:
                    continue

        except KeyboardInterrupt:
            print("\n\n[!] Interrupted by user. Saving progress...")

    # Final Save
    save_to_drive()

    total_time = (time.time() - start_time) / 60
    print(f"\n\n[SUCCESS] Run Complete in {total_time:.1f} minutes.")
    print(f"[-] Total Scanned: {scanned_count:,}")
    print(f"[-] Total Saved:   {saved_count:,}")
    print(f"[-] Location:      {FINAL_DRIVE_FILE}")

if __name__ == "__main__":
    download_production()

[*] Starting PRODUCTION Download from 'The Stack'...
[*] Target: /content/drive/MyDrive/projects/EarthShader/dataset/thestack/shaders_archive.jsonl


README.md:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

[*] Stream opened. Scanning & Cleaning...


[SUCCESS] Run Complete in 6.1 minutes.
[-] Total Scanned: 317,741
[-] Total Saved:   1,162
[-] Location:      /content/drive/MyDrive/projects/EarthShader/dataset/thestack/shaders_archive.jsonl


In [10]:
# @title 6. Inspect & Verify Data
import random

def inspect_shader(index=None):
    """
    Displays a shader from the dataset.
    - index (int): Show specific shader by line number (0-indexed).
    - index (None): Show a random shader.
    """
    if not os.path.exists(FINAL_DRIVE_FILE):
        print("[!] Output file not found. Run the download first.")
        return

    # Read lines efficiently
    # (The dataset is ~50-100MB, fitting easily in RAM for quick debugging)
    with open(FINAL_DRIVE_FILE, 'r') as f:
        lines = f.readlines()

    total_count = len(lines)
    if total_count == 0:
        print("[!] Dataset is empty.")
        return

    # Select Target
    if index is not None:
        if 0 <= index < total_count:
            target_idx = index
            print(f"[*] Selecting SPECIFIC shader at Index {target_idx}...")
        else:
            print(f"[!] Index {index} out of bounds (0-{total_count-1}). Showing random instead.")
            target_idx = random.randint(0, total_count - 1)
    else:
        target_idx = random.randint(0, total_count - 1)
        print(f"[*] Selecting RANDOM shader (Index {target_idx} of {total_count})...")

    # Parse
    try:
        data = json.loads(lines[target_idx])
    except json.JSONDecodeError:
        print(f"[!] Error decoding JSON at line {target_idx}")
        return

    # Display Metadata
    print("\n" + "="*80)
    print(f"SHADER METADATA (Index {target_idx})")
    print(f"ID:      {data.get('id')}")
    print(f"Repo:    {data.get('repo')}")
    print(f"License: {data.get('license')}")
    print("="*80 + "\n")

    # Display Code
    code = data.get('code', '')
    print(code[:10000]) # Print first 10k characters (likely whole file)
    if len(code) > 10000:
        print("\n... [Truncated at 10k characters] ...")

    print("\n" + "="*80)

    # Automated Quality Checks
    print("QUALITY CHECK:")
    issues = []

    # Check for wrappers
    if "uniform vec3 iResolution" in code: issues.append("Wrapper: Found 'uniform iResolution'")
    if "void main()" in code: issues.append("Wrapper: Found 'void main()' footer")

    # Check for textures (Crucial for black screen prevention)
    if "texture(" in code or "iChannel" in code: issues.append("Dependency: Found external texture lookup")

    if not issues:
        print("YAY Code looks clean (Pure Procedural Shadertoy format).")
        print("    Copy-paste this into https://www.shadertoy.com/new to test.")
    else:
        for issue in issues:
            print(f"XXX {issue}")
    print("="*80)

# --- HOW TO USE ---
# inspect_shader()       <-- Shows a random shader
# inspect_shader(5)      <-- Shows the 6th shader (Index 5)

inspect_shader()

[*] Selecting RANDOM shader (Index 167 of 1162)...

SHADER METADATA (Index 167)
ID:      9b55e51cd7f7a6dd01d66e328d2235ab5dd48aa2
Repo:    falcon11/glslsandbox-ios
License: MIT

#ifdef GL_ES
precision mediump float;
#endif

uniform float time;
uniform vec2 resolution;

#define iTime time
#define iResolution resolution

#define PI 3.141592653589793238

float sd_circle(vec2 p, float r) {
    return length(p) - r;
}

float length_n(vec2 p, float n) { p=pow(abs(p), vec2(n)); return pow(p.x+p.y, 1.0/n); }

float sd_ellipsoid(vec2 p, vec2 r, float roundness){
    float k1 = length_n(p/r, roundness);
    float k2 = length_n(p/(r*r), roundness);
    return k1*(k1-1.0)/k2;
}

float sd_vesica(vec2 p, float r, float d) {
    p = abs(p);
    float b = sqrt(r*r-d*d);
    return ((p.y-b)*d>p.x*b) ? length(p-vec2(0.0,b))
                             : length(p-vec2(-d,0.0))-r;
}

float sd_arc(vec2 p, float ta, float tb, float ra, float rb) {
    vec2 sca = vec2(cos(ta), sin(ta));
    vec2 scb = vec2(