In [None]:
# @title 1. Environment Setup
!pip install -q datasets huggingface_hub
import os
import json
import shutil
import time
from datasets import load_dataset
from google.colab import drive
from google.colab import userdata
from huggingface_hub import login

# 1. Mount Drive
drive.mount('/content/drive')

# 2. Login to Hugging Face
# Ensure 'HF_TOKEN' is set in your Colab Secrets (Key Icon)
try:
    hf_token = userdata.get('HF_TOKEN')
    login(hf_token)
    print("[INFO] Logged in to Hugging Face successfully.")
except Exception as e:
    print(f"[WARN] Could not login to Hugging Face: {e}")
    print("       Ensure 'HF_TOKEN' is set in Colab Secrets.")

In [None]:
# @title 2. Configuration
# Paths
LOCAL_FILE = "/content/shaders_archive_temp.jsonl"
DRIVE_DIR = "/content/drive/MyDrive/projects/EarthShader/dataset/thestack"
FINAL_DRIVE_FILE = os.path.join(DRIVE_DIR, "shaders_archive.jsonl")

# Execution Limit (Set to None for full production run)
LIMIT = None

# Safe License List (Strict Permissive Only)
SAFE_LICENSES = [
    "MIT", "Apache-2.0", "BSD-3-Clause", "BSD-2-Clause",
    "CC0-1.0", "Unlicense", "ISC", "BlueOak-1.0.0"
]

In [None]:
# @title 3. Helper Functions
def is_strictly_shadertoy(code):
    """
    Validates that the code is a self-contained Shadertoy shader.
    Rejects generic GLSL or code with external dependencies.
    """
    if not code: return False
    # Must have the entry point
    if "mainImage" not in code: return False
    # Must not rely on local includes
    if "#include" in code: return False
    # Must look like it uses standard inputs
    if "fragCoord" not in code and "iResolution" not in code: return False
    return True

def is_safe_license(license_list):
    """Ensures we only use Permissive or Public Domain code."""
    if not license_list: return False
    return any(lic in SAFE_LICENSES for lic in license_list)

def save_to_drive():
    """Backs up local buffer to Drive to prevent data loss."""
    if not os.path.exists(DRIVE_DIR):
        os.makedirs(DRIVE_DIR, exist_ok=True)
    shutil.copy(LOCAL_FILE, FINAL_DRIVE_FILE)

In [None]:
# @title 4. Start Download Job
def download_production():
    print(f"[*] Starting PRODUCTION Download from 'The Stack'...")
    print(f"[*] Target: {FINAL_DRIVE_FILE}")

    # Load dataset in Streaming Mode
    try:
        ds = load_dataset("bigcode/the-stack", data_dir="data/glsl", split="train", streaming=True)
    except Exception as e:
        print(f"[!] Error connecting to Hugging Face: {e}")
        return

    saved_count = 0
    scanned_count = 0
    start_time = time.time()

    print(f"[*] Stream opened. Scanning for Shadertoy-compatible code...")

    with open(LOCAL_FILE, "w", encoding="utf-8") as f:
        try:
            for sample in ds:
                scanned_count += 1

                # Update Status every 10,000 scanned files
                if scanned_count % 10000 == 0:
                    elapsed = time.time() - start_time
                    rate = scanned_count / (elapsed + 0.01)
                    print(f"   [Scanning] Checked: {scanned_count:,} | Saved: {saved_count:,} | Speed: {rate:.0f} files/sec", end="\r", flush=True)

                try:
                    code = sample.get("content", "")
                    licenses = sample.get("max_stars_repo_licenses", [])

                    if is_strictly_shadertoy(code) and is_safe_license(licenses):
                        record = {
                            "id": sample.get("hexsha"),
                            "license": licenses[0],
                            "repo": sample.get("max_stars_repo_name", "unknown"),
                            "code": code
                        }
                        f.write(json.dumps(record) + "\n")
                        saved_count += 1

                        # Backup every 500 valid shaders
                        if saved_count % 500 == 0:
                            save_to_drive()
                            print(f"   [Backup] Saved {saved_count} shaders to Drive...                ", end="\r", flush=True)

                    if LIMIT and saved_count >= LIMIT:
                        print("\n[*] Limit reached.")
                        break

                except Exception:
                    continue

        except KeyboardInterrupt:
            print("\n\n[!] Interrupted by user. Saving progress...")

    # Final Save
    save_to_drive()

    total_time = (time.time() - start_time) / 60
    print(f"\n\n[SUCCESS] Run Complete in {total_time:.1f} minutes.")
    print(f"[-] Total Scanned: {scanned_count:,}")
    print(f"[-] Total Saved:   {saved_count:,}")
    print(f"[-] Location:      {FINAL_DRIVE_FILE}")

if __name__ == "__main__":
    download_production()