**Stage 1 - Data Generation**

In [None]:
# @title 1. Environment and paths setup
import os
import sys
from google.colab import drive

# Install the required graphics libraries for the renderer.
!pip install -q moderngl moderngl-window

# Mount Google Drive using the space-sensitive path.
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Define project paths using the correct Drive prefix.
PROJECT_ROOT = '/content/drive/My Drive/projects/EarthShader'
DATASET_DIR = os.path.join(PROJECT_ROOT, 'dataset/stage1')
IMAGES_DIR = os.path.join(DATASET_DIR, 'images')
JSONL_PATH = os.path.join(DATASET_DIR, 'dataset.jsonl')

# Create directories if they do not exist.
os.makedirs(IMAGES_DIR, exist_ok=True)

print(f"Dataset target: {DATASET_DIR}")

In [None]:
# @title 2. Library and renderer initialization
import sys
import os

# 1. Add the local library to the system path.
LIB_DIR = os.path.join(PROJECT_ROOT, 'lib')
if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

from gl_renderer import ShaderRenderer
from generators.primitives import generate_primitive

# 2. Initialize the renderer at the training resolution.
# This ensures visual consistency with what the model saw during training.
renderer = ShaderRenderer(width=256, height=256)

print("Renderer and Generator successfully linked.")

In [None]:
# @title 3. Dataset generation loop and validation
import json
import tqdm
import random
from PIL import Image
import numpy as np

# Updated target for a clean, reproducible Stage 1 run.
NUM_SAMPLES = 2000
START_INDEX = 0

def is_valid_render(path):
    """Performs a diagnostic check on the rendered image."""
    try:
        with Image.open(path) as img:
            arr = np.array(img.convert('RGB'))
            # Reject images that are entirely black.
            if np.max(arr) == 0:
                return False
            # Reject images that are purely static or low-variance noise.
            if np.std(arr) < 0.1:
                return False
        return True
    except:
        return False

print(f"Starting generation of {NUM_SAMPLES} valid samples...")

with open(JSONL_PATH, 'w') as f:
    generated_count = 0
    current_seed = START_INDEX

    pbar = tqdm.tqdm(total=NUM_SAMPLES)

    while generated_count < NUM_SAMPLES:
        code, analysis = generate_primitive(current_seed)

        image_filename = f"sample_{generated_count}.png"
        image_path = os.path.join(IMAGES_DIR, image_filename)

        success = renderer.render(code, image_path)

        if success and is_valid_render(image_path):
            entry = {
                "id": generated_count,
                "image_path": image_path,
                "analysis": analysis,
                "code": code
            }
            f.write(json.dumps(entry) + '\n')
            f.flush()
            os.fsync(f.fileno())
            generated_count += 1
            pbar.update(1)
        else:
            if os.path.exists(image_path):
                os.remove(image_path)

        current_seed += 1

print(f"\n[SUCCESS] Stage 1 dataset finalized with {NUM_SAMPLES} samples.")

In [None]:
# @title 4. Verify dataset registry
import os

# 1. Check if the registry file exists on Drive.
if os.path.exists(JSONL_PATH):
    size = os.path.getsize(JSONL_PATH) / (1024 * 1024)
    print(f"Registry found: {JSONL_PATH}")
    print(f"Current file size: {size:.2f} MB")

    # 2. Verify the entry count and content.
    with open(JSONL_PATH, 'r') as f:
        lines = f.readlines()
        print(f"Total valid entries: {len(lines)}")

        if len(lines) > 0:
            # Display the final entry to confirm the 3745 limit.
            print(f"Last entry preview: {lines[-1][:120]}...")
        else:
            print("Warning: Registry is empty.")
else:
    print(f"ERROR: Dataset registry not found at {JSONL_PATH}")

In [None]:
# @title 5. Inspector and visual audit
import random
from IPython.display import display, HTML
import base64
import json
import os

def inspect_random_sample():
    """
    Picks a random sample from the dataset and displays the code and image.
    """
    if not os.path.exists(JSONL_PATH):
        print(f"Dataset registry not found at {JSONL_PATH}.")
        return

    # Load all entries to select a random candidate.
    with open(JSONL_PATH, 'r') as f:
        lines = f.readlines()

    if not lines:
        print("Dataset registry is empty.")
        return

    # Select and parse a random entry.
    line = random.choice(lines)
    entry = json.loads(line)

    img_path = entry['image_path']
    code = entry['code']
    analysis = entry['analysis']

    if os.path.exists(img_path):
        # Encode the image to base64 for inline HTML display.
        with open(img_path, 'rb') as f:
            img_data = base64.b64encode(f.read()).decode()

        # Escape special characters in the GLSL code for HTML safety.
        code_html = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

        # Build a clean side-by-side layout.
        html = f"""
        <div style="display: flex; gap: 20px; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
            <div style="flex: 1; background: #fafafa; padding: 15px; border: 1px solid #eee; border-radius: 8px; overflow-x: auto;">
                <h3 style="margin-top: 0; color: #333;">Generated GLSL Logic</h3>
                <pre style="white-space: pre-wrap; font-size: 12px; color: #444;">{code_html}</pre>
            </div>
            <div style="flex: 1; text-align: center;">
                <h3 style="margin-top: 0; color: #333;">Rendered Output</h3>
                <img src="data:image/png;base64,{img_data}" style="max-width: 100%; border-radius: 4px; border: 1px solid #ddd; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
            </div>
        </div>
        """

        display(HTML(html))

        # Print the analysis reasoning block clearly below the visual.
        print("\n" + "="*70)
        print("SDF REASONING ANALYSIS:")
        print("="*70)
        print(analysis)
        print("="*70)
    else:
        print(f"Image file missing at path: {img_path}")

# Run the inspector.
inspect_random_sample()

# Suggestion for the user to verify diversity.
print("\nTip: Run this cell multiple times to ensure all samples are high-quality and noise-free.")

In [None]:
# @title 6. Quick dataset audit
import json
with open('/content/drive/My Drive/projects/EarthShader/dataset/stage1/dataset.jsonl', 'r') as f:
    for _ in range(3):
        print(json.loads(f.readline())['analysis'])