**Stage 1 - Data Generation**

In [None]:
# @title 1. Setup and dependencies
import sys
import os
import subprocess
import importlib.util

# Install necessary system and python libraries for rendering.
print("Installing dependencies...")
try:
    import moderngl
except ImportError:
    subprocess.check_call(['apt-get', 'install', '-y', 'libgl1-mesa-glx'])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "moderngl"])
    import moderngl

import random
import json
from tqdm import tqdm
import numpy as np
from PIL import Image

# Mount drive to access the project directory using the specific path provided.
from google.colab import drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Correct path containing the space in My Drive.
PROJECT_ROOT = '/content/drive/My Drive/projects/EarthShader'
sys.path.append(PROJECT_ROOT)
LIB_DIR = os.path.join(PROJECT_ROOT, 'lib')

# Ensure the library directory is in the path for custom imports.
if LIB_DIR not in sys.path:
    sys.path.insert(0, LIB_DIR)

# Configuration for the Stage 1 dataset.
DATASET_ROOT = os.path.join(PROJECT_ROOT, 'dataset/stage1')
IMAGES_DIR = os.path.join(DATASET_ROOT, 'images')
JSONL_PATH = os.path.join(DATASET_ROOT, 'dataset.jsonl')
NUM_SAMPLES = 5000
IMG_SIZE = 512

os.makedirs(IMAGES_DIR, exist_ok=True)
print(f"Setup complete. Dataset will be saved to: {DATASET_ROOT}")

In [None]:
# @title 2. Import generator logic
try:
    # Attempt standard package import first.
    from lib.generators.primitives import generate_primitive
    from lib.gl_renderer import ShaderRenderer
    print("Library loaded successfully via standard import.")
except ImportError as e:
    print(f"Standard import failed ({e}). Trying direct file load...")

    # Direct loading if Colab pathing is inconsistent.
    import importlib.util

    # Use the PROJECT_ROOT defined in Cell 1.
    LIB_DIR = os.path.join(PROJECT_ROOT, 'lib')

    # Load the renderer module.
    spec_r = importlib.util.spec_from_file_location("gl_renderer", os.path.join(LIB_DIR, "gl_renderer.py"))
    mod_r = importlib.util.module_from_spec(spec_r)
    spec_r.loader.exec_module(mod_r)
    ShaderRenderer = mod_r.ShaderRenderer

    # Load the base generator module.
    spec_b = importlib.util.spec_from_file_location("base", os.path.join(LIB_DIR, "generators/base.py"))
    mod_b = importlib.util.module_from_spec(spec_b)
    spec_b.loader.exec_module(mod_b)

    # Load the primitives generator module and manually inject the base dependency.
    spec_p = importlib.util.spec_from_file_location("primitives", os.path.join(LIB_DIR, "generators/primitives.py"))
    mod_p = importlib.util.module_from_spec(spec_p)
    mod_p.base = mod_b
    spec_p.loader.exec_module(mod_p)

    generate_primitive = mod_p.generate_primitive
    print("Direct file load successful.")

# Verify that the generator is working correctly before starting the bulk run.
print("\n--- Testing Generator ---")
test_code, test_analysis = generate_primitive(0)
print(f"Generator test passed. Sample analysis:\n{test_analysis[:200]}...")

In [None]:
# @title 3. Generation loop and storage
# Set to True to wipe existing data and start over, or False to resume.
FORCE_REGEN = True

if 'renderer' not in locals():
    renderer = ShaderRenderer(width=IMG_SIZE, height=IMG_SIZE)
    print("Renderer initialized.")

dataset_entries = []
failed_count = 0

# Track statistics for dataset distribution.
stats = {
    'single_circle': 0,
    'single_square': 0,
    'single_ring': 0,
    'composition': 0
}

if FORCE_REGEN:
    print("[FORCE REGEN] Wiping existing registry...")
    with open(JSONL_PATH, 'w') as f:
        pass
    existing_count = 0
else:
    if os.path.exists(JSONL_PATH):
        with open(JSONL_PATH, 'r') as f:
            existing_count = sum(1 for line in f)
        print(f"Found {existing_count} existing samples.")
    else:
        with open(JSONL_PATH, 'w') as f:
            pass
        existing_count = 0

if existing_count < NUM_SAMPLES:
    print(f"Generating samples {existing_count} to {NUM_SAMPLES}...")
    pbar = tqdm(range(existing_count, NUM_SAMPLES), desc="Generating")

    for i in pbar:
        try:
            # Call the SDF-focused generator.
            code, analysis = generate_primitive(i)

            filename = f"stage1_{i:05d}.png"
            filepath = os.path.join(IMAGES_DIR, filename)

            # Render the GLSL code to a file.
            success = renderer.render(code, filepath)

            if success:
                # Update distribution statistics.
                if 'Composition' in analysis:
                    stats['composition'] += 1
                elif 'Circle' in analysis:
                    stats['single_circle'] += 1
                elif 'Square' in analysis:
                    stats['single_square'] += 1
                elif 'Ring' in analysis:
                    stats['single_ring'] += 1

                entry = {
                    "image_path": filepath,
                    "analysis": analysis,
                    "code": code
                }
                dataset_entries.append(entry)

                # Incremental Save to prevent data loss.
                if len(dataset_entries) >= 100:
                    with open(JSONL_PATH, 'a') as f:
                        for e in dataset_entries:
                            f.write(json.dumps(e) + '\n')
                    dataset_entries = []

                    pbar.set_postfix({
                        'fails': failed_count,
                        'comp': stats['composition']
                    })
            else:
                failed_count += 1
                pbar.set_description(f"Gen ({failed_count} fails)")

        except Exception as e:
            print(f"\n[CRITICAL ERROR] Sample {i}: {e}")
            failed_count += 1

    # Final flush for any remaining entries.
    if dataset_entries:
        with open(JSONL_PATH, 'a') as f:
            for e in dataset_entries:
                f.write(json.dumps(e) + '\n')

    print("\n" + "="*60)
    print("GENERATION COMPLETE")
    print("="*60)
    print(f"Total Samples Saved: {NUM_SAMPLES - failed_count}")
    print(f"Final Distribution: {stats}")
else:
    print("Dataset already complete.")

In [None]:
# @title 4. Validator and strict canonical check
import re
import json
import os

def validate_canonical_data():
    """
    Analyzes the dataset to ensure high logic diversity and low code variance.
    """
    if not os.path.exists(JSONL_PATH):
        print("Dataset not found. Please run the generation loop first.")
        return

    print("Analyzing dataset structure for SDF reasoning diversity...\n")
    with open(JSONL_PATH, 'r') as f:
        entries = [json.loads(line) for line in f]

    total = len(entries)
    unique_code_logic = set()
    unique_analyses = set()

    for entry in entries:
        full_code = entry['code']

        # 1. Extract pure GLSL logic by removing the analysis comments.
        glsl_logic = re.sub(r'/\*.*?\*/', '', full_code, flags=re.DOTALL)

        # 2. Normalize logic to check for structural consistency.
        # We strip numbers and whitespace to see if the underlying formula patterns match.
        normalized_logic = ''.join(c for c in glsl_logic if c.isalpha())
        unique_code_logic.add(normalized_logic)

        # 3. Extract the Analysis block to verify variety in reasoning.
        analysis_match = re.search(r'/\* ANALYSIS\n(.*?)\n\*/', full_code, flags=re.DOTALL)
        if analysis_match:
            unique_analyses.add(analysis_match.group(1).strip())

    print(f"Metrics for {total} samples:")
    print(f"   Unique Logic Structures: {len(unique_code_logic)}")
    print(f"   Unique Analysis Blocks:  {len(unique_analyses)}")

    # Validation Report
    print("\nReport:")

    # We expect some variance due to variable randomization, but the structure should be stable.
    if len(unique_code_logic) < 100:
        print("   [PASS] Code structure is consistent and follows SDF patterns.")
    else:
        print(f"   [WARNING] High logic variance ({len(unique_code_logic)}). Check variable randomization.")

    # High analysis variance is the primary goal for teaching reasoning.
    if len(unique_analyses) > (total * 0.05):
        print("   [PASS] Analysis reasoning is sufficiently diverse.")
    else:
        print("   [FAIL] Analysis is repetitive. Risk of model overfitting.")

validate_canonical_data()

In [None]:
# @title 5. Inspector and visual audit
import random
from IPython.display import display, HTML
import base64
import json
import os

def inspect_random_sample():
    """
    Picks a random sample from the dataset and displays the code and image.
    """
    if not os.path.exists(JSONL_PATH):
        print(f"Dataset registry not found at {JSONL_PATH}.")
        return

    # Load all entries to select a random candidate.
    with open(JSONL_PATH, 'r') as f:
        lines = f.readlines()

    if not lines:
        print("Dataset registry is empty.")
        return

    # Select and parse a random entry.
    line = random.choice(lines)
    entry = json.loads(line)

    img_path = entry['image_path']
    code = entry['code']
    analysis = entry['analysis']

    if os.path.exists(img_path):
        # Encode the image to base64 for inline HTML display.
        with open(img_path, 'rb') as f:
            img_data = base64.b64encode(f.read()).decode()

        # Escape special characters in the GLSL code for HTML safety.
        code_html = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

        # Build a clean side-by-side layout.
        html = f"""
        <div style="display: flex; gap: 20px; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
            <div style="flex: 1; background: #fafafa; padding: 15px; border: 1px solid #eee; border-radius: 8px; overflow-x: auto;">
                <h3 style="margin-top: 0; color: #333;">Generated GLSL Logic</h3>
                <pre style="white-space: pre-wrap; font-size: 12px; color: #444;">{code_html}</pre>
            </div>
            <div style="flex: 1; text-align: center;">
                <h3 style="margin-top: 0; color: #333;">Rendered Output</h3>
                <img src="data:image/png;base64,{img_data}" style="max-width: 100%; border-radius: 4px; border: 1px solid #ddd; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
            </div>
        </div>
        """

        display(HTML(html))

        # Print the analysis reasoning block clearly below the visual.
        print("\n" + "="*70)
        print("SDF REASONING ANALYSIS:")
        print("="*70)
        print(analysis)
        print("="*70)
    else:
        print(f"Image file missing at path: {img_path}")

# Run the inspector.
inspect_random_sample()

# Suggestion for the user to verify diversity.
print("\nTip: Run this cell multiple times to see different SDF variable names and reasoning patterns.")