**Stage 1 - Data Generation**

In [None]:
# @title 1. Setup & Dependencies
import sys
import os
import subprocess
import importlib.util

print("Installing dependencies...")
try:
    import moderngl
except ImportError:
    subprocess.check_call(['apt-get', 'install', '-y', 'libgl1-mesa-glx'])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "moderngl"])
    import moderngl

import random
import json
from tqdm import tqdm
import numpy as np
from PIL import Image

from google.colab import drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

PROJECT_ROOT = '/content/drive/MyDrive/projects/EarthShader'
sys.path.append(PROJECT_ROOT)
LIB_DIR = os.path.join(PROJECT_ROOT, 'lib')

# Ensure we can import from lib
if LIB_DIR not in sys.path:
    sys.path.insert(0, LIB_DIR)

# CONFIG
DATASET_ROOT = os.path.join(PROJECT_ROOT, 'dataset/stage1')
IMAGES_DIR = os.path.join(DATASET_ROOT, 'images')
JSONL_PATH = os.path.join(DATASET_ROOT, 'dataset.jsonl')
NUM_SAMPLES = 1500
IMG_SIZE = 512

os.makedirs(IMAGES_DIR, exist_ok=True)
print(f"Setup complete. Dataset will be saved to: {DATASET_ROOT}")

In [None]:
# @title 2. Import Generator Logic
try:
    # Try standard package import
    from lib.generators.primitives import generate_primitive
    from lib.gl_renderer import ShaderRenderer
    print("Library loaded successfully via standard import.")
except ImportError as e:
    print(f"Standard import failed ({e}). Trying direct file load...")

    # Direct loading if Colab pathing is fighting us
    import importlib.util

    # Load Renderer
    spec_r = importlib.util.spec_from_file_location("gl_renderer", os.path.join(LIB_DIR, "gl_renderer.py"))
    mod_r = importlib.util.module_from_spec(spec_r)
    spec_r.loader.exec_module(mod_r)
    ShaderRenderer = mod_r.ShaderRenderer

    # Load Base
    spec_b = importlib.util.spec_from_file_location("base", os.path.join(LIB_DIR, "generators/base.py"))
    mod_b = importlib.util.module_from_spec(spec_b)
    spec_b.loader.exec_module(mod_b)

    # Load Primitives (injecting base dependency manually)
    spec_p = importlib.util.spec_from_file_location("primitives", os.path.join(LIB_DIR, "generators/primitives.py"))
    mod_p = importlib.util.module_from_spec(spec_p)
    mod_p.base = mod_b # Mock the relative import
    spec_p.loader.exec_module(mod_p)

    generate_primitive = mod_p.generate_primitive
    print("Direct file load successful.")

# Quick test to verify generator works
print("\n--- Testing Generator ---")
test_code, test_analysis = generate_primitive(0)
print(f"Generator test passed. Sample analysis:\n{test_analysis[:200]}...")

In [None]:
# @title 3. Generation Loop (Canonical Mode)

# CONFIG
FORCE_REGEN = True  # Set to True to wipe existing data and start over

if 'renderer' not in locals():
    renderer = ShaderRenderer(width=IMG_SIZE, height=IMG_SIZE)
    print("Renderer initialized")

# Resume Logic
dataset_entries = []
failed_count = 0

# Track statistics
stats = {
    'single_circle': 0,
    'single_square': 0,
    'single_ring': 0,
    'composition': 0
}

if FORCE_REGEN:
    print("[FORCE REGEN] Wiping existing registry...")
    with open(JSONL_PATH, 'w') as f:
        pass # Create empty file
    existing_count = 0
else:
    if os.path.exists(JSONL_PATH):
        with open(JSONL_PATH, 'r') as f:
            existing_count = sum(1 for line in f)
        print(f"Found {existing_count} existing samples.")
    else:
        with open(JSONL_PATH, 'w') as f:
            pass
        existing_count = 0

if existing_count < NUM_SAMPLES:
    print(f"Generating samples {existing_count} to {NUM_SAMPLES}...")
    print(f"Target: {NUM_SAMPLES} samples with Canonical Syntax + Diverse Analysis\n")

    pbar = tqdm(range(existing_count, NUM_SAMPLES), desc="Generating")

    for i in pbar:
        try:
            # CALL THE LIBRARY
            code, analysis = generate_primitive(i)

            filename = f"stage1_{i:05d}.png"
            filepath = os.path.join(IMAGES_DIR, filename)

            # Render logic
            success = renderer.render(code, filepath)

            if success:
                # Update Stats
                if 'Composition' in analysis:
                    stats['composition'] += 1
                elif 'Circle' in analysis or 'Radial' in analysis:
                    stats['single_circle'] += 1
                elif 'Square' in analysis or 'Box' in analysis:
                    stats['single_square'] += 1
                elif 'Ring' in analysis or 'Annulus' in analysis:
                    stats['single_ring'] += 1

                entry = {
                    "image_path": filepath,
                    "analysis": analysis,
                    "code": code
                }
                dataset_entries.append(entry)

                # Incremental Save (Every 100 for performance)
                if len(dataset_entries) >= 100:
                    with open(JSONL_PATH, 'a') as f:
                        for e in dataset_entries:
                            f.write(json.dumps(e) + '\n')
                    dataset_entries = []

                    # Update progress bar
                    pbar.set_postfix({
                        'fails': failed_count,
                        'dbl': stats['composition']
                    })
            else:
                # Handle Render Failure
                failed_count += 1
                pbar.set_description(f"Gen ({failed_count} fails)")

        except Exception as e:
            print(f"\n[CRITICAL ERROR] Sample {i}: {e}")
            failed_count += 1

    # Final Flush
    if dataset_entries:
        with open(JSONL_PATH, 'a') as f:
            for e in dataset_entries:
                f.write(json.dumps(e) + '\n')

    # Print final statistics
    print("\n" + "="*60)
    print("GENERATION COMPLETE (Canonical Mode)")
    print("="*60)
    print(f"Total Samples: {NUM_SAMPLES - failed_count}")
    print(f"Failed: {failed_count} ({100*failed_count/NUM_SAMPLES:.1f}%)")
    print(f"\nDistribution:")
    print(f"   Single Circle: {stats['single_circle']:>5}")
    print(f"   Single Square: {stats['single_square']:>5}")
    print(f"   Single Ring:   {stats['single_ring']:>5}")
    print(f"   Compositions:  {stats['composition']:>5}")
else:
    print("Dataset already complete.")

In [None]:
# @title 4. Validator (Strict Canonical Check)
import re
import json
import os

def validate_canonical_data():
    if not os.path.exists(JSONL_PATH):
        print("Dataset not found.")
        return

    print("Analyzing dataset structure...\n")
    with open(JSONL_PATH, 'r') as f:
        entries = [json.loads(line) for line in f]

    total = len(entries)

    unique_code_logic = set()
    unique_analyses = set()

    for entry in entries:
        full_code = entry['code']

        # 1. Extract Logic Only (Remove C-style comments)
        # We use regex to strip the /* ANALYSIS ... */ block
        glsl_logic = re.sub(r'/\*.*?\*/', '', full_code, flags=re.DOTALL)

        # Normalize: Remove numbers, whitespace, and symbols to check PURE structure
        # This collapses "vec2 v = vec2(0.1, 0.2)" and "vec2 v = vec2(0.9, 0.9)" to the same string
        normalized_logic = ''.join(c for c in glsl_logic if c.isalpha())
        unique_code_logic.add(normalized_logic)

        # 2. Extract Analysis Only
        # We grab the text inside the comment block to check for diversity there
        analysis_match = re.search(r'/\* ANALYSIS\n(.*?)\n\*/', full_code, flags=re.DOTALL)
        if analysis_match:
            unique_analyses.add(analysis_match.group(1).strip())

    print(f"Metrics:")
    print(f"   Unique Logic Structures: {len(unique_code_logic)}")
    print(f"   Unique Analysis Blocks:  {len(unique_analyses)}")

    # VALIDATION LOGIC
    print("\nReport:")

    # We expect VERY LOW code variance (ideally < 10 for the 3 primitives + 3 compositions)
    if len(unique_code_logic) <= 15:
        print("   [PASS] Code Structure is Canonical (Low Variance)")
    else:
        print(f"   [FAIL] Too much code variation ({len(unique_code_logic)} variants). Check primitives.py.")

    # We expect HIGH analysis variance
    if len(unique_analyses) > 20:
        print("   [PASS] Analysis Reasoning is Diverse")
    else:
        print("   [FAIL] Analysis is repetitive. Model will overfit.")

validate_canonical_data()

In [None]:
# @title 5. Inspector (Random Sample)
import random
from IPython.display import display, HTML
import base64

def inspect_random_sample():
    if not os.path.exists(JSONL_PATH):
        print(f"Waiting for data... {JSONL_PATH} not found yet.")
        return

    # Load lines to pick a random one
    with open(JSONL_PATH, 'r') as f:
        lines = f.readlines()

    if not lines:
        print("Dataset is empty.")
        return

    # Pick random entry
    line = random.choice(lines)
    entry = json.loads(line)

    img_path = entry['image_path']
    code = entry['code']
    analysis = entry['analysis']

    # Detect variant type for label
    variant_label = "Unknown"
    if 'inline' in analysis.lower():
        variant_label = "Inline"
    elif 'verbose' in analysis.lower():
        variant_label = "Verbose"
    elif 'intermediate' in analysis.lower():
        variant_label = "Intermediate"
    elif 'alternative' in analysis.lower():
        variant_label = "Alternative"

    if 'composition' in analysis.lower() or 'two shapes' in analysis.lower():
        variant_label += " + Composition"

    # Display using HTML (side-by-side layout)
    if os.path.exists(img_path):
        # Encode image to base64 for HTML display
        with open(img_path, 'rb') as f:
            img_data = base64.b64encode(f.read()).decode()

        # Escape code for HTML
        code_html = code.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

        html = f"""
        <div style="display: flex; gap: 20px; font-family: monospace;">
            <div style="flex: 1; background: #f5f5f5; padding: 15px; border-radius: 5px; overflow-x: auto;">
                <h3 style="margin-top: 0;">Generated GLSL [{variant_label}]</h3>
                <pre style="white-space: pre-wrap; font-size: 11px;">{code_html}</pre>
            </div>
            <div style="flex: 1;">
                <h3 style="margin-top: 0;">Rendered Output</h3>
                <img src="data:image/png;base64,{img_data}" style="max-width: 100%; border: 1px solid #ccc;">
            </div>
        </div>
        """

        display(HTML(html))

        # Print analysis separately
        print("\n" + "="*60)
        print("ANALYSIS BLOCK:")
        print("="*60)
        print(analysis)
        print("="*60)
    else:
        print(f"Image missing at path: {img_path}")

# Run inspector
inspect_random_sample()

# Run multiple times to see variety
print("\nTIP: Run this cell multiple times to verify diversity across samples.")

In [None]:
import json

with open('/content/drive/MyDrive/projects/EarthShader/dataset/stage1/dataset.jsonl', 'r') as f:
    has_artifacts = False
    for i, line in enumerate(f):
        entry = json.loads(line)
        code = entry['code']

        # Check for floating point artifacts
        if '00000000' in code:
            print(f"Sample {i} HAS ARTIFACTS:")
            print(code[:500])
            has_artifacts = True
            break

    if not has_artifacts:
        print("SUCCESS: No floating point artifacts found in dataset!")
        print("\nSample code from first entry:")
        f.seek(0)
        first = json.loads(f.readline())
        print(first['code'])