**Stage 2: Datagen**

In [None]:
# @title 1. Environment and library setup
import os
import sys
from google.colab import drive

# Mount Google Drive to access project files.
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Define standard project paths.
PROJECT_ROOT = '/content/drive/MyDrive/projects/EarthShader'
LIB_PATH = os.path.join(PROJECT_ROOT, 'lib')
DATASET_STAGE2 = os.path.join(PROJECT_ROOT, 'dataset/stage2')

# Add the library folder to the path for imports.
if LIB_PATH not in sys.path:
    sys.path.append(LIB_PATH)

os.makedirs(DATASET_STAGE2, exist_ok=True)
os.makedirs(os.path.join(DATASET_STAGE2, 'images'), exist_ok=True)

# Install necessary rendering dependencies.
!pip install -q moderngl Pillow numpy tqdm

In [None]:
# @title 2. Logic initialization and generator setup
import json
import random
from tqdm import tqdm
from gl_renderer import ShaderRenderer
from generators.composition import generate_boolean_composition

# Initialize the headless renderer at standard resolution.
renderer = ShaderRenderer(width=256, height=256)

In [None]:
# @title 3. Production run and dataset commitment
NUM_SAMPLES = 2000
dataset_log = []
output_jsonl = os.path.join(DATASET_STAGE2, 'dataset.jsonl')

print(f"Generating {NUM_SAMPLES} boolean composition samples...")

for i in tqdm(range(NUM_SAMPLES)):
    # Use seeds that follow the Stage 1 sequence.
    seed = 20000 + i
    code, analysis = generate_boolean_composition(seed)

    img_filename = f"csg_{seed}.png"
    img_path = os.path.join(DATASET_STAGE2, 'images', img_filename)

    # Attempt to render the boolean field.
    success, error = renderer.render(code, img_path)

    if success:
        dataset_log.append({
            "image_path": img_path,
            "analysis": analysis,
            "code": code,
            "metadata": {
                "seed": seed,
                "stage": 2
            }
        })
    else:
        # Log failures to catch math edge cases early.
        print(f"Warning: Seed {seed} failed to render: {error}")

# Save the final dataset manifest in JSONL format.
with open(output_jsonl, 'w') as f:
    for entry in dataset_log:
        f.write(json.dumps(entry) + '\n')

print(f"\n[SUCCESS] Stage 2 dataset created with {len(dataset_log)} valid samples.")

In [None]:
# @title 4. Verify dataset registry
import os

# 1. Check if the registry file exists on Drive.
if os.path.exists(output_jsonl):
    size = os.path.getsize(output_jsonl) / (1024 * 1024)
    print(f"Registry found: {output_jsonl}")
    print(f"Current file size: {size:.2f} MB")

    # 2. Verify the entry count and content.
    with open(output_jsonl, 'r') as f:
        lines = f.readlines()
        print(f"Total valid entries: {len(lines)}")

        if len(lines) > 0:
            # Display the final entry to confirm the 2000 limit.
            print(f"Last entry preview: {lines[-1][:120]}...")
        else:
            print("Warning: Registry is empty.")
else:
    print(f"ERROR: Dataset registry not found at {output_jsonl}")

In [None]:
# @title 5. Quick dataset audit
import json

# Open the newly generated Stage 2 registry.
with open(output_jsonl, 'r') as f:
    # Print the first few entries to verify the CSG Analysis and Code logic.
    print("Previewing first 3 Stage 2 samples:\n" + "-"*50)
    for i in range(3):
        line = f.readline()
        if not line:
            break
        entry = json.loads(line)
        print(f"Sample {i+1} Analysis: {entry['analysis']}")
        print(f"Sample {i+1} Code Preview: {entry['code'][:100].replace(chr(10), ' ')}...")
        print("-"*50)