# Generate Sample Dataset for ComfyUI Spark Integration

This notebook creates a sample dataset containing image descriptions and metadata that we'll use to demonstrate the Spark integration with ComfyUI.

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
import random
import json
from datetime import datetime

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Sample Dataset Generator") \
    .master("spark://spark:7077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

In [None]:
# Sample data generation functions
def generate_scene_description():
    scenes = [
        "A serene mountain landscape at sunset",
        "A bustling cityscape at night",
        "A peaceful garden with blooming flowers",
        "A mysterious forest in the fog",
        "A cozy cabin by a lake"
    ]
    styles = [
        "in the style of Van Gogh",
        "with a cyberpunk aesthetic",
        "in watercolor",
        "as an oil painting",
        "with a minimalist design"
    ]
    return f"{random.choice(scenes)} {random.choice(styles)}"

def generate_metadata():
    return {
        "timestamp": datetime.now().isoformat(),
        "style_params": {
            "strength": random.uniform(0.5, 1.0),
            "noise": random.uniform(0.1, 0.5),
            "contrast": random.uniform(0.8, 1.2)
        },
        "tags": random.sample([
            "nature", "urban", "abstract", "landscape",
            "artistic", "modern", "traditional", "minimal"
        ], 3)
    }

In [None]:
# Generate sample data
num_samples = 100
data = [(i,
         generate_scene_description(),
         json.dumps(generate_metadata()))
        for i in range(num_samples)]

# Define schema
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("description", StringType(), False),
    StructField("metadata", StringType(), False)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

In [None]:
# Show sample of the data
df.show(5, truncate=False)

In [None]:
# Save as parquet
output_path = "/data/sample_dataset.parquet"
df.write.mode("overwrite").parquet(output_path)
print(f"Dataset saved to {output_path}")

In [None]:
# Save metadata about the dataset
dataset_metadata = {
    "name": "sample_dataset",
    "created_at": datetime.now().isoformat(),
    "num_samples": num_samples,
    "schema": {field.name: str(field.dataType) for field in schema.fields},
    "description": "Sample dataset for ComfyUI Spark integration demo"
}

with open("/data/sample_dataset_metadata.json", "w") as f:
    json.dump(dataset_metadata, f, indent=2)
print("Dataset metadata saved")