# Stable Diffusion Demo 

This notebook demonstrates the Stable Diffusion pipeline with visualization of latent representations at each denoising step.

In [1]:
import model_loader
import pipeline
from PIL import Image
from transformers import CLIPTokenizer
import torch

DEVICE = "cpu"

ALLOW_CUDA = False
ALLOW_MPS = False

if torch.cuda.is_available() and ALLOW_CUDA:
    DEVICE = "cuda"
elif (torch.has_mps or torch.backends.mps.is_available()) and ALLOW_MPS:
    DEVICE = "mps"
print(f"Using device: {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


  elif (torch.has_mps or torch.backends.mps.is_available()) and ALLOW_MPS:


In [2]:
# Load models and tokenizer
tokenizer = CLIPTokenizer("../data/vocab.json", merges_file="../data/merges.txt")
model_file = "../data/v1-5-pruned-emaonly.ckpt"
models = model_loader.preload_models_from_standard_weights(model_file, DEVICE)
print("Models loaded successfully!")

Models loaded successfully!


In [3]:
# Configuration
prompt = "A dog wearing a red scarf, sitting in a dreamy flower field at golden hour, highly detailed, realistic style"
uncond_prompt = "do not change the dog's face, pose"
do_cfg = True
cfg_scale = 8  # min: 1, max: 14

# Image to image (optional)
input_image = None
# Uncomment to enable image to image
image_path = "../images/dog.jpg"
input_image = Image.open(image_path).convert("RGB")
input_image.resize((512, 512))
strength = 0.8

# Sampler settings
sampler = "ddpm"
num_inference_steps = 18  # Reduced for faster execution and more frequent visualization
seed = 42

print(f"Prompt: {prompt}")
print(f"Steps: {num_inference_steps}")
print(f"Seed: {seed}")
print(f"CFG Scale: {cfg_scale}")

Prompt: A dog wearing a red scarf, sitting in a dreamy flower field at golden hour, highly detailed, realistic style
Steps: 18
Seed: 42
CFG Scale: 8


In [None]:
import threading
import gradio as gr
from PIL import Image
import pipeline
import inpainting
import numpy as np
import random

# Global variable to control cancellation
cancel_flag = threading.Event()

def generate_txt2img(prompt, strength, cfg_scale, num_inference_steps, seed):
    if not prompt.strip():
        raise gr.Error("Prompt is required")
    if seed == -1:
        seed = random.randint(0, 999999)
    output_image = pipeline.generate(
        prompt=prompt,
        uncond_prompt="",
        input_image=None,
        strength=strength,
        do_cfg=True,
        cfg_scale=cfg_scale,
        sampler_name="ddpm",
        n_inference_steps=num_inference_steps,
        seed=seed,
        models=models,
        device=DEVICE,
        idle_device="cpu",
        tokenizer=tokenizer,
        cancel_flag=cancel_flag,
    )
    if output_image is None:
        return None
    return Image.fromarray(output_image)

def generate_img2img(prompt, input_image, strength, cfg_scale, num_inference_steps, seed):
    if input_image is None:
        raise gr.Error("Please upload an input image")
    if not prompt.strip():
        raise gr.Error("Prompt is required")
    if seed == -1:
        seed = random.randint(0, 999999)
    output_image = pipeline.generate(
        prompt=prompt,
        uncond_prompt="",
        input_image=input_image,
        strength=strength,
        do_cfg=True,
        cfg_scale=cfg_scale,
        sampler_name="ddpm",
        n_inference_steps=num_inference_steps,
        seed=seed,
        models=models,
        device=DEVICE,
        idle_device="cpu",
        tokenizer=tokenizer,
        cancel_flag=cancel_flag,
    )
    if output_image is None:
        return None
    return Image.fromarray(output_image)

def generate_inpaint(image, mask, prompt, negative_prompt, strength, cfg_scale, num_steps, seed):
    if image is None or mask is None:
        raise gr.Error("Please upload both image and mask")
    if not prompt.strip():
        raise gr.Error("Prompt is required")
    mask = mask.convert("L").resize(image.size)
    mask_np = np.array(mask)
    original_size = image.size
    if seed == -1:
        seed = random.randint(0, 999999)
    result = inpainting.inpaint(
        prompt=prompt,
        image=image,
        mask=mask_np,
        uncond_prompt=negative_prompt,
        strength=strength,
        do_cfg=True,
        cfg_scale=cfg_scale,
        sampler_name="ddpm",
        n_inference_steps=num_steps,
        models=models,
        tokenizer=tokenizer,
        seed=seed,
        device=DEVICE,
        idle_device="cpu"
    )
    result_image = Image.fromarray(result)
    result_image = result_image.resize(original_size, resample=Image.LANCZOS)
    return result_image

with gr.Blocks(css=".progress-bar, .svelte-1ipelgc {display: none !important;}") as demo:
    gr.Markdown("# Stable Diffusion All-in-One Demo")

    with gr.Tabs():
        with gr.Tab("Text-to-Image"):
            t2i_prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")
            t2i_strength = gr.Slider(0.1, 1.0, value=0.8, step=0.1, label="Strength")
            t2i_cfg = gr.Slider(1, 14, value=8, step=1, label="CFG Scale")
            t2i_steps = gr.Slider(1, 50, value=18, step=1, label="Number of Inference Steps")
            t2i_seed = gr.Number(value=-1, label="Seed (-1 = random)")
            t2i_btn = gr.Button("Generate")
            t2i_output = gr.Image(label="Generated Image")
            t2i_btn.click(
                generate_txt2img,
                inputs=[t2i_prompt, t2i_strength, t2i_cfg, t2i_steps, t2i_seed],
                outputs=t2i_output,
            )

        with gr.Tab("Image-to-Image"):
            i2i_prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")
            i2i_image = gr.Image(label="Input Image", type="pil")
            i2i_strength = gr.Slider(0.1, 1.0, value=0.8, step=0.1, label="Strength")
            i2i_cfg = gr.Slider(1, 14, value=8, step=1, label="CFG Scale")
            i2i_steps = gr.Slider(1, 50, value=18, step=1, label="Number of Inference Steps")
            i2i_seed = gr.Number(value=-1, label="Seed (-1 = random)")
            i2i_btn = gr.Button("Generate")
            i2i_output = gr.Image(label="Generated Image")
            i2i_btn.click(
                generate_img2img,
                inputs=[i2i_prompt, i2i_image, i2i_strength, i2i_cfg, i2i_steps, i2i_seed],
                outputs=i2i_output,
            )

        with gr.Tab("Inpainting"):
            inp_image = gr.Image(label="Upload Your Image", type="pil")
            inp_mask = gr.Image(label="Draw Mask (white = inpaint)", type="pil")
            inp_prompt = gr.Textbox(label="📝 Prompt", lines=2, placeholder="e.g. a mountain with a castle")
            inp_negative = gr.Textbox(label="🚫 Negative Prompt", value="blurry, low quality", lines=1)
            inp_strength = gr.Slider(0.1, 1.0, step=0.1, value=0.8, label="Strength")
            inp_cfg = gr.Slider(1.0, 20.0, step=0.5, value=7.5, label="CFG Scale")
            inp_steps = gr.Slider(10, 100, step=5, value=30, label="Denoising Steps")
            inp_seed = gr.Number(value=-1, precision=0, label="Seed (-1 = random)")
            inp_btn = gr.Button("Generate")
            inp_output = gr.Image(label="🖼️ Output Image")
            inp_btn.click(
                generate_inpaint,
                inputs=[inp_image, inp_mask, inp_prompt, inp_negative, inp_strength, inp_cfg, inp_steps, inp_seed],
                outputs=inp_output,
            )

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




Using DDPM sampler with 5 steps


  plt.show()
100%|██████████| 5/5 [03:36<00:00, 43.23s/it]


## Understanding the Latent Visualization

### What are Latents?
- **Latents** are the compressed representation of images in a lower-dimensional space (64x64x4 instead of 512x512x3)
- The **4 channels** represent different aspects of the image content
- During diffusion, noise is gradually removed from these latents to form the final image

### Denoising Process:
1. **Step 0**: Pure random noise in latent space
2. **Early Steps**: Rough shapes and structures begin to emerge
3. **Middle Steps**: More defined features and composition
4. **Late Steps**: Fine details and refinement
5. **Final Step**: Clean latents that decode to the final image

### Channel Interpretation:
- Each of the 4 latent channels captures different aspects of the image
- The exact meaning of each channel is learned during training
- Generally, they represent different frequency components and feature maps

### Visualization Benefits:
- **Debug generation**: See where the process might be going wrong
- **Understand timing**: Observe when key features appear
- **Compare prompts**: See how different prompts affect the denoising trajectory
- **Optimize parameters**: Adjust CFG scale, steps, etc. based on intermediate results