# Presentation Image Generator

We will train a stable diffusion model to generate custom diagrams from textual prompts

In [1]:
%pip install torch torchvision transformers diffusers accelerate datasets

Collecting diffusers
  Downloading diffusers-0.31.0-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.31.0-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.31.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from datasets import load_dataset
from PIL import Image
import io
from diffusers import (
    StableDiffusionPipeline,
    UNet2DConditionModel,
    DDPMScheduler,
    AutoencoderKL,
)
from transformers import CLIPTextModel, CLIPTokenizer
from accelerate import Accelerator
from accelerate.utils import DistributedDataParallelKwargs
from pathlib import Path
from tqdm.auto import tqdm
import datetime

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
class AI2DDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, image_size=224):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        
        # Print sample item structure
        if len(hf_dataset) > 0:
            sample_item = hf_dataset[0]
            print("\nSample item structure:")
            print("Keys:", sample_item.keys())
            print("Question type:", type(sample_item.get('question')))
            print("Options type:", type(sample_item.get('options')))
            if isinstance(sample_item.get('options'), list):
                print("Sample options:", sample_item['options'][:2])
        
        self.image_transforms = transforms.Compose([
            transforms.Resize(image_size),
            transforms.CenterCrop(image_size),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Process image
        image = item['image']
        if not isinstance(image, Image.Image):
            image = Image.open(io.BytesIO(image['bytes'])) if isinstance(image, dict) else image
            image = image.convert("RGB")
        image = self.image_transforms(image)
        
        # Create text description
        question = item.get('question', '')
        options = item.get('options', [])
        
        description = f"A diagram that answers the question: {question}"
        if isinstance(options, list) and len(options) > 0:
            description += " Options: " + ", ".join([f"({opt})" for opt in options if opt])
        
        # Tokenize text
        encoded_text = self.tokenizer(
            description,
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "image": image.to(torch.float32),
            "input_ids": encoded_text.input_ids[0],
            "attention_mask": encoded_text.attention_mask[0]
        }

In [5]:
def train(
    pretrained_model_name="runwayml/stable-diffusion-v1-5",
    output_dir="./sd-ai2d-model",
    num_epochs=1,
    batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    mixed_precision="no",
    save_steps=1000,
    max_grad_norm=1.0,
):
    """
    Ultra memory efficient training configuration
    """
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"GPU Memory available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    accelerator = Accelerator(
        gradient_accumulation_steps=gradient_accumulation_steps,
        mixed_precision=mixed_precision
    )
    
    # Load dataset
    print("Loading AI2D dataset...")
    dataset = load_dataset("lmms-lab/ai2d")
    if isinstance(dataset, dict):
        dataset = dataset['test']
    
    # Load components
    print("Loading model components...")
    tokenizer = CLIPTokenizer.from_pretrained(
        pretrained_model_name,
        subfolder="tokenizer"
    )
    
    text_encoder = CLIPTextModel.from_pretrained(
        pretrained_model_name,
        subfolder="text_encoder",
        low_cpu_mem_usage=True
    ).to(torch.float32)
    
    vae = AutoencoderKL.from_pretrained(
        pretrained_model_name,
        subfolder="vae",
        low_cpu_mem_usage=True
    ).to(torch.float32)
    
    unet = UNet2DConditionModel.from_pretrained(
        pretrained_model_name,
        subfolder="unet",
        low_cpu_mem_usage=True
    ).to(torch.float32)
    
    noise_scheduler = DDPMScheduler.from_pretrained(
        pretrained_model_name,
        subfolder="scheduler"
    )
    
    # Create dataset and dataloader
    train_dataset = AI2DDataset(dataset, tokenizer)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True
    )
    
    # Optimizer with memory efficient settings
    optimizer = torch.optim.AdamW(
        unet.parameters(),
        lr=learning_rate,
        betas=(0.9, 0.999),
        eps=1e-8,
        foreach=True,
        capturable=True
    )
    
    # Prepare for training
    unet, optimizer, train_dataloader = accelerator.prepare(
        unet, optimizer, train_dataloader
    )
    
    # Move models to device
    text_encoder = text_encoder.to(accelerator.device)
    vae = vae.to(accelerator.device)
    
    # Ensure models are in eval mode
    text_encoder.eval()
    vae.eval()
    
    # Training loop
    global_step = 0
    for epoch in range(num_epochs):
        print(f"\nStarting epoch {epoch}")
        unet.train()
        progress_bar = tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}")
        
        for step, batch in enumerate(train_dataloader):
            try:
                with accelerator.accumulate(unet):
                    # Convert images to latent space
                    with torch.no_grad():
                        latents = vae.encode(batch["image"]).latent_dist.sample()
                        latents = latents * vae.config.scaling_factor

                    # Generate noise
                    noise = torch.randn_like(latents)
                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],), device=latents.device)
                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                    # Get text embeddings
                    with torch.no_grad():
                        encoder_hidden_states = text_encoder(batch["input_ids"])[0]

                    # Clear unnecessary tensors
                    del latents
                    torch.cuda.empty_cache()

                    # Predict noise
                    noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                    loss = torch.nn.functional.mse_loss(noise_pred, noise)
                    
                    accelerator.backward(loss)
                    
                    if accelerator.sync_gradients:
                        accelerator.clip_grad_norm_(unet.parameters(), max_grad_norm)
                    
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)
                
                if step % 10 == 0:
                    print(f"Step {step}: loss = {loss.item():.4f}")
                
                progress_bar.update(1)
                global_step += 1
                
                # Clear cache periodically
                if step % 100 == 0:
                    torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"Error in training step: {str(e)}")
                torch.cuda.empty_cache()
                continue
        
            # Save checkpoint
            if global_step % save_steps == 0 and accelerator.is_main_process:
                try:
                    print(f"\nSaving checkpoint at step {global_step}")
                    pipeline = StableDiffusionPipeline(
                        text_encoder=text_encoder,
                        vae=vae,
                        unet=accelerator.unwrap_model(unet),
                        tokenizer=tokenizer,
                        scheduler=noise_scheduler,
                        safety_checker=None,
                        feature_extractor=None,
                    )
                    pipeline.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
                except Exception as e:
                    print(f"Error saving checkpoint: {str(e)}")
        
        progress_bar.close()
    
    # Save final model
    if accelerator.is_main_process:
        try:
            print("\nSaving final model...")
            pipeline = StableDiffusionPipeline(
                text_encoder=text_encoder,
                vae=vae,
                unet=accelerator.unwrap_model(unet),
                tokenizer=tokenizer,
                scheduler=noise_scheduler,
                safety_checker=None,
                feature_extractor=None,
            )
            pipeline.save_pretrained(output_dir)
        except Exception as e:
            print(f"Error saving final model: {str(e)}")
    
    print("Training completed!")

In [6]:
def generate_diagrams(
    model_path,
    prompts,
    output_dir="./generated_diagrams",
    num_inference_steps=50,
    guidance_scale=7.5,
):
    print(f"Loading model from {model_path}...")
    pipeline = StableDiffusionPipeline.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        safety_checker=None,
    ).to("cuda")
    
    os.makedirs(output_dir, exist_ok=True)
    
    print("Generating diagrams...")
    for i, prompt in enumerate(prompts):
        try:
            # Generate image
            image = pipeline(
                prompt,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
            ).images[0]
            
            # Save image
            filename = f"diagram_{i+1}.png"
            image.save(os.path.join(output_dir, filename))
            
            # Save prompt
            with open(os.path.join(output_dir, f"diagram_{i+1}.txt"), "w") as f:
                f.write(prompt)
            
            print(f"Generated diagram {i+1}/{len(prompts)}")
            
        except Exception as e:
            print(f"Error generating diagram {i+1}: {str(e)}")
    
    print(f"Generation completed. Diagrams saved to {output_dir}")

In [7]:
# Train the model
train(
    num_epochs=1,
    batch_size=1,
    gradient_accumulation_steps=8,  # Increased accumulation steps
    save_steps=1000,
    mixed_precision="no"  # Disabled mixed precision
)

GPU Memory available: 17.06 GB
Loading AI2D dataset...


README.md:   0%|          | 0.00/700 [00:00<?, ?B/s]

test-00000-of-00002.parquet:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

test-00001-of-00002.parquet:   0%|          | 0.00/77.2M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3088 [00:00<?, ? examples/s]

Loading model components...


tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]



text_encoder/config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]


Sample item structure:
Keys: dict_keys(['question', 'options', 'answer', 'image'])
Question type: <class 'str'>
Options type: <class 'list'>
Sample options: ['c', 'D']

Starting epoch 0


Epoch 0:   0%|          | 0/3088 [00:00<?, ?it/s]

Step 0: loss = 0.0257
Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 25.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 35.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Step 10: loss = 0.4441
Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 23.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 37.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid frag

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 27.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 33.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Step 1150: loss = 0.0256
Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 25.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 35.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See docu

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .



Saving checkpoint at step 2000
Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 27.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 33.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Step 2290: loss = 0.1497
Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 23.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 37.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True t

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Error in training step: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 25.12 MiB is free. Process 4960 has 15.86 GiB memory in use. Of the allocated memory 15.53 GiB is allocated by PyTorch, and 35.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Saving final model...
Training completed!


In [8]:
# Generate some test diagrams
test_prompts = [
    "A labeled diagram of an animal cell showing organelles and membrane",
    "A flowchart showing the steps of DNA replication with arrows",
    "A diagram of the carbon cycle showing interactions between atmosphere, plants, and soil",
    "A cross-section diagram of Earth's layers with labels"
]

In [9]:
generate_diagrams(
    model_path="./sd-ai2d-model",
    prompts=test_prompts,
    output_dir="./test_diagrams"
)

Loading model from ./sd-ai2d-model...


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Generating diagrams...


  0%|          | 0/50 [00:00<?, ?it/s]

Generated diagram 1/4


  0%|          | 0/50 [00:00<?, ?it/s]

Generated diagram 2/4


  0%|          | 0/50 [00:00<?, ?it/s]

Generated diagram 3/4


  0%|          | 0/50 [00:00<?, ?it/s]

Generated diagram 4/4
Generation completed. Diagrams saved to ./test_diagrams
