In [4]:
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from diffusers import StableDiffusionPipeline, DDIMScheduler
from PIL import Image
import numpy as np
import os
from tqdm.notebook import tqdm
from transformers import CLIPImageProcessor
import json

# Environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Parameters
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
image_paths = [
    "../data/results/ma-boston_200250_fake_B.png",
    "../data/results/nc-charlotte_200250_fake_B.png",
    "../data/results/ny-manhattan_200250_fake_B.png",
    "../data/results/pa-pittsburgh_200250_fake_B.png"
]
inference_steps = 10  # Reduced steps for lower memory consumption
identifier = f"stable-diffusion-weighted-interpolation_{inference_steps}-inference"
output_dir = os.path.join("diffusion-output", identifier)
output_image_size = (256, 256)  # Reduced size to reduce memory load
max_image_dimension = 256  # Reduce max dimension to fit GPU memory

# Example softmax weights (these should sum to 1)
weights = [0.25, 0.35, 0.15, 0.25]  # Replace with your actual weights

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Set device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

def load_and_preprocess_image(image_path):
    """Load and preprocess an image."""
    image = Image.open(image_path).convert("RGB")
    original_size = image.size
    aspect_ratio = original_size[0] / original_size[1]
    if aspect_ratio > 1:
        new_width = min(original_size[0], max_image_dimension)
        new_width = new_width - (new_width % 8)
        new_height = int(new_width / aspect_ratio)
        new_height = new_height - (new_height % 8)
    else:
        new_height = min(original_size[1], max_image_dimension)
        new_height = new_height - (new_height % 8)
        new_width = int(new_height * aspect_ratio)
        new_width = new_width - (new_width % 8)
    
    new_size = (new_width, new_height)
    transform = Compose([Resize(new_size), ToTensor(), Normalize([0.5], [0.5])])
    image_tensor = transform(image).unsqueeze(0).to(device).to(torch.float16)
    
    return image_tensor

try:
    with tqdm(total=1, desc="Loading Stable Diffusion model") as pbar:
        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
        pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
        pipe = pipe.to(device)
        pipe.enable_attention_slicing("max")  # Maximize attention slicing
        pipe.enable_model_cpu_offload()  # Offload unused parts of the model to CPU
        pbar.update(1)

    images = []
    with tqdm(total=len(image_paths), desc="Loading and preprocessing images") as pbar:
        for image_path in image_paths:
            image = load_and_preprocess_image(image_path)
            images.append(image)
            pbar.update(1)

    latents = []
    with tqdm(total=len(images), desc="Encoding images to latent representations") as pbar:
        with torch.no_grad():
            for image in images:
                latent = pipe.vae.encode(image).latent_dist.sample() * 0.18215
                latents.append(latent)
                pbar.update(1)

    print(f"Shapes of latents: {[latent.shape for latent in latents]}")
    weighted_latent = sum(w * latent for w, latent in zip(weights, latents))

    with torch.no_grad():
        weighted_latent = 1 / 0.18215 * weighted_latent
        decoded_image = pipe.vae.decode(weighted_latent).sample
        decoded_image = (decoded_image / 2 + 0.5).clamp(0, 1)
        decoded_image = decoded_image.cpu().permute(0, 2, 3, 1).numpy()[0]

    decoded_image_pil = Image.fromarray((decoded_image * 255).astype(np.uint8))
    decoded_image_pil = decoded_image_pil.resize(output_image_size, Image.LANCZOS)
    output_path = os.path.join(output_dir, "weighted_interpolated_image.png")
    decoded_image_pil.save(output_path, quality=95)

    print(f"Weighted interpolation image saved at: {output_path}")

except Exception as e:
    print(f"An error occurred: {e}")


Using device: cuda


Loading Stable Diffusion model:   0%|          | 0/1 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


An error occurred: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 7.65 GiB of which 72.69 MiB is free. Including non-PyTorch memory, this process has 5.56 GiB memory in use. Of the allocated memory 5.25 GiB is allocated by PyTorch, and 164.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
