# 04-3: Inference text-to-image

## Dall-E

In [None]:
!pip install -q openai invisible_watermark transformers accelerate safetensors diffusers controlnet_aux==0.0.7 xformers mediapy

In [None]:
import os
from openai import OpenAI

# Load your API key from an environment variable or secret management service
openai_api_key = YOUR_OPEN_API_KEY # <--- CHANGE THIS !!

client = OpenAI(api_key=openai_api_key)

response = client.images.generate(
  model="dall-e-3",
  prompt="a photo of a dog holding up a sign that says 'I love MADRID'", #a white siamese cat",
  size="1024x1024",
  quality="standard",
  n=1,
)

# Each image can be returned as either a URL or Base64 data, using the response_format parameter. URLs will expire after an hour.
# https://beta.openai.com/docs/guides/images/usage
image_url = response.data[0].url

## Stable Diffusion 1.5 with DDIM scheduler

In [None]:
from diffusers import StableDiffusionPipeline, DDIMScheduler

ddim = DDIMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=ddim)

image = pipeline("An astronaut riding a horse.").images[0]

image.save("astronaut_riding_a_horse.png")

## Stable Diffusion XL 1.0

In [None]:
import torch
from diffusers import StableDiffusionXLImg2ImgPipeline, DiffusionPipeline, KDPM2AncestralDiscreteScheduler, StableDiffusionXLPipeline, AutoencoderKL
import gc
from PIL import Image
import requests
from io import BytesIO
from IPython.display import display

In [None]:
model_base = "stabilityai/stable-diffusion-xl-base-1.0"
v_autoencoder = "madebyollin/sdxl-vae-fp16-fix" # fix vae for run in fp16 precision without generating NaNs

vae = AutoencoderKL.from_pretrained(v_autoencoder, torch_dtype=torch.float16)

pipe = StableDiffusionXLPipeline.from_pretrained(
    model_base,
    vae=vae,
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
    add_watermarker=False, # no watermarker
    )

pipe.safety_checker = None

pipe.to("cuda")

In [None]:
model_refiner = "stabilityai/stable-diffusion-xl-refiner-1.0"

pipe_refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
    model_refiner,
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
    add_watermarker=False, # no watermarker
    )

#pipe_refiner.to("cuda")
pipe_refiner.enable_model_cpu_offload()

Generation with the base model:

In [None]:
#(Optional) Change the scheduler
pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(
pipe.scheduler.config, use_karras_sigmas=True
)
#generator = torch.Generator().manual_seed(42)

In [None]:
# max 77 tokens in prompt
prompt = "full body,Cyber goth Geisha in the rain in a  tokyo future  city city wide, Pretty Face, Beautiful eyes, Anime, Portrait, Dark Aesthetic, Neon sunset blade runner background, Concept Art, Digital Art, Anime Art, unreal engine, greg rutkowski, loish, rhads, beeple, makoto shinkai, haruyo morita and lois"
prompt2 = "Cyber goth Geisha in the rain, stylized cyberpunk black tokyo market, indoor in the style of blade runner, stands illuminated by greens neon lights, crowded with cyborgs photorealistic background, 3 5 mm, grainy ruined film, dark color scheme, ray tracing, unreal engine, 4 k long shot"
negative_prompt = ''
negative_prompt2 = ''

image_base = pipe(
    prompt=prompt,
    prompt_2=prompt2,
    negative_prompt=negative_prompt,
    negative_prompt_2=negative_prompt2,
    guidance_scale=9.0,
    num_inference_steps=50,
    ).images[0]

gc.collect()
torch.cuda.empty_cache()

In [None]:
image_base

Using the refiner with the generated image:

In [None]:
image_refiner = pipe_refiner(
    prompt=prompt,
    prompt_2=prompt2,
    negative_prompt=negative_prompt,
    negative_prompt_2=negative_prompt2,
    image=image_base,
    num_inference_steps=50,
    strength=0.3,
    ).images[0]

gc.collect()
torch.cuda.empty_cache()

In [None]:
image_refiner