**Image Variant with Stable Unclip**


In [None]:
import os
import torch
from PIL import Image

from diffusers import StableUnCLIPImg2ImgPipeline
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

from diffusers.blip.models.blip import blip_decoder


def load_demo_image(image_path, image_size, device):
    raw_image = Image.open(image_path).convert("RGB")

    w, h = raw_image.size
    # display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose(
        [
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ]
    )
    image = transform(raw_image).unsqueeze(0).to(device)
    return image


# Start the StableUnCLIP Image variations pipeline
device = "cuda:3"
torch.cuda.set_device(torch.device(device))  # change allocation of current GPU
base_model_path = "/data/noah/ckpt/finetuning/SD_UNCLIP_AD"

clip_image_size = 512
clip_model_path = "/data/noah/ckpt/pretrain_ckpt/BLIP/model_large_caption.pth"
clip_model = blip_decoder(pretrained=clip_model_path, image_size=clip_image_size, vit="large")
clip_model.eval()
clip_model = clip_model.to(device)

pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(base_model_path, torch_dtype=torch.float16)
pipe = pipe.to(device)

path = "/data/noah/inference/reimagine_exam/1657066276420_FR-View-CMR-Wide.png"

# Pipe to make the variation
init_image = Image.open(path).convert("RGB")
init_image.thumbnail((768, 768))

prompt = "{}, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality, <lora:add-detail-xl:1>"
negative_prompt = "cartoon, anime, painting, disfigured, immature, blur, picture, 3D, render, semi-realistic, drawing, poorly drawn, bad anatomy, wrong anatomy, gray scale, worst quality, low quality, sketch"

with torch.no_grad():
    clip_image = load_demo_image(image_path=path, image_size=clip_image_size, device=device)
    caption = clip_model.generate(clip_image, sample=False, num_beams=3, max_length=40, min_length=5)[0]
    prompt_get = prompt.format(caption)

    images = pipe(
        init_image,
        prompt=prompt_get,
        negative_prompt=negative_prompt,
        guidance_scale=7.0,
        num_inference_steps=40,
        noise_level=0,
        generator=torch.manual_seed(42),
    ).images

# print(caption)
display(init_image)
display(images[0])

In [None]:
import torch
from PIL import Image

from diffusers import StableDiffusionXLUnclipImg2ImgPipeline

# Start the StableUnCLIP Image variations pipeline
device = "cuda:3"
torch.cuda.set_device(torch.device(device))  # change allocation of current GPU
# base_model_path = "/data/noah/ckpt/finetuning/SDXL_UNCLIP_AD"
base_model_path = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/sdxl_unclip"

pipe = StableDiffusionXLUnclipImg2ImgPipeline.from_pretrained(base_model_path, torch_dtype=torch.float16)

pipe = pipe.to(device)

path = "/data/noah/inference/reimagine/input/2021-09-24-17-50-40_Front_1632473437550.png"

# Pipe to make the variation
init_image = Image.open(path).convert("RGB")
init_image.thumbnail((1024, 1024))
images = pipe(
    init_image,
    guidance_scale=11.0,
    num_inference_steps=5,
    generator=torch.manual_seed(42),
).images


# print(caption)
display(init_image)
display(images[0])

In [None]:
import torch
from PIL import Image

from diffusers.utils import load_image
from diffusers import StableDiffusionXLReferencePipeline
from diffusers.schedulers import UniPCMultistepScheduler
from diffusers import AutoencoderKL

from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

from diffusers.blip.models.blip import blip_decoder


def load_demo_image(image_path, image_size, device):
    raw_image = Image.open(image_path).convert("RGB")

    w, h = raw_image.size
    # display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose(
        [
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ]
    )
    image = transform(raw_image).unsqueeze(0).to(device)
    return image


device = "cuda:3"
torch.cuda.set_device(torch.device(device))  # change allocation of current GPU

path = "/data/noah/inference/reimagine_exam/1657066114871_FR-View-CMR-Wide.png"
input_image = Image.open(path).convert("RGB")
input_image.thumbnail((1024, 1024))

clip_image_size = 512
clip_model_path = "/data/noah/ckpt/pretrain_ckpt/BLIP/model_large_caption.pth"
clip_model = blip_decoder(pretrained=clip_model_path, image_size=clip_image_size, vit="large")
clip_model.eval()
clip_model = clip_model.to(device)

vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to(device)
pipe = StableDiffusionXLReferencePipeline.from_pretrained(
    "/data/noah/ckpt/finetuning/SDXL_AD",
    # "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/sdxl",
    vae=vae,
    torch_dtype=torch.float16,
    use_safetensors=True,
).to(device)

pipe.load_lora_weights("/data/noah/ckpt/pretrain_ckpt/StableDiffusion/lora_xl", "add-detail-xl.safetensors")
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

prompt = "{}, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality, <lora:add-detail-xl:3>"
negative_prompt = "cartoon, anime, painting, disfigured, immature, blur, picture, 3D, render, semi-realistic, drawing, poorly drawn, bad anatomy, wrong anatomy, gray scale, worst quality, low quality, sketch"

with torch.no_grad():
    clip_image = load_demo_image(image_path=path, image_size=clip_image_size, device=device)
    caption = clip_model.generate(clip_image, sample=False, num_beams=3, max_length=40, min_length=5)[0]
    prompt_get = prompt.format(caption)

    result_img = pipe(
        ref_image=input_image,
        prompt=prompt_get,
        negative_prompt=negative_prompt,
        guidance_scale=7.0,
        num_inference_steps=20,
        style_fidelity=0.5,
        guidance_rescale=0.0,
        reference_attn=True,
        reference_adain=True,
        generator=torch.manual_seed(42),
    ).images[0]

display(input_image)
display(result_img)

In [None]:
import torch
from PIL import Image

from diffusers.utils import load_image
from diffusers import StableDiffusionXLPipeline
from diffusers import DiffusionPipeline

from diffusers.schedulers import UniPCMultistepScheduler
from diffusers import AutoencoderKL

from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

from diffusers.blip.models.blip import blip_decoder


def load_demo_image(image_path, image_size, device):
    raw_image = Image.open(image_path).convert("RGB")

    w, h = raw_image.size
    # display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose(
        [
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ]
    )
    image = transform(raw_image).unsqueeze(0).to(device)
    return image


device = "cuda:3"
torch.cuda.set_device(torch.device(device))  # change allocation of current GPU

path = "/data/noah/inference/reimagine_exam/1657066114871_FR-View-CMR-Wide.png"
input_image = Image.open(path).convert("RGB")
input_image.thumbnail((1024, 1024))

clip_image_size = 512
clip_model_path = "/data/noah/ckpt/pretrain_ckpt/BLIP/model_large_caption.pth"
clip_model = blip_decoder(pretrained=clip_model_path, image_size=clip_image_size, vit="large")
clip_model.eval()
clip_model = clip_model.to(device)

vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to(device)
pipe = DiffusionPipeline.from_pretrained(
    "/data/noah/ckpt/finetuning/SDXL_AD",
    vae=vae,
    torch_dtype=torch.float16,
    use_safetensors=True,
).to(device)
pipe.load_lora_weights("/data/noah/ckpt/pretrain_ckpt/StableDiffusion/lora_xl", "add-detail-xl.safetensors")
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

prompt = "{}, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality, <lora:add-detail-xl:1>"
negative_prompt = "cartoon, anime, painting, disfigured, immature, blur, picture, 3D, render, semi-realistic, drawing, poorly drawn, bad anatomy, wrong anatomy, gray scale, worst quality, low quality, sketch"

with torch.no_grad():
    clip_image = load_demo_image(image_path=path, image_size=clip_image_size, device=device)
    caption = clip_model.generate(clip_image, sample=False, num_beams=3, max_length=40, min_length=5)[0]
    prompt_get = prompt.format(caption)

    result_img = pipe(
        prompt=prompt_get,
        negative_prompt=negative_prompt,
        guidance_scale=7.0,
        num_inference_steps=20,
        generator=torch.manual_seed(42),
    ).images[0]

display(input_image)
display(result_img)

In [None]:
import torch
from diffusers import SemanticStableDiffusionPipeline

device = "cuda:3"
torch.cuda.set_device(torch.device(device))  # change allocation of current GPU

pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = pipe.to(device)

out = pipe(
    prompt="/data/noah/ckpt/finetuning/SDXL_AD",
    num_images_per_prompt=1,
    guidance_scale=7,
    editing_prompt=["cars" "buildings", "trees", "road"],
    reverse_editing_direction=[
        False,
        False,
        False,
        False,
    ],  # Direction of guidance i.e. increase all concepts
    edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
    edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
    edit_threshold=[
        0.99,
        0.975,
        0.925,
        0.96,
    ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
    edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
    edit_mom_beta=0.6,  # Momentum beta
    edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
)
image = out.images[0]

display(image)