# AIMMO SIM2REAL INFERENCE DEMO

**It aims to convert simulated images into real-life images**

**1. Load the Input Image and Setting Parameter**


In [None]:
from PIL import Image
import torch
from diffusers import (
    StableDiffusionImg2ImgPipeline,
    StableDiffusionXLImg2ImgPipeline,
    StableDiffusionControlNetPipeline,
    StableDiffusionXLControlNetPipeline,
    ControlNetModel,
    UniPCMultistepScheduler,
    AutoencoderKL,
)

device = "cuda:3"
torch.cuda.set_device(torch.device(device))  # change allocation of current GPU


def inference_single(
    model_id, prompt, negative_prompt, image, device, lora_id=None, lora_name=None, mode="SD", control_model_id=None
):
    if mode == "SD":
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            model_id, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
    elif mode == "SDXL":
        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
            model_id, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
    elif mode == "Control_SD":
        if control_model_id is None:
            controlnet = ControlNetModel.from_pretrained(
                "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True
            ).to(device)
        else:
            controlnet = ControlNetModel.from_pretrained(
                control_model_id, torch_dtype=torch.float16, use_safetensors=True
            ).to(device)

        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            model_id, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
        # pipe.enable_model_cpu_offload()

    elif mode == "Control_SDXL":
        if control_model_id is None:
            controlnet = ControlNetModel.from_pretrained(
                "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
            ).to(device)
        else:
            controlnet = ControlNetModel.from_pretrained(
                control_model_id, torch_dtype=torch.float16, use_safetensors=True
            ).to(device)

        vae = AutoencoderKL.from_pretrained(
            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensoƒrs=True
        ).to(device)
        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
            model_id, controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

        # pipe.enable_model_cpu_offload()
    else:
        raise ValueError('model must be in ["SD", "SDXL", "ControlNet_SD", "ControlNet_SDXL"]')

    if lora_id and lora_name:
        pipe.load_lora_weights(lora_id, weight_name=lora_name)
        pipe.to(device)

        if mode.startswith("Control"):
            image = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                image=image,
                guidance_scale=8.0,
                num_inference_steps=50,
                generator=torch.manual_seed(123),
                cross_attention_kwargs={"scale": 0.5},
            ).images[0]
        else:
            image = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                image=image,
                strength=0.5,
                guidance_scale=8.0,
                num_inference_steps=50,
                generator=torch.manual_seed(123),
                cross_attention_kwargs={"scale": 0.5},
            ).images[0]

    else:
        if mode.startswith("Control"):
            image = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                image=image,
                guidance_scale=8.0,
                num_inference_steps=50,
                generator=torch.manual_seed(123),
            ).images[0]
        else:
            image = pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                image=image,
                strength=0.5,
                guidance_scale=8.0,
                num_inference_steps=50,
                generator=torch.manual_seed(123),
            ).images[0]

    return image


# prompt = "A man is driving in the car, indoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

# sample_6.png
# prompt = "a black car driving down a street next to a bridge, outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

# img_3.png
# prompt = "a yellow car driving down a street next to tall buildings, outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

# /data/noah/dataset/gta2cityscapes/A/00036.png
# prompt = "a truck driving down a highway next to a car, outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

# /data/noah/inference/gta2cityscapes/00086.png
# prompt = "a car driving down a street next to palm trees, outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

# /data/noah/dataset/gta2cityscapes/A/00065.png
# prompt = "a car driving down a city street at night, outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

# /data/noah/dataset/gta2cityscapes/A/00348.png
# prompt = "a a red truck driving down a street next to tall building, outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"

prompt = "a car driving"
negative_prompt = "cartoon, anime, painting, disfigured, immature, blur, picture, 3D, render, semi-realistic, drawing, poorly drawn, bad anatomy, wrong anatomy, gray scale, worst quality, low quality, sketch"

origin = Image.open("/data/noah/inference/reimagine/input/2021-09-24-17-50-40_Front_1632473437550.png").convert("RGB")
init_image = origin.resize((768, 512))

init_image.save("./1.init_image.jpg")

init_image

**2. Load the Stable Diffusion v1.5 model**

here is the Stable Diffusion [link](https://huggingface.co/CompVis/stable-diffusion-v1-4/tree/main)


In [None]:
model_id = "CompVis/stable-diffusion-v1-4"
image = inference_single(model_id, prompt, negative_prompt, init_image, device)
image.save("./2.Stable Diffusion.jpg")

image

**3. Load the Realistic Vision model based on Stable Diffusion v1.5**

**here is the Realistic Vision [link](https://civitai.com/models/4201/realistic-vision-v20)**


In [None]:
model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rv"
image = inference_single(model_id, prompt, negative_prompt, init_image, device)
image.save("./3.Realistic Vision.jpg")
image

**4. Load the ControlNet based on Realistic Vision**

**here is the detail information [link](https://huggingface.co/docs/diffusers/v0.21.0/en/training/controlnet) about training ControlNet and [link](https://huggingface.co/docs/diffusers/v0.21.0/en/using-diffusers/controlnet#multicontrolnet) about inference ControlNet**


In [None]:
import cv2
from PIL import Image
import numpy as np

low_threshold = 100
high_threshold = 200
model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rv"
control_model_id = "/data/noah/ckpt/pretrain_ckpt/ControlNet/sd/"


canny_image = cv2.Canny(np.array(init_image), low_threshold, high_threshold)
canny_image = canny_image[:, :, None]
canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
canny_image = Image.fromarray(canny_image)

image = inference_single(
    model_id, prompt, negative_prompt, canny_image, device, mode="Control_SD", control_model_id=control_model_id
)
image.save("./4.ControlNet(SD)+RV.jpg")
image

**5. Load Fine Tuned Realistic Vision Model**

here is the detail information [link](https://huggingface.co/docs/diffusers/training/text2image) about training text2img process

if training on your own dataset, first generate dataset with image and text created by image captioning([BLIP](https://github.com/salesforce/BLIP))

and then, train txt2img model(Realistic Vision) with [train script](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image)


In [None]:
model_id = "/data/noah/ckpt/finetuning/Realistic_AD"
image = inference_single(model_id, prompt, negative_prompt, init_image, device)
image.save("./5.Fine Tuning(Realistic Vision).jpg")
image

**6. Load the LoRA model based on Detail Tweaker**

**here is the detail tweaker [link](https://civitai.com/models/58390/detail-tweaker-lora-lora)**


In [None]:
# # model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rv"
# model_id = "/data/noah/ckpt/finetuning/Realistic_AD"
# lora_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/lora/"
# lora_name = "add_detail.safetensors"
# image = inference_single(model_id, prompt + ", <lora:add_detail:1>", negative_prompt, init_image, device, lora_id=lora_id, lora_name=lora_name)
# image.save('./6.Fine Tuning(Realistic Vision) + Detail Tweaker.jpg')
# image

**7. Load the Fine Tuned Realistic Vision v5.1 based ControlNet**


In [None]:
import cv2
from PIL import Image
import numpy as np

low_threshold = 100
high_threshold = 200
model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rv"
control_model_id = "/data/noah/ckpt/finetuning/Control_RV_AD_prompt/checkpoint-25000/controlnet"

canny_image = cv2.Canny(np.array(init_image), low_threshold, high_threshold)
canny_image = canny_image[:, :, None]
canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
canny_image = Image.fromarray(canny_image)

image = inference_single(
    model_id, prompt, negative_prompt, canny_image, device, mode="Control_SD", control_model_id=control_model_id
)
image.save("./7.Fine Tuning ControlNet(SD)+RV.jpg")
image

**8. Load the Realistic Vision XL based on SDXL**

here is the Realistic Vision XL [link](https://civitai.com/models/139562?modelVersionId=154590)


In [None]:
init_image = origin.resize((768, 768))

model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rvxl_v2"
image = inference_single(model_id, prompt, negative_prompt, init_image, device, mode="SDXL")
image.save("./8.Realistic Vision XL.jpg")
image

**9. Load the Fine Tuned Realistic Vision XL Model**


In [None]:
# model_id = "/data/noah/ckpt/finetuning/Realistic_XL_AD/"
# image = inference_single(model_id, prompt,negative_prompt,init_image, device, mode='SDXL')
# image.save('./9.Fine Tuning(Realistic Vision XL).jpg')
# image

**10. Load the ControlNet with Realistic Vision XL**


In [None]:
import cv2
from PIL import Image
import numpy as np

low_threshold = 100
high_threshold = 200
init_image = origin.resize((1024, 1024))
model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rvxl_v2"
control_model_id = "/data/noah/ckpt/pretrain_ckpt/ControlNet/sdxl"

canny_image = cv2.Canny(np.array(init_image), low_threshold, high_threshold)
canny_image = canny_image[:, :, None]
canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
canny_image = Image.fromarray(canny_image)

image = inference_single(
    model_id, prompt, negative_prompt, canny_image, device, mode="Control_SDXL", control_model_id=control_model_id
)
image.save("./10.ControlNet(SDXL)_RVXL.jpg")
image

**11. Load the FineTuned ControlNet with Realistic Vision XL**


In [None]:
import cv2
from PIL import Image
import numpy as np

low_threshold = 100
high_threshold = 200

origin.thumbnail((1024, 1024))

# model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/sdxl"
control_model_id = "/data/noah/ckpt/finetuning/Control_SDXL_AD/controlnet_70k"

canny_image = cv2.Canny(np.array(origin), low_threshold, high_threshold)
canny_image = canny_image[:, :, None]
canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
canny_image = Image.fromarray(canny_image)

image = inference_single(
    model_id, prompt, negative_prompt, canny_image, device, mode="Control_SDXL", control_model_id=control_model_id
)
image.save("./11.Fine Tuning(ControlNet + Realistic Vision).jpg.jpg")
image

**12. Batch Processing of Fine Tuned Stable Diffusion XL v1.0 + ControlNet and Inference With Realistic Vision v2.0**


In [None]:
import os

import cv2
import numpy as np
from PIL import Image

import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

from diffusers import (
    StableDiffusionImg2ImgPipeline,
    StableDiffusionXLImg2ImgPipeline,
    StableDiffusionControlNetPipeline,
    StableDiffusionXLControlNetPipeline,
    StableDiffusionXLControlNetImg2ImgPipeline,
    ControlNetModel,
    UniPCMultistepScheduler,
    AutoencoderKL,
)
from diffusers.blip.models.blip import blip_decoder


def load_demo_image(image_path, image_size, device):
    raw_image = Image.open(image_path).convert("RGB")

    w, h = raw_image.size
    # display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose(
        [
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ]
    )
    image = transform(raw_image).unsqueeze(0).to(device)
    return image


def generate_pipeline(model_id, device, mode="SD", control_model_id=None):
    if mode == "SD":
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            model_id, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
    elif mode == "SDXL":
        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
            model_id, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
    elif mode == "Control_SD":
        if control_model_id is None:
            controlnet = ControlNetModel.from_pretrained(
                "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True
            ).to(device)
        else:
            controlnet = ControlNetModel.from_pretrained(
                control_model_id, torch_dtype=torch.float16, use_safetensors=True
            ).to(device)

        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            model_id, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
        # pipe.enable_model_cpu_offload()

    elif mode == "Control_SDXL":
        if control_model_id is None:
            controlnet = ControlNetModel.from_pretrained(
                "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
            ).to(device)
        else:
            controlnet = ControlNetModel.from_pretrained(
                control_model_id, torch_dtype=torch.float16, use_safetensors=True
            ).to(device)

        vae = AutoencoderKL.from_pretrained(
            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensoƒrs=True
        ).to(device)
        pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
            model_id, controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

        # pipe.enable_model_cpu_offload()
    else:
        raise ValueError('model must be in ["SD", "SDXL", "ControlNet_SD", "ControlNet_SDXL"]')

    return pipe


def inference_sim2real(pipe, prompt, negative_prompt, image, control_image, mode):
    if mode.startswith("Control"):
        image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image=image,
            control_image=control_image,
            guidance_scale=8.0,
            num_inference_steps=50,
            generator=torch.manual_seed(123),
        ).images[0]
    else:
        image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image=image,
            strength=0.5,
            guidance_scale=8.0,
            num_inference_steps=50,
            generator=torch.manual_seed(123),
        ).images[0]

    return image


# clip_device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
# clip_image_size = 512
# clip_model_path = "/data/noah/ckpt/pretrain_ckpt/BLIP/model_large_caption.pth"
# clip_model = blip_decoder(pretrained=clip_model_path, image_size=clip_image_size, vit="large")
# clip_model.eval()
# clip_model = clip_model.to(clip_device)

sim2real_device = "cuda:3"
sim2real_mode = "Control_SDXL"
sim2real_model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/sdxl"
sim2real_control_model_id = "/data/noah/ckpt/finetuning/Control_SDXL_AD/controlnet_70k"
sim2real_pipe = generate_pipeline(
    sim2real_model_id, sim2real_device, mode=sim2real_mode, control_model_id=sim2real_control_model_id
)

image_dir = "/data/noah/inference/sim2real/_input"
out_dir = "/data/noah/inference/sim2real/output"
low_threshold = 100
high_threshold = 200
prompt = "{} ,outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"
negative_prompt = "cartoon, anime, painting, disfigured, immature, blur, picture, 3D, render, semi-realistic, drawing, poorly drawn, bad anatomy, wrong anatomy, gray scale, worst quality, low quality, sketch"


for _ in os.listdir(image_dir):
    image_path = os.path.join(image_dir, _)
    # clip_image = load_demo_image(image_path=image_dir, image_size=clip_image_size, device=device)
    sim2real_image = Image.open(image_path).convert("RGB")
    sim2real_image = sim2real_image.resize((1024, 1024))
    sim2real_canny_image = cv2.Canny(np.array(sim2real_image), low_threshold, high_threshold)
    sim2real_canny_image = sim2real_canny_image[:, :, None]
    sim2real_canny_image = np.concatenate([sim2real_canny_image, sim2real_canny_image, sim2real_canny_image], axis=2)
    sim2real_canny_image = Image.fromarray(sim2real_canny_image)

    with torch.no_grad():
        # caption = clip_model.generate(clip_image, sample=False, num_beams=3, max_length=40, min_length=5)[0]
        # caption = clip_model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)[0]

        sim2real_result_image = inference_sim2real(
            sim2real_pipe,
            prompt.format("cars are driving on the road"),
            negative_prompt,
            image=sim2real_image,
            control_image=sim2real_canny_image,
            mode=sim2real_mode,
        )

    sim2real_result_image.save(os.path.join(out_dir, _))

In [None]:
import os

import cv2
from controlnet_aux.lineart import LineartDetector
import numpy as np
from PIL import Image

import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

from diffusers import (
    StableDiffusionImg2ImgPipeline,
    StableDiffusionXLImg2ImgPipeline,
    StableDiffusionControlNetPipeline,
    StableDiffusionXLControlNetImg2ImgPipeline,
    StableDiffusionXLControlNetPipeline,
    ControlNetModel,
    UniPCMultistepScheduler,
    AutoencoderKL,
)
from diffusers.blip.models.blip import blip_decoder


def load_demo_image(image_path, image_size, device):
    raw_image = Image.open(image_path).convert("RGB")

    w, h = raw_image.size
    # display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose(
        [
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ]
    )
    image = transform(raw_image).unsqueeze(0).to(device)
    return image


def generate_pipeline(model_id, device, mode="SD", control_model_id=None):
    if mode == "SD":
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            model_id, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
    elif mode == "SDXL":
        pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
            model_id, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
    elif mode == "Control_SD":
        if control_model_id is None:
            controlnet = ControlNetModel.from_pretrained(
                "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True
            ).to(device)
        else:
            controlnet = ControlNetModel.from_pretrained(
                control_model_id, torch_dtype=torch.float16, use_safetensors=True
            ).to(device)

        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            model_id, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
        # pipe.enable_model_cpu_offload()

    elif mode == "Control_SDXL":
        if control_model_id is None:
            controlnet = ControlNetModel.from_pretrained(
                "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
            ).to(device)
        else:
            controlnet = ControlNetModel.from_pretrained(
                control_model_id, torch_dtype=torch.float16, use_safetensors=True
            ).to(device)

        vae = AutoencoderKL.from_pretrained(
            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensoƒrs=True
        ).to(device)
        pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
            model_id, controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=True
        ).to(device)
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

        # pipe.enable_model_cpu_offload()
    else:
        raise ValueError('model must be in ["SD", "SDXL", "ControlNet_SD", "ControlNet_SDXL"]')

    return pipe


def inference_sim2real(pipe, prompt, negative_prompt, image, control_image, mode):
    if mode.startswith("Control"):
        image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image=image,
            guidance_scale=8.0,
            num_inference_steps=50,
            generator=torch.manual_seed(123),
        ).images[0]
    else:
        image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            image=image,
            strength=0.5,
            guidance_scale=8.0,
            num_inference_steps=50,
            generator=torch.manual_seed(123),
        ).images[0]

    return image


clip_device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
clip_image_size = 512
clip_model_path = "/data/noah/ckpt/pretrain_ckpt/BLIP/model_large_caption.pth"
clip_model = blip_decoder(pretrained=clip_model_path, image_size=clip_image_size, vit="large")
clip_model.eval()
clip_model = clip_model.to(clip_device)

sim2real_device = "cuda:3"
sim2real_mode = "Control_SDXL"
sim2real_model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/sdxl"
sim2real_control_model_id = "/data/noah/ckpt/finetuning/Control_SDXL_AD/controlnet_25k"
sim2real_pipe = generate_pipeline(
    sim2real_model_id, sim2real_device, mode=sim2real_mode, control_model_id=sim2real_control_model_id
)

image_dir = "/data/noah/inference/sim2real/_input"
out_dir = "/data/noah/inference/sim2real/output"
low_threshold = 100
high_threshold = 200
prompt = "{} ,outdoor, best quality, extremely detailed, clearness, naturalness, film grain, crystal clear, photo with color, actuality"
negative_prompt = "cartoon, anime, painting, disfigured, immature, blur, picture, 3D, render, semi-realistic, drawing, poorly drawn, bad anatomy, wrong anatomy, gray scale, worst quality, low quality, sketch"


for _ in os.listdir(image_dir):
    image_path = os.path.join(image_dir, _)
    clip_image = load_demo_image(image_path=image_dir, image_size=clip_image_size, device=device)
    sim2real_image = Image.open(image_path).convert("RGB")
    sim2real_image = sim2real_image.resize((1024, 1024))
    sim2real_canny_image = cv2.Canny(np.array(sim2real_image), low_threshold, high_threshold)
    sim2real_canny_image = sim2real_canny_image[:, :, None]
    sim2real_canny_image = np.concatenate([sim2real_canny_image, sim2real_canny_image, sim2real_canny_image], axis=2)
    sim2real_canny_image = Image.fromarray(sim2real_canny_image)

    with torch.no_grad():
        caption = clip_model.generate(clip_image, sample=False, num_beams=3, max_length=40, min_length=5)[0]
        # caption = clip_model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)[0]

        sim2real_result_image = inference_sim2real(
            sim2real_pipe, prompt.format(caption), negative_prompt, sim2real_canny_image, sim2real_mode
        )

    sim2real_result_image.save(os.path.join(out_dir, _))