Usage:
1. init
2. load reference image and get pose
3. test a single image with a single prompt (StableIdentity & ControlNet)
4. test a single image with prompts (StableIdentity & ControlNet)
5. test all images with prompts (StableIdentity & ControlNet)

1. init

In [None]:
import torch
import os
from transformers import ViTModel, ViTImageProcessor
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, DPMSolverMultistepScheduler
from utils import latents_to_images, downsampling
from omegaconf import OmegaConf
from accelerate.utils import set_seed
from tqdm import tqdm
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
from PIL import Image
from models.celeb_embeddings import embedding_forward
from controlnet_aux import OpenposeDetector
from diffusers.utils import load_image, make_image_grid
import numpy as np

seed = 42
set_seed(seed)
torch.cuda.set_device(0)

'''set the paths of pretrained models'''
# vit face recognition
vit_face_path = "/home/user/.cache/huggingface/hub/vit-base-patch16-224-in21k-face-recognition"
# sd2.1
model_path = "/home/user/.cache/huggingface/hub/models--stabilityai--stable-diffusion-2-1/snapshots/5cae40e6a2745ae2b01ad92ae5043f95f23644d6"
# controlnet
controlnet_path = "/home/user/.cache/huggingface/hub/controlnetv11_21_diffuser" # https://huggingface.co/thibaud/controlnet-sd21-openpose-diffusers
# openpose
openpose_path = "models/openpose_models/"


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

'''load the pretrained models'''
# vit face recognition
vit_face_recog_processor = ViTImageProcessor.from_pretrained(vit_face_path)    
vit_face_recognition_model = ViTModel.from_pretrained(vit_face_path).to(device)    

# openpose
open_pose = OpenposeDetector.from_pretrained(openpose_path)       


# sd2.1
# model_id = "stabilityai/stable-diffusion-2-1"
# pipe = StableDiffusionPipeline.from_pretrained(model_path)  
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    model_path,
    controlnet=ControlNetModel.from_pretrained(controlnet_path)
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)

2. load reference image and get pose

In [None]:
pose_img_path = "datasets_face/pose_input/1.png"
ref_image = load_image(pose_img_path)
print("load_image", pose_img_path)         
pose_image = open_pose(ref_image, detect_resolution=512, image_resolution=512)
pose_image = np.array(pose_image)[:, :, ::-1]           
pose_image = Image.fromarray(np.uint8(pose_image))
pose_image
# pose_image.save('use_pose.png')

3. test a single image with a single prompt

In [None]:
test_emb_path = "experiments512/save_00059/00059-450.pt"
test_emb = torch.load(test_emb_path).cuda()
v1_emb = test_emb[:, 0]
v2_emb = test_emb[:, 1]


tokens = ["v1*", "v2*"]
embeddings = [v1_emb, v2_emb]
# add tokens and get ids
pipe.tokenizer.add_tokens(tokens)
token_ids = pipe.tokenizer.convert_tokens_to_ids(tokens)

# resize token embeddings and set new embeddings
pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer), pad_to_multiple_of = 8)
for token_id, embedding in zip(token_ids, embeddings):
    pipe.text_encoder.get_input_embeddings().weight.data[token_id] = embedding

prompt = "v1* v2* wearing a Superman outfit, facing to camera, best quality, ultra high res"
image = pipe(prompt, image=pose_image, guidance_scale = 8.5).images[0]  # negative_prompt="ugly, blurry, bad, deformed, bad anatomy"
# image.save(prompt + ".png")
image



4. test a single image with prompts

In [None]:
index = "00059"
save_dir = "results/" + index + "/with_controlnet"
os.makedirs(save_dir, exist_ok=True)
test_emb_path = f"experiments512/save_{index}/{index}-450.pt"
test_emb = torch.load(test_emb_path).cuda()
v1_emb = test_emb[:, 0]
v2_emb = test_emb[:, 1]

'''insert into tokenizer & embedding layer'''
tokens = ["v1*", "v2*"]
embeddings = [v1_emb, v2_emb]
# add tokens and get ids
pipe.tokenizer.add_tokens(tokens)
token_ids = pipe.tokenizer.convert_tokens_to_ids(tokens)

# resize token embeddings and set new embeddings
pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer), pad_to_multiple_of = 8)
for token_id, embedding in zip(token_ids, embeddings):
    pipe.text_encoder.get_input_embeddings().weight.data[token_id] = embedding

prompts_list = ["a photo of v1* v2*, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a Superman outfit, facing to camera, best quality, ultra high res",
"v1* v2* wearing a spacesuit, facing to camera, best quality, ultra high res",
"v1* v2* wearing a red sweater, facing to camera, best quality, ultra high res",
"v1* v2* wearing a purple wizard outfit, facing to camera, best quality, ultra high res",
"v1* v2* wearing a blue hoodie, facing to camera, best quality, ultra high res",
"v1* v2* wearing headphones, facing to camera, best quality, ultra high res",
"v1* v2* with red hair, facing to camera, best quality, ultra high res",
"v1* v2* wearing headphones with red hair, facing to camera, best quality, ultra high res",
"v1* v2* wearing a Christmas hat, facing to camera, best quality, ultra high res",
"v1* v2* wearing sunglasses, facing to camera, best quality, ultra high res",
"v1* v2* wearing sunglasses and necklace, facing to camera, best quality, ultra high res",
"v1* v2* wearing a blue cap, facing to camera, best quality, ultra high res",
"v1* v2* wearing a doctoral cap, facing to camera, best quality, ultra high res",
"v1* v2* with white hair, wearing glasses, facing to camera, best quality, ultra high res",
"v1* v2* in a helmet and vest riding a motorcycle, facing to camera, best quality, ultra high res",
"v1* v2* holding a bottle of red wine, facing to camera, best quality, ultra high res",
"v1* v2* driving a bus in the desert, facing to camera, best quality, ultra high res",
"v1* v2* playing basketball, facing to camera, best quality, ultra high res",
"v1* v2* playing the violin, facing to camera, best quality, ultra high res",
"v1* v2* piloting a spaceship, facing to camera, best quality, ultra high res",
"v1* v2* riding a horse, facing to camera, best quality, ultra high res",
"v1* v2* coding in front of a computer, facing to camera, best quality, ultra high res",
"v1* v2* laughing on the lawn, facing to camera, best quality, ultra high res",
"v1* v2* frowning at the camera, facing to camera, best quality, ultra high res",
"v1* v2* happily smiling, looking at the camera, facing to camera, best quality, ultra high res",
"v1* v2* crying disappointedly, with tears flowing, facing to camera, best quality, ultra high res",
"v1* v2* wearing sunglasses, facing to camera, best quality, ultra high res",
"v1* v2* playing the guitar in the view of left side, facing to camera, best quality, ultra high res",
"v1* v2* holding a bottle of red wine, upper body, facing to camera, best quality, ultra high res",
"v1* v2* wearing sunglasses and necklace, close-up, in the view of right side, facing to camera, best quality, ultra high res",
"v1* v2* riding a horse, in the view of the top, facing to camera, best quality, ultra high res",
"v1* v2* wearing a doctoral cap, upper body, with the left side of the face facing the camera, best quality, ultra high res",
"v1* v2* crying disappointedly, with tears flowing, with left side of the face facing the camera, best quality, ultra high res",
"v1* v2* sitting in front of the camera, with a beautiful purple sunset at the beach in the background, best quality, ultra high res",
"v1* v2* swimming in the pool, facing to camera, best quality, ultra high res",
"v1* v2* climbing a mountain, facing to camera, best quality, ultra high res",
"v1* v2* skiing on the snowy mountain, facing to camera, best quality, ultra high res",
"v1* v2* in the snow, facing to camera, best quality, ultra high res",
"v1* v2* in space wearing a spacesuit, facing to camera, best quality, ultra high res",]

for prompt in prompts_list:
    image = pipe(prompt, image=pose_image, guidance_scale = 8.5).images[0]
    image.save(os.path.join(save_dir, prompt.replace("v1* v2*", "a person") + '.png'))

5. test all images with prompts

In [None]:
folders = "datasets_face/test_data_demo"
folder_names = os.listdir(folders)

for img_name in folder_names:
    index = img_name[:-4]
    '''test single image with prompts_list'''
    save_dir = "results/" + index + "/with_controlnet"
    os.makedirs(save_dir, exist_ok=True)
    test_emb_path = f"experiments512/save_{index}/{index}-450.pt"
    test_emb = torch.load(test_emb_path).cuda()
    v1_emb = test_emb[:, 0]
    v2_emb = test_emb[:, 1]

    '''insert into tokenizer & embedding layer'''
    tokens = ["v1*", "v2*"]
    embeddings = [v1_emb, v2_emb]
    # add tokens and get ids
    pipe.tokenizer.add_tokens(tokens)
    token_ids = pipe.tokenizer.convert_tokens_to_ids(tokens)

    # resize token embeddings and set new embeddings
    pipe.text_encoder.resize_token_embeddings(len(pipe.tokenizer), pad_to_multiple_of = 8)
    for token_id, embedding in zip(token_ids, embeddings):
        pipe.text_encoder.get_input_embeddings().weight.data[token_id] = embedding

    prompts_list = ["a photo of v1* v2*, facing to camera, best quality, ultra high res",
        "v1* v2* wearing a Superman outfit, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a spacesuit, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a red sweater, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a purple wizard outfit, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a blue hoodie, facing to camera, best quality, ultra high res",
    "v1* v2* wearing headphones, facing to camera, best quality, ultra high res",
    "v1* v2* with red hair, facing to camera, best quality, ultra high res",
    "v1* v2* wearing headphones with red hair, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a Christmas hat, facing to camera, best quality, ultra high res",
    "v1* v2* wearing sunglasses, facing to camera, best quality, ultra high res",
    "v1* v2* wearing sunglasses and necklace, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a blue cap, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a doctoral cap, facing to camera, best quality, ultra high res",
    "v1* v2* with white hair, wearing glasses, facing to camera, best quality, ultra high res",
    "v1* v2* in a helmet and vest riding a motorcycle, facing to camera, best quality, ultra high res",
    "v1* v2* holding a bottle of red wine, facing to camera, best quality, ultra high res",
    "v1* v2* driving a bus in the desert, facing to camera, best quality, ultra high res",
    "v1* v2* playing basketball, facing to camera, best quality, ultra high res",
    "v1* v2* playing the violin, facing to camera, best quality, ultra high res",
    "v1* v2* piloting a spaceship, facing to camera, best quality, ultra high res",
    "v1* v2* riding a horse, facing to camera, best quality, ultra high res",
    "v1* v2* coding in front of a computer, facing to camera, best quality, ultra high res",
    "v1* v2* laughing on the lawn, facing to camera, best quality, ultra high res",
    "v1* v2* frowning at the camera, facing to camera, best quality, ultra high res",
    "v1* v2* happily smiling, looking at the camera, facing to camera, best quality, ultra high res",
    "v1* v2* crying disappointedly, with tears flowing, facing to camera, best quality, ultra high res",
    "v1* v2* wearing sunglasses, facing to camera, best quality, ultra high res",
    "v1* v2* playing the guitar in the view of left side, facing to camera, best quality, ultra high res",
    "v1* v2* holding a bottle of red wine, upper body, facing to camera, best quality, ultra high res",
    "v1* v2* wearing sunglasses and necklace, close-up, in the view of right side, facing to camera, best quality, ultra high res",
    "v1* v2* riding a horse, in the view of the top, facing to camera, best quality, ultra high res",
    "v1* v2* wearing a doctoral cap, upper body, with the left side of the face facing the camera, best quality, ultra high res",
    "v1* v2* crying disappointedly, with tears flowing, with left side of the face facing the camera, best quality, ultra high res",
    "v1* v2* sitting in front of the camera, with a beautiful purple sunset at the beach in the background, best quality, ultra high res",
    "v1* v2* swimming in the pool, facing to camera, best quality, ultra high res",
    "v1* v2* climbing a mountain, facing to camera, best quality, ultra high res",
    "v1* v2* skiing on the snowy mountain, facing to camera, best quality, ultra high res",
    "v1* v2* in the snow, facing to camera, best quality, ultra high res",
    "v1* v2* in space wearing a spacesuit, facing to camera, best quality, ultra high res",
    ]

    for prompt in prompts_list:
        image = pipe(prompt, image=pose_image, guidance_scale = 8.5).images[0]
        image.save(os.path.join(save_dir, prompt.replace("v1* v2*", "a person") + '.png'))