# Generate Source Image and Mask of Person

In [None]:
import os
import json
import numpy as np
from PIL import Image
import cv2
from tqdm import tqdm
import csv

# 76 ~ 659

def crop_from_mask(image, mask, padding=50):
    # Find contours in the mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # If no contours found, return original image
    if not contours:
        return image

    # Find the bounding box of the largest contour
    x, y, w, h = cv2.boundingRect(contours[0])
    
    # padding
    x = x-padding//2
    y = y-padding//2
    w = w+padding
    h = h+padding
    
    x = 0 if x<0 else x
    y = 0 if y<0 else y
    
    if x+w>image.shape[1]:
        w = image.shape[1]-x

    if y+h>image.shape[0]:
        h = image.shape[0]-y

    # Crop the image using the bounding box
    cropped_image = image[y : y + h, x : x + w]
    cropped_mask = mask[y : y + h, x : x + w]

    return cropped_image, cropped_mask


annotation_path = "/data/noah/dataset/ad_human/anno"
out_mask_path = "/data/noah/inference/magna_human_premask/_masks"
out_image_path = "/data/noah/inference/magna_human_premask/_images"
padding = 200
threshold = 200 + padding
cnt = 0
target_class = "pedestrian"

instance_infos = []

for anno_name in tqdm(os.listdir(annotation_path)):
    anno_path = os.path.join(annotation_path, anno_name)

    with open(anno_path, "r") as f:
        annotation = json.load(f)

    image_path = os.path.join(annotation["parent_path"], annotation["filename"])
    image = Image.open(image_path).convert("RGB")

    # generate crop mask and image
    target_idxs = []
    for idx, anno in enumerate(annotation["annotations"]):
        if (
            anno["label"] == target_class
            and anno["attributes"]["occlusion"] == "0"
            and anno["attributes"]["truncation"] == "0"
        ):
            target_idxs.append(idx)

    for target_idx in target_idxs:
        mask = np.zeros((image.height, image.width))
        point = np.array(annotation["annotations"][target_idx]["points"], dtype=np.int32)
        try:
            mask = cv2.fillPoly(mask, [point], color=255)
        except:
            continue

        cnt += 1
        crop_image, crop_mask = crop_from_mask(np.array(image), mask.astype("uint8"), padding=200)
        crop_image = Image.fromarray(crop_image)
        crop_mask = Image.fromarray(crop_mask)
        
        if crop_image.height < threshold:
            continue

        crop_image.save(os.path.join(out_image_path, "{}.png".format(cnt)))
        crop_mask.save(os.path.join(out_mask_path, "{}.png".format(cnt)))
        info = {
            "image_path": image_path,
            "image_height": crop_mask.height,
        }
        instance_infos.append(info)

print("done")

# Person Setting

In [None]:
import os
from tqdm import tqdm

import cv2
from PIL import Image
import numpy as np

import torch
from controlnet_aux.processor import MidasDetector
from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler

def make_dirs(paths):
    for path in paths:
        os.makedirs(path, exist_ok=True)

def make_grid(images, rows, cols):
    w, h = images[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))
    for i, image in enumerate(images):
        grid.paste(image, box=(i % cols * w, i // cols * h))
    return grid

def closest_multiple_of_8(number):
    closest_multiple = (number // 8) * 8  # 가장 가까운 8의 배수
    return closest_multiple

device = 'cuda:3'
instance_height = 1000
k = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
image_dir = '/data/noah/inference/magna_human_premask/images'
mask_dir = '/data/noah/inference/magna_human_premask/masks'
out_image_dir = '/data/noah/inference/magna_object/person/images'
out_mask_dir = '/data/noah/inference/magna_object/person/masks'
make_dirs([out_image_dir, out_mask_dir])

prompt_types = ["a person", "a woman", "a man", "a black person", "a white person", "a young person", "a old person"]
prompts = ["{}, RAW photo, subject, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3, <lora:add-detail:1>".format(ptype) for ptype in prompt_types]
negative_prompts = ["(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime), blurry, text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, UnrealisticDream"]*len(prompt_types)
num_inference_steps = 40
guidance_scale = 7.5
strength = 1.0
sag_scale = 0.75
controlnet_conditioning_scale = 0.5
padding_mask_crop=0
num_images_per_prompt=1
generation_cnt = 1000
iter_cnt = generation_cnt//(len(prompt_types)*num_images_per_prompt)

model_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/rv_inpaint_5.1"
controlnet_id = "/data/noah/ckpt/finetuning/controlnet_inpaint_coco_rider/checkpoint-21000/controlnet"
lora_id = "/data/noah/ckpt/pretrain_ckpt/StableDiffusion/lora_detail"
controlnet = ControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16)
pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
    model_id, controlnet=controlnet, torch_dtype=torch.float16
).to(device)
pipe.load_lora_weights(lora_id, weight_name="add_detail.safetensors")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.enable_freeu(s1=1.2, s2=0.5, b1=1.2, b2=1.4)


midas = MidasDetector.from_pretrained("lllyasviel/Annotators").to(device)


# Person Generation

In [11]:
for _ in range(iter_cnt):
    for name in tqdm(os.listdir(image_dir)):
        image_path = os.path.join(image_dir, name)
        mask_path = os.path.join(mask_dir, name)
        
        image = Image.open(image_path)
        mask = Image.open(mask_path)
        
        # generate condition image
        con_image = midas(image, image_resolution=image.height)

        height = closest_multiple_of_8(1024)
        ratio = instance_height/image.height
        width = closest_multiple_of_8(int(ratio*image.width))
        
        image = image.resize((width, height))
        mask = mask.resize((width, height))
        con_image = con_image.resize((width, height))

        # mask boundary refinement
        mask = np.array(mask)
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, k, iterations=3)    
        spots = np.argwhere(mask == 255)

        image = np.array(image)
        masked_image = np.ones((image.shape))*255

        for spot in spots:
            masked_image[spot[0], spot[1], :] = image[spot[0], spot[1], :]

        mask = Image.fromarray(mask.astype('uint8')).convert('L')
        masked_image = Image.fromarray(masked_image.astype('uint8'))
        image = Image.fromarray(image.astype('uint8'))
                
        result_images = pipe(
                prompt=prompts,
                negative_prompt=negative_prompts,
                image=masked_image,
                control_image=con_image,
                mask_image=mask,
                height=height,
                width=width,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                strength=strength,
                sag_scale=sag_scale,
                controlnet_conditioning_scale=controlnet_conditioning_scale,
                padding_mask_crop=padding_mask_crop,
                num_images_per_prompt=num_images_per_prompt
            ).images

        for idx, result_image in enumerate(result_images):
            prompt_type = prompt_types[idx]
            output_image_path = os.path.join(out_image_dir, name[:-4]+'_{}_{}'.format(prompt_type, _)+name[-4:])
            output_mask_path = os.path.join(out_mask_dir, name[:-4]+'_{}_{}'.format(prompt_type, _)+name[-4:])
            result_image.save(output_image_path)
            mask.save(output_mask_path)
            
    

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/63 [02:40<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.67 GiB (GPU 3; 23.70 GiB total capacity; 17.56 GiB already allocated; 1.44 GiB free; 21.92 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF