<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/diffusers/SDE_DRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q diffusers accelerate transformers

In [None]:
!pip install -q git+https://github.com/huggingface/diffusers.git

In [None]:
!wget https://static2.flymee.jp/product_images/c74a-110961/202206091032086835.jpg -O /content/input.jpg

In [None]:
import PIL
import torch
from diffusers import DDIMScheduler, DiffusionPipeline

# Load the pipeline
model_path = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag", torch_dtype=torch.float16)
pipe.to('cuda')

# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
# If not training LoRA, please avoid using torch.float16
# pipe.to(torch.float16)

In [None]:
from transformers import AutoProcessor, AutoModelForUniversalSegmentation
model_id = "shi-labs/oneformer_ade20k_swin_large"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForUniversalSegmentation.from_pretrained(model_id).to("cuda")

In [9]:
def calculate_bounding_box(mask_image):
    # 二値化処理
    _, binary_mask = cv2.threshold(mask_image, 127, 255, cv2.THRESH_BINARY)

    # 輪郭を検出
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if not contours:
        return None  # 輪郭が見つからない場合

    # 最大の輪郭を取得
    max_contour = max(contours, key=cv2.contourArea)

    # 輪郭を囲む矩形を計算
    x, y, w, h = cv2.boundingRect(max_contour)

    return x, y, x + w, y + h  # 左上の座標と右下の座標を返す

In [None]:
from PIL import Image
import numpy as np
import cv2
img_path = '/content/input.jpg'
image = Image.open(img_path).convert("RGB")
inputs = processor(image, ["semantic"], return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model(**inputs)

predicted_semantic_map = processor.post_process_semantic_segmentation(
    outputs, target_sizes=[image.size[::-1]]
)[0]

predicted_semantic_map = predicted_semantic_map.detach().cpu().numpy()
target_label_id = model.config.label2id["chair"]
target_label_map = np.where(predicted_semantic_map == target_label_id, 255, 0).astype(np.uint8)
display(Image.fromarray(target_label_map))

In [None]:
xmin, ymin, xmax, ymax = calculate_bounding_box(target_label_map)
display(image.crop((xmin, ymin, xmax, ymax)))

In [None]:
import cv2
def draw_point(image, point, color=(0, 255, 0), radius=5, thickness=-1):
    """
    画像に点を描画する関数

    Parameters:
    - image: 描画対象の画像
    - point: 描画する点の座標 (x, y)
    - color: 点の色 (B, G, R)
    - radius: 点の半径
    - thickness: 点の輪郭の太さ（負の値の場合、塗りつぶし）
    """
    cv2.circle(image, point, radius, color, thickness)


base = cv2.imread('/content/input.jpg')
draw_point(base, (570, 600))

display(Image.fromarray(cv2.cvtColor(base, cv2.COLOR_BGR2RGB)))


In [None]:
# Provide prompt, image, mask image, and the starting and target points for drag editing.
prompt = "chair"
image = Image.open('/content/input.jpg')
mask_image = np.zeros_like(target_label_map)
mask_image[ymin:ymax, xmin:xmax] = 255
mask_image = Image.fromarray(mask_image)
source_points = [[400, 400], [340, 600], [570, 600]]
target_points = [[250, 400], [300, 600], [530, 600]]

# train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
# pipe.train_lora(prompt, image)

output = pipe(prompt, image, mask_image, source_points, target_points)
output_image = Image.fromarray(output)
display(output_image)