<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/application/gligenadvancedapp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation

In [None]:
!pip install transformers accelerate scipy safetensors
!git clone https://github.com/gligen/diffusers.git
!pip install git+https://github.com/gligen/diffusers.git
# Installation for GroundingDINO
%cd /content
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd /content/GroundingDINO
!pip install -q -e .
!pip install -q roboflow

Import Libraries

In [2]:
# import libraries
import cv2
from PIL import Image
import numpy as np
import os
import supervision as sv
import argparse
from functools import partial
import cv2
import requests

from io import BytesIO
from pathlib import Path
import random


import warnings
warnings.filterwarnings("ignore")


import torch
from torchvision.ops import box_convert

from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
from groundingdino.util.inference import annotate, load_image, predict
import groundingdino.datasets.transforms as T

from huggingface_hub import hf_hub_download
from diffusers import StableDiffusionGLIGENPipeline

Utils function

In [3]:
# utils function
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)

    args = SLConfig.fromfile(cache_config_file) 
    model = build_model(args)
    args.device = device

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location='cpu')
    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
    print("Model loaded from {} \n => {}".format(cache_file, log))
    _ = model.eval()
    return model  

def get_base_img(img, part):
  h, w = img.shape[:2]
  if part == "left":
    crop_img = img[:, :w // 2, :] 
    base_img = np.zeros_like(img)
    base_img[:, :w // 2, :] = crop_img
  elif part == "right":
    crop_img = img[:, w // 2:, :] 
    base_img = np.zeros_like(img)
    base_img[:, w // 2:, :] = crop_img
  elif part == "top":
    crop_img = img[:, :h // 2, :] 
    base_img = np.zeros_like(img)
    base_img[:, :h // 2, :] = crop_img
  elif part == "bottom":
    crop_img = img[:, h // 2:, :] 
    base_img = np.zeros_like(img)
    base_img[:, h // 2:, :] = crop_img
  else:
    base_img = img
  
  file_path = "/content/part.png"
  cv2.imwrite(file_path, base_img)

  return base_img, file_path

def generate_masks_with_grounding(image_source, boxes):
    h, w, _ = image_source.shape
    boxes_unnorm = boxes * torch.Tensor([w, h, w, h])
    boxes_xyxy = box_convert(boxes=boxes_unnorm, in_fmt="cxcywh", out_fmt="xyxy").numpy()
    mask = np.zeros_like(image_source)
    for box in boxes_xyxy:
        x0, y0, x1, y1 = box
        mask[int(y0):int(y1), int(x0):int(x1), :] = 255
    return mask

Load model

In [None]:
# load detection model
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filenmae = "groundingdino_swint_ogc.pth"
ckpt_config_filename = "GroundingDINO_SwinT_OGC.cfg.py"
dino_model = load_model_hf(ckpt_repo_id, ckpt_filenmae, ckpt_config_filename)

# load gligen model
pipe = StableDiffusionGLIGENPipeline.from_pretrained("gligen/diffusers-inpainting-text-box", revision="fp16", torch_dtype=torch.float16)
pipe.to("cuda")

LLM pipeline

In [None]:
#@title description
# chatgpt prompt enginnering
# ここはchatgptでやる
# Fewshot learning

"""
You act as a prompt generator. Please do the classification task like the following examples. \  Prompt: replace chair into sofa at the left side Output: {"part": "left", "source object": ["chair"], "target object": ["sofa"]} \ Prompt: change wooden table into white table in the right part. Output: {"part": "right", "source object": ["wooden table"], "target object": ["white table"]} \ Prompt: replace blue chair and red sofa into yellow table and green chair at the bottom Output: {"part": "bottom", "source object": ["blue chair", "red sofa"], "target object": ["yellow table", "green chair"]} \ Prompt: replace sofa and shelf into chair and picture at the right side in the room Output:
"""

# after you get into the above prompt, you should inject the following prompt
"""
Prompt: replace chair and shelf into potted plant and picture at the right side Output:
"""

# Results
"""
{"part": "right", "source object": ["chair", "shelf"], "target object": ["potted plant", "picture"]}
"""


In [5]:
prompt = "replace chair and shelf into potted plant and picture at the right side"
# response = llm(prompt) # In this case, llm is OpenAI chatgpt 3.5, 
response = {"part": "right", "source object": ["chair", "shelf"], "target object": ["potted plant", "picture"]}

zeroshot object detection pipeline

In [6]:
img_path = "/content/room001.jpeg"
img = cv2.imread(img_path)
base_img, local_img_path = get_base_img(img, response["part"])

In [None]:
# grounding dino detection
local_image_path = local_img_path
text_prompt = ""
for t in response["source object"]:
  text_prompt += f"{t}."

TEXT_PROMPT = text_prompt
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(local_image_path)

boxes, logits, phrases = predict(
    model=dino_model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
# annotated_frame = annotated_frame[...,::-1] # BGR to RGB
%matplotlib inline  
sv.plot_image(annotated_frame, (16, 16))

In [None]:
# check mask
image_source = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
image_mask = generate_masks_with_grounding(image_source, boxes)
display(Image.fromarray(image_mask))

GLIGEN pipeline


In [9]:
image_source = Image.fromarray(image_source)
image_mask = Image.fromarray(image_mask)
# Resize
image_source_for_inpaint = image_source.resize((512, 512))
image_mask_for_inpaint = image_mask.resize((512, 512))
# get bbox
xyxy_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").tolist()

# define prompts for each box
gligen_phrases = ['a chair', 'a great picture']
prompt = "'a chair', 'a great picture'"

num_box = len(boxes)

image_inpainting = pipe(
    prompt,
    num_images_per_prompt = 2,
    gligen_phrases = gligen_phrases,
    gligen_inpaint_image = image_source_for_inpaint,
    gligen_boxes = xyxy_boxes,
    gligen_scheduled_sampling_beta=1,
    output_type="numpy",
    num_inference_steps=50
).images

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# display image
image_inpainting = (image_inpainting * 255).astype(np.uint8)
image_inpainting = np.concatenate(image_inpainting, axis=1)
Image.fromarray(image_inpainting).resize((image_source.size[0]*2, image_source.size[1]))