<a href="https://colab.research.google.com/github/softmurata/prompt-engineering/blob/main/sgwork/roomcreatorapp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

GLIGEN Room App Demo

In [None]:
# install GLIGEN
!pip install transformers accelerate scipy safetensors
!pip install git+https://github.com/gligen/diffusers.git

!pip install gradio

# Font Get
!wget https://huggingface.co/spaces/gligen/demo/resolve/main/DejaVuSansMono.ttf

In [None]:
# import library
import gradio as gr
from gradio import processing_utils
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import cv2


from torchvision.ops import box_convert

In [None]:
# load gligen model
import torch
from diffusers import StableDiffusionGLIGENPipeline

pipe = StableDiffusionGLIGENPipeline.from_pretrained("gligen/diffusers-inpainting-text-box", revision="fp16", torch_dtype=torch.float16)
pipe.to("cuda")

In [None]:
### Dynamic Function
def click_check_btn(image_input):
  image = image_input["image"]
  res_image = image.resize((512, 512))
  filepath = "/content/init.png"
  res_image.save(filepath)
  bbox_mask_image = res_image.save("/content/bbox_mask.png")
  return filepath

def refresh(value):
  
  return gr.update(value=value)

def draw_box(boxes=[], texts=[], img=None):
    if len(boxes) == 0 and img is None:
        return None

    if img is None:
        img = Image.new('RGB', (512, 512), (255, 255, 255))
    colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype("DejaVuSansMono.ttf", size=18)
    for bid, box in enumerate(boxes):
        draw.rectangle([box[0], box[1], box[2], box[3]], outline=colors[bid % len(colors)], width=4)
        anno_text = texts[bid]
        draw.rectangle([box[0], box[3] - int(font.size * 1.2), box[0] + int((len(anno_text) + 0.8) * font.size * 0.6), box[3]], outline=colors[bid % len(colors)], fill=colors[bid % len(colors)], width=4)
        draw.text([box[0] + int(font.size * 0.2), box[3] - int(font.size*1.2)], anno_text, font=font, fill=(255,255,255))
    return img


def draw_bbox(sd_input_image, grounding_instruction, state):
  grounding_texts = [x.strip() for x in grounding_instruction.split(';')]
  grounding_texts = [x for x in grounding_texts if len(x) > 0]
  image = Image.open(sd_input_image["image"])
  mask_img_path = sd_input_image["mask"]
  # read image
  mask_img = cv2.resize(cv2.imread(mask_img_path), (512, 512))
  Image.fromarray(mask_img).save("/content/mask.png")  # 最終マスクを取得しておく
  if 'masks' not in state or len(state['masks']) == 0:
        state['masks'] = []
        last_mask = np.zeros_like(mask_img)
  else:
        last_mask = state['masks'][-1]
  
  diff_mask = mask_img - last_mask
  state['masks'].append(mask_img)
  Image.fromarray(diff_mask).save("/content/diff.png")
  # convert to grayscale
  gray = cv2.cvtColor(diff_mask, cv2.COLOR_BGR2GRAY)
  
  # threshold
  thresh = cv2.threshold(gray,128,255,cv2.THRESH_BINARY)[1]
  
  # get contours
  result = np.zeros_like(mask_img)
  contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  contours = contours[0]  # if len(contours) == 2 else contours[1]
  if len(contours) == 0:
    return image, state

  cntr = contours[-1]
  x,y,w,h = cv2.boundingRect(cntr)
  # cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
  result[y:y+h, x:x+w] = np.ones((h, w, 3)) * 255
  if 'boxes' not in state:
    state['boxes'] = []
  state["boxes"].append((x, y, x+w, y+h))
  # mask = result 
  
  if len(grounding_texts) < len(state['boxes']):
     grounding_texts += [f'Obj. {bid+1}' for bid in range(len(grounding_texts), len(state['boxes']))]
  
  print(state["boxes"])
  box_image = draw_box(state['boxes'], grounding_texts, image)
  bbox_mask = np.zeros_like(mask_img)
  for box in state["boxes"]:
    xmin, ymin, xmax, ymax = box
    bbox_mask[ymin:ymax, xmin:xmax, :] = 255
  Image.fromarray(bbox_mask).save("/content/bbox_mask.png")


  return box_image, state

def bbox_check(grounding_instruction, sd_input_image, state):
  image = sd_input_image["image"]
  mask = sd_input_image["mask"]
  grounding_texts = [x.strip() for x in grounding_instruction.split(';')]
  grounding_texts = [x for x in grounding_texts if len(x) > 0]
  state["grounding_texts"] = grounding_texts
  return grounding_texts, gr.update(value="/content/bbox_mask.png")

In [None]:
## inference function
def gligen_infer(state):
  # bbox_mask_image => PIL image
  # grounding_lang -> [str]
  image_source = Image.open("/content/init.png")
  bbox_mask_image = Image.open("/content/bbox_mask.png").resize((512, 512))
  image_source_for_inpaint = image_source.resize((512, 512))

  boxes = state["boxes"]
  xyxy_boxes = []
  for box in boxes:
    xmin, ymin, xmax, ymax = box
    xyxy_boxes.append([xmin / 512, ymin/ 512, xmax / 512, ymax / 512])

  grounding_lang = state["grounding_texts"]
  gligen_phrases = grounding_lang
  prompt = ""
  for gp in gligen_phrases:
    prompt += f"'a {gp}', "

  prompt = prompt[:-1]

  num_box = len(boxes)

  image_inpainting = pipe(
      prompt,
      num_images_per_prompt = 1,
      gligen_phrases = gligen_phrases,
      gligen_inpaint_image = image_source_for_inpaint,
      gligen_boxes = xyxy_boxes,
      gligen_scheduled_sampling_beta=1,
      output_type="numpy",
      num_inference_steps=50
  ).images

  image_inpainting = (image_inpainting[0] * 255).astype(np.uint8)
  Image.fromarray(image_inpainting).resize((image_source.size[0], image_source.size[1])).save("/content/output.png")
  
  return "/content/output.png"


In [None]:
## Main function

with gr.Blocks() as demo:
    # demo の state管理
    state = gr.State({})
    
    with gr.Tab("Upload Image"):
        with gr.Row():
          image_input = gr.Image(tool="sketch", type="pil", label="upload image")
          input_filepath = gr.outputs.Textbox(type="text", label="input filepath")
        check_btn = gr.Button("Check")
    with gr.Tab("Stable Diffusion"):
        with gr.Column(scale=1):
            with gr.Row():
              value = gr.inputs.Textbox(type="text", default="/content/init.png", label="sd input filepath")
              refresh_btn = gr.Button("Refresh")
            with gr.Row():
              grounding_instruction = gr.Textbox(type="text", label="Grounding instruction (Separated by semicolon)")
            with gr.Row():
              sd_input_image = gr.Image(tool="sketch", type="filepath", label="sd input", value="/content/init.png")
              bbox_output_image = gr.Image(type="pil", label="BBox input image")
              grounding_lang = gr.outputs.Textbox(type="text", label="Grounding language")
              bbox_mask_image = gr.Image(type="filepath", value="/content/bbox_mask.png", label="Bbox mask image")
              bbox_check_btn = gr.Button("Bbox check")

            generate_btn = gr.Button("Generate")

        with gr.Column(scale=2, min_width=600):
            gr.Markdown("### Output")
            sd_image_output = gr.outputs.Image(type="filepath",label="SD")
            bbox_text = gr.outputs.Textbox(type="text", label="bbox")

    check_btn.click(
         click_check_btn,
         inputs=[image_input],
         outputs=[input_filepath]
    )
    refresh_btn.click(
        refresh,
        inputs=[value],
        outputs=[sd_input_image]
    )

    generate_btn.click(
        gligen_infer,
        inputs = [state],
        outputs = [sd_image_output],

    )

    # change task
    sd_input_image.edit(
        draw_bbox,
        inputs=[sd_input_image, grounding_instruction, state],
        outputs=[bbox_output_image, state]
    )

    # object idの順番通りに入れてもらう仕様にすればいい感じ

    bbox_check_btn.click(
        bbox_check,
        inputs=[grounding_instruction, sd_input_image, state],
        outputs = [grounding_lang, bbox_mask_image]
    )
    

if __name__ == "__main__":
    demo.launch(debug=True)    

  super().__init__(
  super().__init__(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

[(317, 124, 412, 302)]
[(317, 124, 412, 302), (199, 383, 365, 512)]
[(317, 124, 412, 302), (199, 383, 365, 512), (190, 187, 266, 311)]


  0%|          | 0/50 [00:00<?, ?it/s]

Keyboard interruption in main thread... closing server.


Stable diffusion inpainting App

In [None]:
# CodeFormer Install(for super resolution)
%cd /content
!rm -rf CodeFormer
!git clone https://github.com/sczhou/CodeFormer.git
%cd CodeFormer

# Set up the environment
# Install python dependencies
!pip install -r requirements.txt
# Install basicsr
!python basicsr/setup.py develop

# Download the pre-trained model
!python scripts/download_pretrained_models.py facelib
!python scripts/download_pretrained_models.py CodeFormer

In [None]:
# load stable diffusion inpainting pipeline
import torch
from PIL import Image
import cv2
from diffusers import StableDiffusionInpaintPipeline
pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-inpainting",
    # torch_dtype=torch.float16,
).to("cuda")
pipe.enable_attention_slicing()

  deprecate(


In [None]:
%cd /content/CodeFormer/

/content/CodeFormer


In [None]:
### library and utils function
import gradio as gr
import imageio
from PIL import Image
import numpy as np
import cv2
import sys
sys.path.append("CodeFormer")
import os
import glob

# drawするのはsd input imageのところ
# upload imageとimage inputのところを分けた
# sd inputのところで新しくできたイメージを入れる際に一度リフレッシュして、filepathを指定するようにする。

os.makedirs("/content/srinputs", exist_ok=True)
os.makedirs("/content/sroutputs", exist_ok=True)

def refresh(img, value):
  
  return gr.update(value=value)


def infer(img, prompt):

  rgb_img_path = img["image"]
  rgb_img = Image.open(rgb_img_path).save("/content/data.png")
  mask_img_path = img["mask"]
  
  # read image
  img = cv2.resize(cv2.imread(mask_img_path), (512, 512))
  
  # convert to grayscale
  gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
  
  # threshold
  thresh = cv2.threshold(gray,128,255,cv2.THRESH_BINARY)[1]
  
  # get contours
  result = img.copy()
  contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  contours = contours[0] if len(contours) == 2 else contours[1]
  for cntr in contours:
    x,y,w,h = cv2.boundingRect(cntr)
    # cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
    result[y:y+h, x:x+w] = np.ones((h, w, 3)) * 255
    print("x,y,w,h:",x,y,w,h)
    
  # save resulting image
  cv2.imwrite('/content/data_mask_new.png',result) 
  text = f"{x},{y},{w},{h}"
  with open("/content/data_bbox.txt", "w") as f:
    f.write(text+"\n")
  f.close()

  input_image = Image.open("/content/data.png")
  mask_image = Image.open("/content/data_mask_new.png")
  bh, bw = input_image.size

  input_image = input_image.resize((512, 512))
  mask_image = mask_image.resize((512, 512))
  print(prompt)
  image = pipe(prompt=prompt, image=input_image, mask_image=mask_image).images[0]
  # image.resize((bh, bw)).save("/content/data_sd.png")
  image.save("/content/data_sd.png")

  return "/content/data_sd.png", text

def infer_superres(img, fl_slider):
  rgb_img = img["image"]
  rgb_img.save("/content/srinputs/lr.png")

  !python /content/CodeFormer/inference_codeformer.py -w $fl_slider --input_path /content/srinputs --bg_upsampler realesrgan

  !cp  /content/results/srinputs_$fl_slider/final_results/lr.png /content/sroutputs/

  return "/content/sroutputs/lr.png"

In [None]:
### Main function
# data.pngをあらかじめ用意
with gr.Blocks() as demo:
    with gr.Row():
      with gr.Tab("Stable Diffusion"):
        with gr.Column(scale=2):
            gr.Markdown("### Data")
            with gr.Tab("Upload Image"):
              image_input = gr.Image(tool="sketch", type="filepath", label="upload image")
            with gr.Tab("Inpaint Image"):
              sd_input_image = gr.Image(tool="sketch", type="filepath", label="sd input", value="/content/data.png")
              value = gr.inputs.Textbox(type="text", default="/content/data.png", label="sd input filepath")
              refresh_button = gr.Button("Refresh")
              
            prompt = gr.inputs.Textbox(type="text", label="prompt")
            
            process_button = gr.Button("Process")

            sd_image_output = gr.Image(type="filepath",label="SD")
            bbox_text = gr.outputs.Textbox(type="text", label="bbox")


        # with gr.Column(scale=2, min_width=600):
        #    gr.Markdown("### Output")
        #    sd_image_output = gr.Image(type="filepath",label="SD")
        #    bbox_text = gr.outputs.Textbox(type="text", label="bbox")
      with gr.Tab("SuperRes"):
        with gr.Column(scale=1):
          gr.Markdown("### Input")
          lr_image_input = gr.Image(tool="sketch", type="pil", label="lr input")
          fl_slider = gr.inputs.Slider(minimum=0, maximum=1, step=0.1, label="fidelity")
          sr_button = gr.Button("Process")
        with gr.Column(scale=2, min_width=600):
          gr.Markdown("### Output")
          hr_image_output = gr.outputs.Image(type="filepath", label="HR")

      """
      with gr.Tab("ControlNet"):
        with gr.Column(scale=1):
          edit_part_image = gr.Image(tool="sketch", type="pil", label="cn input")
          with gr.Column():
            controlnet_option = gr.inputs.Radio(choices=["mlsd", "depth", "canny"], default="canny", type="value", label="Controlnet Option")
            with gr.Tab("mlsd params"):
              thr_v = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="thr_v", interactive=True)
              thr_d = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="thr_d", interactive=True)
            
            cn_button = gr.Button("Process")

        with gr.Column(scale=2, min_width=600):
          gr.Markdown("### Output")
          cn_image_output = gr.outputs.Image(type="filepath", label="CN")
      """




    refresh_button.click(fn=refresh, inputs=[image_input, value], outputs=sd_input_image)
    process_button.click(fn=infer, inputs=[sd_input_image, prompt], outputs=[sd_image_output, bbox_text])

    sr_button.click(fn=infer_superres, inputs=[lr_image_input, fl_slider], outputs=hr_image_output)
    # cn_button.click(fn=infer_controlnet, inputs=[edit_part_image], outputs=cn_image_output)
    
if __name__ == "__main__":
    demo.launch(debug=True)    

  super().__init__(
  super().__init__(
  super().__init__(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

x,y,w,h: 323 122 90 209
a tall bookshelf


  0%|          | 0/50 [00:00<?, ?it/s]

Keyboard interruption in main thread... closing server.
