# Marrying Grounding DINO with GLIGEN for Image Editing


[![Grounding DINO](https://badges.aleen42.com/src/github.svg)](https://github.com/IDEA-Research/GroundingDINO)
[![GLIGEN](https://badges.aleen42.com/src/github.svg)](https://github.com/gligen/GLIGEN)


[![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499) 
[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)
[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
[![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)

![gdgligen](https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/GD_GLIGEN.png)

# Build environment

**GLIGEN uses a modified diffusers. We highly recommoned to use new conda virtural environment for the notebook!**

To do this, please run the following commands and rerun the notebook with the new environment:

```bash
conda create -n gligen_diffusers python=3.10
conda activate gligen_diffusers
```

In [0]:
! pip install diffusers transformers accelerate scipy safetensors

In [0]:
# install gligen_diffusers
! pwd
! git clone git@github.com:gligen/diffusers.git
! python -m pip install -e diffusers

In [0]:
import os

# setup device. If you have a GPU, you can change this to "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [0]:
import argparse
from functools import partial
import cv2
import requests

from io import BytesIO
from PIL import Image
import numpy as np
from pathlib import Path
import random


import warnings
warnings.filterwarnings("ignore")


import torch
from torchvision.ops import box_convert

from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
from groundingdino.util.inference import annotate, load_image, predict
import groundingdino.datasets.transforms as T

from huggingface_hub import hf_hub_download


# Load grounding dino models

In [0]:
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)

    args = SLConfig.fromfile(cache_config_file) 
    model = build_model(args)
    args.device = device

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location='cpu')
    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
    print("Model loaded from {} \n => {}".format(cache_file, log))
    _ = model.eval()
    return model    

In [0]:
# Use this command for evaluate the Grounding DINO model
# Or you can download the model by yourself
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filenmae = "groundingdino_swint_ogc.pth"
ckpt_config_filename = "GroundingDINO_SwinT_OGC.cfg.py"

In [0]:
model = load_model_hf(ckpt_repo_id, ckpt_filenmae, ckpt_config_filename)

# Load GLIGEN inpainting models

In [0]:
from diffusers import StableDiffusionGLIGENPipeline


pipe = StableDiffusionGLIGENPipeline.from_pretrained("gligen/diffusers-inpainting-text-box", revision="fp16", torch_dtype=torch.float16)
pipe.to("cuda")

# Load demo image

In [0]:
image_url = 'https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/art_dog_birthdaycake.png'
local_image_path = 'art_dog_birthdaycake.png'

In [0]:
import io


def download_image(url, image_file_path):
    r = requests.get(url, timeout=4.0)
    if r.status_code != requests.codes.ok:
        assert False, 'Status code error: {}.'.format(r.status_code)

    with Image.open(io.BytesIO(r.content)) as im:
        im.save(image_file_path)

    print('Image downloaded from url: {} and saved to: {}.'.format(url, image_file_path))

download_image(image_url, local_image_path)

# Run Grounding DINO

In [0]:
import os
import supervision as sv


TEXT_PROMPT = "dog. cake."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(local_image_path)

boxes, logits, phrases = predict(
    model=model, 
    image=image, 
    caption=TEXT_PROMPT, 
    box_threshold=BOX_TRESHOLD, 
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
annotated_frame = annotated_frame[...,::-1] # BGR to RGB

# image_source: np.ndarray
# annotated_frame: np.ndarray

In [0]:
def generate_masks_with_grounding(image_source, boxes):
    h, w, _ = image_source.shape
    boxes_unnorm = boxes * torch.Tensor([w, h, w, h])
    boxes_xyxy = box_convert(boxes=boxes_unnorm, in_fmt="cxcywh", out_fmt="xyxy").numpy()
    mask = np.zeros_like(image_source)
    for box in boxes_xyxy:
        x0, y0, x1, y1 = box
        mask[int(y0):int(y1), int(x0):int(x1), :] = 255
    return mask

In [0]:
image_mask = generate_masks_with_grounding(image_source, boxes)

In [0]:
Image.fromarray(image_source)

In [0]:
Image.fromarray(annotated_frame)

In [0]:
Image.fromarray(image_mask)

# Image Inpainting

In [0]:
image_source = Image.fromarray(image_source)
annotated_frame = Image.fromarray(annotated_frame)
image_mask = Image.fromarray(image_mask)

In [0]:
image_source_for_inpaint = image_source.resize((512, 512))
image_mask_for_inpaint = image_mask.resize((512, 512))

In [0]:
num_box = len(boxes)
num_box

In [0]:
xyxy_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").tolist()
xyxy_boxes[:2]

In [0]:
# define prompts for each box
gligen_phrases = ['a cat', 'a rose']

In [0]:
prompt = "'a cat', 'a rose'"

num_box = len(boxes)

image_inpainting = pipe(
    prompt,
    num_images_per_prompt = 2,
    gligen_phrases = gligen_phrases,
    gligen_inpaint_image = image_source_for_inpaint,
    gligen_boxes = xyxy_boxes,
    gligen_scheduled_sampling_beta=1,
    output_type="numpy",
    num_inference_steps=50
).images

In [0]:
# 0..1 to 0..255, and convert to uint8
image_inpainting = (image_inpainting * 255).astype(np.uint8)

In [0]:
image_inpainting = np.concatenate(image_inpainting, axis=1)

In [0]:
Image.fromarray(image_inpainting).resize((image_source.size[0]*2, image_source.size[1]))