<div style="text-align: center;">
    <a href="https://colab.research.google.com/github/quamernasim/YOLO-Wrold-See-Beyond-Labels/blob/main/yolo-world.ipynb" target="_parent">
        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
    </a>
</div>

# Install the Dependencies

- https://arxiv.org/pdf/2401.17270.pdf
- https://github.com/AILab-CVC/YOLO-World
- https://huggingface.co/spaces/stevengrove/YOLO-World
- https://colab.research.google.com/drive/1AmhbXBmH2MnJA8_aQ5EDoormQ61xlzdp#scrollTo=YQ86d81wVGMe
- https://twitter.com/skalskip92/status/1754916529672438173

In [None]:
!git clone https://github.com/onuralpszr/mmyolo.git -b version/mmcv
%cd mmyolo/
!git branch

In [None]:
!pip install -e .

In [None]:
%cd /content

In [None]:
!git clone --recursive https://github.com/onuralpszr/YOLO-World.git -b collab_friendly
%cd YOLO-World/
!git branch

In [None]:
import os
# Install certain version of requests,tqdm,rich for openxlab (fix for yolo_world)
if 'COLAB_GPU' in os.environ:
  !pip install requests==2.28.2 tqdm==4.65.0 rich==13.4.2

In [None]:
!python setup.py build develop

In [None]:
%pip install -U openmim
!mim install "mmengine>=0.7.0"
!mim install "mmcv"

In [None]:
!pip install supervision==0.18.0

In [None]:
quit()

# Getting started with Yolo-World

In [None]:
!wget https://huggingface.co/spaces/stevengrove/YOLO-World/resolve/main/yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth?download=true
!mv yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth?download=true yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth
!wget https://huggingface.co/spaces/stevengrove/YOLO-World/resolve/main/configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py?download=true
!mv yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py?download=true yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py

In [None]:
!cp -r yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py /content/YOLO-World/configs/pretrain/

In [None]:
import mmengine
import yolo_world
import mmyolo

In [None]:
import argparse
import os.path as osp
from functools import partial
import supervision as sv
import cv2
import torch
import numpy as np
from tempfile import NamedTemporaryFile
from PIL import Image
from torchvision.ops import nms
from mmengine.config import Config, DictAction
from mmengine.runner import Runner
from mmengine.runner.amp import autocast
from mmengine.dataset import Compose
from mmdet.visualization import DetLocalVisualizer
from mmdet.datasets import CocoDataset
from mmyolo.registry import RUNNERS

In [None]:
def setup_runner(cfg):

    if 'runner_type' not in cfg:
        runner = Runner.from_cfg(cfg)
    else:
        runner = RUNNERS.build(cfg)

    runner.call_hook('before_run')
    runner.load_or_resume()
    pipeline = cfg.test_dataloader.dataset.pipeline
    runner.pipeline = Compose(pipeline)
    runner.model.eval()

    bounding_box_annotator = sv.BoundingBoxAnnotator()
    label_annotator = sv.LabelAnnotator(text_color=sv.Color.BLACK)
    return runner, bounding_box_annotator, label_annotator

def run_image(
    image: np.ndarray,
    text,
    cfg,
    max_num_boxes = 100,
    score_thr = 0.05,
    nms_thr = 0.5
):
    runner, bounding_box_annotator, label_annotator = setup_runner(cfg)
    with NamedTemporaryFile(suffix=".jpeg") as f:
        cv2.imwrite(f.name, image)
        texts = [[t.strip()] for t in text.split(',')] + [[' ']]
        data_info = dict(img_id=0, img_path=f.name, texts=texts)
        data_info = runner.pipeline(data_info)
        data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
                          data_samples=[data_info['data_samples']])

        with autocast(enabled=False), torch.no_grad():
            output = runner.model.test_step(data_batch)[0]
            pred_instances = output.pred_instances

        keep_idxs = nms(pred_instances.bboxes, pred_instances.scores, iou_threshold=nms_thr)

        pred_instances = pred_instances[keep_idxs]
        pred_instances = pred_instances[pred_instances.scores.float() > score_thr]

        if len(pred_instances.scores) > max_num_boxes:
            indices = pred_instances.scores.float().topk(max_num_boxes)[1]
            pred_instances = pred_instances[indices]

        pred_instances = pred_instances.cpu().numpy()

        print(pred_instances['labels'])

        detections = sv.Detections(
            xyxy=pred_instances['bboxes'],
            class_id=pred_instances['labels'],
            confidence=pred_instances['scores'],
            data={
                'class_name': np.array([texts[class_id][0] for class_id in pred_instances['labels']])
            }
        )

        labels = [
            f"{class_name} {confidence:0.2f}"
            for class_name, confidence
            in zip(detections['class_name'], detections.confidence)
        ]
        annotated_image = image.copy()
        annotated_image = bounding_box_annotator.annotate(annotated_image, detections)
        annotated_image = label_annotator.annotate(annotated_image, detections, labels)
        return annotated_image

In [None]:
cfg = Config.fromfile("/content/YOLO-World/configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py")
cfg.work_dir = "."
cfg.load_from = "yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth"
# class_names = "person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign, parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair drier, toothbrush"

class_names = "dog, person, car, bike, bicycle, tree, hand, nose, hair"

In [None]:
image = run_image(cv2.imread('/content/car-chase-featured.jpg') , class_names, cfg)
sv.plot_image(image)