# Setup

In [9]:
import os
HOME = os.getcwd()

In [10]:
import openvino as ov
import torch
import torchvision

TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]



Install YoloV8

In [11]:
!pip install "ultralytics<=8.3.40"

from IPython import display
display.clear_output()

import ultralytics
ultralytics.checks()

Ultralytics 8.3.40  Python-3.12.7 torch-2.5.1 CPU (11th Gen Intel Core(TM) i7-1165G7 2.80GHz)
Setup complete  (8 CPUs, 15.7 GB RAM, 69.9/476.2 GB disk)


Install YoloV5

In [12]:
%cd {HOME}
!git clone https://github.com/ultralytics/yolov5

%cd {HOME}/yolov5
!pip install -r requirements.txt

from IPython import display
display.clear_output()

In [13]:
import detectron2

In [14]:
from IPython import display
display.clear_output()

import supervision as sv
print("supervision", sv.__version__)

supervision 0.25.1


In [15]:
from IPython import display
display.clear_output()

In [16]:
from supervision.assets import download_assets, VideoAssets
from IPython import display


download_assets(VideoAssets.MARKET_SQUARE)
download_assets(VideoAssets.GROCERY_STORE)
download_assets(VideoAssets.SUBWAY)


display.clear_output()

In [19]:
torch.cuda.is_available()

False

# Shopping mall example

In [21]:
from ultralytics import YOLO

model = YOLO('yolov8s.pt')

In [22]:
# import supervision as sv
import numpy as np

# initiate polygon zone
polygon = np.array([
    [1725, 1550],
    [2725, 1550],
    [3500, 2160],
    [1250, 2160]
])
video_info = sv.VideoInfo.from_video_path(VideoAssets.GROCERY_STORE.value)
zone = sv.PolygonZone(polygon=polygon)

# initiate annotators
box_annotator = sv.BoxAnnotator(thickness=4)
label_annotator = sv.LabelAnnotator(text_thickness=4, text_scale=2)
zone_annotator = sv.PolygonZoneAnnotator(zone=zone, color=sv.Color.WHITE, thickness=6, text_thickness=6, text_scale=4)

def process_frame(frame: np.ndarray, _) -> np.ndarray:
    # detect
    results = model(frame, imgsz=1280)[0]
    detections = sv.Detections.from_ultralytics(results)
    detections = detections[detections.class_id == 0]
    zone.trigger(detections=detections)

    # annotate
    labels = [f"{model.names[class_id]} {confidence:0.2f}" for _, _, confidence, class_id, _, _ in detections]
    frame = box_annotator.annotate(scene=frame, detections=detections)
    frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)
    frame = zone_annotator.annotate(scene=frame)

    return frame

sv.process_video(source_path=VideoAssets.GROCERY_STORE.value, target_path=f"{HOME}/mall-result.mp4", callback=process_frame)

from IPython import display

#display.clear_output()


0: 736x1280 1 person, 4 bottles, 1 chair, 1 tv, 2 refrigerators, 518.9ms
Speed: 12.0ms preprocess, 518.9ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 1 person, 4 bottles, 1 chair, 1 tv, 2 refrigerators, 503.5ms
Speed: 12.0ms preprocess, 503.5ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 1 person, 3 bottles, 1 chair, 2 tvs, 2 refrigerators, 397.8ms
Speed: 9.0ms preprocess, 397.8ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 1 person, 3 bottles, 1 chair, 2 tvs, 2 refrigerators, 361.6ms
Speed: 14.0ms preprocess, 361.6ms inference, 3.0ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 1 person, 3 bottles, 1 chair, 2 refrigerators, 350.1ms
Speed: 12.0ms preprocess, 350.1ms inference, 1.0ms postprocess per image at shape (1, 3, 736, 1280)

0: 736x1280 1 person, 3 bottles, 1 chair, 1 tv, 2 refrigerators, 409.3ms
Speed: 16.0ms preprocess, 409.3ms inference, 2.0ms postproces

KeyboardInterrupt: 