In [1]:
!pip install spaces
!pip install gradio
!pip install pillow
!pip install opencv-python
!pip install transformers
!pip install ultralytics

Collecting spaces
  Downloading spaces-0.36.0-py3-none-any.whl.metadata (1.1 kB)
Collecting gradio (from spaces)
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio->spaces)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio->spaces)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio->spaces)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio->spaces)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio->spaces)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio->spaces)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio->spaces)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from PIL import ImageDraw, ImageFont
import colorsys
import spaces
import cv2
from PIL import Image, ImageDraw, ImageFont
import torch
import time
import numpy as np
import uuid
import gradio as gr
from ultralytics import YOLO
from transformers import RTDetrImageProcessor, RTDetrForObjectDetection, RTDetrV2ForObjectDetection

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [4]:
def get_color(label):
    hash_value = hash(label)
    hue = (hash_value % 100) / 100.0
    saturation = 0.7
    value = 0.9
    rgb = colorsys.hsv_to_rgb(hue, saturation, value)
    return tuple(int(x * 255) for x in rgb)


In [5]:
def detect_and_annotate(video_path: str, output_path: str,model_weights: str,conf_threshold: float):
    model = YOLO(model_weights)
    cap = cv2.VideoCapture(video_path)

    fps    = cap.get(cv2.CAP_PROP_FPS)
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_idx += 1

        results = model(frame)[0]

        boxes = results.boxes
        xyxy  = boxes.xyxy.cpu().numpy()
        confs = boxes.conf.cpu().numpy()
        clss  = boxes.cls.cpu().numpy().astype(int)

        for (x1, y1, x2, y2), conf, cls in zip(xyxy, confs, clss):
            if conf < conf_threshold:
                continue
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            label = f"{model.names[cls]} {conf:.2f}"
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame,label,(x1, y1 - 10),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0, 255, 0),2, cv2.LINE_AA)

        writer.write(frame)

    cap.release()
    writer.release()
    print(f"outpur path: {output_path}")
    return output_path

In [6]:
def draw_bounding_boxes(image, results: dict, model, threshold=0.3):
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    for score, label_id, box in zip(
        results["scores"], results["labels"], results["boxes"]
    ):
        if score > threshold:
            label = model.config.id2label[label_id.item()]
            box = [round(i, 2) for i in box.tolist()]
            color = get_color(label)

            draw.rectangle(box, outline=color, width=3)
            text = f"{label}: {score:.2f}"
            text_bbox = draw.textbbox((0, 0), text, font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            draw.rectangle(
                [box[0], box[1] - text_height - 4, box[0] + text_width, box[1]],
                fill=color,
            )

            draw.text((box[0], box[1] - text_height - 4), text, fill="white", font=font)

    return image

In [7]:
def detect_objects_in_video(input_video_path: str,output_video_path: str = None,conf_threshold: float = 0.8,
                            subsample: int = 1,batch_duration_sec: int = 2, model_name: str = None):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    #тут нужно указать путь до моделей
    image_processor = RTDetrImageProcessor.from_pretrained("/путь/до/модели/p/",local_files_only=True)
    model = RTDetrV2ForObjectDetection.from_pretrained("/путь/до/модели/m/", local_files_only=True).to(device)
    if model_name == "RT-DETR_50":
      image_processor = RTDetrImageProcessor.from_pretrained("/путь/до/модели/p",local_files_only=True)
      model = RTDetrV2ForObjectDetection.from_pretrained("/путь/до/модели/m", local_files_only=True).to(device)
    model.eval()

    cap = cv2.VideoCapture(input_video_path)

    orig_fps = cap.get(cv2.CAP_PROP_FPS)
    fps = int(orig_fps) if orig_fps > 0 else 30
    desired_fps = max(1, fps )
    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out_w, out_h = orig_w, orig_h
    if output_video_path is None:
        output_video_path = f"output_{uuid.uuid4()}.mp4"
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(output_video_path, fourcc, desired_fps, (out_w, out_h))
    batch_size = desired_fps * batch_duration_sec
    batch_frames = []
    frame_idx = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        small = cv2.resize(frame, (out_w, out_h))
        rgb = cv2.cvtColor(small, cv2.COLOR_BGR2RGB)
        batch_frames.append(rgb)

        if len(batch_frames) >= batch_size:
            inputs = image_processor(images=batch_frames, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = model(**inputs)

            target_sizes = torch.tensor([[out_h, out_w]] * len(batch_frames)).to(device)
            detections = image_processor.post_process_object_detection(
                outputs,
                target_sizes=target_sizes,
                threshold=conf_threshold
            )
            for img_np, det in zip(batch_frames, detections):
                pil = Image.fromarray(img_np)
                pil = draw_bounding_boxes(pil, det, model, conf_threshold)
                out_np = np.array(pil)[:, :, ::-1]
                writer.write(out_np)
            batch_frames = []
        frame_idx += 1

    if batch_frames:
        inputs = image_processor(images=batch_frames, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        target_sizes = torch.tensor([[out_h, out_w]] * len(batch_frames)).to(device)
        detections = image_processor.post_process_object_detection(
            outputs,
            target_sizes=target_sizes,
            threshold=conf_threshold
        )
        for img_np, det in zip(batch_frames, detections):
            pil = Image.fromarray(img_np)
            pil = draw_bounding_boxes(pil, det, model, conf_threshold)
            out_np = np.array(pil)[:, :, ::-1]
            writer.write(out_np)

    cap.release()
    writer.release()
    return output_video_path


In [8]:
BATCH_DURATION_SEC = 2
YOLO_WEIGHTS = {
    "YOLOv8x": "/путь/до/модели/best.pt",
    "YOLOv8l": "/путь/до/модели/best.pt",
    "YOLOv8m": "/путь/до/модели/best.pt",
}
def process_video(video_path: str, conf_threshold: float, model_name: str) -> str:
    output_path = f"output_{uuid.uuid4()}.mp4"

    if model_name.startswith("RT-DETR"):
        detect_objects_in_video(
            input_video_path=video_path,
            output_video_path=output_path,
            conf_threshold=conf_threshold,
            subsample=1,
            batch_duration_sec=BATCH_DURATION_SEC,
            model_name = model_name
        )
    else:
        weights = YOLO_WEIGHTS[model_name]
        detect_and_annotate(
            video_path,
            output_path,
            model_weights=weights,
            conf_threshold=conf_threshold
        )

    return output_path


In [None]:
with gr.Blocks() as app:
    gr.HTML("<h1 style='text-align: center'>Video Object Detection</h1>"
            "<p>diploma</p>")
    with gr.Row():
        with gr.Column():
            video_input      = gr.Video(label="Видео")
            conf_threshold   = gr.Slider(0.0, 1.0, value=0.3, step=0.05,
                                        label="Порог уверенности")
            model_selector   = gr.Dropdown(
                ["RT-DETR_101",'RT-DETR_50', "YOLOv8x", "YOLOv8l", "YOLOv8m"],
                label="Модель"
            )
            btn = gr.Button("Запустить")
        with gr.Column():
            video_output = gr.Video(label="Результат", autoplay=True)

    btn.click(fn=process_video,
              inputs=[video_input, conf_threshold, model_selector],
              outputs=[video_output])

app.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5ac9c1f712c399e83e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Speed: 1.9ms preprocess, 12.3ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 12.4ms
Speed: 1.9ms preprocess, 12.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 12.4ms
Speed: 1.8ms preprocess, 12.4ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 12.5ms
Speed: 1.8ms preprocess, 12.5ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 13.8ms
Speed: 1.8ms preprocess, 13.8ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 13.5ms
Speed: 1.8ms preprocess, 13.5ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 12.6ms
Speed: 1.9ms preprocess, 12.6ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 organic, 12.2ms
Speed: 1.7ms preprocess, 12.2

