In [1]:
!python --version

Python 3.12.4


In [2]:
import ultralytics
import torch

In [3]:
from ultralytics import YOLO

import time
import torch
import cv2
import torch.backends.cudnn as cudnn
from PIL import Image
import colorsys
import numpy as np
from queue import Queue
from threading import Thread
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from threading import Thread

In [4]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650'

In [5]:
class_names = ['person', 'bicycle', 'car', 'motorcycle',
               'airplane', 'bus', 'train', 'truck', 'boat',
               'traffic light', 'fire hydrant', 'stop sign',
               'parking meter', 'bench', 'bird', 'cat', 'dog',
               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
               'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard',
               'sports ball', 'kite', 'baseball bat', 'baseball glove',
               'skateboard', 'surfboard', 'tennis racket', 'bottle',
               'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
               'banana', 'apple', 'sandwich', 'orange', 'broccoli',
               'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
               'couch', 'potted plant', 'bed', 'dining table', 'toilet',
               'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
               'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
               'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
               'toothbrush']

In [6]:
INPUT_VIDEO_PATH = "data/videos/Rec16-1_trimmed.mp4"
OUTPUT_VIDEO_PATH = "output/videos/Rec16-1-yolo_trimmed_2.mp4"
INPUT_TIMESTAMP_PATH = "output/timestamps/Rec16-1_trimmed.txt"
JSON_OUTPUT_PATH = "output/json/Rec16-1_trimmed_yolo_2.json"

In [7]:
def load_timestamps(file_path):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f]

In [8]:
# CONSTANTS
YOLO_MODEL_PATH = "models/yolo/yolov8n.pt"
YOLO_CONFIDENCE_THRESHOLD = 0.5
MAX_QUEUE_SIZE = 30
# Class indices for person, car, truck, bus, and motorcycle in COCO dataset
TARGET_CLASSES = [0, 2, 7, 5, 3]

In [9]:
def process_frame(frame, frame_number, model, timestamps):
    og_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = model(og_frame, classes=TARGET_CLASSES, conf=YOLO_CONFIDENCE_THRESHOLD)
    
    frame_results = []
    for det in results[0].boxes.data:
        x1, y1, x2, y2, conf, cls = det.tolist()
        frame_results.append({
            "class": model.names[int(cls)],
            "confidence": conf,
            "bbox": [x1, y1, x2, y2]
        })
    
    # Draw bounding boxes on the frame
    for det in frame_results:
        x1, y1, x2, y2 = map(int, det["bbox"])
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        label = f"{det['class']} {det['confidence']:.2f}"
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    timestamp = timestamps[frame_number] if frame_number < len(timestamps) else f"frame_{frame_number}"
    return frame_number, timestamp, frame_results, frame


In [10]:
def write_video(output_queue, output_path, fps, frame_size):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, frame_size)
    
    while True:
        frame = output_queue.get()
        if frame is None:
            break
        out.write(frame)
    
    out.release()

In [11]:
def detect_yolo_parallel(video_path, timestamps_path, num_workers=4):
    model = YOLO(YOLO_MODEL_PATH)
    timestamps = load_timestamps(timestamps_path)
    
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file")
        return

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))

    results_dict = {}
    output_queue = Queue(maxsize=MAX_QUEUE_SIZE)
    
    # Start the video writing thread
    write_thread = Thread(target=write_video, args=(output_queue, OUTPUT_VIDEO_PATH, fps, frame_size))
    write_thread.start()

    start_time = time.time()

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        for frame_number in range(total_frames):
            ret, frame = cap.read()
            if not ret:
                break
            future = executor.submit(process_frame, frame, frame_number, model, timestamps)
            futures.append(future)

        for future in as_completed(futures):
            frame_number, timestamp, frame_results, processed_frame = future.result()
            results_dict[timestamp] = frame_results
            output_queue.put(processed_frame)

    # Signal the video writing thread to finish
    output_queue.put(None)
    write_thread.join()

    end_time = time.time()
    processing_time = end_time - start_time

    # Save results to JSON file
    with open(JSON_OUTPUT_PATH, 'w') as f:
        json.dump(results_dict, f, indent=2)

    cap.release()

    print(f"Processed {total_frames} frames in {processing_time:.2f} seconds")
    print(f"Results saved to {JSON_OUTPUT_PATH}")
    print(f"Output video saved to {OUTPUT_VIDEO_PATH}")

    return results_dict


In [12]:
results = detect_yolo_parallel(INPUT_VIDEO_PATH, INPUT_TIMESTAMP_PATH, num_workers=4)


Ultralytics YOLOv8.2.87 🚀 Python-3.12.4 torch-2.4.0+cu121 CUDA:0 (NVIDIA GeForce GTX 1650, 3897MiB)

Ultralytics YOLOv8.2.87 🚀 Python-3.12.4 torch-2.4.0+cu121 CUDA:0 (NVIDIA GeForce GTX 1650, 3897MiB)

Ultralytics YOLOv8.2.87 🚀 Python-3.12.4 torch-2.4.0+cu121 CUDA:0 (NVIDIA GeForce GTX 1650, 3897MiB)

YOLOv8n summary (fused): 168 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs
YOLOv8n summary (fused): 168 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs
YOLOv8n summary (fused): 168 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs
0: 352x640 (no detections), 71.1ms
Speed: 7.3ms preprocess, 71.1ms inference, 42.2ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 motorcycle, 48.3ms
Speed: 4.3ms preprocess, 48.3ms inference, 194.2ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 66.8ms
Speed: 6.2ms preprocess, 66.8ms inference, 2.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 91.5ms
Speed: 5.9ms 