In [15]:
!python --version

Python 3.12.4


In [16]:
import ultralytics
import torch

In [17]:
from ultralytics import YOLO

import time
import torch
import cv2
import torch.backends.cudnn as cudnn
from PIL import Image
import colorsys
import numpy as np
from queue import Queue
from threading import Thread
import json

In [18]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650'

In [19]:
class_names = ['person', 'bicycle', 'car', 'motorcycle',
               'airplane', 'bus', 'train', 'truck', 'boat',
               'traffic light', 'fire hydrant', 'stop sign',
               'parking meter', 'bench', 'bird', 'cat', 'dog',
               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
               'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard',
               'sports ball', 'kite', 'baseball bat', 'baseball glove',
               'skateboard', 'surfboard', 'tennis racket', 'bottle',
               'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
               'banana', 'apple', 'sandwich', 'orange', 'broccoli',
               'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
               'couch', 'potted plant', 'bed', 'dining table', 'toilet',
               'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
               'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
               'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
               'toothbrush']

In [20]:
INPUT_VIDEO_PATH = "data/videos/Rec16-1_trimmed.mp4"
OUTPUT_VIDEO_PATH = "output/videos/Rec16-1-yolo_trimmed.mp4"
INPUT_TIMESTAMP_PATH = "output/timestamps/Rec16-1_trimmed.txt"
JSON_OUTPUT_PATH = "output/json/Rec16-1_trimmed_yolo.json"

# TIMESTAMPS

In [21]:
def load_timestamps(file_path):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f]

# YOLO

In [22]:
# CONSTANTS
YOLO_MODEL_PATH = "models/yolo/yolov8n.pt"
YOLO_CONFIDENCE_THRESHOLD = 0.5
MAX_QUEUE_SIZE = 30
# Class indices for person, car, truck, bus, and motorcycle in COCO dataset
TARGET_CLASSES = [0, 2, 7, 5, 3]

In [23]:
def yolo_process_frames(yolo_frame_queue, yolo_result_queue, yolo_model, timestamps):
    frame_index = 0
    results_dict = {}
    
    while True:
        yolo_frame = yolo_frame_queue.get()
        if yolo_frame is None:
            break
        
        # Run YOLOv8 inference with specific classes and confidence threshold
        yolo_results = yolo_model(yolo_frame, classes=TARGET_CLASSES,
                                  conf=YOLO_CONFIDENCE_THRESHOLD)
        
        # Process results and store in dictionary
        frame_results = []
        for det in yolo_results[0].boxes.data:
            x1, y1, x2, y2, conf, cls = det.tolist()
            frame_results.append({
                "class": class_names[int(cls)],
                "confidence": conf,
                "bbox": [x1, y1, x2, y2]
            })
        
        timestamp = timestamps[frame_index] if frame_index < len(timestamps) else f"frame_{frame_index}"
        results_dict[timestamp] = frame_results
        
        yolo_result_queue.put((yolo_frame, yolo_results))
        frame_index += 1
    
    # Save results to JSON file
    with open(JSON_OUTPUT_PATH, 'w') as f:
        json.dump(results_dict, f, indent=2)
    
    yolo_result_queue.put(None)

In [24]:
def yolo_read_frames(yolo_cap, yolo_frame_queue, yolo_max_queue_size):
    while True:
        if yolo_frame_queue.qsize() < yolo_max_queue_size:
            ret, frame = yolo_cap.read()
            if not ret:
                break
            yolo_frame_queue.put(frame)
        else:
            time.sleep(0.1)
    yolo_frame_queue.put(None)

In [25]:
def yolo_write_video(yolo_result_queue, yolo_out):
    while True:
        yolo_item = yolo_result_queue.get()
        if yolo_item is None:
            break
        yolo_frame, yolo_results = yolo_item
        yolo_annotated_frame = yolo_results[0].plot()
        yolo_out.write(yolo_annotated_frame)

In [26]:
def yolo_process_video(yolo_input_path, yolo_output_path, timestamps_path, yolo_model_path="yolov8n.pt"):
    yolo_model = YOLO(yolo_model_path)
    timestamps = load_timestamps(timestamps_path)

    yolo_cap = cv2.VideoCapture(yolo_input_path)
    yolo_width = int(yolo_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    yolo_height = int(yolo_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    yolo_fps = int(yolo_cap.get(cv2.CAP_PROP_FPS))
    yolo_total_frames = int(yolo_cap.get(cv2.CAP_PROP_FRAME_COUNT))

    assert yolo_total_frames == len(timestamps)

    yolo_fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    yolo_out = cv2.VideoWriter(yolo_output_path, yolo_fourcc, yolo_fps, (yolo_width, yolo_height))

    yolo_frame_queue = Queue(maxsize=30)
    yolo_result_queue = Queue(maxsize=30)

    yolo_read_thread = Thread(target=yolo_read_frames, args=(yolo_cap, yolo_frame_queue, 30))
    yolo_process_thread = Thread(target=yolo_process_frames, args=(yolo_frame_queue, yolo_result_queue, yolo_model, timestamps))
    yolo_write_thread = Thread(target=yolo_write_video, args=(yolo_result_queue, yolo_out))

    yolo_read_thread.start()
    yolo_process_thread.start()
    yolo_write_thread.start()

    yolo_read_thread.join()
    yolo_process_thread.join()
    yolo_write_thread.join()

    yolo_cap.release()
    yolo_out.release()
    cv2.destroyAllWindows()

# Implementation

In [27]:
yolo_process_video(INPUT_VIDEO_PATH, OUTPUT_VIDEO_PATH, INPUT_TIMESTAMP_PATH, YOLO_MODEL_PATH)


0: 352x640 (no detections), 33.4ms
Speed: 3.1ms preprocess, 33.4ms inference, 0.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 7.7ms
Speed: 3.5ms preprocess, 7.7ms inference, 2.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 8.5ms
Speed: 3.5ms preprocess, 8.5ms inference, 2.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 6.9ms
Speed: 2.6ms preprocess, 6.9ms inference, 2.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 8.2ms
Speed: 2.2ms preprocess, 8.2ms inference, 3.4ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 7.9ms
Speed: 3.4ms preprocess, 7.9ms inference, 2.1ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 8.6ms
Speed: 2.3ms preprocess, 8.6ms inference, 1.6ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 9.7ms
Speed: 2.9ms preprocess, 9.7ms inference, 2.6ms postprocess per image at shape (1

Speed: 2.7ms preprocess, 6.4ms inference, 2.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 7.6ms
Speed: 2.7ms preprocess, 7.6ms inference, 1.9ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 8.3ms
Speed: 3.2ms preprocess, 8.3ms inference, 1.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 7.9ms
Speed: 2.2ms preprocess, 7.9ms inference, 2.9ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 9.6ms
Speed: 4.4ms preprocess, 9.6ms inference, 2.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 5.8ms
Speed: 4.2ms preprocess, 5.8ms inference, 1.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 8.8ms
Speed: 2.5ms preprocess, 8.8ms inference, 3.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 5.2ms
Speed: 2.9ms preprocess, 5.2ms inference, 2.6ms postprocess per image at sha