In [1]:
!python --version

Python 3.12.4


In [2]:
import ultralytics
import torch

In [3]:
import cv2
import time
from queue import Queue
from threading import Thread
from concurrent.futures import ThreadPoolExecutor, as_completed
from abc import ABC, abstractmethod
import json
from ultralytics import YOLO
import numpy as np
import os

In [4]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650'

In [5]:
class_names = ['person', 'bicycle', 'car', 'motorcycle',
               'airplane', 'bus', 'train', 'truck', 'boat',
               'traffic light', 'fire hydrant', 'stop sign',
               'parking meter', 'bench', 'bird', 'cat', 'dog',
               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
               'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard',
               'sports ball', 'kite', 'baseball bat', 'baseball glove',
               'skateboard', 'surfboard', 'tennis racket', 'bottle',
               'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
               'banana', 'apple', 'sandwich', 'orange', 'broccoli',
               'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
               'couch', 'potted plant', 'bed', 'dining table', 'toilet',
               'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
               'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
               'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
               'toothbrush']

In [6]:
INPUT_VIDEO_PATH = "data/videos/Rec16-1_trimmed.mp4"
# OUTPUT_VIDEO_PATH = "output/videos/Rec16-1-yolo_trimmed_final.mp4"
INPUT_TIMESTAMP_PATH = "output/timestamps/Rec16-1_trimmed.txt"
# JSON_OUTPUT_PATH = "output/json/Rec16-1_trimmed_yolo_final.json"

In [7]:
# CONSTANTS
YOLO_MODEL_PATH = "models/yolo/yolov8n.pt"
YOLO_CONFIDENCE_THRESHOLD = 0.5
MAX_QUEUE_SIZE = 30
# Class indices for person, car, truck, bus, and motorcycle in COCO dataset
TARGET_CLASSES = [0, 2, 7, 5, 3]

In [8]:
import json

class FileIO:
    @staticmethod
    def read_lines(file_path):
        with open(file_path, 'r') as f:
            return [line.strip() for line in f]

    @staticmethod
    def read_json(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)

    @staticmethod
    def write_json(file_path, data):
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)

fileio = FileIO()
print(fileio.read_json("output/gaze/balls/balls_gaze.json"))

{'0': {'x': 1108.0, 'y': 712.0}, '1': {'x': 1108.0, 'y': 712.0}, '2': {'x': 1070.0, 'y': 702.0}, '3': {'x': 1066.0, 'y': 708.0}, '4': {'x': 1022.0, 'y': 742.0}, '5': {'x': 1038.0, 'y': 562.0}, '6': {'x': 660.0, 'y': 136.0}, '7': {'x': 688.0, 'y': 322.0}, '8': {'x': 670.0, 'y': 306.0}, '9': {'x': 666.0, 'y': 306.0}, '10': {'x': 666.0, 'y': 306.0}, '11': {'x': 670.0, 'y': 306.0}, '12': {'x': 674.0, 'y': 310.0}, '13': {'x': 680.0, 'y': 312.0}, '14': {'x': 690.0, 'y': 316.0}, '15': {'x': 690.0, 'y': 316.0}, '16': {'x': 696.0, 'y': 318.0}, '17': {'x': 598.0, 'y': 334.0}, '18': {'x': 558.0, 'y': 334.0}, '19': {'x': 558.0, 'y': 338.0}, '20': {'x': 566.0, 'y': 342.0}, '21': {'x': 542.0, 'y': 336.0}, '22': {'x': 568.0, 'y': 334.0}, '23': {'x': 566.0, 'y': 336.0}, '24': {'x': 740.0, 'y': 314.0}, '25': {'x': 774.0, 'y': 310.0}, '26': {'x': 784.0, 'y': 312.0}, '27': {'x': 782.0, 'y': 312.0}, '28': {'x': 786.0, 'y': 312.0}, '29': {'x': 786.0, 'y': 312.0}, '30': {'x': 786.0, 'y': 312.0}, '31': {'x':

In [9]:
class VideoProcessor(ABC):
    def __init__(self, output_dir, max_queue_size=MAX_QUEUE_SIZE, max_workers=None):
        self.output_dir = output_dir
        self.max_queue_size = max_queue_size
        self.max_workers = max_workers
        self.text_reader = FileIO()

    @abstractmethod
    def create_process_config(self, video_config):
        pass

    @abstractmethod
    def _process_frames(self, frame_queue, result_queue, process_config, results_dict):
        pass

    @abstractmethod
    def _post_process(self, result_queue, output_queue):
        pass

    def process_videos(self, video_configs):
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [executor.submit(self.process_single_video, self.create_process_config(video_config)) 
                       for video_config in video_configs]
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    print(f"An error occurred: {str(e)}")

    def process_single_video(self, process_config):
        cap, video_params = self._initialize_video_capture(process_config['input_path'])
        queues = self._create_queues()
        results_dict = {}

        threads = self._create_and_start_threads(cap, queues, process_config, results_dict, video_params)
        self._join_threads(threads)
        self._save_results(process_config['output_json_path'], results_dict)

        cap.release()
        cv2.destroyAllWindows()

    def _initialize_video_capture(self, input_path):
        cap = cv2.VideoCapture(input_path)
        video_params = {
            'width': int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
            'height': int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
            'fps': int(cap.get(cv2.CAP_PROP_FPS))
        }
        return cap, video_params

    def _create_queues(self):
        return {name: Queue(maxsize=self.max_queue_size) for name in ['frame', 'result', 'output']}

    def _create_and_start_threads(self, cap, queues, process_config, results_dict, video_params):
        threads = [
            Thread(target=self._read_frames, args=(cap, queues['frame'])),
            Thread(target=self._process_frames, args=(queues['frame'], queues['result'], process_config, results_dict)),
            Thread(target=self._post_process, args=(queues['result'], queues['output'])),
            Thread(target=self._write_video, args=(process_config['output_video_path'], queues['output'], 
                                                   video_params['fps'], video_params['width'], video_params['height']))
        ]
        for thread in threads:
            thread.start()
        return threads

    def _join_threads(self, threads):
        for thread in threads:
            thread.join()

    def _save_results(self, output_json_path, results_dict):
        self.text_reader.write_json(output_json_path, results_dict)

    def _read_frames(self, cap, frame_queue):
        while True:
            if frame_queue.qsize() < self.max_queue_size:
                ret, frame = cap.read()
                if not ret:
                    break
                frame_queue.put(frame)
            else:
                time.sleep(0.1)
        frame_queue.put(None)

    def _write_video(self, output_path, output_queue, fps, width, height):
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        while True:
            frame = output_queue.get()
            if frame is None:
                break
            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
        
        out.release()

    def get_unique_output_dir(self, base_path):
        if not os.path.exists(base_path):
            return base_path
        
        counter = 1
        while True:
            new_path = f"{base_path}_copy{counter}"
            if not os.path.exists(new_path):
                return new_path
            counter += 1

In [10]:
class YOLOProcessor(VideoProcessor):
    def __init__(self, output_dir, yolo_model_path=YOLO_MODEL_PATH, 
                 confidence_threshold=YOLO_CONFIDENCE_THRESHOLD, 
                 target_classes=TARGET_CLASSES,
                 max_queue_size=MAX_QUEUE_SIZE, max_workers=None):
        super().__init__(output_dir, max_queue_size, max_workers)
        self.yolo_model = YOLO(yolo_model_path)
        self.confidence_threshold = confidence_threshold
        self.target_classes = target_classes

    def create_process_config(self, video_config):
        video_id = video_config['video_id']
        output_dir = self.get_unique_output_dir(os.path.join(self.output_dir, video_id))
        os.makedirs(output_dir, exist_ok=True)

        return {
            'input_path': video_config['input_path'],
            'video_id': video_id,
            'gaze_path': video_config['gaze_path'],
            'output_dir': output_dir,
            'output_video_path': os.path.join(output_dir, f"output_{video_id}.mp4"),
            'output_json_path': os.path.join(output_dir, f"output_{video_id}.json"),
            'gaze': self.text_reader.read_json(video_config['gaze_path'])
        }

    def _process_frames(self, frame_queue, result_queue, process_config, results_dict):
        frame_index = 0
        # timestamps = process_config['timestamps']

        while True:
            frame = frame_queue.get()
            if frame is None:
                break
            
            results = self.yolo_model(frame, classes=self.target_classes, conf=self.confidence_threshold)
            frame_results = []
            gaze_target = None

            # get gaze data for the current frame
            gaze_data = process_config['gaze'][str(frame_index)]

            for det in results[0].boxes.data:
                x1, y1, x2, y2, conf, cls = det.tolist()
                class_name = class_names[int(cls)]
                bbox = [float(x1), float(y1), float(x2), float(y2)]  # Convert to float
                
                detect_data = {
                    "class": class_name,
                    "confidence": conf,
                    "bbox": bbox
                }
                
                frame_results.append(detect_data)
                
                # check if gaze point is within the bounding box
                if gaze_data['x'] >= x1 and gaze_data['x'] <= x2 and gaze_data['y'] >= y1 and gaze_data['y'] <= y2:
                    gaze_target = detect_data

            # timestamp = timestamps[frame_index] if frame_index < len(timestamps) else f"frame_{frame_index}"
            results_dict[frame_index] = {
                "detections": frame_results,
                "gaze_target": gaze_target
            }
            result_queue.put((frame, results))
            frame_index += 1
        
        result_queue.put(None)

    def _post_process(self, result_queue, output_queue):
        while True:
            item = result_queue.get()
            if item is None:
                break
            frame, results = item
            annotated_frame = results[0].plot()
            output_queue.put(annotated_frame)
        
        output_queue.put(None)

In [11]:
OUTPUT_DIR = "output/results"

In [12]:
yolo_processor = YOLOProcessor(OUTPUT_DIR)

In [13]:
video_config = [
    {
        "video_id": "balls",
        "input_path": INPUT_VIDEO_PATH,
        "gaze_path": "output/gaze/balls/balls_gaze.json"
    },
        {
        "video_id": "balls2",
        "input_path": INPUT_VIDEO_PATH,
        "gaze_path": "output/gaze/balls2/balls2_gaze.json"
    },
]

yolo_processor.process_videos(video_config)


Ultralytics YOLOv8.2.87 🚀 Python-3.12.4 torch-2.4.0+cu121 CUDA:0 (NVIDIA GeForce GTX 1650, 3897MiB)

YOLOv8n summary (fused): 168 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs
0: 352x640 (no detections), 39.6ms
Speed: 4.2ms preprocess, 39.6ms inference, 16.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 34.8ms
Speed: 3.6ms preprocess, 34.8ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 6.1ms
Speed: 3.5ms preprocess, 6.1ms inference, 95.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 7.4ms
Speed: 3.2ms preprocess, 7.4ms inference, 2.2ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 6.7ms
Speed: 4.6ms preprocess, 6.7ms inference, 1.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 5.8ms
Speed: 3.3ms preprocess, 5.8ms inference, 2.1ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 7.