In [2]:
!python --version

Python 3.12.4


In [3]:
import ultralytics
import torch

In [4]:
import cv2
import time
from queue import Queue
from threading import Thread
from abc import ABC, abstractmethod
import json
from ultralytics import YOLO
from deep_sort.deep_sort import DeepSort
import numpy as np

In [5]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650'

In [6]:
class_names = ['person', 'bicycle', 'car', 'motorcycle',
               'airplane', 'bus', 'train', 'truck', 'boat',
               'traffic light', 'fire hydrant', 'stop sign',
               'parking meter', 'bench', 'bird', 'cat', 'dog',
               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
               'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard',
               'sports ball', 'kite', 'baseball bat', 'baseball glove',
               'skateboard', 'surfboard', 'tennis racket', 'bottle',
               'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
               'banana', 'apple', 'sandwich', 'orange', 'broccoli',
               'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
               'couch', 'potted plant', 'bed', 'dining table', 'toilet',
               'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
               'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
               'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
               'toothbrush']

In [7]:
INPUT_VIDEO_PATH = "data/videos/Rec16-1_trimmed.mp4"
OUTPUT_VIDEO_PATH = "output/videos/Rec16-1-yolo_trimmed_final.mp4"
INPUT_TIMESTAMP_PATH = "output/timestamps/Rec16-1_trimmed.txt"
JSON_OUTPUT_PATH = "output/json/Rec16-1_trimmed_yolo_final.json"

In [8]:
# CONSTANTS
YOLO_MODEL_PATH = "models/yolo/yolov8n.pt"
YOLO_CONFIDENCE_THRESHOLD = 0.5
MAX_QUEUE_SIZE = 30
# Class indices for person, car, truck, bus, and motorcycle in COCO dataset
TARGET_CLASSES = [0, 2, 7, 5, 3]

In [9]:
class VideoProcessor(ABC):
    def __init__(self, input_path, output_path, timestamps_path, max_queue_size=MAX_QUEUE_SIZE):
        self.input_path = input_path
        self.output_path = output_path
        self.timestamps_path = timestamps_path
        self.max_queue_size = max_queue_size
        self.timestamps = self.load_timestamps()
        
        self.cap = cv2.VideoCapture(input_path)
        self.width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
        
        self.frame_queue = Queue(maxsize=max_queue_size)
        self.result_queue = Queue(maxsize=max_queue_size)
        self.output_queue = Queue(maxsize=max_queue_size)

    def load_timestamps(self):
        with open(self.timestamps_path, 'r') as f:
            return [line.strip() for line in f]

    def read_frames(self):
        while True:
            if self.frame_queue.qsize() < self.max_queue_size:
                ret, frame = self.cap.read()
                if not ret:
                    break
                self.frame_queue.put(frame)
            else:
                time.sleep(0.1)
        self.frame_queue.put(None)

    @abstractmethod
    def process_frames(self):
        pass

    @abstractmethod
    def post_process(self):
        pass

    def write_video(self):
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(self.output_path, fourcc, self.fps, (self.width, self.height))
        
        while True:
            frame = self.output_queue.get()
            if frame is None:
                break
            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
        
        out.release()

    def run(self):
        read_thread = Thread(target=self.read_frames)
        process_thread = Thread(target=self.process_frames)
        post_process_thread = Thread(target=self.post_process)
        write_thread = Thread(target=self.write_video)
        
        read_thread.start()
        process_thread.start()
        post_process_thread.start()
        write_thread.start()
        
        read_thread.join()
        process_thread.join()
        post_process_thread.join()
        write_thread.join()
        
        self.cap.release()
        cv2.destroyAllWindows()


In [10]:
class YOLOProcessor(VideoProcessor):
    def __init__(self, input_path, output_path, timestamps_path, json_output_path,
                 yolo_model_path=YOLO_MODEL_PATH, confidence_threshold=YOLO_CONFIDENCE_THRESHOLD,
                 target_classes=TARGET_CLASSES):
        super().__init__(input_path, output_path, timestamps_path)
        self.json_output_path = json_output_path
        self.yolo_model = YOLO(yolo_model_path)
        self.confidence_threshold = confidence_threshold
        self.target_classes = target_classes
        self.results_dict = {}

    def process_frames(self):
        frame_index = 0
        while True:
            frame = self.frame_queue.get()
            if frame is None:
                break
            
            results = self.yolo_model(frame, classes=self.target_classes, conf=self.confidence_threshold)
            
            timestamp = self.timestamps[frame_index] if frame_index < len(self.timestamps) else f"frame_{frame_index}"
            frame_results = []
            for det in results[0].boxes.data:
                x1, y1, x2, y2, conf, cls = det.tolist()
                frame_results.append({
                    "class": self.yolo_model.names[int(cls)],
                    "confidence": conf,
                    "bbox": [x1, y1, x2, y2]
                })
            
            self.results_dict[timestamp] = frame_results
            self.result_queue.put((frame, results))
            frame_index += 1
        
        self.result_queue.put(None)

    def post_process(self):
        while True:
            item = self.result_queue.get()
            if item is None:
                break
            frame, results = item
            annotated_frame = results[0].plot()
            self.output_queue.put(annotated_frame)
        
        with open(self.json_output_path, 'w') as f:
            json.dump(self.results_dict, f, indent=2)
        
        self.output_queue.put(None)


In [11]:
class DeepSORTProcessor(VideoProcessor):
    def __init__(self, input_path, output_path, timestamps_path, json_output_path,
                 yolo_model_path=YOLO_MODEL_PATH, deepsort_model_path="deep_sort/deep/checkpoint/ckpt.t7",
                 confidence_threshold=0.5, target_classes=[0, 2, 7, 5, 3], max_age=20):
        super().__init__(input_path, output_path, timestamps_path)
        self.json_output_path = json_output_path
        self.yolo_model = YOLO(yolo_model_path)
        self.tracker = DeepSort(model_path=deepsort_model_path, max_age=max_age)
        self.confidence_threshold = confidence_threshold
        self.target_classes = target_classes
        self.results_dict = {}

    def process_frames(self):
        while True:
            frame = self.frame_queue.get()
            if frame is None:
                break
            
            og_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.yolo_model(og_frame, classes=self.target_classes, conf=self.confidence_threshold)
            
            self.result_queue.put((og_frame, results))
        self.result_queue.put(None)

    def post_process(self):
        frame_index = 0
        while True:
            item = self.result_queue.get()
            if item is None:
                break
            
            og_frame, results = item
            timestamp = self.timestamps[frame_index] if frame_index < len(self.timestamps) else f"frame_{frame_index}"
            frame_results = []
            
            if len(results[0].boxes) > 0:
                boxes = results[0].boxes
                cls = boxes.cls.tolist()
                conf = boxes.conf.detach().cpu().numpy()
                xyxy = boxes.xyxy.detach().cpu().numpy()
                xywh = boxes.xywh.cpu().numpy()
                
                tracks = self.tracker.update(xywh, conf, og_frame)
                
                for track in self.tracker.tracker.tracks:
                    track_id = track.track_id
                    x1, y1, x2, y2 = track.to_tlbr()
                    w = x2 - x1
                    h = y2 - y1
                    
                    color = [(0, 0, 255), (255, 0, 0), (0, 255, 0)][track_id % 3]
                    cv2.rectangle(og_frame, (int(x1), int(y1)), (int(x1 + w), int(y1 + h)), color, 2)
                    
                    class_index = int(cls[track_id % len(cls)])
                    class_name = self.yolo_model.names[class_index]
                    
                    cv2.putText(og_frame, f"{class_name}-{track_id}", (int(x1) + 10, int(y1) - 5),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
                    
                    frame_results.append({
                        "track_id": track_id,
                        "class": class_name,
                        "bbox": [x1, y1, x2, y2],
                        "confidence": float(conf[track_id % len(conf)])
                    })
            
            self.results_dict[timestamp] = frame_results
            self.output_queue.put(og_frame)
            frame_index += 1
        
        with open(self.json_output_path, 'w') as f:
            json.dump(self.results_dict, f, indent=2)
        
        self.output_queue.put(None)

In [12]:
# yolo example
INPUT_VIDEO_PATH = "data/videos/Rec16-1_trimmed.mp4"
OUTPUT_VIDEO_PATH = "output/videos/Rec16-1-yolo_trimmed_final_2.mp4"

In [13]:
yolo_processor = YOLOProcessor(INPUT_VIDEO_PATH, OUTPUT_VIDEO_PATH, INPUT_TIMESTAMP_PATH, JSON_OUTPUT_PATH)
yolo_processor.run()


0: 352x640 (no detections), 53.0ms
Speed: 9.5ms preprocess, 53.0ms inference, 51.1ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 7.3ms
Speed: 3.1ms preprocess, 7.3ms inference, 147.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 7.9ms
Speed: 5.7ms preprocess, 7.9ms inference, 3.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 8.0ms
Speed: 2.7ms preprocess, 8.0ms inference, 2.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 7.9ms
Speed: 2.4ms preprocess, 7.9ms inference, 2.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 6.2ms
Speed: 2.9ms preprocess, 6.2ms inference, 3.4ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 7.8ms
Speed: 2.9ms preprocess, 7.8ms inference, 2.0ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 8.8ms
Speed: 3.7ms preprocess, 8.8ms inference, 2.0ms postprocess per image at shape