In [1]:
!python --version

Python 3.12.4


In [2]:
import ultralytics
import torch

In [3]:
import cv2
import time
import torch
from ultralytics import YOLO
import numpy as np
from deep_sort_realtime.deepsort_tracker import DeepSort
from queue import Queue
from threading import Thread
import json

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda', index=0)

In [5]:
class_names = ['person', 'bicycle', 'car', 'motorcycle',
               'airplane', 'bus', 'train', 'truck', 'boat',
               'traffic light', 'fire hydrant', 'stop sign',
               'parking meter', 'bench', 'bird', 'cat', 'dog',
               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
               'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard',
               'sports ball', 'kite', 'baseball bat', 'baseball glove',
               'skateboard', 'surfboard', 'tennis racket', 'bottle',
               'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
               'banana', 'apple', 'sandwich', 'orange', 'broccoli',
               'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
               'couch', 'potted plant', 'bed', 'dining table', 'toilet',
         'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
               'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
               'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
               'toothbrush']

In [6]:
MAX_QUEUE_SIZE = 30
SLEEP_TIME = 0.01

In [7]:
YOLO_MODEL_PATH = "models/yolo/yolov8n.pt"
YOLO_CONFIDENCE_THRESHOLD = 0.5
TARGET_CLASSES = [0, 2, 7, 5, 3]  # person, car, truck, bus, motorcycle

In [8]:
DEEP_SORT_MODEL_PATH = "deep_sort/deep/checkpoint/ckpt.t7"
DEEP_SORT_MAX_AGE = 5

In [9]:
INPUT_VIDEO_PATH = "data/videos/Rec16-1_trimmed.mp4"
INPUT_TIMESTAMP_PATH = "output/timestamps/Rec16-1_trimmed.txt"
OUTPUT_PATH = "output/videos/Rec16-1_trimmed_deepsort4.mp4"
OUTPUT_JSON_PATH = "output/json/Rec16-1_trimmed_deepsort4.json"

# TIMESTAMPS

In [10]:
def load_timestamps(file_path):
    with open(file_path, 'r') as f:
        return [line.strip() for line in f]

# GAZE

In [11]:
def get_circle_BB(whole_frame):
    x_circle = 0.0
    y_circle = 0.0
    r = 0.0

    # Convert to grayscale.
    gray = cv2.cvtColor(whole_frame, cv2.COLOR_BGR2GRAY)

    # Blur using 3 * 3 kernel.
    gray_blurred = cv2.blur(gray, (3, 3))

    # Apply Hough transform on the blurred image.
    detected_circles = cv2.HoughCircles(gray_blurred,
                    cv2.HOUGH_GRADIENT, 1, 20, param1 = 50,
                param2 = 30, minRadius = 18, maxRadius = 19)

    # Draw circles that are detected.
    if detected_circles is not None:
        # Convert the circle parameters a, b and r to integers.
        detected_circles = np.uint16(np.around(detected_circles))
        first_circle = detected_circles[0, :][0]
        x_circle, y_circle, r = map(float, first_circle)  # Convert to float

    return x_circle, y_circle, r

# DEEPSORT

In [12]:
def read_frames(cap, frame_queue, max_queue_size):
    with torch.no_grad():  # Disable gradient calculation for inference
        while True:
            if frame_queue.qsize() < max_queue_size:
                ret, frame = cap.read()
                if not ret:
                    break
                frame_queue.put(frame)
            else:
                time.sleep(SLEEP_TIME)  # Sleep briefly to prevent busy-waiting
    frame_queue.put(None)  # Signal end of video

In [13]:
# def process_frames(frame_queue, result_queue, model):
#     while True:
#         frame = frame_queue.get()
#         if frame is None:
#             break
        
#         og_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#         results = model(og_frame, device=0, classes=TARGET_CLASSES, conf=YOLO_CONFIDENCE_THRESHOLD)
        
#         # Get gaze coordinates
#         x_circle, y_circle, r = get_circle_BB(og_frame)
        
#         result_queue.put((og_frame, results, x_circle, y_circle, r))
#     result_queue.put(None)
def process_frames(frame_queue, result_queue, model):
    while True:
        frame = frame_queue.get()
        if frame is None:
            break
        
        try:
            results = model(frame, device=0, classes=TARGET_CLASSES, conf=YOLO_CONFIDENCE_THRESHOLD)
            result = results[0] if results else None
        except Exception as e:
            print(f"Error processing frame with YOLO: {e}")
            result = None
        
        # Get gaze coordinates
        try:
            x_circle, y_circle, r = get_circle_BB(frame)
        except Exception as e:
            print(f"Error getting gaze coordinates: {e}")
            x_circle, y_circle, r = 0, 0, 0
        
        result_queue.put((frame, result, x_circle, y_circle, r))
    result_queue.put(None)

In [14]:
def track_and_visualize(result_queue, output_queue, tracker, class_names, timestamps):
    frame_index = 0
    results_dict = {}
    
    while True:
        item = result_queue.get()
        if item is None:
            break
        
        frame, results, x_circle, y_circle, r = item
        
        timestamp = timestamps[frame_index] if frame_index < len(timestamps) else f"frame_{frame_index}"
        frame_results = []
        gaze_target = None
        
        # Process YOLO results into the format expected by DeepSort
        detections = []
        if results and len(results) > 0:
            for box in results[0].boxes:
                x1, y1, x2, y2 = box.xyxy[0].tolist()
                conf = box.conf.item()
                cls = int(box.cls.item())
                w = x2 - x1
                h = y2 - y1
                detections.append(([x1, y1, w, h], conf, cls))
        
        # Update tracks
        tracks = tracker.update_tracks(detections, frame=frame)
        
        for track in tracks:
            if not track.is_confirmed():
                continue
            
            track_id = track.track_id
            ltrb = track.to_ltrb()
            x1, y1, x2, y2 = map(int, ltrb)
            
            # Get class name
            class_id = track.get_det_class()
            class_name = class_names[class_id] if class_id < len(class_names) else "Unknown"
            
            # Draw bounding box
            color = [(0, 255, 0), (255, 0, 0), (0, 0, 255)][int(track_id) % 3]
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            
            # Draw label
            label = f"{class_name}-{track_id}"
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            
            # Store detection and tracking information
            detection_info = {
                "track_id": int(track_id),
                "class": class_name,
                "bbox": [float(x1), float(y1), float(x2), float(y2)],
                "confidence": float(track.get_det_conf()) if track.get_det_conf() else None
            }
            frame_results.append(detection_info)
            
            # Check if gaze is inside this bounding box
            if x1 <= x_circle <= x2 and y1 <= y_circle <= y2:
                gaze_target = detection_info
        
        # Draw gaze circle
        cv2.circle(frame, (int(x_circle), int(y_circle)), int(r), (0, 255, 255), 2)
        
        results_dict[timestamp] = {
            "detections": frame_results,
            "gaze": {
                "coordinates": [float(x_circle), float(y_circle)],
                "radius": float(r),
                "target": gaze_target
            }
        }
        
        output_queue.put(frame)
        frame_index += 1
    
    # Save results to JSON file
    with open(OUTPUT_JSON_PATH, 'w') as f:
        json.dump(results_dict, f, indent=2)
    
    output_queue.put(None)

In [15]:
def write_video(output_queue, out):
    while True:
        frame = output_queue.get()
        if frame is None:
            break
        out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

In [16]:
def process_video(input_path, output_path, timestamps_path, yolo_model_path=YOLO_MODEL_PATH):
    yolo_model = YOLO(yolo_model_path)
    tracker = DeepSort(max_age=DEEP_SORT_MAX_AGE, nn_budget=100,
                       nms_max_overlap=1.0,
                       max_cosine_distance=0.3,
                       max_iou_distance=0.7, n_init=3)

    timestamps = load_timestamps(timestamps_path)

    cap = cv2.VideoCapture(input_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_queue = Queue(maxsize=MAX_QUEUE_SIZE)
    result_queue = Queue(maxsize=MAX_QUEUE_SIZE)
    output_queue = Queue(maxsize=MAX_QUEUE_SIZE)
    
    read_thread = Thread(target=read_frames, args=(cap, frame_queue, MAX_QUEUE_SIZE))
    process_thread = Thread(target=process_frames, args=(frame_queue, result_queue, yolo_model))
    track_thread = Thread(target=track_and_visualize, args=(result_queue, output_queue, tracker, class_names, timestamps))
    write_thread = Thread(target=write_video, args=(output_queue, out))
    
    read_thread.start()
    process_thread.start()
    track_thread.start()
    write_thread.start()

    read_thread.join()
    process_thread.join()
    track_thread.join()
    write_thread.join()

    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [17]:
process_video(INPUT_VIDEO_PATH, OUTPUT_PATH, INPUT_TIMESTAMP_PATH)


0: 352x640 (no detections), 75.5ms
Speed: 5.1ms preprocess, 75.5ms inference, 41.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 1 motorcycle, 22.3ms
Speed: 1.9ms preprocess, 22.3ms inference, 181.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 11.5ms
Speed: 4.8ms preprocess, 11.5ms inference, 2.3ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 14.7ms
Speed: 2.8ms preprocess, 14.7ms inference, 2.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 13.1ms
Speed: 3.4ms preprocess, 13.1ms inference, 3.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 14.1ms
Speed: 2.7ms preprocess, 14.1ms inference, 5.6ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 13.0ms
Speed: 1.9ms preprocess, 13.0ms inference, 3.1ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 person, 13.4ms
Speed: 3.2ms preprocess, 13.4ms inference, 3.8ms postprocess per 