Note, requires pytorch and ultralytics packages:
- https://github.com/ultralytics/ultralytics
- https://pytorch.org/get-started/locally/

In [None]:
import cv2
from ultralytics import YOLO

In [None]:
current_tracked_box = None
current_frame_boxes = []
is_person_selected = False
iou_threshold_value = 0.3
direction_definitions = ["right", "left"]

model yolov11, v8 might be useful

In [None]:
model = YOLO("yolo11n.pt")
model.to("cuda") #comment this line out if not using an nvidia gpu

In [None]:
camera_stream = cv2.VideoCapture(0)
camera_stream.set(cv2.CAP_PROP_FRAME_WIDTH, 1200)
camera_stream.set(cv2.CAP_PROP_FRAME_HEIGHT, 1000)

In [None]:
def calculate_iou(bounding_box_a, bounding_box_b):
    xA = max(bounding_box_a[0], bounding_box_b[0])
    yA = max(bounding_box_a[1], bounding_box_b[1])
    xB = min(bounding_box_a[2], bounding_box_b[2])
    yB = min(bounding_box_a[3], bounding_box_b[3])
    intersection_width = max(0, xB - xA)
    intersection_height = max(0, yB - yA)
    intersection_area = intersection_width * intersection_height
    area_a = (bounding_box_a[2] - bounding_box_a[0]) * (bounding_box_a[3] - bounding_box_a[1])
    area_b = (bounding_box_b[2] - bounding_box_b[0]) * (bounding_box_b[3] - bounding_box_b[1])
    return intersection_area / (area_a + area_b - intersection_area + 1e-6)

In [None]:
def handle_mouse_click(event, mouse_x, mouse_y, flags, param):
    global current_tracked_box
    global is_person_selected
    if event == cv2.EVENT_LBUTTONDOWN:
        for (box_x1, box_y1, box_x2, box_y2) in current_frame_boxes:
            if box_x1 <= mouse_x <= box_x2 and box_y1 <= mouse_y <= box_y2:
                current_tracked_box = (box_x1, box_y1, box_x2, box_y2)
                is_person_selected = True
                print("Selected new person for tracking:", current_tracked_box)
                return
        current_tracked_box = None
        is_person_selected = False
        print("No person clicked. Tracking disabled.")

In [None]:
cv2.namedWindow("Webcam")
cv2.setMouseCallback("Webcam", handle_mouse_click)

while True:
    ret, frame = camera_stream.read()
    if not ret:
        break
    current_frame_boxes = []
    predictions = model.predict(
        source=frame,
        stream=True,
        verbose=False
    )
    for prediction in predictions:
        for bounding_box in prediction.boxes:
            if int(bounding_box.cls[0]) == 0:
                x1, y1, x2, y2 = map(int, bounding_box.xyxy[0])
                current_frame_boxes.append((x1, y1, x2, y2))
    if is_person_selected and current_tracked_box:
        best_iou_score = 0.0
        best_box_match = None
        for box in current_frame_boxes:
            iou_value = calculate_iou(current_tracked_box, box)
            if iou_value > best_iou_score:
                best_iou_score = iou_value
                best_box_match = box
        if best_box_match and best_iou_score >= iou_threshold_value:
            current_tracked_box = best_box_match
        else:
            print("Lost track (no matching box over IOU threshold).")
            current_tracked_box = None
            is_person_selected = False
    for (box_x1, box_y1, box_x2, box_y2) in current_frame_boxes:
        rectangle_color = (255, 0, 255)
        rectangle_thickness = 2
        if is_person_selected and current_tracked_box and (box_x1, box_y1, box_x2, box_y2) == current_tracked_box:
            rectangle_color = (0, 255, 0)
            rectangle_thickness = 3
        cv2.rectangle(frame, (box_x1, box_y1), (box_x2, box_y2), rectangle_color, rectangle_thickness)
    if is_person_selected and current_tracked_box:
        tracked_x1, tracked_y1, tracked_x2, tracked_y2 = current_tracked_box
        person_center_x = (tracked_x1 + tracked_x2) // 2
        frame_center_x = frame.shape[1] // 2
        if person_center_x > frame_center_x:
            direction_index = 0
        else:
            direction_index = 1
        print(direction_definitions[direction_index])
    cv2.imshow("Webcam", frame)
    key_pressed = cv2.waitKey(1)
    if key_pressed & 0xFF == ord('q'):
        break
camera_stream.release()
cv2.destroyAllWindows()

In [None]:
#use in case of keyboard interrupt to remove window
camera_stream.release()
cv2.destroyAllWindows()