In [5]:
from ultralytics import YOLO
import cv2
import os
import time

def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)  # Area of intersection
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])  # Area of boxA
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])  # Area of boxB
    if boxAArea == 0 or boxBArea == 0:
        return 0.0
    return interArea / float(boxAArea + boxBArea - interArea)

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
model_path = r"C:\Users\joshu\Documents\EEE\AMNIS\Models\training\weights\best.pt"
video_path = r"C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4"
save_dir = r"C:\Users\joshu\Documents\EEE\AMNIS\output_videos"
os.makedirs(save_dir, exist_ok=True)

# Detection thresholds
detection_conf_thresh = 0.1    # Confidence threshold for YOLO detections
detection_nms_thresh = 0.5     # NMS IOU threshold for YOLO detections

# Tracking thresholds
tracking_iou_thresh = 0.5      # IOU threshold for object tracking/matching
shelf_thresh = 100            # Vertical distance threshold for shelf grouping
lost_time_limit = 300         # Frames before declaring object as permanently lost

tracker_config = 'bytetrack.yaml'

# Initialize video capture to get frame for ROI selection
cap = cv2.VideoCapture(video_path)
ret, first_frame = cap.read()
if not ret:
    raise RuntimeError("Could not read first frame")

# Get ROI from user
print("Select ROI and press ENTER. Press C to cancel.")
roi = cv2.selectROI("Select ROI", first_frame, fromCenter=False, showCrosshair=True)
cv2.destroyWindow("Select ROI")
x_min, y_min, w, h = roi
x_max, y_max = x_min + w, y_min + h

# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()

# Load YOLO model
model = YOLO(model_path)

# Initialize the tracking generator
generator = model.track(
    source=video_path,
    imgsz=640,
    conf=detection_conf_thresh,
    iou=detection_nms_thresh,
    tracker=tracker_config,
    stream=True
)

# Prepare video writer
output_video_path = os.path.join(save_dir, "tracked_with_roi.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# State management
active_objects = []      # List of (label, box) tuples for current objects
lost_objects = {}        # Dict of label: (box, last_seen_frame) for lost objects
next_label_num = 1      # Counter for generating unique object labels
objects_that_left = set()  # Set of labels for objects that have left the scene
frame_count = 0
box_color = (0, 255, 0)

# Main processing loop
while True:
    try:
        results = next(generator)
    except StopIteration:
        break

    frame_count += 1
    frame = results.orig_img.copy()

    # Filter boxes to only include those within ROI
    boxes = []
    for b in results.boxes:
        x1, y1, x2, y2 = map(int, b.xyxy[0])
        if (x_min <= x1 <= x_max and x_min <= x2 <= x_max and
            y_min <= y1 <= y_max and y_min <= y2 <= y_max):
            boxes.append((x1, y1, x2, y2))

    if boxes:
        # Group boxes into shelves
        boxes.sort(key=lambda b: b[3])
        shelves = []
        for bx in boxes:
            x1, y1, x2, y2 = bx
            placed = False
            for shelf in shelves:
                shelf_bottoms = [b[3] for b in shelf]
                shelf_mean_bottom = sum(shelf_bottoms) / len(shelf_bottoms)
                if abs(y2 - shelf_mean_bottom) <= shelf_thresh:
                    shelf.append(bx)
                    placed = True
                    break
            if not placed:
                shelves.append([bx])

        # Sort shelves and create final box list
        for shelf in shelves:
            shelf.sort(key=lambda b: b[0])
        sorted_boxes = [b for shelf in shelves for b in shelf]

        # Track objects
        new_objects = []
        used_active_labels = set()

        for new_box in sorted_boxes:
            # Try to match with active objects
            best_iou = 0
            best_label = None
            for (lbl, a_box) in active_objects:
                iou_val = iou(a_box, new_box)
                if iou_val > best_iou:
                    best_iou = iou_val
                    best_label = lbl

            if best_iou > tracking_iou_thresh:
                new_objects.append((best_label, new_box))
                used_active_labels.add(best_label)
            else:
                # Try to match with lost objects
                best_iou_lost = 0
                best_lost_label = None
                for lbl, (l_box, l_frame) in lost_objects.items():
                    if frame_count - l_frame <= lost_time_limit:
                        iou_val = iou(l_box, new_box)
                        if iou_val > best_iou_lost:
                            best_iou_lost = iou_val
                            best_lost_label = lbl

                if best_iou_lost > tracking_iou_thresh and best_lost_label is not None:
                    new_objects.append((best_lost_label, new_box))
                    used_active_labels.add(best_lost_label)
                    del lost_objects[best_lost_label]
                    objects_that_left.discard(best_lost_label)
                else:
                    new_objects.append((next_label_num, new_box))
                    used_active_labels.add(next_label_num)
                    next_label_num += 1

        # Handle disappeared objects
        active_labels = {lbl for (lbl, _) in active_objects}
        disappeared = active_labels - used_active_labels
        for lbl in disappeared:
            for (l, box) in active_objects:
                if l == lbl:
                    lost_objects[lbl] = (box, frame_count)
                    objects_that_left.add(lbl)

        # Clean up lost objects
        lost_objects = {
            lbl: (box, frame) 
            for lbl, (box, frame) in lost_objects.items() 
            if frame_count - frame <= lost_time_limit
        }

        active_objects = new_objects

        # Draw visualizations
        # Draw ROI
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)

        # Draw bounding boxes and labels
        for (lbl, (x1, y1, x2, y2)) in active_objects:
            cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)
            cv2.putText(frame, f"{lbl}", (x1 + 5, y1 + 20), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, box_color, 2)

        # Draw shelf boundaries
        for idx, shelf in enumerate(shelves, start=1):
            y = min(b[3] for b in shelf)
            cv2.line(frame, (0, y), (width, y), (0, 0, 255), 2)
            cv2.putText(frame, f"Shelf {idx}", (10, y - 10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

        # Draw status information
        if objects_that_left:
            text = f"Objects that have left: {', '.join(map(str, objects_that_left))}"
            cv2.putText(frame, text, (100, 90), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

        cv2.putText(frame, f"Total objects in scene: {len(active_objects)}", 
                   (100, 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    else:
        # Handle case when no boxes are detected
        for (lbl, box) in active_objects:
            lost_objects[lbl] = (box, frame_count)
            objects_that_left.add(lbl)
        active_objects = []

    out_writer.write(frame)

out_writer.release()
print(f"Processed video saved to {output_video_path}")

# Automatically play the processed video
def play_video(video_path):
    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_delay = int(1000/fps)  # Convert fps to milliseconds delay between frames
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
            
        cv2.imshow('Processed Video', frame)
        
        # Break loop if 'q' is pressed or window is closed
        key = cv2.waitKey(frame_delay) & 0xFF
        if key == ord('q') or cv2.getWindowProperty('Processed Video', cv2.WND_PROP_VISIBLE) < 1:
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Play the processed video
print("Playing processed video (press 'q' to quit)...")
play_video(output_video_path)

Select ROI and press ENTER. Press C to cancel.

video 1/1 (frame 1/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4: 384x640 16 obs, 19.5ms
video 1/1 (frame 2/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4: 384x640 16 obs, 21.2ms
video 1/1 (frame 3/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4: 384x640 16 obs, 20.0ms
video 1/1 (frame 4/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4: 384x640 16 obs, 21.0ms
video 1/1 (frame 5/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4: 384x640 16 obs, 19.1ms
video 1/1 (frame 6/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran - 20241204_jp\20241204_105738_BR0.mp4: 384x640 16 obs, 19.5ms
video 1/1 (frame 7/202) C:\Users\joshu\Queen's University Belfast\Michael Loughran