In [None]:
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator
import cv2
import time

class YOLOv12PersonDetector:
    def __init__(self, model_path="yolov12n.pt", device="cuda"):
        self.device = device
        self.model = YOLO(model_path).to(device)
        self.classes_to_detect = [0]  # Only "person" class

    def process_frame(self, frame, conf_threshold=0.5):
        start_infer = time.time()
        results = self.model.predict(source=frame, conf=conf_threshold, device=self.device, classes=self.classes_to_detect, verbose=False)
        end_infer = time.time()
        inference_time = end_infer - start_infer  # in seconds

        boxes = results[0].boxes
        annotator = Annotator(frame, line_width=2)
        person_count = 0
        confidence_sum = 0.0

        if boxes is not None and boxes.xyxy is not None:
            for box in boxes:
                conf = float(box.conf[0])
                if conf < conf_threshold:
                    continue
                person_count += 1
                confidence_sum += conf
                annotator.box_label(box.xyxy[0].tolist(), label=f"Person {conf:.2f}", color=(255, 0, 0))

        return annotator.result(), person_count, confidence_sum, person_count, inference_time

def detect_person_in_video(input_path, output_path="output.mp4", device="cpu"):
    cap = cv2.VideoCapture(input_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    detector = YOLOv12PersonDetector(device=device)

    frame_count = 0
    total_persons = 0
    total_confidence = 0.0
    total_detections = 0
    total_inference_time = 0.0

    start_time = time.time()

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        processed_frame, person_count, conf_sum, det_count, infer_time = detector.process_frame(frame)
        out_video.write(processed_frame)

        frame_count += 1
        total_persons += person_count
        total_confidence += conf_sum
        total_detections += det_count
        total_inference_time += infer_time

        print(f"Frame {frame_count}: {person_count} person(s) detected | Inference time: {infer_time*1000:.2f} ms")

    cap.release()
    out_video.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    avg_fps = frame_count / total_time if total_time > 0 else 0
    avg_infer_time = (total_inference_time / frame_count) * 1000 if frame_count > 0 else 0  # in ms
    avg_confidence = total_confidence / total_detections if total_detections > 0 else 0.0

    # Final metrics
    print("\n=== Performance Metrics ===")
    print(f"Total frames processed: {frame_count}")
    print(f"Total persons detected: {total_persons}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average FPS (overall): {avg_fps:.2f}")
    print(f"Average inference time per frame: {avg_infer_time:.2f} ms")
    print(f"Average confidence score: {avg_confidence:.3f}")
    print(f"Output video saved to: {output_path}")

# Example usage:
detect_person_in_video("test_video.mp4", output_path="output.mp4", device="cuda:0")


Frame 1: 4 person(s) detected | Inference time: 2504.53 ms
Frame 2: 5 person(s) detected | Inference time: 18.15 ms
Frame 3: 5 person(s) detected | Inference time: 17.11 ms
Frame 4: 5 person(s) detected | Inference time: 15.04 ms
Frame 5: 5 person(s) detected | Inference time: 16.46 ms
Frame 6: 5 person(s) detected | Inference time: 16.29 ms
Frame 7: 5 person(s) detected | Inference time: 15.18 ms
Frame 8: 5 person(s) detected | Inference time: 15.75 ms
Frame 9: 5 person(s) detected | Inference time: 15.01 ms
Frame 10: 5 person(s) detected | Inference time: 19.36 ms
Frame 11: 4 person(s) detected | Inference time: 20.67 ms
Frame 12: 4 person(s) detected | Inference time: 18.62 ms
Frame 13: 5 person(s) detected | Inference time: 18.05 ms
Frame 14: 5 person(s) detected | Inference time: 17.52 ms
Frame 15: 5 person(s) detected | Inference time: 17.39 ms
Frame 16: 5 person(s) detected | Inference time: 16.37 ms
Frame 17: 5 person(s) detected | Inference time: 14.88 ms
Frame 18: 5 person(s)