In [1]:
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator
import cv2
import time

class YOLOv8PersonDetector:
    def __init__(self, model_path="yolov8n.pt", device="cpu"):
        self.device = device
        self.model = YOLO(model_path).to(device)
        self.classes_to_detect = [0]  # Only "person" class

    def process_frame(self, frame, conf_threshold=0.5):
        start_infer = time.time()
        results = self.model.predict(source=frame, conf=conf_threshold, device=self.device, classes=self.classes_to_detect, verbose=False)
        end_infer = time.time()
        inference_time = end_infer - start_infer  # in seconds

        boxes = results[0].boxes
        annotator = Annotator(frame, line_width=2)
        person_count = 0
        confidence_sum = 0.0

        if boxes is not None and boxes.xyxy is not None:
            for box in boxes:
                conf = float(box.conf[0])
                if conf < conf_threshold:
                    continue
                person_count += 1
                confidence_sum += conf
                annotator.box_label(box.xyxy[0].tolist(), label=f"Person {conf:.2f}", color=(255, 0, 0))

        return annotator.result(), person_count, confidence_sum, person_count, inference_time

def detect_person_in_video(input_path, output_path="output.mp4", device="cpu"):
    cap = cv2.VideoCapture(input_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    detector = YOLOv8PersonDetector(device=device)

    frame_count = 0
    total_persons = 0
    total_confidence = 0.0
    total_detections = 0
    total_inference_time = 0.0

    start_time = time.time()

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        processed_frame, person_count, conf_sum, det_count, infer_time = detector.process_frame(frame)
        out_video.write(processed_frame)

        frame_count += 1
        total_persons += person_count
        total_confidence += conf_sum
        total_detections += det_count
        total_inference_time += infer_time

        print(f"Frame {frame_count}: {person_count} person(s) detected | Inference time: {infer_time*1000:.2f} ms")

    cap.release()
    out_video.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    avg_fps = frame_count / total_time if total_time > 0 else 0
    avg_infer_time = (total_inference_time / frame_count) * 1000 if frame_count > 0 else 0  # in ms
    avg_confidence = total_confidence / total_detections if total_detections > 0 else 0.0

    # Final metrics
    print("\n=== Performance Metrics ===")
    print(f"Total frames processed: {frame_count}")
    print(f"Total persons detected: {total_persons}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average FPS (overall): {avg_fps:.2f}")
    print(f"Average inference time per frame: {avg_infer_time:.2f} ms")
    print(f"Average confidence score: {avg_confidence:.3f}")
    print(f"Output video saved to: {output_path}")

# Example usage:
detect_person_in_video("test_video.mp4", output_path="output.mp4", device="cuda")


Frame 1: 5 person(s) detected | Inference time: 1401.46 ms
Frame 2: 5 person(s) detected | Inference time: 13.34 ms
Frame 3: 5 person(s) detected | Inference time: 10.23 ms
Frame 4: 5 person(s) detected | Inference time: 9.97 ms
Frame 5: 5 person(s) detected | Inference time: 12.08 ms
Frame 6: 5 person(s) detected | Inference time: 11.12 ms
Frame 7: 5 person(s) detected | Inference time: 10.84 ms
Frame 8: 5 person(s) detected | Inference time: 11.97 ms
Frame 9: 5 person(s) detected | Inference time: 10.79 ms
Frame 10: 5 person(s) detected | Inference time: 10.84 ms
Frame 11: 4 person(s) detected | Inference time: 11.32 ms
Frame 12: 4 person(s) detected | Inference time: 12.35 ms
Frame 13: 5 person(s) detected | Inference time: 18.44 ms
Frame 14: 5 person(s) detected | Inference time: 14.37 ms
Frame 15: 5 person(s) detected | Inference time: 11.20 ms
Frame 16: 5 person(s) detected | Inference time: 10.12 ms
Frame 17: 5 person(s) detected | Inference time: 15.47 ms
Frame 18: 5 person(s) 

In [16]:
!yolo export model=yolo11n.pt format=engine device=0 half=True

Ultralytics 8.3.168 🚀 Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 3768MiB)
YOLO11n summary (fused): 100 layers, 2,616,248 parameters, 0 gradients, 6.5 GFLOPs

[34m[1mPyTorch:[0m starting from 'yolo11n.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 84, 8400) (5.4 MB)

[34m[1mONNX:[0m starting export with onnx 1.17.0 opset 19...
[34m[1mONNX:[0m slimming with onnxslim 0.1.61...
[34m[1mONNX:[0m export success ✅ 1.2s, saved as 'yolo11n.onnx' (10.2 MB)

[34m[1mTensorRT:[0m starting export with TensorRT 10.12.0.36...
[07/22/2025-21:14:32] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 774, GPU 992 (MiB)
[07/22/2025-21:14:35] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1562, GPU -1, now: CPU 2487, GPU 990 (MiB)
[07/22/2025-21:14:35] [TRT] [I] ----------------------------------------------------------------
[07/22/2025-21:14:35] [TRT] [I] Input filename:   yolo11n.onnx
[07/22/2025-21:14:35] [TRT

In [19]:
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator
import cv2
import time

class YOLOv8PersonDetector:
    def __init__(self, model_path="yolov8n.engine", device="cuda"):
        self.device = device
        self.model = YOLO(model_path)

    def process_batch(self, frames_batch, conf_threshold=0.5):
        """Processes a batch of frames for person detection."""
        start_infer = time.time()
        
        # Predict on the entire batch of frames at once
        results_list = self.model.predict(source=frames_batch, conf=conf_threshold, device=self.device, classes=0, verbose=False)
        
        end_infer = time.time()
        inference_time_for_batch = end_infer - start_infer

        processed_frames = []
        total_persons_in_batch = 0
        total_confidence_in_batch = 0.0
        total_detections_in_batch = 0

        # Iterate through the results for each frame in the batch
        for i, results in enumerate(results_list):
            original_frame = frames_batch[i]
            annotator = Annotator(original_frame.copy(), line_width=2)
            
            boxes = results.boxes
            person_count = 0
            confidence_sum = 0.0

            if boxes is not None and boxes.xyxy is not None:
                for box in boxes:
                    conf = float(box.conf[0])
                    person_count += 1
                    confidence_sum += conf
                    annotator.box_label(box.xyxy[0].tolist(), label=f"Person {conf:.2f}", color=(0, 255, 0))

            processed_frames.append(annotator.result())
            total_persons_in_batch += person_count
            total_confidence_in_batch += confidence_sum
            total_detections_in_batch += person_count

        return processed_frames, total_persons_in_batch, total_confidence_in_batch, total_detections_in_batch, inference_time_for_batch

def detect_person_in_video(model_path, input_path, output_path="output_optimized.mp4", device="cuda", batch_size=4):
    """Processes a video using batch inference for higher throughput."""
    cap = cv2.VideoCapture(input_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_video = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    detector = YOLOv8PersonDetector(model_path=model_path, device=device)
    print(f"✅ Detector initialized with model: {model_path} on device: {device}")
    print(f"🚀 Running with batch size: {batch_size}")


    frame_count = 0
    total_persons = 0
    total_confidence = 0.0
    total_detections = 0
    total_inference_time = 0.0

    start_time = time.time()
    
    # Main loop to read and process frames in batches
    while True:
        frames_batch = []
        for _ in range(batch_size):
            ret, frame = cap.read()
            if not ret:
                break
            frames_batch.append(frame)

        # If the batch is empty, we've reached the end of the video
        if not frames_batch:
            break

        # Process the entire batch
        processed_frames, persons, conf_sum, dets, infer_time = detector.process_batch(frames_batch)
        
        # Write processed frames to the output video
        for p_frame in processed_frames:
            out_video.write(p_frame)

        # Update statistics
        num_frames_in_batch = len(frames_batch)
        frame_count += num_frames_in_batch
        total_persons += persons
        total_confidence += conf_sum
        total_detections += dets
        total_inference_time += infer_time

        avg_time_per_frame = (infer_time / num_frames_in_batch) * 1000
        print(f"Processed Batch of {num_frames_in_batch} frames | Avg Inference per frame: {avg_time_per_frame:.2f} ms")

    cap.release()
    out_video.release()
    cv2.destroyAllWindows()

    total_time = time.time() - start_time
    avg_fps = frame_count / total_time if total_time > 0 else 0
    avg_infer_time = (total_inference_time / frame_count) * 1000 if frame_count > 0 else 0
    avg_confidence = total_confidence / total_detections if total_detections > 0 else 0.0

    print("\n🚀 === Final Performance Metrics (Optimized) ===")
    print(f"Total frames processed: {frame_count}")
    print(f"Total persons detected: {total_persons}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average FPS (overall throughput): {avg_fps:.2f}")
    print(f"Average inference time per frame: {avg_infer_time:.2f} ms")
    print(f"Average confidence score: {avg_confidence:.3f}")
    print(f"Output video saved to: {output_path}")

# --- HOW TO RUN ---
if __name__ == '__main__':
    # 1. First, export the model with the 'half=True' flag as shown above.
    # 2. Update the path to your new FP16 engine file.
    
    ENGINE_MODEL_PATH = "yolo11n.engine"  # <-- Use your new FP16 .engine file
    INPUT_VIDEO_PATH = "test_video.mp4"
    OUTPUT_VIDEO_PATH = "output_fp16_v12.mp4"
    DEVICE = "cuda"
    
    # Adjust batch size based on your GPU's VRAM. Start with 4 or 8.
    BATCH_SIZE = 1

    detect_person_in_video(
        model_path=ENGINE_MODEL_PATH,
        input_path=INPUT_VIDEO_PATH,
        output_path=OUTPUT_VIDEO_PATH,
        device=DEVICE,
        batch_size=BATCH_SIZE
    )



✅ Detector initialized with model: yolo11n.engine on device: cuda
🚀 Running with batch size: 1
Loading yolo11n.engine for TensorRT inference...
[07/22/2025-21:18:53] [TRT] [I] Loaded engine size: 9 MiB
[07/22/2025-21:18:53] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +9, now: CPU 0, GPU 14 (MiB)
Processed Batch of 1 frames | Avg Inference per frame: 77.14 ms
Processed Batch of 1 frames | Avg Inference per frame: 10.73 ms
Processed Batch of 1 frames | Avg Inference per frame: 7.39 ms
Processed Batch of 1 frames | Avg Inference per frame: 8.22 ms
Processed Batch of 1 frames | Avg Inference per frame: 6.68 ms
Processed Batch of 1 frames | Avg Inference per frame: 6.50 ms
Processed Batch of 1 frames | Avg Inference per frame: 10.94 ms
Processed Batch of 1 frames | Avg Inference per frame: 8.41 ms
Processed Batch of 1 frames | Avg Inference per frame: 8.87 ms
Processed Batch of 1 frames | Avg Inference per frame: 6.82 ms
Processed Batch 