In [8]:
# Install required libraries
!pip install ultralytics --quiet
!pip install opencv-python --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# YOLO Pose Tracking - Code Documentation

## Overview

This Python script implements real-time human pose estimation and tracking using YOLOv11 pose detection model. The system processes video files to detect and track human poses, generating annotated output videos with skeletal keypoints overlaid on detected persons.

## Features

- Human Pose Detection: Uses YOLOv11 pose model for accurate keypoint detection
- Multi-Person Tracking: Tracks multiple persons across video frames
- Performance Optimization: Includes frame skipping and downscaling for faster processing
- Configurable Parameters: Easily adjustable settings for different use cases
- Video Output: Saves annotated videos with pose overlays

## Dependencies

```python
from ultralytics import YOLO
import cv2
import time
```

### Required Libraries

- ultralytics: YOLOv11 implementation and model loading
- opencv-python (cv2): Video processing and computer vision operations
- time: Performance timing and benchmarking

## Configuration Parameters

- MODEL_PATH: 'yolo11x-pose.pt' - Path to the YOLOv11 pose model file
- VIDEO_PATH: "/home/sourav/code/.../sample-1.mp4" - Input video file path
- OUTPUT_PATH: "/home/sourav/code/.../sample-1_output.mp4" - Output video file path
- DOWNSCALE: 0.5 - Frame resize factor (0.5 = 50% of original size)
- FRAME_SKIP: 1 - Process every nth frame (1 = process all frames)
- DISPLAY_GUI: False - Enable/disable real-time display window
- CONF_THRESHOLD: 0.6 - Confidence threshold for pose detection

## Code Structure

### Step 1: Library Imports

```python
from ultralytics import YOLO
import cv2
import time
```

### Step 2: Configuration Setup

Defines all configurable parameters including model path, video paths, and optimization settings.

### Step 3: Model Loading

```python
model = YOLO(MODEL_PATH)
model.fuse()  # Optimize model for inference
```

- Loads the YOLOv11 pose model
- Applies model fusion for improved inference speed

### Step 4: Video Setup

```python
cap = cv2.VideoCapture(VIDEO_PATH, cv2.CAP_FFMPEG)
```

- Initializes video capture with FFMPEG backend
- Extracts video properties (width, height, FPS)
- Sets up video writer for output

### Step 5: Processing Loop

The main processing loop handles:

1. Frame Reading: Captures frames from input video
2. Frame Skipping: Processes every nth frame based on FRAME_SKIP
3. Downscaling: Resizes frames for faster inference
4. Pose Detection: Runs YOLO inference with tracking
5. Annotation: Draws pose keypoints and connections
6. Output Writing: Saves annotated frames to output video

### Step 6: Resource Cleanup

Properly releases video capture, writer, and destroys display windows.

## Performance Optimizations

### 1. Model Optimization

- Uses model.fuse() to optimize the model for inference
- Lighter YOLOv11x-pose model for balance between accuracy and speed

### 2. Frame Processing

- Downscaling: Reduces frame size by 50% (DOWNSCALE = 0.5)
- Frame Skipping: Processes every frame (FRAME_SKIP = 1)
- Confidence Threshold: Higher threshold (0.6) reduces false positives

### 3. Resource Management

- Disables GUI display (DISPLAY_GUI = False) for headless processing
- Uses FFMPEG backend for efficient video reading

## Model Information

The script uses YOLOv11x-pose model which provides:

- 17 Keypoints: Standard COCO pose keypoints
- Multi-Person Detection: Simultaneous tracking of multiple individuals
- High Accuracy: Balanced performance for most use cases
- Real-time Capability: Optimized for video processing applications

In [None]:
# STEP 1: Import necessary libraries
from ultralytics import YOLO
import cv2
import time

# STEP 2: Define configuration
MODEL_PATH = 'yolo11x-pose.pt'  # Lighter model for faster inference
VIDEO_PATH = "/home/sourav/code/my-sdk/Boilerplates/CV/VideoAnalysis/sample_videos/sample-1.mp4"
OUTPUT_PATH = "/home/sourav/code/my-sdk/Boilerplates/CV/VideoAnalysis/sample_videos/sample-1_output.mp4"

DOWNSCALE = 0.5              # Resize frame to 50%
FRAME_SKIP = 1               # Process every nth frame
DISPLAY_GUI = False          # Disable GUI for speed
CONF_THRESHOLD = 0.6         # ✅ewer detections, faster inference


# STEP 4: Load YOLOv8 pose model
print("Loading model...")
model = YOLO(MODEL_PATH)
model.fuse()  
print("Model loaded.")

# STEP 5: Setup video reader and writer
cap = cv2.VideoCapture(VIDEO_PATH, cv2.CAP_FFMPEG)
if not cap.isOpened():
    raise IOError(f"Cannot open video file at {VIDEO_PATH}")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
frame_size = (width, height)

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_PATH, fourcc, fps, frame_size)

# STEP 6: Run pose estimation and tracking
frame_count = 0
processed_count = 0
print("Starting optimized pose tracking...")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1

    # Skip frames to speed up processing
    if frame_count % FRAME_SKIP != 0:
        continue

    # Downscale frame
    resized_frame = cv2.resize(frame, (0, 0), fx=DOWNSCALE, fy=DOWNSCALE)

    # Inference with tracking
    start_time = time.time()
    results = model.track(resized_frame, persist=True, verbose=False, conf=CONF_THRESHOLD)
    inference_time = time.time() - start_time

    # Annotate and resize back to original resolution
    annotated_frame = results[0].plot()
    annotated_frame = cv2.resize(annotated_frame, frame_size)

    # Display or write frame
    if DISPLAY_GUI:
        cv2.imshow("Pose Tracking", annotated_frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            print("Interrupted by user.")
            break

    out.write(annotated_frame)
    processed_count += 1

    if processed_count % 100 == 0:
        print(f"Processed {processed_count} frames. ⏱Last frame time: {inference_time:.3f}s")

# STEP 7: Release resources
cap.release()
out.release()
cv2.destroyAllWindows()

print(f"\nCompleted. Total frames processed: {processed_count}/{frame_count}")
print(f"Output saved to: {OUTPUT_PATH}")

🔄 Loading model...
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11x-pose.pt to 'yolo11x-pose.pt'...


100%|██████████| 113M/113M [00:43<00:00, 2.73MB/s] 


YOLO11x-pose summary (fused): 483 layers, 58,751,308 parameters, 0 gradients, 202.8 GFLOPs
✅ Model loaded.
🚀 Starting optimized pose tracking...
🖼️ Processed 100 frames. ⏱️ Last frame time: 1.190s
🖼️ Processed 200 frames. ⏱️ Last frame time: 1.168s

✅ Completed. Total frames processed: 225/225
📁 Output saved to: /home/sourav/code/my-sdk/Boilerplates/CV/VideoAnalysis/sample_videos/sample-1_output.mp4
