In [4]:
import cv2

# Load the video
video_path = 'output.mp4'  # Replace with your video path
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("Error: Could not open video.")
else:
    # Get FPS
    fps = cap.get(cv2.CAP_PROP_FPS)
    print(f"FPS: {fps}")

    # Get total number of frames
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"Total Frames: {total_frames}")

    # Calculate video duration (in seconds)
    duration_seconds = total_frames / fps
    print(f"Duration: {duration_seconds} seconds")

    # Optionally, you can release the video capture object if needed
    cap.release()

FPS: 30.0
Total Frames: 480
Duration: 16.0 seconds


Mediapipe observation : 
- Ran on few videos and realised that Mediapipe Pose landmarkers are not able to detect a person if the head is not shown in the frame. Likely because of the top down approach of pose detection.
- Pose landmarkers are trained for closeup cases, so it is difficult to detect a person from far away in the frame. 

In [None]:
!pip install ultralytics
!pip install opencv-python

In [10]:
import os
from ultralytics import YOLO
import cv2 
import time # To estimate FPS if needed
from datetime import datetime
import numpy as np


MODEL_NAME = 'yolo11s-pose.pt'
VIDEO_SOURCE_DIR = 'video_data'
OUTPUT_DIR = 'pose_outputs'
RUN_NAME = datetime.now().strftime('%Y%m%d')
CONFIDENCE_THRESHOLD = 0.5

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
output_run_dir = os.path.join(OUTPUT_DIR, RUN_NAME)
os.makedirs(output_run_dir, exist_ok=True) # Ensure the run-specific directory exists

skeleton = [
    (5, 7), (7, 9),    # Left arm
    (6, 8), (8, 10),   # Right arm
    (11, 13), (13, 15),# Left leg
    (12, 14), (14, 16),# Right leg
    (5, 6),            # Shoulders
    (11, 12),          # Hips
    (5, 11), (6, 12)   # Torso
]

# Check if the video file exists
if not os.path.exists(VIDEO_SOURCE_DIR):
    print(f"Error: Video files path not found at {VIDEO_SOURCE_DIR}")
else:
    print(f"Loading YOLO model: {MODEL_NAME}")
    model = YOLO(MODEL_NAME)
    print("Model loaded successfully.")

    for filename in os.listdir(VIDEO_SOURCE_DIR):
        VIDEO_SOURCE_PATH = os.path.join(VIDEO_SOURCE_DIR, filename)

        cap = cv2.VideoCapture(VIDEO_SOURCE_PATH)
        if not cap.isOpened():
            print(f"Error: Could not open video source at {VIDEO_SOURCE_PATH}")
        else:
            frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            # If FPS read is 0 or invalid, use a default
            if fps <= 0:
                print("Warning: Could not read FPS from video source, using default 25 FPS.")
                fps = 25
            cap.release() 

            # --- Define Video Writer ---
            output_video_filename = os.path.splitext(os.path.basename(VIDEO_SOURCE_PATH))[0] + '_annotated.mp4'
            output_video_path = os.path.join(output_run_dir, output_video_filename)

            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            writer = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

            if not writer.isOpened():
                print(f"Error: Could not open video writer for path {output_video_path}. Check codec support ('mp4v').")
            else:
                print(f"Starting pose estimation, output will be saved to: {output_video_path}")
                # --- Run Pose Estimation (WITHOUT internal save) ---
                results_generator = model(
                    source=VIDEO_SOURCE_PATH,
                    stream=True,
                    save=False, # <<< Important: Set save to False or defaults to avi
                    conf=CONFIDENCE_THRESHOLD,
                    task='pose',
                    show=False,
                    imgsz=(frame_height, frame_width) # Optional: Ensure model uses consistent size
                )

                # --- Process Results and Write Frames ---
                frame_count = 0
                start_time = time.time()
                print("Processing frames and writing MP4...")
                try:
                    for frame_number, result in enumerate(results_generator):
                        # Create a blank white frame
                        blank_frame = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 255

                        # Iterate through detected persons
                        for person in result.keypoints.xy:
                            keypoints = person.cpu().numpy()

                            # Draw keypoints
                            for x, y in keypoints:
                                cv2.circle(blank_frame, (int(x), int(y)), radius=3, color=(0, 0, 255), thickness=-1)

                            # Draw skeleton connections
                            for idx1, idx2 in skeleton:
                                if idx1 < len(keypoints) and idx2 < len(keypoints):
                                    x1, y1 = keypoints[idx1]
                                    x2, y2 = keypoints[idx2]
                                    cv2.line(blank_frame, (int(x1), int(y1)), (int(x2), int(y2)), color=(0, 255, 0), thickness=2)

                        # Resize frame if necessary
                        if blank_frame.shape[1] != frame_width or blank_frame.shape[0] != frame_height:
                            blank_frame = cv2.resize(blank_frame, (frame_width, frame_height))

                        # Write the frame to the video file
                        writer.write(blank_frame)

                        frame_count = frame_number + 1
                        if frame_count % 50 == 0:
                            elapsed_time = time.time() - start_time
                            current_fps = frame_count / elapsed_time if elapsed_time > 0 else 0
                            print(f"  Processed {frame_count} frames... (Current FPS: {current_fps:.2f})")

                except Exception as e:
                    print(f"\nAn error occurred during processing/writing: {e}")
                finally:
                    # Release the video writer
                    writer.release()
                    end_time = time.time()
                    print(f"\nFinished processing and writing video.")
                    print(f"Total frames processed: {frame_count}")
                    total_time = end_time - start_time
                    avg_fps = frame_count / total_time if total_time > 0 else 0
                    print(f"Total time: {total_time:.2f} seconds")
                    print(f"Average processing FPS: {avg_fps:.2f}")
                    print(f"✅ Output MP4 video saved successfully at: {output_video_path}")

Loading YOLO model: yolo11s-pose.pt
Model loaded successfully.
Starting pose estimation, output will be saved to: pose_outputs/20250413/output_annotated.mp4
Processing frames and writing MP4...

video 1/1 (frame 1/480) /home/shad/work/stick-figure-fight-generator/stick-figure-fight-generator/video_data/output.mp4: 480x864 1 person, 22.5ms
video 1/1 (frame 2/480) /home/shad/work/stick-figure-fight-generator/stick-figure-fight-generator/video_data/output.mp4: 480x864 1 person, 27.8ms
video 1/1 (frame 3/480) /home/shad/work/stick-figure-fight-generator/stick-figure-fight-generator/video_data/output.mp4: 480x864 1 person, 24.4ms
video 1/1 (frame 4/480) /home/shad/work/stick-figure-fight-generator/stick-figure-fight-generator/video_data/output.mp4: 480x864 1 person, 19.4ms
video 1/1 (frame 5/480) /home/shad/work/stick-figure-fight-generator/stick-figure-fight-generator/video_data/output.mp4: 480x864 1 person, 11.2ms
video 1/1 (frame 6/480) /home/shad/work/stick-figure-fight-generator/stick-