<a href="https://colab.research.google.com/github/qubvel/rt-pose/blob/main/notebooks/video_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install -U pip uv
# !pip install -U git+https://github.com/qubvel/rt-pose.git
# !uv pip install --system "moviepy==2.*" supervision

In [None]:
import os
import torch
import moviepy
import argparse
import numpy as np
import supervision as sv
import huggingface_hub

from tqdm import tqdm
from rt_pose import PoseEstimationPipeline, PoseEstimationOutput

In [None]:
device = "cuda"

capability = torch.cuda.get_device_capability(device)
dtype = torch.bfloat16 if capability > (8, 0) else torch.float16

print(f"Using device: {device}")
print(f"Using dtype: {dtype}")

In [None]:
# Load pose estimation pipeline
pipeline = PoseEstimationPipeline(
    object_detection_checkpoint="PekingU/rtdetr_r34vd",
    pose_estimation_checkpoint="usyd-community/vitpose-plus-small",
    device="cuda",
    dtype=torch.float16,
    compile=True,  # True to get more speedup
)

In [None]:
# As you can see from logs below, model compilation is pretty long step.
# Compilation happens just-in-time, that is why we use warmup, to pass
# a few batches to the models to compile them.
pipeline.warmup()

## Defining some useful functions and loading demo clip

In [35]:
from IPython.display import HTML
from base64 import b64encode

def show_clip(path):
    with open(path, "rb") as f:
        mp4 = f.read()
    data = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML(f'<video width=400 controls><source src={data} type="video/mp4"></video>')

def visualize_output(image: np.ndarray, output: PoseEstimationOutput, confidence: float = 0.3) -> np.ndarray:
    """
    Visualize pose estimation output.
    """
    keypoints_xy = output.keypoints_xy.float().cpu().numpy()
    scores = output.scores.float().cpu().numpy()

    # Supervision will not draw vertices with `0` score
    # and coordinates with `(0, 0)` value
    invisible_keypoints = scores < confidence
    scores[invisible_keypoints] = 0
    keypoints_xy[invisible_keypoints] = 0

    keypoints = sv.KeyPoints(xy=keypoints_xy, confidence=scores)

    _, y_min, _, y_max = output.person_boxes_xyxy.T
    height = int((y_max - y_min).mean().item())
    radius = max(height // 100, 4)
    thickness = max(height // 200, 3)
    edge_annotator = sv.EdgeAnnotator(color=sv.Color.YELLOW, thickness=thickness)
    vertex_annotator = sv.VertexAnnotator(color=sv.Color.ROBOFLOW, radius=radius)

    annotated_frame = image.copy()
    annotated_frame = edge_annotator.annotate(annotated_frame, keypoints)
    annotated_frame = vertex_annotator.annotate(annotated_frame, keypoints)

    return annotated_frame

In [30]:
# Load demo clip from dataset, but you can use a local one
path = huggingface_hub.hf_hub_download(
    repo_id="qubvel-hf/assets", filename="rt_pose_break_dance_v1.mp4", repo_type="dataset"
)
clip = moviepy.VideoFileClip(path)

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'comment': 'vid:v12044gd0000cqrtpdfog65mut9h0ba0', 'aigc_info': '{"aigc_label_type": 0}', 'encoder': 'Lavf58.76.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [576, 1024], 'bitrate': 1076, 'fps': 59.63, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 32, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 12.95, 'bitrate': 1118, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size':

In [39]:
# Uncomment next line to show annotated clip
# show_clip(path)

## Running pose estimation pipeline

In [36]:
annotated_frames = []
frames = list(clip.iter_frames())

for frame in tqdm(frames, total=clip.n_frames):
    output = pipeline(frame)
    annotated_frame = visualize_output(frame, output, confidence=0.3)
    annotated_frames.append(annotated_frame)

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'comment': 'vid:v12044gd0000cqrtpdfog65mut9h0ba0', 'aigc_info': '{"aigc_label_type": 0}', 'encoder': 'Lavf58.76.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [576, 1024], 'bitrate': 1076, 'fps': 59.63, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 32, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 12.95, 'bitrate': 1118, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size':

100%|██████████| 772/772 [00:48<00:00, 16.04it/s]


In [37]:
# Save annotated frames as video with the same audio from clip
annotated_clip = moviepy.ImageSequenceClip(annotated_frames, fps=clip.fps)
annotated_clip.audio = clip.audio

dst_path = "saved_video.mp4"
annotated_clip.write_videofile(dst_path)

MoviePy - Building video saved_video.mp4.
MoviePy - Writing audio in saved_videoTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
MoviePy - Writing video saved_video.mp4





MoviePy - Done !
MoviePy - video ready saved_video.mp4


In [40]:
# Uncomment next line to show annotated clip
# show_clip(dst_path)