---
title: "Part I: Motion tracking with MediaPipe"
authors: Šárka Kadavá (adapted from EnvisionBOX)
---

In this script, we will demonstrate how to use MediaPipe for markerless motion tracking performed over a video. We will use the MediaPipe Holistic model. This model provides full-body tracking including face, hands, and pose landmarks, we will however focus mainly on pose landmarks which can be extracted in real-world landmarks not only in pixels (i.e., 2D image coordinates).

Unlike many other (better) algorithms, MediaPipe also provides third, depth dimension.

After loading necessary components of the algorithm, we run single loop to capture motion over videos in a folder. In the following script, we will work with the results further to prepare them for an analysis.


## Introduction

In [1]:
import os
import glob
import csv

curfolder = os.getcwd()

# This is where we store videos we want to track
projectdata = os.path.join(curfolder, "ToTrack")
if not os.path.exists(projectdata):
    os.makedirs(projectdata)

vfiles = glob.glob(os.path.join(projectdata, "*.avi"))  # Check the extension

# Here we store our tracked videos with skeleton
outputf_mask = os.path.join(curfolder, "Output_Videos")
if not os.path.exists(outputf_mask):
    os.makedirs(outputf_mask)

# Here we store our time series data
outputf_ts = os.path.join(curfolder, "Output_TimeSeries")
if not os.path.exists(outputf_ts):
    os.makedirs(outputf_ts)

print("\nThe following video(s) will be processed for masking: ")
print(vfiles)


The following video(s) will be processed for masking: 
['c:\\Users\\kadava\\Documents\\Github\\MotionTrackingPipeline_ProDiGe2025\\ToTrack\\0_1_pr_36_p0_snijden_gebaren_video_raw_cam2.avi', 'c:\\Users\\kadava\\Documents\\Github\\MotionTrackingPipeline_ProDiGe2025\\ToTrack\\0_1_trial_34_p1_springen_combinatie_video_raw_cam2.avi', 'c:\\Users\\kadava\\Documents\\Github\\MotionTrackingPipeline_ProDiGe2025\\ToTrack\\0_1_trial_43_p0_sterk_gebaren_video_raw_cam2.avi', 'c:\\Users\\kadava\\Documents\\Github\\MotionTrackingPipeline_ProDiGe2025\\ToTrack\\0_1_trial_53_p1_vangen_gebaren_video_raw_cam2.avi']


## Loading in Mediapipe

In [None]:
import cv2
import mediapipe as mp
import numpy as np

### MEDIAPIPE ###

mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic

# landmarks 33x that are used by Mediapipe (Blazepose)
markersbody = ['NOSE', 'LEFT_EYE_INNER', 'LEFT_EYE', 'LEFT_EYE_OUTER', 'RIGHT_EYE_OUTER', 'RIGHT_EYE', 'RIGHT_EYE_OUTER',
          'LEFT_EAR', 'RIGHT_EAR', 'MOUTH_LEFT', 'MOUTH_RIGHT', 'LEFT_SHOULDER', 'RIGHT_SHOULDER', 'LEFT_ELBOW', 
          'RIGHT_ELBOW', 'LEFT_WRIST', 'RIGHT_WRIST', 'LEFT_PINKY', 'RIGHT_PINKY', 'LEFT_INDEX', 'RIGHT_INDEX',
          'LEFT_THUMB', 'RIGHT_THUMB', 'LEFT_HIP', 'RIGHT_HIP', 'LEFT_KNEE', 'RIGHT_KNEE', 'LEFT_ANKLE', 'RIGHT_ANKLE',
          'LEFT_HEEL', 'RIGHT_HEEL', 'LEFT_FOOT_INDEX', 'RIGHT_FOOT_INDEX']

markershands = ['LEFT_WRIST', 'LEFT_THUMB_CMC', 'LEFT_THUMB_MCP', 'LEFT_THUMB_IP', 'LEFT_THUMB_TIP', 'LEFT_INDEX_FINGER_MCP',
              'LEFT_INDEX_FINGER_PIP', 'LEFT_INDEX_FINGER_DIP', 'LEFT_INDEX_FINGER_TIP', 'LEFT_MIDDLE_FINGER_MCP', 
               'LEFT_MIDDLE_FINGER_PIP', 'LEFT_MIDDLE_FINGER_DIP', 'LEFT_MIDDLE_FINGER_TIP', 'LEFT_RING_FINGER_MCP', 
               'LEFT_RING_FINGER_PIP', 'LEFT_RING_FINGER_DIP', 'LEFT_RING_FINGER_TIP', 'LEFT_PINKY_FINGER_MCP', 
               'LEFT_PINKY_FINGER_PIP', 'LEFT_PINKY_FINGER_DIP', 'LEFT_PINKY_FINGER_TIP',
              'RIGHT_WRIST', 'RIGHT_THUMB_CMC', 'RIGHT_THUMB_MCP', 'RIGHT_THUMB_IP', 'RIGHT_THUMB_TIP', 'RIGHT_INDEX_FINGER_MCP',
              'RIGHT_INDEX_FINGER_PIP', 'RIGHT_INDEX_FINGER_DIP', 'RIGHT_INDEX_FINGER_TIP', 'RIGHT_MIDDLE_FINGER_MCP', 
               'RIGHT_MIDDLE_FINGER_PIP', 'RIGHT_MIDDLE_FINGER_DIP', 'RIGHT_MIDDLE_FINGER_TIP', 'RIGHT_RING_FINGER_MCP', 
               'RIGHT_RING_FINGER_PIP', 'RIGHT_RING_FINGER_DIP', 'RIGHT_RING_FINGER_TIP', 'RIGHT_PINKY_FINGER_MCP', 
               'RIGHT_PINKY_FINGER_PIP', 'RIGHT_PINKY_FINGER_DIP', 'RIGHT_PINKY_FINGER_TIP']

print("Note that we have the following number of pose keypoints for markers body")
print(len(markersbody))

print("\nNote that we have the following number of pose keypoints for markers hands")
print(len(markershands))

# set up the column names and objects for the time series data (add time as the first variable)
markerxyzbody = ['time']
markerxyzhands = ['time']

for mark in markersbody:
    for pos in ['X', 'Y', 'Z', 'visibility']: # for markers of the body you also have a visibility reliability score
        nm = pos + "_" + mark
        markerxyzbody.append(nm)
for mark in markershands:
    for pos in ['X', 'Y', 'Z']:
        nm = pos + "_" + mark
        markerxyzhands.append(nm)


### FUNCTIONS ###

# check if there are numbers in a string
def num_there(s):
    return any(i.isdigit() for i in s)

# take some google classification object and convert it into a string
def makegoginto_str(gogobj):
    gogobj = str(gogobj).strip("[]")
    gogobj = gogobj.split("\n")
    return(gogobj[:-1]) # ignore last element as this has nothing

# make the stringifyd position traces into clean numerical values
def listpositions(newsamplemarks):
    newsamplemarks = makegoginto_str(newsamplemarks)
    tracking_p = []
    for value in newsamplemarks:
        if num_there(value):
            stripped = value.split(':', 1)[1]
            stripped = stripped.strip() # remove spaces in the string if present
            tracking_p.append(stripped) # add to this list  
    return(tracking_p)

Note that we have the following number of pose keypoints for markers body
33

Note that we have the following number of pose keypoints for markers hands
42


## Tracking - pose (in meters+pixels), hands (in pixels)

In [3]:
# LOOP
## We will now loop over all the videos that are present in the video file
for vidf in vfiles:
    print("We will now process video:")
    print(vidf)
    print("This is video number " + str(vfiles.index(vidf)) + " of " + str(len(vfiles)) + " videos in total")
    
    # RAW VIDEO
    ## Capture the video and check video settings
    videoname = vidf.split(os.sep)[-1]

    # This can be useful if you for some reason rerun the code but do not want to reprocess videos that are already done
    # if os.path.isfile(outputf_mask + videoname):
    #     print("The video file " + videoname + " already exists in the output folder. We will skip this video.")
    #     continue

    ## Get video properties
    capture = cv2.VideoCapture(vidf)  # load the video capture
    frameWidth = capture.get(cv2.CAP_PROP_FRAME_WIDTH)  # check frame width
    frameHeight = capture.get(cv2.CAP_PROP_FRAME_HEIGHT)  # check frame height
    samplerate = capture.get(cv2.CAP_PROP_FPS)  # fps = frames per second

    # Create an empty video file to project the pose tracking on
    fourcc = cv2.VideoWriter_fourcc(*'XVID')  # for different video formats you could use e.g., *'XVID', MP4V
    out = cv2.VideoWriter(os.path.join(outputf_mask, videoname), fourcc, fps=samplerate, 
                          frameSize=(int(frameWidth), int(frameHeight)))
    
    # Initialize Mediapipe Holistic
    time = 0
    tsbody = [markerxyzbody]  # These are the time series objects starting with column names initialized above
    tshands = [markerxyzhands]
    tsbody_world = [markerxyzbody]  # For world landmarks (3D coordinates)

    with mp_holistic.Holistic(
        model_complexity=2,             # highest-quality pose model
        min_detection_confidence=0.5,   # minimum confidence for the detection to be considered valid
        min_tracking_confidence=0.5,    # minimum confidence for the tracking to be considered valid
        static_image_mode=False,
        enable_segmentation=True
    ) as holistic:

        ## Processing video frame-by-frame
        while capture.isOpened():
            ret, frame = capture.read()
            if not ret:                     # if there are no more frames, break the loop
                break

            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False

            # Make holistic detection
            results = holistic.process(image)

            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

            # Pose landmarks
            if results.pose_landmarks:
                mp_drawing.draw_landmarks(
                    image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)    # Overlay the pose landmarks on the image
                samplebody = listpositions(results.pose_landmarks)                  # Convert the pose landmarks to a list of positions

                
                # Pose world landmarks (3D coordinates in meters)
                if results.pose_world_landmarks:
                    samplebody_world = listpositions(results.pose_world_landmarks)
                    samplebody_world.insert(0, time)
                    tsbody_world.append(samplebody_world)


            else:
                samplebody = [np.nan for _ in range(len(markerxyzbody)-1)]
                samplebody.insert(0, time)
                tsbody.append(samplebody)

                # Append NaNs for world coordinates as well
                samplebody_world = [np.nan for x in range(len(markerxyzbody)-1)]
                samplebody_world.insert(0, time)
                tsbody_world.append(samplebody_world)

            # Hand landmarks
            if results.left_hand_landmarks or results.right_hand_landmarks:
                mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
                mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
                        
                # Process hands separately
                sampleLH = listpositions(results.left_hand_landmarks)
                sampleRH = listpositions(results.right_hand_landmarks)

                # Fill empty left hand with placeholders
                if len(sampleLH) == 0:
                    sampleLH = ["" for x in range(int(len(markerxyzhands)/2))]

                # Combine hands
                samplehands = sampleLH + sampleRH
                samplehands.insert(0, time)
                tshands.append(samplehands)

            # Show and write output
            cv2.imshow('Mediapipe Feed', image)
            out.write(image)
            time += (1000 / samplerate)

            if cv2.waitKey(1) == 27:
                break
            if ret == False:  # if there are no more frames, break the loop
                break
            
    # Once done, de-initialize all processes
    out.release()
    capture.release()
    cv2.destroyAllWindows()

        
    # Save CSV data for body
    filebody = open(os.path.join(outputf_ts, videoname + '_body_px.csv'), 'w+', newline='')
    with filebody:
        write = csv.writer(filebody)
        write.writerows(tsbody)

    # Save world coordinates (in meters) to CSV for body, face, and hands
    filebody_world = open(os.path.join(outputf_ts, videoname + '_body_world.csv'), 'w+', newline='')
    with filebody_world:
        write = csv.writer(filebody_world)
        write.writerows(tsbody_world)

    # Save CSV data for hands
    filehands = open(os.path.join(outputf_ts, videoname + '_hands_px.csv'), 'w+', newline='')
    with filehands:
        write = csv.writer(filehands)
        write.writerows(tshands)

We will now process video:
c:\Users\kadava\Documents\Github\MotionTrackingPipeline_ProDiGe2025\ToTrack\0_1_pr_36_p0_snijden_gebaren_video_raw_cam2.avi
This is video number 0 of 4 videos in total
We will now process video:
c:\Users\kadava\Documents\Github\MotionTrackingPipeline_ProDiGe2025\ToTrack\0_1_trial_34_p1_springen_combinatie_video_raw_cam2.avi
This is video number 1 of 4 videos in total
We will now process video:
c:\Users\kadava\Documents\Github\MotionTrackingPipeline_ProDiGe2025\ToTrack\0_1_trial_43_p0_sterk_gebaren_video_raw_cam2.avi
This is video number 2 of 4 videos in total
We will now process video:
c:\Users\kadava\Documents\Github\MotionTrackingPipeline_ProDiGe2025\ToTrack\0_1_trial_53_p1_vangen_gebaren_video_raw_cam2.avi
This is video number 3 of 4 videos in total
