# Final Notebook - Fitness Activity Recognition - Cohort 22

# 0. Imports

In [None]:
# Disable Warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# OpenCV to read and write the videos
import cv2

# For Mediapipe Blazepose to extract key-point coordinates
import mediapipe as mp

# Standard libraries
import numpy as np
import pandas as pd

# For file-handling
import os

In [None]:
# Mediapipe Blazepose utilities

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose

# 1.1 Dataset 0

We use 3 different videos of subjects doing bicep curls for preliminary comparison:
- Video 1: Trainer Video
- Video 2: Trainee Video with correct form
- Video 3: Trainee Video with wrong form

In [None]:
# storing the video file paths

vid1_path = os.path.join('data', 'sample_videos', 'curls3-1.mp4')
print(vid1_path)

vid2_path = os.path.join('data', 'sample_videos', 'curls3-2.mp4')
print(vid2_path)

vid3_path = os.path.join('data', 'sample_videos', 'curls3-3.mp4')
print(vid3_path)

data/sample_videos/curls3-1.mp4
data/sample_videos/curls3-2.mp4
data/sample_videos/curls3-3.mp4


# 1.2 Dataset 1

We use 3 different videos taken from 3 different angles of subjects doing bicep curls for testing of angle invariance algorithms:
- Video 1: Front View Video
- Video 2: Left View Video
- Video 3: Right View Video

In [None]:
# storing the video file paths

vidf_path = os.path.join('data', 'sample_videos', 'curls_R-F-1.mp4')
print(vidf_path)

vidl_path = os.path.join('data', 'sample_videos', 'curls_R-L-1.mp4')
print(vidl_path)

vidr_path = os.path.join('data', 'sample_videos', 'curls_R-R-1.mp4')
print(vidr_path)

data/sample_videos/curls_R-F-1.mp4
data/sample_videos/curls_R-L-1.mp4
data/sample_videos/curls_R-R-1.mp4


# 2. Preliminary Algorithms

## Algorithm 1.1

We first reshape the `(n, 33, 3)` dimensional array to `(33, n, 3)` dimensional array (here `n` is the no. of frames in the input video). Then we loop over each of the `33` key-points individually. Let’s say we are focussing on key-point `0`, which corresponds to the nose. We consider the `(n1, 3)` and `(n2, 3)` dimensional arrays corresponding to this key-point in the first and second video respectively and use DTW to obtain a score corresponding to each key-point. We obtain 33 such scores and consider the average score out of all.

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the coordinates of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass
            
            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))

            # extract (x,y,z) coordinate of each landmark (key-point)
            for i in range(33):
                frame_coordinates[i][0],frame_coordinates[i][1],frame_coordinates[i][2] = landmarks[i].x, landmarks[i].y, landmarks[i].z
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


## reshape the (n, 33, 3) dimensional array to a (33, n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates
        vid1_arr[i] = vid1_arr[i]/np.linalg.norm(vid1_arr[i])
        vid2_arr[i] = vid2_arr[i]/np.linalg.norm(vid2_arr[i])
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
combined_compare(vid1_path, vid1_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100.]
Average Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [90.36712509 90.48350076 90.4214465  90.35077107 89.98707817 89.96469313
 89.93794297 90.28094662 88.2643645  90.54121127 89.79774844 91.68475828
 87.10831677 92.99999853 87.92232255 88.48792062 81.77806401 88.23772274
 80.51475224 88.83917342 81.23785002 87.93834118 81.52105547 92.34317024
 91.37831678 91.96892653 92.49469029 92.16970594 92.91024929 92.23795573
 92.89406741 92.15368687 92.04274271]
Average Score: 89.4321398829224


In [None]:
combined_compare(vid1_path, vid3_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [77.64975877 77.51754387 77.55602626 77.60267612 77.69823648 77.72383293
 77.76272665 77.87427589 78.52889323 77.89025921 78.09730396 79.23676164
 79.29086474 84.07772858 80.88352592 76.7209857  69.06288637 75.14808023
 67.02120509 74.80446291 66.30122747 74.34038988 67.36503401 80.99545498
 81.17073534 79.05177224 77.54065719 77.68160555 76.73332758 77.80852284
 76.83242531 76.98610586 76.16014382]
Average Score: 76.7004677761122


## Algorithm 1.2

This is a modification of **Algorithm 1.1**. We use our domain expertise about the exercise, i.e. bicep curls and argue that only the key-points corresponding to the upper body matters. Hence we limit our observations to only the key-points in the upper body and only consider `(n, k , 3)` dimensional array, where `k` is the no. of key-points considered `(k  33)`. Everything else is exactly the same as before, and we end up considering the average score of these `k` key-points.

In [None]:
## to create a helper function to extract (n, len(landmark_list), 3) dimensional array to store the coordinates of each landmark in the landmark_list in each frame (we only consider the landmarks in this list `landmark_list`)

def extract_arr(input_video_path, landmark_list = list(range(33))):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # we sort the landmark_list inplace for convenience
            landmark_list.sort()

            # initialize frame_coordinates to store the coordinates of the 'len(landmark_list)' key-points per frame
            frame_coordinates = np.zeros((len(landmark_list),3))

            # extract (x,y,z) coordinate of each landmark (key-point)
            for i in range(len(landmark_list)):
                idx = landmark_list[i]
                frame_coordinates[i][0],frame_coordinates[i][1],frame_coordinates[i][2] = landmarks[idx].x, landmarks[idx].y, landmarks[idx].z
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates), landmark_list


## reshape the (n, len(landmark_list), 3) dimensional array to a (len(landmark_list), n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, len(landmark_list), 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates
        vid1_arr[i] = vid1_arr[i]/np.linalg.norm(vid1_arr[i])
        vid2_arr[i] = vid2_arr[i]/np.linalg.norm(vid2_arr[i])
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, landmark_list = list(range(33))):
    vid1_arr, _ = extract_arr(vid1_path, landmark_list)
    vid2_arr, _  = extract_arr(vid2_path, landmark_list)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
# we consider landmarks from 0 till 24: to consider only the upper body
landmark_list = list(range(25))

In [None]:
combined_compare(vid1_path, vid1_path, landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.]
Average Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path, landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [90.36712509 90.48350076 90.4214465  90.35077107 89.98707817 89.96469313
 89.93794297 90.28094662 88.2643645  90.54121127 89.79774844 91.68475828
 87.10831677 92.99999853 87.92232255 88.48792062 81.77806401 88.23772274
 80.51475224 88.83917342 81.23785002 87.93834118 81.52105547 92.34317024
 91.37831678]
Average Score: 88.4955436546632


In [None]:
combined_compare(vid1_path, vid3_path, landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [77.64975877 77.51754387 77.55602626 77.60267612 77.69823648 77.72383293
 77.76272665 77.87427589 78.52889323 77.89025921 78.09730396 79.23676164
 79.29086474 84.07772858 80.88352592 76.7209857  69.06288637 75.14808023
 67.02120509 74.80446291 66.30122747 74.34038988 67.36503401 80.99545498
 81.17073534]
Average Score: 76.49283504900352


## Algorithm 2.1

Instead of reshaping the array as in **Algorithm 1.1**, **1.2**, we now consider the whole `(n, (33,3))` array of each video for comparison. So, each frame contains information of all the key-points in a `(33, 3)` dimensional array. We use DTW on this and consider the score.

In [None]:
## to create a helper function to extract (n, (33, 3)) dimensional array to store the coordinates of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass
            
            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))

            # extract (x,y,z) coordinate of each landmark (key-point)
            for i in range(33):
                frame_coordinates[i][0],frame_coordinates[i][1],frame_coordinates[i][2] = landmarks[i].x, landmarks[i].y, landmarks[i].z
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, (33, 3)) using DTW

def compare_vid(vid1_arr, vid2_arr):
    n_landmarks = vid1_arr.shape[0]
    
    # normalize the coordinates
    vid1_arr = vid1_arr/np.linalg.norm(vid1_arr)
    vid2_arr = vid2_arr/np.linalg.norm(vid2_arr)
    
    d = distance(vid1_arr, vid2_arr)
    d_score = 100 - (d*100)
    
    return d_score


## comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    score = compare_vid(vid1_arr, vid2_arr)

    print("Final Score:", score)

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
combined_compare(vid1_path, vid1_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 85.61251023753023


In [None]:
combined_compare(vid1_path, vid3_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 72.13579475938042


## Algorithm 2.2

This is again a modification of **Algorithm 2.1** and **Algorithm 1.2**, where we limit ourselves to the relevant key-points (in this case the upper-body) and directly compare the `(n, (k, 3))` dimensional arrays of each of the videos to obtain the score. So, each frame contains information of only the relevant `k` key-points (of the upper-body) in the `(k, 33)` dimensional array. We use DTW on this to consider the score.

In [None]:
## to create a helper function to extract (n, len(landmark_list), 3) dimensional array to store the coordinates of each landmark in the landmark_list in each frame (we only consider the landmarks in this list `landmark_list`)

def extract_arr(input_video_path, landmark_list = list(range(33))):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # we sort the landmark_list inplace for convenience
            landmark_list.sort()

            # initialize frame_coordinates to store the coordinates of the 'len(landmark_list)' key-points per frame
            frame_coordinates = np.zeros((len(landmark_list),3))
            
            # extract (x,y,z) coordinate of each landmark (key-point)
            for i in range(len(landmark_list)):
                idx = landmark_list[i]
                frame_coordinates[i][0],frame_coordinates[i][1],frame_coordinates[i][2] = landmarks[idx].x, landmarks[idx].y, landmarks[idx].z
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates), landmark_list


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, len(landmark_list), 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    
    # normalize the coordinates
    vid1_arr = vid1_arr/np.linalg.norm(vid1_arr)
    vid2_arr = vid2_arr/np.linalg.norm(vid2_arr)
    
    d = distance(vid1_arr, vid2_arr)
    d_score = 100 - (d*100)
    
    return d_score


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, landmark_list = list(range(33))):
    vid1_arr, _ = extract_arr(vid1_path, landmark_list)
    vid2_arr, _ = extract_arr(vid2_path, landmark_list)

    score = compare_vid(vid1_arr, vid2_arr)

    print("Final Score:", score)

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
# we consider landmarks from 0 till 24: to consider only the upper body
landmark_list = list(range(25))

In [None]:
combined_compare(vid1_path, vid1_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 83.77952859907592


In [None]:
combined_compare(vid1_path, vid3_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 70.6449503249911


# 3. Algorithms with Cube Normalization

As in the previous Algorithms, we again use Mediapipe blazepose to extract 33-key points coordinates in the `x`,`y` and `z` axis, which is normalized w.r.t. the size of the frame. But this time we go one step further and implement the “Cube Normalization”.

**Cube Normalization:**
For each frame, first we extract the $(x_{\text{min}}, y_{\text{min}}, z_{\text{min}})$ and $(x_{\text{max}}, y_{\text{max}}, z_{\text{max}})$ which are the min and max coordinates of each of the components, amongst all the 33 key-point coordinates in a particular frame.
We use these min-max coordinates to transform the coordinates of the 33 key-points using the following equation:
$$(x,y,z) \mapsto  (\frac{x - x_{\text{min}}}{x_{\text{max}} - x_{\text{min}}}, \frac{y - y_{\text{min}}}{y_{\text{max}} - y_{\text{min}}}, \frac{z - z_{\text{min}}}{z_{\text{max}} - z_{\text{min}}})$$

After this transformation, we only focus on the key-points of the subject, which makes the background obsolete. This also manages to squeeze all the coordinates into a unit cube, with the origin translated to the $(x_{\text{min}}, y_{\text{min}}, z_{\text{min}})$.

## Algorithm 3.1

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


## reshape the (n, 33, 3) dimensional array to a (33, n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates
        vid1_arr[i] = vid1_arr[i]/np.linalg.norm(vid1_arr[i])
        vid2_arr[i] = vid2_arr[i]/np.linalg.norm(vid2_arr[i])
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
combined_compare(vid1_path, vid1_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100.]
Average Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [76.09880093 71.03308007 71.92827159 72.97813549 71.17779816 71.32982945
 71.62863353 83.42066047 80.79698083 83.79111416 83.92837262 91.99387374
 89.42282927 89.75248382 84.20653405 80.34372698 74.62355477 77.20345952
 71.53413893 77.28875337 72.1713712  78.79991038 74.87175014 87.50617816
 86.33696332 82.74433482 88.15196298 92.0240622  93.22718093 93.21799267
 93.55924829 90.54064646 93.09000439]
Average Score: 81.84007992994708


In [None]:
combined_compare(vid1_path, vid3_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [70.64647839 72.04212754 72.4391972  72.87225625 70.89557378 71.13176141
 71.47964269 76.64926065 73.51485129 73.78181789 72.46286718 81.5419961
 79.96752798 66.27161632 75.31776237 59.18279573 65.48069597 52.85263092
 59.70680572 52.3766379  57.8763038  55.54283735 61.07838155 74.01273037
 75.66878705 73.65711004 77.58279603 79.34605448 79.69506302 80.02573554
 80.03066776 78.59057671 80.22067364]
Average Score: 71.02854607963663


## Algorithm 3.2

In [None]:
## to create a helper function to extract (n, len(landmark_list), 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in the landmark_list in each frame (we only consider the landmarks in this list `landmark_list`)

def extract_arr(input_video_path, landmark_list = list(range(33))):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # we sort the landmark_list inplace for convenience
            landmark_list.sort()
            
            # initialize to store the key-points coordinates in the frame
            frame_coordinates = np.zeros((len(landmark_list),3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(len(landmark_list)):
                idx = landmark_list[i]
                x, y, z = landmarks[idx].x, landmarks[idx].y, landmarks[idx].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(len(landmark_list)):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates), landmark_list


## reshape the (n, len(landmark_list), 3) dimensional array to a (len(landmark_list), n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, len(landmark_list), 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates
        vid1_arr[i] = vid1_arr[i]/np.linalg.norm(vid1_arr[i])
        vid2_arr[i] = vid2_arr[i]/np.linalg.norm(vid2_arr[i])
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, landmark_list = list(range(33))):
    vid1_arr, _ = extract_arr(vid1_path, landmark_list)
    vid2_arr, _  = extract_arr(vid2_path, landmark_list)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
# we consider landmarks from 0 till 24: to consider only the upper body
landmark_list = list(range(25))

In [None]:
combined_compare(vid1_path, vid1_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.]
Average Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [79.29076533 74.62334593 75.25919102 76.04498242 76.75394097 76.75762224
 76.875806   85.18942254 85.8666015  85.01640652 86.24877752 93.15801846
 88.64307079 87.08987375 83.57118547 88.67744092 74.7101584  87.44175462
 73.97601561 87.14164671 75.5954874  87.54069156 76.52657353 84.99043884
 85.98794896]
Average Score: 82.11908668037474


In [None]:
combined_compare(vid1_path, vid3_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [77.35746198 77.4755386  77.96513494 78.49942307 77.01953939 77.18586414
 77.48336462 81.13451253 79.70358058 79.9590351  79.50166629 81.43148999
 85.53170833 65.59291866 76.88160367 52.32549177 69.21050326 49.27063941
 66.29886869 49.27494289 64.55502344 50.97623    66.85073321 75.39640356
 76.66159941]
Average Score: 71.7417311011076


## Algorithm 4.1

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    n_landmarks = vid1_arr.shape[0]
    
    # normalize the coordinates
    vid1_arr = vid1_arr/np.linalg.norm(vid1_arr)
    vid2_arr = vid2_arr/np.linalg.norm(vid2_arr)
    
    d = distance(vid1_arr, vid2_arr)
    d_score = 100 - (d*100)
    
    return d_score


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    score = compare_vid(vid1_arr, vid2_arr)

    print("Final Score:", score)

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
combined_compare(vid1_path, vid1_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 80.65988501532588


In [None]:
combined_compare(vid1_path, vid3_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 67.32642349397406


## Algorithm 4.2

In [None]:
## to create a helper function to extract (n, len(landmark_list), 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in the landmark_list in each frame (we only consider the landmarks in this list `landmark_list`)

def extract_arr(input_video_path, landmark_list = list(range(33))):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # we sort the landmark_list inplace for convenience
            landmark_list.sort()
            
            # initialize to store the key-points coordinates in the frame
            frame_coordinates = np.zeros((len(landmark_list),3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(len(landmark_list)):
                idx = landmark_list[i]
                x, y, z = landmarks[idx].x, landmarks[idx].y, landmarks[idx].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(len(landmark_list)):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates), landmark_list


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, len(landmark_list), 3) using DTW

def compare_vid(vid1_arr, vid2_arr):
    
    # normalize the coordinates
    vid1_arr = vid1_arr/np.linalg.norm(vid1_arr)
    vid2_arr = vid2_arr/np.linalg.norm(vid2_arr)
    
    d = distance(vid1_arr, vid2_arr)
    d_score = 100 - (d*100)
    
    return d_score


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, landmark_list = list(range(33))):
    vid1_arr, _ = extract_arr(vid1_path, landmark_list)
    vid2_arr, _ = extract_arr(vid2_path, landmark_list)

    score = compare_vid(vid1_arr, vid2_arr)

    print("Final Score:", score)

We now use the videos in **Dataset 0** to test our algorithms.

In [None]:
# we consider landmarks from 0 till 24: to consider only the upper body
landmark_list = list(range(25))

In [None]:
combined_compare(vid1_path, vid1_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 100.0


In [None]:
combined_compare(vid1_path, vid2_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 77.4598144210633


In [None]:
combined_compare(vid1_path, vid3_path,landmark_list)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 61.681784088694194


# 4. Normalized Algorithms with added Sensitivity

We need sensitivity hyperparameters to control how strict or lenient we would want our model to be. In our implementation:
- Higher Sensitivity => Lenient Model => Higher Average Score
- Lower Sensitivity => Strict Model => Lower Average Score
- Default Sensitivity = 1

Adding the sensitivity hyperparameter:
- We keep everything the same as seen in Cube Normalization.
- We only tweak the compare_vid function (this takes in the arrays of coordinates extracted from the video and uses DTW to output a score) to add an extra parameter called sensitivity (with default value = 1), which is divided in the numerator along with the norm of the array of coordinates from the video extracted using the extract_arr function.
- This enables us to control the strictness of the model.

## Algorithm 5

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


## reshape the (n, 33, 3) dimensional array to a (33, n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW while incorporating the sensitivity parameter

def compare_vid(vid1_arr, vid2_arr, senstivity = 1):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring
        vid1_arr[i] = vid1_arr[i]/(np.linalg.norm(vid1_arr[i]) * senstivity)
        vid2_arr[i] = vid2_arr[i]/(np.linalg.norm(vid2_arr[i]) * senstivity)
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new, senstivity)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())

We now use the videos in **Dataset 0** to test our algorithms.

### `senstivity = 0.5`

In [None]:
combined_compare(vid1_path, vid2_path, 0.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [52.19760186 42.06616013 43.85654318 45.95627098 42.35559631 42.65965891
 43.25726706 66.84132094 61.59396165 67.58222832 67.85674525 83.98774748
 78.84565854 79.50496764 68.4130681  60.68745396 49.24710954 54.40691904
 43.06827785 54.57750674 44.3427424  57.59982076 49.74350029 75.01235632
 72.67392663 65.48866964 76.30392597 84.0481244  86.45436187 86.43598534
 87.11849659 81.08129293 86.18000878]
Average Score: 63.68015985989418


In [None]:
combined_compare(vid1_path, vid3_path, 0.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [41.29295678 44.08425509 44.87839439 45.74451249 41.79114755 42.26352283
 42.95928539 53.2985213  47.02970258 47.56363577 44.92573436 63.08399221
 59.93505596 32.54323265 50.63552474 18.36559147 30.96139195  5.70526184
 19.41361143  4.7532758  15.7526076  11.08567471 22.1567631  48.02546074
 51.3375741  47.31422008 55.16559206 58.69210896 59.39012603 60.05147107
 60.06133551 57.18115343 60.44134728]
Average Score: 42.05709215927326


### `senstivity = 1` (Default)

In [None]:
combined_compare(vid1_path, vid2_path, 1)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [76.09880093 71.03308007 71.92827159 72.97813549 71.17779816 71.32982945
 71.62863353 83.42066047 80.79698083 83.79111416 83.92837262 91.99387374
 89.42282927 89.75248382 84.20653405 80.34372698 74.62355477 77.20345952
 71.53413893 77.28875337 72.1713712  78.79991038 74.87175014 87.50617816
 86.33696332 82.74433482 88.15196298 92.0240622  93.22718093 93.21799267
 93.55924829 90.54064646 93.09000439]
Average Score: 81.84007992994708


In [None]:
combined_compare(vid1_path, vid3_path, 1)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [70.64647839 72.04212754 72.4391972  72.87225625 70.89557378 71.13176141
 71.47964269 76.64926065 73.51485129 73.78181789 72.46286718 81.5419961
 79.96752798 66.27161632 75.31776237 59.18279573 65.48069597 52.85263092
 59.70680572 52.3766379  57.8763038  55.54283735 61.07838155 74.01273037
 75.66878705 73.65711004 77.58279603 79.34605448 79.69506302 80.02573554
 80.03066776 78.59057671 80.22067364]
Average Score: 71.02854607963663


### `senstivity = 1.5`

In [None]:
combined_compare(vid1_path, vid2_path, 1.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [84.06586729 80.68872004 81.28551439 81.98542366 80.78519877 80.88655297
 81.08575569 88.94710698 87.19798722 89.19407611 89.28558175 94.66258249
 92.94855285 93.16832255 89.4710227  86.89581799 83.08236985 84.80230635
 81.02275928 84.85916891 81.4475808  85.86660692 83.24783343 91.67078544
 90.89130888 88.49622321 92.10130866 94.68270813 95.48478729 95.47866178
 95.70616553 93.69376431 95.39333626]
Average Score: 87.89338661996472


In [None]:
combined_compare(vid1_path, vid3_path, 1.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [80.43098559 81.36141836 81.62613146 81.9148375  80.59704918 80.75450761
 80.98642846 84.43284043 82.34323419 82.52121192 81.64191145 87.69466407
 86.64501865 77.51441088 83.54517491 72.78853049 76.98713065 68.56842061
 73.13787048 68.25109193 71.91753587 70.36189157 74.05225437 82.67515358
 83.77919137 82.43807336 85.05519735 86.23070299 86.46337534 86.68382369
 86.68711184 85.72705114 86.81378243]
Average Score: 80.68569738642442


### `senstivity = 2`

In [None]:
combined_compare(vid1_path, vid2_path, 2)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [88.04940047 85.51654003 85.9641358  86.48906775 85.58889908 85.66491473
 85.81431676 91.71033024 90.39849041 91.89555708 91.96418631 95.99693687
 94.71141464 94.87624191 92.10326703 90.17186349 87.31177738 88.60172976
 85.76706946 88.64437668 86.0856856  89.39995519 87.43587507 93.75308908
 93.16848166 91.37216741 94.07598149 96.0120311  96.61359047 96.60899634
 96.77962415 95.27032323 96.54500219]
Average Score: 90.92003996497354


In [None]:
combined_compare(vid1_path, vid3_path, 2)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [85.32323919 86.02106377 86.2195986  86.43612812 85.44778689 85.56588071
 85.73982135 88.32463032 86.75742564 86.89090894 86.23143359 90.77099805
 89.98376399 83.13580816 87.65888119 79.59139787 82.74034799 76.42631546
 79.85340286 76.18831895 78.9381519  77.77141868 80.53919078 87.00636519
 87.83439353 86.82855502 88.79139801 89.67302724 89.84753151 90.01286777
 90.01533388 89.29528836 90.11033682]
Average Score: 85.51427303981832


### `senstivity = 5`

In [None]:
combined_compare(vid1_path, vid2_path, 5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [95.21976019 94.20661601 94.38565432 94.5956271  94.23555963 94.26596589
 94.32572671 96.68413209 96.15939617 96.75822283 96.78567452 98.39877475
 97.88456585 97.95049676 96.84130681 96.0687454  94.92471095 95.4406919
 94.30682779 95.45775067 94.43427424 95.75998208 94.97435003 97.50123563
 97.26739266 96.54886696 97.6303926  98.40481244 98.64543619 98.64359853
 98.71184966 98.10812929 98.61800088]
Average Score: 96.36801598598942


In [None]:
combined_compare(vid1_path, vid3_path, 5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [94.12929568 94.40842551 94.48783944 94.57445125 94.17911476 94.22635228
 94.29592854 95.32985213 94.70297026 94.75636358 94.49257344 96.30839922
 95.9935056  93.25432326 95.06355247 91.83655915 93.09613919 90.57052618
 91.94136114 90.47532758 91.57526076 91.10856747 92.21567631 94.80254607
 95.13375741 94.73142201 95.51655921 95.8692109  95.9390126  96.00514711
 96.00613355 95.71811534 96.04413473]
Average Score: 94.20570921592733


## Algorithm 6

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW

def compare_vid(vid1_arr, vid2_arr, senstivity=1):
    n_landmarks = vid1_arr.shape[0]
    
    # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring
    vid1_arr = vid1_arr/np.linalg.norm(vid1_arr * senstivity)
    vid2_arr = vid2_arr/np.linalg.norm(vid2_arr * senstivity)
    
    d = distance(vid1_arr, vid2_arr)
    d_score = 100 - (d*100)
    
    return d_score


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    score = compare_vid(vid1_arr, vid2_arr, senstivity)

    print("Final Score:", score)

We now use the videos in **Dataset 0** to test our algorithms.

### `senstivity = 0.5`

In [None]:
combined_compare(vid1_path, vid2_path, 0.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 61.31977003065177


In [None]:
combined_compare(vid1_path, vid3_path, 0.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 34.65284698794811


### `senstivity = 1` (Default)

In [None]:
combined_compare(vid1_path, vid2_path, 1)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 80.65988501532588


In [None]:
combined_compare(vid1_path, vid3_path, 1)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 67.32642349397406


### `senstivity = 1.5`

In [None]:
combined_compare(vid1_path, vid2_path, 1.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 87.10659001021726


In [None]:
combined_compare(vid1_path, vid3_path, 1.5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 78.2176156626494


### `senstivity = 2`

In [None]:
combined_compare(vid1_path, vid2_path, 2)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 90.32994250766293


In [None]:
combined_compare(vid1_path, vid3_path, 2)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 83.66321174698703


### `senstivity = 5`

In [None]:
combined_compare(vid1_path, vid2_path, 5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 96.13197700306517


In [None]:
combined_compare(vid1_path, vid3_path, 5)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 93.46528469879479


# 5. Comparison without incorporating any Angle Invariance

Inorder to view the improvements obtained by incorporating angle invariance into the previous algorithms, we test our most recent algorithm on **Dataset 1**, which contains the video of the subject doing bicep curls, captured from 3 angles, i.e. front, left and right.

We test these videos by comparing them using **Algorithm 5** and **Algorithm 6**

## Algorithm 5

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


## reshape the (n, 33, 3) dimensional array to a (33, n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW while incorporating the sensitivity parameter

def compare_vid(vid1_arr, vid2_arr, senstivity = 1):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring
        vid1_arr[i] = vid1_arr[i]/(np.linalg.norm(vid1_arr[i]) * senstivity)
        vid2_arr[i] = vid2_arr[i]/(np.linalg.norm(vid2_arr[i]) * senstivity)
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new, senstivity)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())

In [None]:
combined_compare(vidf_path, vidl_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [45.22674208 53.4325719  55.64925515 57.89520312 50.84884936 51.94202676
 53.12908517 53.73710754 64.71518806 59.35555499 57.38926076 47.94843278
 36.2996597  52.91552326 45.6973902  21.96842416 45.20449684 -4.83100935
 35.16916514 -6.77046046 24.04786828  6.19902415 41.13754584 59.35762923
 59.44756476 76.35263031 64.58019943 66.28150802 57.38213105 61.41047399
 57.57897927 55.99940455 66.06404367]
Average Score: 47.659438476444215


In [None]:
combined_compare(vidl_path, vidr_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [54.67339965 64.00474909 65.9591996  67.86586825 55.4702046  56.61552616
 58.10813624 52.64060764 56.93837906 68.65334328 58.59660477 -3.44145224
 31.97151369 18.97767591 38.18179489 25.37866009 28.47155748 -5.28306657
  9.69910369 -5.90577003  9.32165017  8.39902529 17.21961296 41.40006457
 57.03371723 50.10330405 59.77757616 32.08080955 42.4314739  18.71239601
 29.30638285 50.32866455 51.74349256]
Average Score: 38.34649106405652


In [None]:
combined_compare(vidr_path, vidf_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Scores List:
 [ 65.84589319  67.10406967  67.55816017  67.69236121  64.24567159
  63.23967121  62.21962484  70.15788802  36.1333119   67.9651035
  63.59649441  40.12820616   0.62910235  55.73664817  13.25159761
  82.12647498   2.33949145  79.81775631 -13.47594076  77.75256505
 -11.7287956   82.17856165  -5.12174903  67.90564144  54.08030043
  66.0554927   71.55136753  57.37749607  70.61079732  50.78935092
  64.70348539  66.67736493  52.73294585]
Average Score: 52.178073049184114


## Algorithm 6

In [None]:
## to create a helper function to extract (n, 33, 3) dimensional array to store the normalized coordinates (via cube normalization) of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW

def compare_vid(vid1_arr, vid2_arr, senstivity=1):
    n_landmarks = vid1_arr.shape[0]
    
    # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring
    vid1_arr = vid1_arr/np.linalg.norm(vid1_arr * senstivity)
    vid2_arr = vid2_arr/np.linalg.norm(vid2_arr * senstivity)
    
    d = distance(vid1_arr, vid2_arr)
    d_score = 100 - (d*100)
    
    return d_score


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    score = compare_vid(vid1_arr, vid2_arr, senstivity)

    print("Final Score:", score)

In [None]:
combined_compare(vidf_path, vidl_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 43.1257580930832


In [None]:
combined_compare(vidl_path, vidr_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 22.883248620537046


In [None]:
combined_compare(vidr_path, vidf_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Final Score: 43.54172035829884


# 6. Implementing Angle Invariance: Euler's Rotation Matrix

- We rotate the coordinates using Euler's rotation matrix about `y-axis` for various angles and collect the coordinates for each frame.
- If we fix an angle $\theta$ to rotate, we obtain $\frac{2 \pi}{\theta}$ no. of coordinates for each key-point in each frame.
- In essence we obtain $\frac{2 \pi}{\theta}$ many videos to compare against the trainer video and we output the best score.

## Algorithm 7

In [None]:
## This is the Euler's Rotation Matrix (Ry) to rotate the 3D coordinates in anti-clockwise direction w.r.t. the +ve y axis by an angle of theta
def Ry(theta):
  return np.matrix([[ np.cos(theta), 0,  np.sin(theta)],
                    [ 0            , 1,  0            ],
                    [-np.sin(theta), 0,  np.cos(theta)]])


## to create a helper function to extract (n, 33, 3) dimensional array to store the coordinates of each landmark in each frame

def extract_arr(input_video_path):
    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)


## to create a helper function to extract (n, 33, 3) dimensional array to store the coordinates of each landmark in each frame after rotation (modified version of extract_arr method)

def extract_arr_rot(input_video_path, rot_angle=None):
    # if no rotation return the original coordinates array
    if rot_angle == None or rot_angle == 0:
        return extract_arr(input_video_path)

    # extract the rotation matrix
    rot_mat_y = Ry(rot_angle)

    coordinates = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks (key-points and their coordinates)
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass

            # initialize frame_coordinates to store the coordinates of the 33 key-points per frame
            frame_coordinates = np.zeros((33,3))
            
            # inialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                # rotate the coordinates by multiplying with rotation matrix
                C = rot_mat_y@np.array((x,y,z))
                x, y, z = C[0,0], C[0,1], C[0,2]

                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z   
            
            # normalize frame_coordinates
            for i in range(33):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)
            
            coordinates.append(frame_coordinates)

        # Releasing the video capture device
        cap.release()

        return np.array(coordinates)    


# reshape the (n, 33, 3) dimensional array to a (33, n, 3) dimensional array for extracting scores of each key-point

def my_reshape(input_arr):
    n_frames = input_arr.shape[0]
    n_landmarks = input_arr.shape[1]

    # new array of desired shape
    new_arr = np.zeros((n_landmarks, n_frames, 3))

    for f in range(n_frames):
        for i in range(n_landmarks):
            new_arr[i][f] = input_arr[f][i]
    
    return new_arr


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (33, n, 3) using DTW

def compare_vid(vid1_arr, vid2_arr, senstivity = 1):
    n_landmarks = vid1_arr.shape[0]
    
    scores = np.zeros(n_landmarks)
    
    for i in range (n_landmarks):
        # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring
        vid1_arr[i] = vid1_arr[i]/(np.linalg.norm(vid1_arr[i]) * senstivity)
        vid2_arr[i] = vid2_arr[i]/(np.linalg.norm(vid2_arr[i]) * senstivity)
        
        # calculate distance
        d = distance(vid1_arr[i], vid2_arr[i])
        # we give a score out of 100
        d_score = 100 - (d*100)
        scores[i] = d_score
    
    return scores


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_arr(vid1_path)
    vid2_arr  = extract_arr(vid2_path)

    vid1_arr_new = my_reshape(vid1_arr)
    vid2_arr_new = my_reshape(vid2_arr)

    scores = compare_vid(vid1_arr_new, vid2_arr_new, senstivity)

    print("Scores List:\n", scores)
    print("Average Score:", scores.mean())


## helper function for comparing videos by path: just including the pre-processing steps in the previous function
## we add an extra step of rotating the coordinates and collecting them for further comparison to compare two (33, n, 3) dimensional vectors

def combined_compare_rot(vid1_path, vid2_path, senstivity=1, rot_angle=None):
    # list of scores
    scores = {}
    
    vid2_arr  = extract_arr(vid2_path)
    vid2_arr_new = my_reshape(vid2_arr)
    
    # no rotation cases
    if rot_angle==None or rot_angle == 0:
        return combined_compare(vid1_path, vid2_path, senstivity)
    
    n_rotations = int(2*np.pi//rot_angle)

    for i in range(n_rotations):
        theta = rot_angle*i
        
        vid1_arr = extract_arr_rot(vid1_path, theta)
        vid1_arr_new = my_reshape(vid1_arr)

        score = compare_vid(vid1_arr_new, vid2_arr_new, senstivity).mean()
        scores[theta*(180/np.pi)] = score

    return scores

We now use the videos in **Dataset 1** to test our algorithms for angle invariance.

### $\theta = \frac{\pi}{2}$

In [None]:
theta = np.pi/2
combined_compare_rot(vidf_path, vidl_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 47.659438476444215,
 90.0: 54.02428022596708,
 180.0: 38.287762524885515,
 270.0: 38.14814235803677}

In [None]:
theta = np.pi/2
combined_compare_rot(vidl_path, vidr_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 38.34649106405652,
 90.0: 18.98437412497976,
 180.0: 48.24839240653634,
 270.0: 60.44612304276004}

In [None]:
theta = np.pi/2
combined_compare_rot(vidr_path, vidf_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 52.178073049184114,
 90.0: 51.090001666712304,
 180.0: 33.45393718279474,
 270.0: 43.88127734300271}

### $\theta = \frac{\pi}{4}$

In [None]:
theta = np.pi/4
combined_compare_rot(vidf_path, vidl_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 47.659438476444215,
 45.0: 60.30430232656137,
 90.0: 54.02428022596708,
 135.0: 48.154522230399344,
 180.0: 38.287762524885515,
 225.0: 38.72428958495459,
 270.0: 38.14814235803677,
 315.0: 44.545602472111796}

In [None]:
theta = np.pi/4
combined_compare_rot(vidl_path, vidr_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 38.34649106405652,
 45.0: 29.597350251645953,
 90.0: 18.98437412497976,
 135.0: 22.821740366390834,
 180.0: 48.24839240653635,
 225.0: 52.09853614133803,
 270.0: 60.446123042760036,
 315.0: 55.29132903344268}

In [None]:
theta = np.pi/4
combined_compare_rot(vidr_path, vidf_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 52.178073049184114,
 45.0: 60.35806076708979,
 90.0: 51.09000166671231,
 135.0: 29.181638962246964,
 180.0: 33.45393718279474,
 225.0: 42.74894720281564,
 270.0: 43.88127734300271,
 315.0: 54.329924607042685}

### $\theta = \frac{\pi}{6}$

In [None]:
theta = np.pi/6
combined_compare_rot(vidf_path, vidl_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 47.659438476444215,
 29.999999999999996: 58.781760367498286,
 59.99999999999999: 58.89815144623445,
 90.0: 54.02428022596708,
 119.99999999999999: 50.55557022595661,
 149.99999999999997: 46.30758400398886,
 180.0: 38.287762524885515,
 210.0: 37.93348158634412,
 239.99999999999997: 39.210504535344306,
 270.0: 38.14814235803677,
 299.99999999999994: 43.28729659360412,
 330.0: 44.78307060550464}

In [None]:
theta = np.pi/6
combined_compare_rot(vidl_path, vidr_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 38.34649106405652,
 29.999999999999996: 30.73449418164807,
 59.99999999999999: 27.963597941140304,
 90.0: 18.984374124979755,
 119.99999999999999: 21.5046834745022,
 149.99999999999997: 24.85387949798147,
 180.0: 48.24839240653635,
 210.0: 52.05661520949112,
 239.99999999999997: 53.00924977222898,
 270.0: 60.446123042760036,
 299.99999999999994: 55.62914894785719,
 330.0: 54.75260432404643}

In [None]:
theta = np.pi/6
combined_compare_rot(vidr_path, vidf_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 52.178073049184114,
 29.999999999999996: 60.93226729045125,
 59.99999999999999: 59.67668193616592,
 90.0: 51.090001666712304,
 119.99999999999999: 30.773560387242483,
 149.99999999999997: 28.53088981539551,
 180.0: 33.45393718279474,
 210.0: 42.25463960514128,
 239.99999999999997: 43.064568327309466,
 270.0: 43.88127734300271,
 299.99999999999994: 53.92688582665791,
 330.0: 54.29148697602684}

### $\theta = \frac{\pi}{8}$

In [None]:
theta = np.pi/8
combined_compare_rot(vidf_path, vidl_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 47.659438476444215,
 22.5: 57.228361420429806,
 45.0: 60.30430232656137,
 67.5: 57.21640102568671,
 90.0: 54.02428022596708,
 112.5: 51.86413581712696,
 135.0: 48.154522230399344,
 157.5: 45.32285240240143,
 180.0: 38.287762524885515,
 202.5: 37.239242324854196,
 225.0: 38.72428958495459,
 247.49999999999997: 39.38510957826169,
 270.0: 38.14814235803677,
 292.5: 42.295004582297224,
 315.0: 44.5456024721118,
 337.5: 44.76715560641582}

In [None]:
theta = np.pi/8
combined_compare_rot(vidl_path, vidr_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 38.34649106405652,
 22.5: 31.939510389559018,
 45.0: 29.597350251645953,
 67.5: 26.790305115310787,
 90.0: 18.984374124979755,
 112.5: 21.05255730422534,
 135.0: 22.821740366390834,
 157.5: 26.634373282613527,
 180.0: 48.24839240653635,
 202.5: 52.230318037641304,
 225.0: 52.09853614133803,
 247.49999999999997: 54.06973832915243,
 270.0: 60.44612304276004,
 292.5: 56.04153855987537,
 315.0: 55.29132903344268,
 337.5: 54.22376565435354}

In [None]:
theta = np.pi/8
combined_compare_rot(vidr_path, vidf_path, rot_angle=theta)

Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.
Ignoring empty camera frame.


{0.0: 52.178073049184114,
 22.5: 61.30688816454947,
 45.0: 60.35806076708979,
 67.5: 58.83640115867573,
 90.0: 51.090001666712304,
 112.5: 32.196241038427026,
 135.0: 29.181638962246964,
 157.5: 28.888419020272995,
 180.0: 33.45393718279474,
 202.5: 41.71112620127972,
 225.0: 42.74894720281565,
 247.49999999999997: 43.42666721675701,
 270.0: 43.88127734300271,
 292.5: 53.3728479891264,
 315.0: 54.329924607042685,
 337.5: 54.36619212113806}

# 7. Implementing Angle Invariance: Comparing Joint Angles

## Algorithm 8.1 (without Cube Normalization)

- First, we extract the coordinates of 33 keypoints using Mediapipe Blazepose. These are the keypoints and their corresponding labels:

![](https://mediapipe.dev/images/mobile/pose_tracking_full_body_landmarks.png)

- We add 4 more key-points namely:
    - `33`: left-hand
    - `34`: neck
    - `35`: right-hand
    - `36`: middle-pelvis

- We then consider 18-relevant joint angles, using the key-point coordinates:
    - ![](https://i.postimg.cc/zBSdvndp/image.png)

- We extract these 18 angles for each frame, and compile these into a `(n, 18)` shaped array, where `n` is the no. of frames in the video.

- Then we use DTW to compare these arrays of the videos to compare and obtain a score.

In [None]:
## calculating the joint angles
def calculate_angle(a, b, c):
    # we will be calculating angle ABC
    
    # extract unit vector BA:
    v1 = a - b
    v1_u = v1/np.linalg.norm(v1)
    # extract unit vector BC:
    v2 = c - b
    v2_u = v2/np.linalg.norm(v2)

    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

## these are 18 tuples of keypoints to extract 18 relevant angles

keypoint_index = np.array([
    [33, 16, 14],
    [16, 14, 12],
    [14, 12, 11],
    [14, 12, 24],
    [ 0, 34, 12],
    [ 0, 34, 36],
    [12, 11, 13],
    [13, 11, 23],
    [11, 13, 15],
    [13, 15, 35],
    [12, 24, 26],
    [26, 24, 23],
    [24, 23, 25],
    [11, 23, 25],
    [24, 26, 28],
    [23, 25, 27],
    [26, 28, 32],
    [25, 27, 31]
])

## to create a helper function to extract (n, 18) dimensional array to store the coordinates of each landmark in each frame

def extract_angle_arr(input_video_path, keypoint_index = keypoint_index):
    angles = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks
            try:
                landmarks = results.pose_landmarks.landmark
                # print(landmarks)
            except:
                pass
            
            # initialize frame_coordinates to store the coordinates of the keypoints per frame
            # we initialize to size (37,3) instead of (33,3) because we include 4 more key-points, for angle calculation
            frame_coordinates = np.zeros((37,3))
            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                frame_coordinates[i][0],frame_coordinates[i][1],frame_coordinates[i][2] = landmarks[i].x, landmarks[i].y, landmarks[i].z
            
            # adding 4 new keypoints to the already existing 33 keypoints

            # left hand key-point
            frame_coordinates[33] = (frame_coordinates[18] + frame_coordinates[20])/2
            # right hand key-point
            frame_coordinates[35] = (frame_coordinates[17] + frame_coordinates[19])/2
            # neck key-point
            frame_coordinates[34] = (frame_coordinates[11] + frame_coordinates[12])/2
            # middle pelvis key-point
            frame_coordinates[36] = (frame_coordinates[23] + frame_coordinates[24])/2

            # initialize array to hold angles per frame
            angle = np.zeros(18)

            # extract angles based on the relevant key-points from the keypoint_index
            for i in range(18):
                angle[i] = calculate_angle(frame_coordinates[keypoint_index[i][0]], frame_coordinates[keypoint_index[i][1]], frame_coordinates[keypoint_index[i][2]])
            
            angles.append(angle)

    # Releasing the video capture
    cap.release()

    return np.array(angles)


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, 18) using DTW

def compare_vid(vid1_arr, vid2_arr, senstivity = 1):
    
    
    # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring

    vid1_arr = vid1_arr/(np.linalg.norm(vid1_arr) * senstivity)
    vid2_arr = vid2_arr/(np.linalg.norm(vid2_arr) * senstivity)
    
    # calculate distance
    d = distance(vid1_arr, vid2_arr)
    # we give a score out of 100
    d_score = 100 - (d*100)
    
    return d_score

## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_angle_arr(vid1_path)
    vid2_arr  = extract_angle_arr(vid2_path)

    score = compare_vid(vid1_arr, vid2_arr, senstivity)

    print("Score:", score)
    return score

We now use the videos in **Dataset 1** to test our algorithms for angle invariance.

In [None]:
combined_compare(vidf_path, vidl_path)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Ignoring empty camera frame.
Ignoring empty camera frame.
Score: 62.77854002852074


62.77854002852074

In [None]:
combined_compare(vidl_path, vidr_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Score: 48.29008589082052


48.29008589082052

In [None]:
combined_compare(vidr_path, vidf_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Score: 60.443947858380085


60.443947858380085

## Algorithm 8.2 (with Cube Normalization)

- First, we extract the coordinates of 33 keypoints using Mediapipe Blazepose. These are the keypoints and their corresponding labels:

![](https://mediapipe.dev/images/mobile/pose_tracking_full_body_landmarks.png)

- We add 4 more key-points namely:
    - `33`: left-hand
    - `34`: neck
    - `35`: right-hand
    - `36`: middle-pelvis

- We employ cube-normalization by applying the following transformation to each coordinate in each frame:
    - $(x,y,z) \mapsto (\frac{x - x_{\text{min}}}{x_{\text{max}} - x_{\text{min}}}, \frac{y - y_{\text{min}}}{y_{\text{max}} - y_{\text{min}}}, \frac{z - z_{\text{min}}}{z_{\text{max}} - z_{\text{min}}})$
    - This makes $(x_{\text{min}}, y_{\text{min}}, z_{\text{min}})$ the origin of the unit cube coordinate system.

- We then consider 18-relevant joint angles, using the key-point coordinates:
    - ![](https://i.postimg.cc/zBSdvndp/image.png)

- We extract these 18 angles for each frame, and compile these into a `(n, 18)` shaped array, where `n` is the no. of frames in the video.

- Then we use DTW to compare these arrays of the videos to compare and obtain a score

In [None]:
## calculating the joint angles
def calculate_angle(a, b, c):
    # we will be calculating angle ABC
    
    # extract unit vector BA:
    v1 = a - b
    v1_u = v1/np.linalg.norm(v1)
    # extract unit vector BC:
    v2 = c - b
    v2_u = v2/np.linalg.norm(v2)

    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))


## these are 18 tuples of keypoints to extract 18 relevant angles

keypoint_index = np.array([
    [33, 16, 14],
    [16, 14, 12],
    [14, 12, 11],
    [14, 12, 24],
    [ 0, 34, 12],
    [ 0, 34, 36],
    [12, 11, 13],
    [13, 11, 23],
    [11, 13, 15],
    [13, 15, 35],
    [12, 24, 26],
    [26, 24, 23],
    [24, 23, 25],
    [11, 23, 25],
    [24, 26, 28],
    [23, 25, 27],
    [26, 28, 32],
    [25, 27, 31]
])

## to create a helper function to extract (n, 18) dimensional array to store the coordinates of each landmark in each frame

def extract_angle_arr(input_video_path, keypoint_index = keypoint_index):
    angles = []
    
    cap = cv2.VideoCapture(input_video_path)

    with mp_pose.Pose(min_detection_confidence=0.3, min_tracking_confidence=0.3) as pose:
        while cap.isOpened():
            # Reading frames from video
            success, frame = cap.read()
            
            if not success:
                print("Ignoring empty camera frame.")
                break
            
            # Recolor image to RGB
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image.flags.writeable = False
        
            # Make detection
            results = pose.process(image)
        
            # Recolor back to BGR
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            # Extract landmarks
            try:
                landmarks = results.pose_landmarks.landmark
            except:
                pass
            
            # initialize frame_coordinates to store the coordinates of the keypoints per frame
            # we initialize to size (37,3) instead of (33,3) because we include 4 more key-points, for angle calculation
            frame_coordinates = np.zeros((37,3))

            # initialize min-max coordinates variables
            x_min, y_min, z_min = np.inf, np.inf, np.inf
            x_max, y_max, z_max = -np.inf, -np.inf, -np.inf

            # extract (x,y,z) coordinate of each landmark
            for i in range(33):
                x, y, z = landmarks[i].x, landmarks[i].y, landmarks[i].z
                frame_coordinates[i][0], frame_coordinates[i][1], frame_coordinates[i][2] = x, y, z

                # checking and updating the min coordinates
                if x < x_min:
                    x_min = x
                if y < y_min:
                    y_min = y
                if z < z_min:
                    z_min = z
                
                # checking and updating the max coordinates
                if x > x_max:
                    x_max = x
                if y > y_max:
                    y_max = y
                if z > z_max:
                    z_max = z
            
            # adding 4 new keypoints to the already existing 33 keypoints

            # left hand key-point
            frame_coordinates[33] = (frame_coordinates[18] + frame_coordinates[20])/2
            # right hand key-point
            frame_coordinates[35] = (frame_coordinates[17] + frame_coordinates[19])/2
            # neck key-point
            frame_coordinates[34] = (frame_coordinates[11] + frame_coordinates[12])/2
            # middle pelvis key-point
            frame_coordinates[36] = (frame_coordinates[23] + frame_coordinates[24])/2

            # normalize frame_coordinates
            for i in range(37):
                # normalizing x coordinate
                frame_coordinates[i][0] = (frame_coordinates[i][0] - x_min)/(x_max - x_min)
                # normalizing y coordinate
                frame_coordinates[i][1] = (frame_coordinates[i][1] - y_min)/(y_max - y_min)
                # normalizing z coordinate
                frame_coordinates[i][2] = (frame_coordinates[i][2] - z_min)/(z_max - z_min)

            # initialize array to hold angles per frame
            angle = np.zeros(18)

            # extract angles based on the relevant key-points from the keypoint_index
            for i in range(18):
                angle[i] = calculate_angle(frame_coordinates[keypoint_index[i][0]], frame_coordinates[keypoint_index[i][1]], frame_coordinates[keypoint_index[i][2]])
            
            angles.append(angle)

    # Releasing the video capture
    cap.release()

    return np.array(angles)


from dtaidistance.dtw_ndim import distance, distance_fast

## to compare 2 videos of dim (n, 18) using DTW

def compare_vid(vid1_arr, vid2_arr, senstivity = 1):
    
    
    # normalize the coordinates using sensitivity; higher sensitivity => more lenient scoring, lower sensitivity => strict scoring

    vid1_arr = vid1_arr/(np.linalg.norm(vid1_arr) * senstivity)
    vid2_arr = vid2_arr/(np.linalg.norm(vid2_arr) * senstivity)
    
    # calculate distance
    d = distance(vid1_arr, vid2_arr)
    # we give a score out of 100
    d_score = 100 - (d*100)
    
    return d_score


## helper function for comparing videos by path: just including the pre-processing steps in the previous function

def combined_compare(vid1_path, vid2_path, senstivity=1):
    vid1_arr = extract_angle_arr(vid1_path)
    vid2_arr  = extract_angle_arr(vid2_path)

    score = compare_vid(vid1_arr, vid2_arr, senstivity)

    print("Score:", score)
    return score

We now use the videos in **Dataset 1** to test our algorithms for angle invariance.

In [None]:
combined_compare(vidf_path, vidl_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Score: 76.83928496785356


76.83928496785356

In [None]:
combined_compare(vidl_path, vidr_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Score: 74.22925426625005


74.22925426625005

In [None]:
combined_compare(vidr_path, vidf_path)

Ignoring empty camera frame.
Ignoring empty camera frame.
Score: 75.6500236414682


75.6500236414682

In [None]:
%%time