## Variable length sequence LSTM full script

In this script, the whole process of loading data, training and testing the LSTM is written.
The script is as generalized as it can get, since the variables of the model change based on the underlying dataset.
This has been done to allow for the addition of new actions and data without having to change the code.
Since it is a variable length sequence LSTM, ragged tensors are used.
The predicting logic is a fixed length LSTM with 8 frame length. Being it real-time, it is not possible to predict variable length sequence, since one does not now when the sequence is completed. A challenge will be finding the sweet spot in terms of sequence length (for now it is 8 frames).

In [1]:
#importing needed libraries

import numpy as np
import tensorflow as tf
import cv2
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, InputLayer
from tensorflow.keras.callbacks import TensorBoard
from scipy.spatial.transform import Rotation as R

In [2]:
#setting up the mediapipe model

mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

#defining a few functions to detect, draw and extract keypoints

def mediapipe_detection(image, model):
    #this function takes in the image and the model and returns the prediction results
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_landmarks(image, results, drawing_spec_circle, drawing_spec_line):
    #this function takes in the image and results and draws mediapipe landmarks on the picture
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, landmark_drawing_spec=drawing_spec_circle, connection_drawing_spec=drawing_spec_line)
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, landmark_drawing_spec=drawing_spec_circle, connection_drawing_spec=drawing_spec_line)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, landmark_drawing_spec=drawing_spec_circle, connection_drawing_spec=drawing_spec_line)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, landmark_drawing_spec=drawing_spec_circle, connection_drawing_spec=drawing_spec_line)

def extract_keypoints(results):
    #this function takes in the prediction results and returns the array with the extracted keypoints to be saved
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

def visualize_probabilities(res, actions, input_frame, colors):
    #this function takes in the action probabilities and actions, and draws the coloured probability rectangles on the picture
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*200), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
    return output_frame

In [3]:
#preparing the action categories and paths

actions = []

DATA_PATH = os.path.join(os.getcwd(), "DATA")

for action in os.listdir(DATA_PATH):
    actions.append(action)
    
actions = np.array(actions)

label_map = {label:num for num, label in enumerate(actions)}

labels, temp_points = [], []

In [4]:
#a temporary solution to the initializing ragged tensors problem is to initialize with a standard value and then deleting it at
#the end. I didn't find any better solution since documentation on ragged tensors is not so clean
temporary_ragged = tf.ragged.constant([[[3, 1, 4, 1], [5, 9, 2], [6], []]], tf.double)

#for every action, loops and loads data for training
for action in actions: 
    for num_sequence in os.listdir(os.path.join(DATA_PATH, action)):
        temp_points = []
        for point in os.listdir(os.path.join(DATA_PATH, action, num_sequence)):
            res = np.load(os.path.join(DATA_PATH, action, num_sequence, point), allow_pickle = True)
            temp_points.append(res)       
            
        temporary_ragged = tf.concat([temporary_ragged, tf.expand_dims(np.array(temp_points), axis = 0)], axis = 0)
        labels.append(label_map[action])

#skips the first tensor, that is the one we used to intialize the variable
dataset = temporary_ragged[1:]

In [5]:
#preparing train and test sets (not really the cleanest way, ragged tensors don't seem too friendly to handle).
#In the future I should add data augmentation as well, but it will probably be not clean to do that with ragged tensors

y = to_categorical(labels).astype(int)
train_ind = int(dataset.shape[0]*0.9)
X_train = dataset[:train_ind]
X_test = dataset[train_ind:]
y_train = y[:train_ind]
y_test = y[train_ind:]
print(y.shape)

(989, 7)


In [6]:
#initializing the Keras model. Different models have been tested, for now with the dataset we have this has performed the best
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=[None, 1662], ragged=True),
    tf.keras.layers.LSTM(64, activation = 'tanh', dropout=0.2, return_sequences = True),
    tf.keras.layers.LSTM(128, activation = 'tanh', dropout=0.2, return_sequences = True),
    tf.keras.layers.LSTM(64, activation = 'tanh', dropout=0.2),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(len(actions), activation = 'softmax'),
])

In [7]:
#defining compiler and callback to stop training and 97% accuracy
model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])

ACCURACY_THRESHOLD = 0.97

class myCallback(tf.keras.callbacks.Callback): 
    def on_epoch_end(self, epoch, logs={}): 
        if(logs.get('categorical_accuracy') > ACCURACY_THRESHOLD):   
            print("\nReached %2.2f%% accuracy, so stopping training!!" %(ACCURACY_THRESHOLD*100))   
            self.model.stop_training = True

In [8]:
callb = myCallback()

#training the model
model.fit(X_train, y_train, epochs = 4000, callbacks = callb)

Epoch 1/4000
Epoch 2/4000
Epoch 3/4000
Epoch 4/4000
Epoch 5/4000
Epoch 6/4000
Epoch 7/4000
Epoch 8/4000
Epoch 9/4000
Epoch 10/4000
Epoch 11/4000
Epoch 12/4000
Epoch 13/4000
Epoch 14/4000
Epoch 15/4000
Epoch 16/4000
Epoch 17/4000
Epoch 18/4000
Epoch 19/4000
Epoch 20/4000
Epoch 21/4000
Epoch 22/4000
Epoch 23/4000
Epoch 24/4000
Epoch 25/4000
Epoch 26/4000
Epoch 27/4000
Epoch 28/4000
Epoch 29/4000
Epoch 30/4000
Epoch 31/4000
Epoch 32/4000
Epoch 33/4000
Epoch 34/4000
Epoch 35/4000
Epoch 36/4000
Epoch 37/4000
Epoch 38/4000
Epoch 39/4000
Epoch 40/4000
Epoch 41/4000
Epoch 42/4000
Epoch 43/4000
Epoch 44/4000
Epoch 45/4000
Epoch 46/4000
Epoch 47/4000
Epoch 48/4000
Epoch 49/4000
Epoch 50/4000
Epoch 51/4000
Epoch 52/4000
Epoch 53/4000
Epoch 54/4000
Epoch 55/4000
Epoch 56/4000
Epoch 57/4000
Epoch 58/4000
Epoch 59/4000
Epoch 60/4000
Epoch 61/4000
Epoch 62/4000
Epoch 63/4000
Epoch 64/4000
Epoch 65/4000
Epoch 66/4000
Epoch 67/4000
Epoch 68/4000
Epoch 69/4000
Epoch 70/4000
Epoch 71/4000


Epoch 72/4000
Epoch 73/4000
Epoch 74/4000
Epoch 75/4000
Epoch 76/4000
Epoch 77/4000
Epoch 78/4000
Epoch 79/4000
Epoch 80/4000
Epoch 81/4000
Epoch 82/4000
Epoch 83/4000
Epoch 84/4000
Epoch 85/4000
Epoch 86/4000
Epoch 87/4000
Epoch 88/4000
Epoch 89/4000
Epoch 90/4000
Epoch 91/4000
Epoch 92/4000
Epoch 93/4000
Epoch 94/4000
Epoch 95/4000
Epoch 96/4000
Epoch 97/4000
Epoch 98/4000
Epoch 99/4000
Epoch 100/4000
Epoch 101/4000
Epoch 102/4000
Epoch 103/4000
Epoch 104/4000
Epoch 105/4000
Epoch 106/4000
Epoch 107/4000
Epoch 108/4000
Epoch 109/4000
Epoch 110/4000
Epoch 111/4000
Epoch 112/4000
Epoch 113/4000
Epoch 114/4000
Epoch 115/4000
Epoch 116/4000
Epoch 117/4000
Epoch 118/4000
Epoch 119/4000
Epoch 120/4000
Epoch 121/4000
Epoch 122/4000
Epoch 123/4000
Epoch 124/4000
Epoch 125/4000
Epoch 126/4000
Epoch 127/4000
Epoch 128/4000
Epoch 129/4000
Epoch 130/4000
Epoch 131/4000
Epoch 132/4000
Epoch 133/4000
Epoch 134/4000
Epoch 135/4000
Epoch 136/4000
Epoch 137/4000
Epoch 138/4000
Epoch 139/4000
Epoch 14

Epoch 142/4000
Epoch 143/4000
Epoch 144/4000
Epoch 145/4000
Epoch 146/4000
Epoch 147/4000
Epoch 148/4000
Epoch 149/4000
Epoch 150/4000
Epoch 151/4000
Epoch 152/4000
Epoch 153/4000
Epoch 154/4000
Epoch 155/4000
Epoch 156/4000
Epoch 157/4000
Epoch 158/4000
Epoch 159/4000
Epoch 160/4000
Epoch 161/4000
Epoch 162/4000
Epoch 163/4000
Epoch 164/4000
Epoch 165/4000
Epoch 166/4000
Epoch 167/4000
Epoch 168/4000
Epoch 169/4000
Epoch 170/4000
Epoch 171/4000
Epoch 172/4000
Epoch 173/4000
Epoch 174/4000
Epoch 175/4000
Epoch 176/4000
Epoch 177/4000
Epoch 178/4000
Epoch 179/4000
Epoch 180/4000
Epoch 181/4000
Epoch 182/4000
Epoch 183/4000
Epoch 184/4000
Epoch 185/4000
Epoch 186/4000
Epoch 187/4000
Epoch 188/4000
Epoch 189/4000
Epoch 190/4000
Epoch 191/4000
Epoch 192/4000
Epoch 193/4000
Epoch 194/4000
Epoch 195/4000
Epoch 196/4000
Epoch 197/4000
Epoch 198/4000
Epoch 199/4000
Epoch 200/4000
Epoch 201/4000
Epoch 202/4000
Epoch 203/4000
Epoch 204/4000
Epoch 205/4000
Epoch 206/4000
Epoch 207/4000
Epoch 208/

Epoch 212/4000
Epoch 213/4000
Epoch 214/4000
Epoch 215/4000
Epoch 216/4000
Epoch 217/4000
Epoch 218/4000
Epoch 219/4000
Epoch 220/4000
Epoch 221/4000
Epoch 222/4000
Epoch 223/4000
Epoch 224/4000
Epoch 225/4000
Epoch 226/4000
Epoch 227/4000
Epoch 228/4000
Epoch 229/4000
Epoch 230/4000
Epoch 231/4000
Epoch 232/4000
Epoch 233/4000
Epoch 234/4000
Epoch 235/4000
Epoch 236/4000
Epoch 237/4000
Epoch 238/4000
Epoch 239/4000
Epoch 240/4000
Epoch 241/4000
Epoch 242/4000
Epoch 243/4000
Epoch 244/4000
Epoch 245/4000
Epoch 246/4000
Epoch 247/4000
Epoch 248/4000
Epoch 249/4000
Epoch 250/4000
Epoch 251/4000
Epoch 252/4000
Epoch 253/4000
Epoch 254/4000
Epoch 255/4000
Epoch 256/4000
Epoch 257/4000
Epoch 258/4000
Epoch 259/4000
Epoch 260/4000
Epoch 261/4000
Epoch 262/4000
Epoch 263/4000
Epoch 264/4000
Epoch 265/4000
Epoch 266/4000
Epoch 267/4000
Epoch 268/4000
Epoch 269/4000
Epoch 270/4000
Epoch 271/4000
Epoch 272/4000
Epoch 273/4000
Epoch 274/4000
Epoch 275/4000
Epoch 276/4000
Epoch 277/4000
Epoch 278/

Epoch 282/4000
Epoch 283/4000
Epoch 284/4000
Epoch 285/4000
Epoch 286/4000
Epoch 287/4000
Epoch 288/4000
Epoch 289/4000
Epoch 290/4000
Epoch 291/4000
Epoch 292/4000
Epoch 293/4000
Epoch 294/4000
Epoch 295/4000
Epoch 296/4000
Epoch 297/4000
Epoch 298/4000
Epoch 299/4000
Epoch 300/4000
Epoch 301/4000
Epoch 302/4000
Epoch 303/4000
Epoch 304/4000
Epoch 305/4000
Epoch 306/4000

Reached 97.00% accuracy, so stopping training!!


<tensorflow.python.keras.callbacks.History at 0x279264d3988>

In [9]:
#using confusion matrix to determine how well the model performs on the test set (false positive and negatives)
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis = 1).tolist()
yhat = np.argmax(yhat, axis = 1).tolist()
multilabel_confusion_matrix(ytrue, yhat)

array([[[94,  5],
        [ 0,  0]],

       [[87, 12],
        [ 0,  0]],

       [[89, 10],
        [ 0,  0]],

       [[96,  3],
        [ 0,  0]],

       [[ 0,  0],
        [30, 69]]], dtype=int64)

In [10]:
#printing out the accuracy
accuracy_score(ytrue, yhat)

0.696969696969697

In [11]:
#saving the weights
model.save('model_weights\\5_actions_696(to_test).h5')

In [5]:
#loading the weights of a previously saved model
model.load_weights('model_weights\\5_actions_741(to_test).h5')

In [13]:
#testing in real-time

colors = [(0,0,0), (255,255,255), (0,255,0), (255,0,0), (0,0,255), (125, 125, 125), (125, 0, 200)]
sequence = []
sentence = []
predictions = [0]
threshold = 0.95

#choosing the camera to use
cap = cv2.VideoCapture(0)

#drawing specs for face dots
drawing_spec_circle = mp_drawing.DrawingSpec()
drawing_spec_circle.circle_radius = 1
drawing_spec_circle.thickness = 1
drawing_spec_circle.color = (0,0,255)

prev_frame_time = 0
new_frame_time = 0

#drawing specs for face connections
drawing_spec_line = mp_drawing.DrawingSpec()
drawing_spec_line.thickness = 1

#starting the loop
with mp_holistic.Holistic(min_detection_confidence=0.8) as holistic:
    while cap.isOpened():

        ret,frame = cap.read()

        #make detections
        image, results = mediapipe_detection(frame, holistic)
        
        #count and show fps
        new_frame_time = time.time()
        fps = 1/(new_frame_time-prev_frame_time)
        prev_frame_time = new_frame_time
        fps = int(fps)
        fps = str(fps)
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(image, fps, (550, 120), font, 3, (100, 255, 0), 3, cv2.LINE_AA)
        
        #drawing landmarks
        draw_landmarks(image, results, drawing_spec_circle, drawing_spec_line)
        
        #extracting keypoints and feeding them to the LSTM for prediction
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-10:]
        
        if len(sequence) == 10:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            image = visualize_probabilities(res, actions, image, colors)
        
        #this would be the rendering logic if one would want to write the action. The first line implies that the action gets
        #written only if predicted for 10 consecutive frames, to give an higher stability to the model. This logic is not used
        #now, if one would like to implement that, just add 2 lines of code to display a text box with the sentence in
            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) > 0:
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])    

            if len(sentence) > 5:
                sentence = sentence[-5:]

        #showing image with predictions
        cv2.imshow('the feed', image)

        #close loop
        if cv2.waitKey(10) & 0XFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
hand_bite
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
no_action
head_hit
head_scratch
head_hit
head_hit
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_hit
head_hit
head_hit
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scratch
head_scr