In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import cv2
import sklearn
from tensorflow.keras import regularizers
from tensorflow.keras import layers
import os

In [2]:
GESTURE_TYPES = 11
LABEL_DICT = {k:i for i,k in enumerate([21, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])}
CONNECTION_LABELS = [
    (0, 1), (1, 2), (2, 3), (3, 4),
    (5, 6), (6, 7), (7, 8),
    (9, 10), (10, 11), (11, 12),
    (13, 14), (14, 15), (15, 16),
    (17, 18), (18, 19), (19, 20),
    (0, 5), (5, 9), (9, 13), (13, 17), (0, 17)
]

In [8]:
def load_keypoint_sequences(data_path='gesture_recognition/Fall 2020/data', sequence_length=5):
    keypoints = []
    labels = []
    for subjectName in os.listdir(data_path):
        if not (subjectName.startswith("Subject") or subjectName.startswith("subject")): continue
        # subjectNum = int(re.findall(r'(\d+)', subjectName)[0])
        for sceneName in os.listdir(os.path.join(data_path, subjectName)):
            if not (sceneName.startswith("Scene") or subjectName.startswith("scene")): continue
            for groupEntry in os.scandir(os.path.join(data_path, subjectName, sceneName)):
                with open(groupEntry, 'r') as f:
                    groupData = json.load(f)
                    for gesture in groupData:
                        # print(gesture['label'], gesture['keypoints'])
                        for i in range(len(gesture['keypoints'])):
                            if not gesture['keypoints'][i]:
                                gesture['keypoints'][i] = [[np.nan, np.nan, np.nan] for _ in range(21)]
                        for start_frame in range(len(gesture['keypoints']) - sequence_length + 1):
                            keypoints.append(gesture['keypoints'][start_frame: start_frame + sequence_length])
                            labels.append(LABEL_DICT[gesture['label']])
    keypoints = np.array(keypoints)
    labels = np.array(labels)
    return keypoints, labels
keypoints, labels = load_keypoint_sequences()
print(keypoints.shape, labels.shape)

(23346, 5, 21, 3) (23346,)


In [None]:
def generate_connection_angles_from_sequences(keypoints, keypoints_num=21, keypoints_dimensions=3):
    connections = []
    for connection in CONNECTION_LABELS:
        connections.append(keypoints[..., connection[1], :] - keypoints[..., connection[0], :])
    connections = np.stack(connections, axis = -2)
    tensor1 = connections[..., np.newaxis].repeat(keypoints_num, -1).transpose(0,1,2,4,3)
    tensor2 = connections[..., np.newaxis].repeat(keypoints_num, -1).transpose(0,1,4,2,3)
    angles = (tensor1*tensor2).sum(axis=-1)/np.linalg.norm(tensor1,axis=-1)/np.linalg.norm(tensor2,axis=-1)
    angles = angles.transpose(2,3,0,1)[np.triu_indices(21, k = 1)].transpose(1,2,0)
    return np.arccos(angles)
angles = generate_connection_angles_from_sequences(keypoints)
print(angles.shape)

In [None]:
def generate_joint_distances_from_sequences(keypoints, keypoints_num=21, keypoints_dimensions=3):
    connections = []
    for connection in CONNECTION_LABELS:
        connections.append(keypoints[..., connection[1], :] - keypoints[..., connection[0], :])
    connections = np.stack(connections, axis = -2)
    tensor1 = connections[..., np.newaxis].repeat(keypoints_num, -1).transpose(0,1,2,4,3)
    tensor2 = connections[..., np.newaxis].repeat(keypoints_num, -1).transpose(0,1,4,2,3)
    distances = np.linalg.norm(tensor1-tensor2,axis=-1).transpose(2,3,0,1)[np.triu_indices(21, k = 1)].transpose(1,2,0)
    return distances
distances = generate_joint_distances_from_sequences(keypoints)
print(distances.shape)

In [None]:
def visualize_keypoint_sequences(keypoints):
    for sequence in keypoints:
        for points in sequence:
            img = np.zeros((480, 640, 3))
            for point in points:
                x, y, z = point
                if np.isnan(x):
                    continue
                cv2.circle(img, (int(x), int(y)), 4, (255, 0, 0), 2)
            for connection in CONNECTION_LABELS:
                if np.isnan(points[connection[0]][0]):
                    continue
                x0, y0, z0 = points[connection[0]]
                x1, y1, z1 = points[connection[1]]
                cv2.line(img, (int(x0), int(y0)), (int(x1), int(y1)), (0, 255, 0), 2)
            cv2.imshow("Key Points", img)
            key = cv2.waitKey(1)
            if key == 27:
                cv2.destroyAllWindows()
                cv2.waitKey(1) # cannot close window on macOS without this line
                return
# visualize_keypoint_sequences(keypoints)

In [45]:
def process_sequence_features(keypoints, angles, distances):
    data_length = keypoints.shape[0]
    sequence_length = keypoints.shape[1]
    keypoints = keypoints.reshape(data_length*sequence_length, -1)
    angles = angles.reshape(data_length*sequence_length, -1)
    distances = distances.reshape(data_length*sequence_length, -1)
    features = np.concatenate((keypoints, angles, distances), -1)
    df = pd.DataFrame(features)
    df = (df-df.mean())/df.std()
    df = df.fillna(0)
    features = df.to_numpy().reshape(data_length, sequence_length, -1)
    return features
X = process_sequence_features(keypoints, angles, distances)
#normalizer = tf.keras.layers.experimental.preprocessing.Normalization()
#normalizer.adapt(X)
# X_train, X_val, y_train, y_val = train_test_split(processed_keypoints, labels, test_size=0.2, random_state=0)
print(X.shape)

(18250, 5, 483)


## RNN Classification

In [46]:
model_lstm = tf.keras.Sequential([layers.Masking() ,layers.LSTM(GESTURE_TYPES, activation=None), layers.Activation('softmax')])
model_lstm.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_lstm.fit(X, labels, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model_gru = tf.keras.Sequential([layers.Masking() ,layers.GRU(GESTURE_TYPES, activation=None), layers.Activation('softmax')])
model_gru.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_gru.fit(X, labels, epochs=20, validation_split=0.2)

GRU cells work just as well as LSTM cells, and is cheaper in theory.

In [22]:
model_bilstm = tf.keras.Sequential([layers.Masking(), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(GESTURE_TYPES, activation=None)), tf.keras.layers.Activation('softmax')])
model_bilstm.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_bilstm.fit(X, labels, epochs=20, validation_split=0.2)

Train on 18676 samples, validate on 4670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Making LSTM bidirectional does not improve the accuracy by a lot.

In [24]:
model_rnn = tf.keras.Sequential([layers.Masking() ,tf.keras.layers.SimpleRNN(GESTURE_TYPES, activation=None), tf.keras.layers.Activation('softmax')])
model_rnn.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_rnn.fit(X, labels, epochs=20, validation_split=0.2)

Train on 18676 samples, validate on 4670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Simple RNN gives a slightly worse performance.

In [25]:
model_lstm4 = tf.keras.Sequential([layers.Masking(), tf.keras.layers.LSTM(128), layers.Dense(GESTURE_TYPES), layers.Activation('softmax')])
model_lstm4.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_lstm4.fit(X, labels, epochs=20, validation_split=0.2)

Train on 18676 samples, validate on 4670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Increasing dense layer depth may improve performance.

In [12]:
model_lstm5 = tf.keras.Sequential([layers.Masking(), tf.keras.layers.LSTM(128,return_sequences=True),tf.keras.layers.LSTM(128), layers.Dense(GESTURE_TYPES), layers.Activation('softmax')])
model_lstm5.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_lstm5.fit(X, labels, epochs=20, validation_split=0.2)

Train on 18676 samples, validate on 4670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Adding another layer of LSTM does not improve performance.

## Convolutional + Recurrent
[Núñez et al. - Convolutional Neural Networks and Long Short-Term Memory for skeleton-based human activity and hand gesture recognition](https://www.sciencedirect.com/science/article/pii/S0031320317304405)

In [16]:
model_cnnlstm2 = tf.keras.Sequential([layers.Masking(), layers.Conv1D(20,3,activation='relu'),layers.Conv1D(20,3,activation='relu'), layers.LSTM(128, return_sequences=True), layers.Dense(GESTURE_TYPES, activation='softmax')])
model_cnnlstm2.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
history = model_cnnlstm2.fit(X, labels, epochs=20, validation_split=0.2)

Train on 18676 samples, validate on 4670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
