In [4]:
# !wget "https://www.dropbox.com/scl/fi/acun1rm43ge7ljr5qo6p2/wlasl.zip?rlkey=4o90zt8bhip49m7nows9gcsc8&dl=0"
# !unzip -qq wlasl* -d dw-data
# !mv dw-data/data data
# !rm -r dw-data
# !rm wlasl.zip*
# !rm -r sample_data
# !git clone -b feature/mediapipe https://github.com/sceredi/VAR-wlals-recognition.git ./code
# !mv ./code/* ./
# !rm -r code
# !pip install mediapipe

In [5]:
import pandas as pd
import numpy as np

from handcrafted.app.dataset.dataset import Dataset 
from wlasl_mediapipe.app.mp.mp_video import MediapipeVideo

In [6]:
def split_data(word_number: int):
  dataset = Dataset('data/WLASL_v0.3.json')
  glosses = pd.read_csv("data/wlasl_class_list.txt", sep="\t", header=None)[1].tolist()
  glosses = glosses[:word_number]
  train_videos = dataset.get_videos(
    lambda video: (video.split == "train") and video.gloss in glosses
  )
  val_videos = dataset.get_videos(
    lambda video: (video.split == "val") and video.gloss in glosses
  )
  test_videos = dataset.get_videos(
    lambda video: (video.split == "test") and video.gloss in glosses
  )
  train_videos = [MediapipeVideo(video, plot=False) for video in train_videos]
  val_videos = [MediapipeVideo(video, plot=False) for video in val_videos]
  test_videos = [MediapipeVideo(video, plot=False) for video in test_videos]
  return train_videos, val_videos, test_videos, glosses

In [7]:
word_number = 20
train_videos, val_videos, test_videos, glosses = split_data(word_number)

In [8]:
Y_train = [glosses.index(video.get_base_video().gloss) for video in train_videos]
Y_val = [glosses.index(video.get_base_video().gloss) for video in val_videos]
Y_test = [glosses.index(video.get_base_video().gloss) for video in test_videos]

In [9]:
X_lh_train = [video.sign_model.left_hand_list for video in train_videos]
X_rh_train = [video.sign_model.right_hand_list for video in train_videos] 
X_pose_train = [video.pose_model.pose_list for video in train_videos]
X_face_train = [video.face_model.face_list for video in train_videos]

X_lh_val = [video.sign_model.left_hand_list for video in val_videos]
X_rh_val = [video.sign_model.right_hand_list for video in val_videos]
X_pose_val = [video.pose_model.pose_list for video in val_videos]
X_face_val = [video.face_model.face_list for video in val_videos]

X_lh_test = [video.sign_model.left_hand_list for video in test_videos]
X_rh_test = [video.sign_model.right_hand_list for video in test_videos]
X_pose_test = [video.pose_model.pose_list for video in test_videos]
X_face_test = [video.face_model.face_list for video in test_videos]

In [10]:
print(np.array(X_lh_train[0]).shape)

(74, 63)


# Model definition
## Libraries useful for ml

In [21]:

from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

### Defining the inputs of the neural network

In [12]:
lh_input = Input(shape=(None, X_lh_train[0].shape[1]), name="lh_input")
rh_input = Input(shape=(None, X_rh_train[0].shape[1]), name="rh_input")
pose_input = Input(shape=(None, X_pose_train[0].shape[1]), name="pose_input")
face_input = Input(shape=(None, X_face_train[0].shape[1]), name="face_input")

### Defining the Long Short Term Memory layer

In [13]:
lh_lstm = LSTM(256, return_sequences=True, return_state=True, name="lh_lstm")
rh_lstm = LSTM(256, return_sequences=True, return_state=True, name="rh_lstm")
pose_lstm = LSTM(256, return_sequences=True, return_state=True, name="pose_lstm")
face_lstm = LSTM(256, return_sequences=True, return_state=True, name="face_lstm")

### A layer to concatenate the output of these

In [14]:
concatenated = Concatenate()([lh_lstm(lh_input)[0], rh_lstm(rh_input)[0], pose_lstm(pose_input)[0], face_lstm(face_input)[0]])


### The hidden layers

In [15]:
last_hidden_layer = Dense(64, activation="relu")(concatenated)

### The output layer

In [16]:
output_layer = Dense(word_number, activation="softmax")(last_hidden_layer)

### Model creation

In [17]:
model = Model(inputs=[lh_input, rh_input, pose_input, face_input], outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lh_input (InputLayer)       [(None, None, 63)]           0         []                            
                                                                                                  
 rh_input (InputLayer)       [(None, None, 63)]           0         []                            
                                                                                                  
 pose_input (InputLayer)     [(None, None, 99)]           0         []                            
                                                                                                  
 face_input (InputLayer)     [(None, None, 1404)]         0         []                            
                                                                                              

In [18]:
keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Model fitting

In [19]:
print(len(Y_train))
print(X_lh_train[1].shape)
print(len(X_rh_train))
print(len(X_pose_train))
print(len(X_face_train))

166
(29, 63)
166
166
166


In [22]:
X_lh_train_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_lh_train]
X_rh_train_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_rh_train]
X_pose_train_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_pose_train]
X_face_train_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_face_train]

X_lh_val_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_lh_val]
X_rh_val_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_rh_val]
X_pose_val_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_pose_val]
X_face_val_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_face_val]

X_lh_test_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_lh_test]
X_rh_test_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_rh_test]
X_pose_test_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_pose_test]
X_face_test_ragged = [tf.RaggedTensor.from_tensor(tf.convert_to_tensor(x)) for x in X_face_test]

In [27]:
n_epochs = 100
patience = 10
batch_size = 32

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
model.fit(
    [X_lh_train_ragged, X_rh_train_ragged, X_pose_train_ragged, X_face_train_ragged],
    Y_train,
    validation_data=([X_lh_val_ragged, X_rh_val_ragged, X_pose_val_ragged, X_face_val_ragged], Y_val),  
    epochs=n_epochs,
    callbacks=[early_stop],
    batch_size=batch_size
)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"})'}), (<class 'list'> containing values of types {"<class 'int'>"})