In [6]:
# !wget "https://www.dropbox.com/scl/fi/acun1rm43ge7ljr5qo6p2/wlasl.zip?rlkey=4o90zt8bhip49m7nows9gcsc8&dl=0"
# !unzip -qq wlasl* -d dw-data
# !mv dw-data/data data
# !rm -r dw-data
# !rm wlasl.zip*
# !rm -r sample_data
# !git clone -b feature/mediapipe https://github.com/sceredi/VAR-wlals-recognition.git ./code
# !mv ./code/* ./
# !rm -r code
# !pip install mediapipe

In [7]:
import pandas as pd
import numpy as np

from handcrafted.app.dataset.dataset import Dataset 
from wlasl_mediapipe.app.mp.mp_video import MediapipeVideo

In [8]:
def split_data(word_number: int):
  dataset = Dataset('data/WLASL_v0.3.json')
  glosses = pd.read_csv("data/wlasl_class_list.txt", sep="\t", header=None)[1].tolist()
  glosses = glosses[:word_number]
  train_videos = dataset.get_videos(
    lambda video: (video.split == "train") and video.gloss in glosses
  )
  val_videos = dataset.get_videos(
    lambda video: (video.split == "val") and video.gloss in glosses
  )
  test_videos = dataset.get_videos(
    lambda video: (video.split == "test") and video.gloss in glosses
  )
  train_videos = [MediapipeVideo(video, plot=False) for video in train_videos]
  val_videos = [MediapipeVideo(video, plot=False) for video in val_videos]
  test_videos = [MediapipeVideo(video, plot=False) for video in test_videos]
  return train_videos, val_videos, test_videos, glosses

In [9]:
word_number = 20
train_videos, val_videos, test_videos, glosses = split_data(word_number)

In [10]:
Y_train = [glosses.index(video.get_base_video().gloss) for video in train_videos]
Y_val = [glosses.index(video.get_base_video().gloss) for video in val_videos]
Y_test = [glosses.index(video.get_base_video().gloss) for video in test_videos]

In [11]:
# X_lh_train = [video.sign_model.left_hand_list for video in train_videos]
# X_rh_train = [video.sign_model.right_hand_list for video in train_videos] 
# X_pose_train = [video.pose_model.pose_list for video in train_videos]
# X_face_train = [video.face_model.face_list for video in train_videos]

# X_lh_val = [video.sign_model.left_hand_list for video in val_videos]
# X_rh_val = [video.sign_model.right_hand_list for video in val_videos]
# X_pose_val = [video.pose_model.pose_list for video in val_videos]
# X_face_val = [video.face_model.face_list for video in val_videos]

# X_lh_test = [video.sign_model.left_hand_list for video in test_videos]
# X_rh_test = [video.sign_model.right_hand_list for video in test_videos]
# X_pose_test = [video.pose_model.pose_list for video in test_videos]
# X_face_test = [video.face_model.face_list for video in test_videos]

In [12]:
def concatenate_data(video_list):
    concatenated_data = []
    for video in video_list:
        frames_data = []
        for i in range(len(video.sign_model.left_hand_list)):
            frame_data = np.concatenate((video.sign_model.left_hand_list[i],
                                         video.sign_model.right_hand_list[i],
                                         video.pose_model.pose_list[i],
                                         video.face_model.face_list[i]))
            frames_data.append(frame_data)
        concatenated_data.append(frames_data)
    return concatenated_data

X_train_concatenated = concatenate_data(train_videos)
X_val_concatenated = concatenate_data(val_videos)
X_test_concatenated = concatenate_data(test_videos)

In [13]:
print(X_train_concatenated[0][0].shape)

(1629,)


# Model definition
## Libraries useful for ml

In [14]:

from tensorflow import keras
import tensorflow as tf

from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

## Preparing the data

In [15]:
# Convert your concatenated data to RaggedTensors
X_train_ragged = tf.ragged.constant(X_train_concatenated, dtype=tf.float32)
X_val_ragged = tf.ragged.constant(X_val_concatenated, dtype=tf.float32)
X_test_ragged = tf.ragged.constant(X_test_concatenated, dtype=tf.float32)

In [16]:
Y_train_one_hot = to_categorical(Y_train)
Y_val_one_hot = to_categorical(Y_val)
Y_test_one_hot = to_categorical(Y_test)

### Defining the model

In [17]:
def build_rnn(input_shape, gru_units, output_count,neuron_count_per_hidden_layer=[128,128],activation='relu'):
  model = keras.Sequential()
  model.add(layers.GRU(units = gru_units, input_shape=input_shape))

  for n in neuron_count_per_hidden_layer:
    model.add(layers.Dense(n,activation=activation))

  model.add(layers.Dense(output_count, activation="softmax"))
  return model

### Model creation

In [18]:
# Assuming X_train_ragged is your ragged tensor
# video_index = 0  # Index of the video tensor you want to inspect
# for frame_vector in X_train_ragged[video_index]:
#     print("Feature vector length:", len(frame_vector))
print(len(X_train_ragged[0][0]))

1629


In [19]:
input_shape = (None, len(X_train_ragged[0][0]))
model = build_rnn(
    input_shape=input_shape,
    gru_units=128,
    output_count=word_number,
    neuron_count_per_hidden_layer=[128, 128]
)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["accuracy"])
# model.summary()

In [20]:
keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Model fitting

In [21]:
# train_datas = tf.data.Dataset.from_tensor_slices(({"lh_input": X_lh_train_ragged, "rh_input": X_rh_train_ragged, "pose_input": X_pose_train_ragged, "face_input": X_face_train_ragged}, Y_train)).shuffle(buffer_size=100)
# val_datas = tf.data.Dataset.from_tensor_slices(({"lh_input": X_lh_val_ragged, "rh_input": X_rh_val_ragged, "pose_input": X_pose_val_ragged, "face_input": X_face_val_ragged}, Y_val)).shuffle(buffer_size=100)

In [22]:
n_epochs = 100
patience = 10
batch_size = 32

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
model.fit(
    X_train_ragged, Y_train_one_hot,
    validation_data=(X_val_ragged, Y_val_one_hot),  
    epochs=n_epochs,
    callbacks=[early_stop],
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

KeyboardInterrupt: 