In [1]:
# !wget "https://www.dropbox.com/scl/fi/acun1rm43ge7ljr5qo6p2/wlasl.zip?rlkey=4o90zt8bhip49m7nows9gcsc8&dl=0"
# !pip install gdown
# !conda install -y gdown
# !gdown --id "1e7nvLl2sdQO9VZDxYbWAZL9UBfUGmONv"

In [2]:
# !mv wlasl.zip* /wlasl.zip
# !unzip -qq /wlasl.zip -d dw-data
# !mv dw-data/data data
# !rm -r dw-data
# !rm wlasl.zip*
# !rm -r sample_data
# !git clone -b feature/mediapipe https://github.com/sceredi/VAR-wlals-recognition.git ./code
# !mv ./code/* ./
# !rm -r code

In [3]:
# !pip install -r requirements.txt
# !pip install mediapipe==0.10.9

In [4]:
import pandas as pd
import numpy as np

from handcrafted.app.dataset.dataset import Dataset 
from wlasl_mediapipe.app.mp.mp_video import MediapipeVideo

2024-03-30 15:28:58.687540: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-30 15:28:58.689074: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-30 15:28:58.711441: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-30 15:28:58.711493: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-30 15:28:58.712991: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [5]:
def split_data(word_number: int):
  dataset = Dataset('data/WLASL_v0.3.json')
  glosses = pd.read_csv("data/wlasl_class_list.txt", sep="\t", header=None)[1].tolist()
  glosses = glosses[:word_number]
  train_videos = dataset.get_videos(
    lambda video: (video.split == "train") and video.gloss in glosses
  )
  val_videos = dataset.get_videos(
    lambda video: (video.split == "val") and video.gloss in glosses
  )
  test_videos = dataset.get_videos(
    lambda video: (video.split == "test") and video.gloss in glosses
  )
  train_videos = [MediapipeVideo(video, plot=False) for video in train_videos]
  val_videos = [MediapipeVideo(video, plot=False) for video in val_videos]
  test_videos = [MediapipeVideo(video, plot=False) for video in test_videos]
  return train_videos, val_videos, test_videos, glosses

In [6]:
word_number = 64
train_videos, val_videos, test_videos, glosses = split_data(word_number)

In [7]:
Y_train = [glosses.index(video.get_base_video().gloss) for video in train_videos]
Y_val = [glosses.index(video.get_base_video().gloss) for video in val_videos]
Y_test = [glosses.index(video.get_base_video().gloss) for video in test_videos]

In [8]:
filtered_pose = [11, 12, 13, 14, 15, 16]

filtered_face = [0, 4, 7, 8, 10, 13, 14, 17, 21, 33, 37, 39, 40, 46, 52, 53, 54, 55, 58,
                 61, 63, 65, 66, 67, 70, 78, 80, 81, 82, 84, 87, 88, 91, 93, 95, 103, 105,
                 107, 109, 127, 132, 133, 136, 144, 145, 146, 148, 149, 150, 152, 153, 154,
                 155, 157, 158, 159, 160, 161, 162, 163, 172, 173, 176, 178, 181, 185, 191,
                 234, 246, 249, 251, 263, 267, 269, 270, 276, 282, 283, 284, 285, 288, 291,
                 293, 295, 296, 297, 300, 308, 310, 311, 312, 314, 317, 318, 321, 323, 324,
                 332, 334, 336, 338, 356, 361, 362, 365, 373, 374, 375, 377, 378, 379, 380,
                 381, 382, 384, 385, 386, 387, 388, 389, 390, 397, 398, 400, 402, 405, 409,
                 415, 454, 466, 468, 473]

In [9]:
def concatenate_data(video_list, filtered_pose=None, filtered_face=None):
    concatenated_data = []
    for video in video_list:
        frames_data = []
        for i in range(len(video.sign_model.left_hand_list)):
            left_hand_data = video.sign_model.left_hand_list[i]
            right_hand_data = video.sign_model.right_hand_list[i]
            pose_data = video.pose_model.pose_list[i]
            face_data = video.face_model.face_list[i]
            
            if filtered_pose is not None:
                pose_data = pose_data[filtered_pose]
            if filtered_face is not None:
                face_data = face_data[filtered_face]
                
            frame_data = np.concatenate((left_hand_data, right_hand_data, pose_data, face_data))
            frames_data.append(frame_data)
        concatenated_data.append(frames_data)
    return concatenated_data


In [10]:
X_train_concatenated = concatenate_data(train_videos, filtered_pose, filtered_face)
X_val_concatenated = concatenate_data(val_videos, filtered_pose, filtered_face)
X_test_concatenated = concatenate_data(test_videos, filtered_pose, filtered_face)


In [11]:
print(X_train_concatenated[0][0].shape)

(264,)


# Model definition
## Libraries useful for ml

In [12]:

from tensorflow import keras
import tensorflow as tf

from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

## Preparing the data

In [13]:
# Convert your concatenated data to RaggedTensors
X_train_ragged = tf.ragged.constant(X_train_concatenated, dtype=tf.float32)
X_val_ragged = tf.ragged.constant(X_val_concatenated, dtype=tf.float32)
X_test_ragged = tf.ragged.constant(X_test_concatenated, dtype=tf.float32)

In [14]:
Y_train_one_hot = to_categorical(Y_train, num_classes=word_number)
Y_val_one_hot = to_categorical(Y_val, num_classes=word_number)
Y_test_one_hot = to_categorical(Y_test, num_classes=word_number)

In [15]:
batch_size = 32
X_train_dataset = tf.data.Dataset.from_tensor_slices((X_train_ragged, Y_train_one_hot)).shuffle(buffer_size=X_train_ragged.shape[0]).batch(batch_size)
X_val_dataset = tf.data.Dataset.from_tensor_slices((X_val_ragged, Y_val_one_hot)).shuffle(buffer_size=X_val_ragged.shape[0]).batch(batch_size)
X_test_dataset = tf.data.Dataset.from_tensor_slices((X_test_ragged, Y_test_one_hot)).shuffle(buffer_size=X_test_ragged.shape[0]).batch(batch_size)

In [16]:
print(Y_test_one_hot)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


### Defining the model

In [17]:
def build_rnn(input_shape, gru_units_per_layer=[256, 256], output_count=2000,neuron_count_per_hidden_layer=[128,128],activation='relu'):
  model = keras.Sequential()
  model.add(layers.Input(shape=input_shape, ragged=True))
  print(gru_units_per_layer[:-1])
  print(gru_units_per_layer[-1])
  for gru_units in gru_units_per_layer[:-1]:
    model.add(layers.GRU(units = gru_units, return_sequences=True, activation=activation))
  
  model.add(layers.GRU(units = gru_units_per_layer[-1], activation=activation))

  for n in neuron_count_per_hidden_layer:
    model.add(layers.Dense(n,activation=activation))

  model.add(layers.Dense(output_count, activation="softmax"))
  return model

### Model creation

In [18]:
print(len(X_train_ragged[0][0]))

264


In [19]:
input_shape = (None, len(X_train_ragged[0][0]))
model = build_rnn(
    input_shape=input_shape,
    gru_units_per_layer=[256, 256, 256],
    output_count=word_number,
    neuron_count_per_hidden_layer=[128, 64, 32]
)
model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

[256, 256]
256
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, None, 256)         400896    
                                                                 
 gru_1 (GRU)                 (None, None, 256)         394752    
                                                                 
 gru_2 (GRU)                 (None, 256)               394752    
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 64)  

In [20]:
keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Model fitting

In [21]:
n_epochs = 100
patience = 10

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
model.fit(
    X_train_dataset,
    validation_data=X_val_dataset,  
    epochs=n_epochs,
    batch_size=batch_size,
    callbacks=[early_stop],
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

KeyboardInterrupt: 

## Model predictions

In [None]:
results = model.evaluate(X_train_dataset)
print("test loss, test acc:", results)

In [None]:
Y_pred = model.predict(X_test_dataset)
accuracy = np.mean(np.argmax(Y_pred, axis=1) == Y_test)
print(f"Accuracy: {accuracy}")

In [None]:
print(np.argmax(Y_pred, axis=1))
print(Y_test)