In [8]:
# !wget "https://www.dropbox.com/scl/fi/acun1rm43ge7ljr5qo6p2/wlasl.zip?rlkey=4o90zt8bhip49m7nows9gcsc8&dl=0"
# !pip install gdown
# !gdown --id "1QbuUJbwrq0D3hU8-sEePb4tJ87t2WA8r"

In [9]:
# !mv wlasl.zip* /wlasl.zip
# !unzip -qq /wlasl.zip -d dw-data
# !mv dw-data/data data
# !rm -r dw-data
# !rm wlasl.zip*
# !rm -r sample_data
# !git clone -b feature/mediapipe https://github.com/sceredi/VAR-wlals-recognition.git ./code
# !mv ./code/* ./
# !rm -r code

In [10]:
# !pip install -r requirements.txt
# !pip install mediapipe==0.10.9
# !pip uninstall -y keras
# !pip install keras==2.15.0

In [11]:
import gc
import numpy as np

from handcrafted.app.dataset.dataset import Dataset 
from wlasl_mediapipe.app.mp.mp_video import MediapipeVideo
from wlasl_mediapipe.app.mp.augmentation import augment

In [12]:
from typing import List


def split_data(dataset: Dataset, glosses = List[str]):
  train_videos = dataset.get_videos(
    lambda video: (video.split == "train") and video.gloss in glosses
  )
  val_videos = dataset.get_videos(
    lambda video: (video.split == "val") and video.gloss in glosses
  )
  test_videos = dataset.get_videos(
    lambda video: (video.split == "test") and video.gloss in glosses
  )
  train_videos = [MediapipeVideo(video, plot=False, expand_keypoints=False) for video in train_videos]
  print("Train videos loaded")
  val_videos = [MediapipeVideo(video, plot=False, expand_keypoints=False) for video in val_videos]
  print("Val videos loaded")
  test_videos = [MediapipeVideo(video, plot=False, expand_keypoints=False) for video in test_videos]
  print("Test videos loaded")
  return train_videos, val_videos, test_videos, glosses

In [13]:
from wlasl_mediapipe.app.mp.models.globals import FilteredLabels


word_number = 5
dataset = Dataset('data/WLASL_v0.3.json', only_keypoints=True)
# glosses = pd.read_csv("data/wlasl_class_list.txt", sep="\t", header=None)[1].tolist()
glosses = FilteredLabels.get_labels()
glosses = glosses[:word_number]
if word_number == -1:
    word_number = len(glosses)
train_videos, val_videos, test_videos, glosses = split_data(dataset, glosses)

Train videos loaded
Val videos loaded
Test videos loaded


In [14]:
Y_train = [video.get_base_video().gloss for video in train_videos]
Y_val = [video.get_base_video().gloss for video in val_videos]
Y_test = [video.get_base_video().gloss for video in test_videos]

In [15]:
print(f"Train Y: {len(np.unique(Y_train))}")

Train Y: 5


In [16]:
labels_dict = np.load('data/labels.npz', allow_pickle=True)

In [17]:
Y_train_labels = np.array([labels_dict[label] for label in Y_train])
Y_val_labels = np.array([labels_dict[label] for label in Y_val])
Y_test_labels = np.array([labels_dict[label] for label in Y_test])

In [None]:
print(f"Train Y: {Y_train_labels.shape}")
output_count = Y_train_labels.shape[1]

In [None]:
max_frames = max([video.sign_model.lh_matrix.shape[0] for video in train_videos + val_videos + test_videos])
print(f"Longest video: {max_frames}")

In [None]:
def concatenate_data(video_list):
    concatenated_data = []
    for video in video_list:
        frames_data = []
        for i in range(len(video.sign_model.left_hand_list)):
            left_hand_data = video.sign_model.lh_matrix[i]
            right_hand_data = video.sign_model.rh_matrix[i]
            pose_data = video.sign_model.pose_matrix[i]
            _face_data = video.sign_model.face_matrix[i]
            _face_data = []
            # frame_data = np.concatenate((left_hand_data, right_hand_data, pose_data, face_data))
            frame_data = np.concatenate((left_hand_data, right_hand_data, pose_data))
            frames_data.append(np.array(frame_data))
        # frames_data = np.pad(frames_data, ((0, max_frames - len(frames_data)), (0, 0)), 'constant', constant_values=0)
        concatenated_data.append(np.array(frames_data))
    return concatenated_data

In [None]:
X_train_concatenated = concatenate_data(train_videos)
del train_videos
gc.collect()
X_val_concatenated = concatenate_data(val_videos)
del val_videos
gc.collect()
X_test_concatenated = concatenate_data(test_videos)
del test_videos
gc.collect()

## Scaling the data using standard scaler

In [None]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(np.concatenate(X_train_concatenated))
# X_train_scaled = [scaler.transform(video) for video in X_train_concatenated]
# del X_train_concatenated
# gc.collect()
# X_val_scaled = [scaler.transform(video) for video in X_val_concatenated]
# del X_val_concatenated
# gc.collect()
# X_test_scaled = [scaler.transform(video) for video in X_test_concatenated]
# del X_test_concatenated
# gc.collect()

# Model definition
## Libraries useful for ml

In [None]:

# from tensorflow import keras
import tensorflow as tf

from tensorflow.keras import layers

## Preparing the data

In [None]:
# # Convert your concatenated data to RaggedTensors
# X_train_ragged = tf.ragged.constant(X_train_scaled, dtype=tf.float32)
# del X_train_scaled
# gc.collect()
# X_val_ragged = tf.ragged.constant(X_val_scaled, dtype=tf.float32)
# del X_val_scaled
# gc.collect()
# X_test_ragged = tf.ragged.constant(X_test_scaled, dtype=tf.float32)
# del X_test_scaled
# gc.collect()

In [None]:
# Label preprocessing with StringLookup.
# label_processor = keras.layers.StringLookup(
#     num_oov_indices=0, vocabulary=np.unique(Y_train), mask_token=None
# )
# print(label_processor.get_vocabulary())
# Y_train_labels = label_processor(Y_train).numpy()
# Y_val_labels = label_processor(Y_val).numpy()
# Y_test_labels = label_processor(Y_test).numpy()
# Y_train_one_hot = to_categorical(Y_train, num_classes=word_number)
# Y_val_one_hot = to_categorical(Y_val, num_classes=word_number)
# Y_test_one_hot = to_categorical(Y_test, num_classes=word_number)

In [None]:
# input_shape = (None, len(X_train_ragged[0][0]))

### Data Augmentation

In [None]:
print(len(X_train_concatenated))

In [None]:
X_train_aug, Y_test_aug = augment(X_train_concatenated, Y_train_labels.tolist(), 8)
del X_train_concatenated

In [None]:
print(len(X_train_aug))

In [None]:
def flatten_and_pad(data, max_frames):
    padded_data = []
    for video in data:
        video = np.array(video)
        video = np.reshape(video, (video.shape[0], -1))
        if len(video) < max_frames:
            video = np.pad(video, ((0, max_frames - len(video)), (0, 0)), 'constant', constant_values=0)
        padded_data.append(video)
    return np.array(padded_data)

In [None]:
X_train_flattened= flatten_and_pad(X_train_aug, max_frames)
del X_train_aug
X_val_flattened = flatten_and_pad(X_val_concatenated, max_frames)
del X_val_concatenated
X_test_flattened = flatten_and_pad(X_test_concatenated, max_frames)
del X_test_concatenated
gc.collect()

In [None]:
input_shape = (np.array(X_train_flattened).shape[1:])
print(input_shape)

print(X_train_flattened.shape)
print(X_val_flattened.shape)
print(X_test_flattened.shape)

In [None]:
batch_size = 128
X_train_dataset = tf.data.Dataset.from_tensor_slices((X_train_flattened, Y_test_aug)).shuffle(buffer_size=X_train_flattened.shape[0]).batch(batch_size)
del X_train_flattened
gc.collect()
X_val_dataset = tf.data.Dataset.from_tensor_slices((X_val_flattened, Y_val_labels)).shuffle(buffer_size=X_val_flattened.shape[0]).batch(batch_size)
del X_val_flattened
gc.collect()
X_test_dataset = tf.data.Dataset.from_tensor_slices((X_test_flattened, Y_test_labels)).batch(batch_size)
del X_test_flattened
gc.collect()

### Defining the model

In [None]:
def build_rnn_gru(input_shape, gru_units_per_layer=[256, 256], output_count=2000,neuron_count_per_hidden_layer=[128,128],activation='relu'):
  model = keras.Sequential()
  model.add(layers.Input(shape=input_shape, ragged=True))
  for gru_units in gru_units_per_layer[:-1]:
    model.add(layers.GRU(units = gru_units, return_sequences=True, activation=activation, dropout=0.2, recurrent_dropout=0.2))
  
  model.add(layers.GRU(units = gru_units_per_layer[-1], activation=activation, dropout=0.2, recurrent_dropout=0.2))

  for n in neuron_count_per_hidden_layer:
    model.add(layers.Dense(n,activation=activation))

  model.add(layers.Dense(output_count, activation="softmax"))
  return model

In [None]:
def build_rnn_lstm(input_shape, lstm_units_per_layer=[256, 256], output_count=2000, neuron_count_per_hidden_layer=[128, 128], activation='relu'):
    model = keras.Sequential()
    model.add(layers.Input(shape=input_shape, ragged=True))
    
    for lstm_units in lstm_units_per_layer[:-1]:
        model.add(layers.LSTM(units=lstm_units, return_sequences=True, activation=activation))
    
    model.add(layers.LSTM(units=lstm_units_per_layer[-1], activation=activation))
    
    for n in neuron_count_per_hidden_layer:
        model.add(layers.Dense(n, activation=activation))
    
    model.add(layers.Dense(output_count, activation="softmax"))
    
    return model


In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def build(self, input_shape):
        self.position_embeddings.build(input_shape)

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        inputs = keras.ops.cast(inputs, self.compute_dtype)
        length = keras.ops.shape(inputs)[1]
        positions = keras.ops.arange(start=0, stop=length, step=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation=keras.activations.gelu),
                layers.BatchNormalization(),
                layers.Dense(embed_dim),
                layers.BatchNormalization(),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

In [None]:
def get_compiled_model(shape):
    sequence_length = max_frames
    embed_dim = input_shape[1]
    dense_dim = 1
    num_heads = 1
    classes = output_count

    inputs = keras.Input(shape=shape)
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="linear")(x)
    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.cosine_similarity,
        metrics=["accuracy"],
    )
    return model


In [None]:
import keras
print(keras.__version__)

### Model creation

In [None]:
# model = build_rnn_gru(
#     input_shape=input_shape,
#     gru_units_per_layer=[128, 64, 32],
#     output_count=word_number,
#     neuron_count_per_hidden_layer=[],
#     activation='tanh'
# )

# model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
# model = build_rnn_lstm(
#     input_shape=input_shape,
#     lstm_units_per_layer=[256, 256, 256],
#     output_count=word_number,
#     neuron_count_per_hidden_layer=[128, 64, 32]
# )

# model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
model = get_compiled_model(input_shape)

In [None]:
model.summary()

In [None]:
keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

## Model fitting

In [None]:
n_epochs = 100
patience = 10

filepath = "wlasl_transformer.weights.h5"
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath, save_weights_only=True, save_best_only=True, verbose=1, monitor='val_accuracy',mode='max'
)
model.fit(
    X_train_dataset,
    validation_data=X_val_dataset,  
    epochs=n_epochs,
    batch_size=batch_size,
    callbacks=[checkpoint],
)
model.load_weights(filepath)

## Model predictions

In [None]:
results = model.evaluate(X_test_dataset)
print("test loss, test acc:", results)

In [None]:
Y_pred = model.predict(X_test_dataset)
print(np.argmax(Y_pred, axis=1), "\n", Y_test_labels)
accuracy = np.mean(np.argmax(Y_pred, axis=1) == Y_test_labels)
print(f"Accuracy: {accuracy}")

In [None]:
print(np.argmax(Y_pred, axis=1))
print(Y_test_labels)