# American Sign Language Fingerspelling recognition

In [None]:
!pip install mediapipe

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sn
import tensorflow as tf
import tensorflow_addons as tfa

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from leven import levenshtein

import glob
import sys
import random
import os
import math
import gc
import sys
import sklearn
import time
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sn
import os
import math
import gc
import shutil
import pyarrow.parquet as pq
import json
import mediapipe
import random
import json

from tensorflow import keras
from tensorflow.keras import layers
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tqdm.notebook import tqdm
from matplotlib import animation, rc
from mediapipe.framework.formats import landmark_pb2
from tensorflow.keras.callbacks import Callback

# TQDM Progress Bar With Pandas Apply Function
tqdm.pandas()

print(f'Tensorflow Version {tf.__version__}')
print(f'Python Version: {sys.version}')

Load the training dataset from 'train.csv' and display the shape of the training dataset.

In [None]:
dataset_df = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

In [None]:
dataset_df.head()

## Character 2 Number Encoding

### Reading Character-to-Number Encoding Mapping

#### Reading JSON File
- The code opens a JSON file located at `/kaggle/input/asl-fingerspelling/character_to_prediction_index.json`.

#### Loading Character-to-Number Mapping
- The content of the JSON file is loaded into a Python dictionary named `char_to_num`. This dictionary represents a character-to-number encoding mapping, where characters are keys and numbers are values.

#### Creating Number-to-Character Mapping
- The code creates a new dictionary named `num_to_char` by reversing the key-value pairs of `char_to_num`. In this dictionary, numbers become keys, and characters become values.

#### Converting Dictionary to DataFrame
- A pandas DataFrame named `char_to_num_df` is created from the `char_to_num` dictionary. This DataFrame has two columns: 'Number Encoding' and 'Character', where 'Character' corresponds to the keys from `char_to_num`, and 'Number Encoding' corresponds to the values.

#### Displaying the First Few Rows
- The `head()` method is used to display the first few rows of the `char_to_num_df` DataFrame.

This part helpful for working with character-to-number encoding mappings and allows you to easily convert between characters and their numeric representations using pandas DataFrames.


In [None]:
# Read Character to Number Encoding Mapping
with open('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json') as json_file:
    char_to_num = json.load(json_file)
    
# Number to Character Mapping
num_to_char = {j:i for i,j in char_to_num.items()}

# convert dictionary to pandas dataframe
char_to_num_df = pd.DataFrame(char_to_num.values(),index=char_to_num.keys(),columns=['Number Encoding'])
char_to_num_df.head()

## Configuration and Settings

#### Environment Check
- The code checks the environment to determine if the notebook is running interactively or as a commit on Kaggle. It sets the `IS_INTERACTIVE` variable accordingly. You can manually set it to `False` if you want to run the full code regardless of the environment.

#### Verbosity Setting
- The `VERBOSE` variable is used to control the level of verbosity during training. If the notebook is run interactively, it sets verbosity to 1 for more detailed output. Otherwise, it sets verbosity to 2.

#### Global Random Seed
- The `SEED` variable sets a global random seed to ensure that random operations are reproducible across runs.

#### Number of Target Frames
- `N_TARGET_FRAMES` specifies the number of frames to which recordings should be resized.

#### Debug Mode
- `DEBUG` is a debug flag. When set to `True`, it enables a debug mode that takes a subset of the training data for debugging purposes.

#### Number of Unique Characters and Special Tokens
- `N_UNIQUE_CHARACTERS0` represents the number of unique characters in your data before adding special tokens.
- `N_UNIQUE_CHARACTERS` represents the total number of unique characters, including padding, start of sentence, and end of sentence tokens.
- `PAD_TOKEN`, `START_TOKEN`, `END_TOKEN` hold token values for padding, start of sentence, and end of sentence, respectively.

#### Validation Data
- `USE_VAL` determines whether 10% of the data should be used for validation during training.

#### Batch Size
- `BATCH_SIZE` specifies the batch size for training.

#### Number of Epochs
- `N_EPOCHS` defines the number of training epochs. It's set to 2 if running interactively; otherwise, it's set to 30.

#### Warmup Epochs
- `N_WARMUP_EPOCHS` specifies the number of warm-up epochs for the learning rate scheduler.

#### Maximum Learning Rate
- `LR_MAX` sets the maximum learning rate for the optimizer.

#### Weight Decay Ratio
- `WD_RATIO` represents the weight decay ratio as a fraction of the learning rate.

#### Maximum Phrase Length
- `MAX_PHRASE_LENGTH` is the maximum length of a phrase plus the end of sentence token.

#### Training and Loading Weights
- `TRAIN_MODEL` indicates whether the model should be trained.
- `LOAD_WEIGHTS` specifies whether pretrained weights should be loaded.

#### Learning Rate Warmup Method
- `WARMUP_METHOD` defines the learning rate warmup method, which can be 'log' or 'exp'.

These settings can be adjusted to customize the behavior of the training and model based on specific requirements.

In [None]:
# If Notebook Is Run By Committing or In Interactive Mode For Development
#IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
IS_INTERACTIVE = False # to run full code

# Verbose Setting during training
VERBOSE = 1 if IS_INTERACTIVE else 2

# Global Random Seed
SEED = 42

# Number of Frames to resize recording to
N_TARGET_FRAMES = 128

# Global debug flag, takes subset of train
DEBUG = False

# Number of Unique Characters To Predict + Pad Token + SOS Token + EOS Token
N_UNIQUE_CHARACTERS0 = len(char_to_num)
N_UNIQUE_CHARACTERS = len(char_to_num) + 1 + 1 + 1
PAD_TOKEN = len(char_to_num) # Padding # 59
START_TOKEN = len(char_to_num) + 1 # Start Of Sentence # 60
END_TOKEN = len(char_to_num) + 2 # End Of Sentence # 61

# Whether to use 10% of data for validation
USE_VAL = True

# Batch Size
BATCH_SIZE = 64

# Number of Epochs to Train for
N_EPOCHS = 2 if IS_INTERACTIVE else 50

# Number of Warmup Epochs in Learning Rate Scheduler
N_WARMUP_EPOCHS = 5

# Maximum Learning Rate
LR_MAX = 1e-3

# Weight Decay Ratio as Ratio of Learning Rate
WD_RATIO = 0.05

# Length of Phrase + EOS Token
MAX_PHRASE_LENGTH = 31 + 1

# Whether to Train The model
TRAIN_MODEL = True

# Whether to Load Pretrained Weights
LOAD_WEIGHTS = False

# Learning Rate Warmup Method [log,exp]
WARMUP_METHOD = 'exp'

## Reading the Training DataFrame

#### Debug Mode Data Sampling
- The code checks the value of the `DEBUG` flag. If it is set to `True`, the code reads a subset of the training data by loading the first 5000 rows from the CSV file located at `/kaggle/input/asl-fingerspelling/train.csv`. This subset is useful for debugging and faster execution. If `DEBUG` is set to `False`, the code reads the entire training dataset.

#### Reading the CSV File
- The training dataset is read from the CSV file located at `/kaggle/input/asl-fingerspelling/train.csv`. This file presumably contains information about the training examples, such as sequence IDs, file paths, phrases, and other relevant data.

#### Creating a Sequence ID Index
- The code constructs an index called `train_sequence_id` using the `sequence_id` column from the training dataset. This index can be used to quickly retrieve data associated with specific sequence IDs.

#### File Path Construction
- A function named `get_file_path` is defined to construct the complete file path from a given path. This function is applied to the `path` column of the training dataset using the `apply` method. The result is stored in a new column called `file_path`. This step is likely done to make it easier to access the actual data files associated with each training example.

#### Displaying the First Few Rows
- The `head()` method is used to display the first few rows of the training dataset with the newly added `file_path` column.

This code prepares the training dataset by either loading a subset (in debug mode) or the full dataset (in non-debug mode) and constructs additional columns for file paths. The dataset is now ready for further processing and model training.
Since `DEBUG=False`, we load the entire training dataset. We add a new column file_path, containing the complete `file_path` for the parquet files.

In [None]:
# Read Train DataFrame
if DEBUG:
    train = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv').head(5000)
else:
    train = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv')

# this will be used to construct TFLite model
train_sequence_id = train.set_index('sequence_id')

# Get complete file path to file
def get_file_path(path):
    return f'/kaggle/input/asl-fingerspelling/{path}'

train['file_path'] = train['path'].apply(get_file_path)

train.head()

# Example Parquet

Let's remember the format of a Parquet file that we have saved before preprocessing:

In [None]:
# Unique Parquet Files
INFERENCE_FILE_PATHS = pd.Series(
        glob.glob('/kaggle/input//aslfr-eda-preprocessing-dataset-for-beginners/train_landmark_subsets/*')
    )

# Read a Parquet File
example_parquet_df = pd.read_parquet(INFERENCE_FILE_PATHS[0])

# Each parquet file contains 1000 recordings
print(f'Number of Unique Recording: {example_parquet_df.index.nunique()}')
# Display DataFrame layout
display(example_parquet_df.head())

# Data Visualization using MediaPipe

## Creating Animation from Images

### Configuration and Initialization

The following Python code defines a function `create_animation` that is responsible for creating an animation from a list of images. Before defining the function, it performs some necessary configurations.This function sets up a figure and axis, initializes an image, defines an animation update function, and returns an animation that cycles through the input images, creating an animated sequence.

```python
# Configuration settings for animation embedding
matplotlib.rcParams['animation.embed_limit'] = 2**128
matplotlib.rcParams['savefig.pad_inches'] = 0
rc('animation', html='jshtml')
```

### Function Definition: `create_animation`

The `create_animation` function takes a list of images as input and returns an animation.

```python
def create_animation(images):
```

### Figure and Axis Initialization

Inside the function, a figure (`fig`) and an axis (`ax`) are created using Matplotlib to set up the canvas for the animation. The figure is defined with dimensions 6x9 inches, and the axis covers the entire figure area.

```python
    fig = plt.figure(figsize=(6, 9))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
```

### Configuring the Axis

The axis's visibility is turned off, and the configured axis is added to the figure.

```python
    ax.set_axis_off()
    fig.add_axes(ax)
```

### Image Initialization

An initial image (`im`) is created from the first image in the input list (`images[0]`). This image is initially displayed using a grayscale colormap.

```python
    im = ax.imshow(images[0], cmap="gray")
```

### Animation Function Definition

A function `animate_func` is defined to update the displayed image in the animation. It takes an index `i` as input, sets the image's array to the `i`-th image in the list, and returns a list containing the updated image.

```python
    def animate_func(i):
        im.set_array(images[i])
        return [im]
```

### Creating the Animation

Finally, the code uses Matplotlib's `animation.FuncAnimation` to create the animation. It associates the animation with the defined figure, specifies the `animate_func` as the update function, sets the number of frames to be equal to the number of images in the list, and specifies the interval between frames to control the animation speed.

```python
    return animation.FuncAnimation(fig, animate_func, frames=len(images), interval=1000/10)
```


In [None]:
# Function create animation from images.

matplotlib.rcParams['animation.embed_limit'] = 2**128
matplotlib.rcParams['savefig.pad_inches'] = 0
rc('animation', html='jshtml')

def create_animation(images):
    fig = plt.figure(figsize=(6, 9))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    im=ax.imshow(images[0], cmap="gray")
    plt.close(fig)
    
    def animate_func(i):
        im.set_array(images[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames=len(images), interval=1000/10)

## Hand Landmark Extraction

### Importing Mediapipe Libraries

The following Python code imports necessary modules from the Mediapipe library.

```python
mp_pose = mediapipe.solutions.pose
mp_hands = mediapipe.solutions.hands
mp_face_mesh = mediapipe.solutions.face_mesh
mp_drawing = mediapipe.solutions.drawing_utils 
mp_drawing_styles = mediapipe.solutions.drawing_styles
```

### Function Definition: `get_hands`

The code defines a Python function named `get_hands`. This function is responsible for extracting hand landmark data from a DataFrame and converting it into images using the Mediapipe library.

```python
def get_hands(seq_df):
```

### Initialization

Inside the `get_hands` function, two lists are initialized to store the generated images and detected hand landmarks.

```python
    images = []
    all_hand_landmarks = []
```

### Loop Over DataFrame Rows

The code iterates over the rows of a DataFrame named `seq_df`, which is expected to contain hand landmark data.

### Extracting Hand Landmark Data

Within the loop, the code extracts hand landmark data (x, y, and z coordinates) for both the right and left hands from the DataFrame.

### Creating Blank Images

Two blank images, `right_hand_image` and `left_hand_image`, of size 600x600 pixels are created to visualize the hand landmarks.

### Creating Landmark Lists

Normalized landmark lists for the right and left hands, named `right_hand_landmarks` and `left_hand_landmarks`, are initialized. These objects will store the normalized landmark data.

### Drawing Hand Landmarks

The code uses the `mp_drawing.draw_landmarks` function to draw the hand landmarks on the respective images (`right_hand_image` and `left_hand_image`). This function takes the image, landmark data, hand connections, and drawing styles as arguments.

### Appending Images and Landmarks

The generated images and detected hand landmarks (as `NormalizedLandmarkList` objects) for both hands are appended to the `images` and `all_hand_landmarks` lists, respectively.

### Returning Results

The `get_hands` function returns a list of images and a list of hand landmarks for each frame in the input DataFrame.

```python
    return images, all_hand_landmarks
```


```


In [None]:
# Extract the landmark data and convert it to an image using medipipe library.
# This function extracts the data for both hands.

mp_pose = mediapipe.solutions.pose
mp_hands = mediapipe.solutions.hands
mp_face_mesh = mediapipe.solutions.face_mesh
mp_drawing = mediapipe.solutions.drawing_utils 
mp_drawing_styles = mediapipe.solutions.drawing_styles

def get_hands(seq_df):
    images = []
    all_hand_landmarks = []
    for seq_idx in range(len(seq_df)):
        x_hand = seq_df.iloc[seq_idx].filter(regex="x_right_hand.*").values
        y_hand = seq_df.iloc[seq_idx].filter(regex="y_right_hand.*").values
        z_hand = seq_df.iloc[seq_idx].filter(regex="z_right_hand.*").values

        right_hand_image = np.zeros((600, 600, 3))

        right_hand_landmarks = landmark_pb2.NormalizedLandmarkList()
        
        for x, y, z in zip(x_hand, y_hand, z_hand):
            right_hand_landmarks.landmark.add(x=x, y=y, z=z)

        mp_drawing.draw_landmarks(
                right_hand_image,
                right_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        x_hand = seq_df.iloc[seq_idx].filter(regex="x_left_hand.*").values
        y_hand = seq_df.iloc[seq_idx].filter(regex="y_left_hand.*").values
        z_hand = seq_df.iloc[seq_idx].filter(regex="z_left_hand.*").values
        
        left_hand_image = np.zeros((600, 600, 3))
        
        left_hand_landmarks = landmark_pb2.NormalizedLandmarkList()
        for x, y, z in zip(x_hand, y_hand, z_hand):
            left_hand_landmarks.landmark.add(x=x, y=y, z=z)

        mp_drawing.draw_landmarks(
                left_hand_image,
                left_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        images.append([right_hand_image.astype(np.uint8), left_hand_image.astype(np.uint8)])
        all_hand_landmarks.append([right_hand_landmarks, left_hand_landmarks])
    return images, all_hand_landmarks

## Extracting Face Landmark Data and Converting to Images

### Function Parameters

The function `get_face` takes a DataFrame `seq_df` as input. This DataFrame is assumed to contain the facial landmark data. This function processes facial landmark data, creates annotated images with facial landmarks, and stores both images and landmark data in lists for further analysis or visualization.

```python
def get_face(seq_df):
```

### Image Storage Initialization

Inside the function, two empty lists (`images` and `all_face_landmarks`) are created to store images and face landmark data, respectively. These lists will be populated as the function processes each sequence.

```python
    images = []
    all_face_landmarks = []
```

### Loop Through Sequences

The function iterates through each sequence in the input DataFrame `seq_df`.

```python
    for seq_idx in range(len(seq_df)):
```

### Extracting Facial Landmark Data

The x, y, and z coordinates of facial landmarks are extracted from the DataFrame using regular expressions to filter columns with names like "x_face.*", "y_face.*", and "z_face.*".

```python
        x_face = seq_df.iloc[seq_idx].filter(regex="x_face.*").values
        y_face = seq_df.iloc[seq_idx].filter(regex="y_face.*").values
        z_face = seq_df.iloc[seq_idx].filter(regex="z_face.*").values
```

### Initializing an Annotated Image

An annotated image is initialized as a black canvas with dimensions 900x600 pixels to draw the facial landmarks on.

```python
        annotated_image = np.zeros((900, 600, 3))
```

### Creating a `NormalizedLandmarkList`

A `NormalizedLandmarkList` object, `face_landmarks`, is created to store the facial landmark data.

```python
        face_landmarks = landmark_pb2.NormalizedLandmarkList()
```

### Drawing Facial Landmarks

Mediapipe's `mp_drawing.draw_landmarks` function is used to draw the facial landmarks on the annotated image. Two types of facial landmarks are drawn: mesh tessellation and contour landmarks.

```python
        mp_drawing.draw_landmarks(
          image=annotated_image,
          landmark_list=face_landmarks,
          connections=mp_face_mesh.FACEMESH_TESSELATION,
          landmark_drawing_spec=None,
          connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style())
          
        mp_drawing.draw_landmarks(
          image=annotated_image,
          landmark_list=face_landmarks,
          connections=mp_face_mesh.FACEMESH_CONTOURS,
          landmark_drawing_spec=None,
          connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style())
```

### Appending Data to Lists

The annotated image, represented as a NumPy array, is appended to the `images` list. The `face_landmarks` object is appended to the `all_face_landmarks` list.

```python
        images.append(annotated_image.astype(np.uint8))
        all_face_landmarks.append(face_landmarks)
```

### Return Values

The function returns two lists: `images`, which contains the annotated face images, and `all_face_landmarks`, which contains the facial landmark data.

```python
    return images, all_face_landmarks
```


```


In [None]:
# Extract the landmark data and convert it to an image using medipipe library.
# This function extracts the data for face

def get_face(seq_df):
    images = []
    all_face_landmarks = []
    for seq_idx in range(len(seq_df)):
        x_face = seq_df.iloc[seq_idx].filter(regex="x_face.*").values
        y_face = seq_df.iloc[seq_idx].filter(regex="y_face.*").values
        z_face = seq_df.iloc[seq_idx].filter(regex="z_face.*").values

        annotated_image = np.zeros((900, 600, 3))

        face_landmarks = landmark_pb2.NormalizedLandmarkList()
        for x, y, z in zip(x_face, y_face, z_face):
            face_landmarks.landmark.add(x=x, y=y, z=z)

        mp_drawing.draw_landmarks(
          image=annotated_image,
          landmark_list=face_landmarks,
          connections=mp_face_mesh.FACEMESH_TESSELATION,
          landmark_drawing_spec=None,
          connection_drawing_spec=mp_drawing_styles
          .get_default_face_mesh_tesselation_style())
        mp_drawing.draw_landmarks(
          image=annotated_image,
          landmark_list=face_landmarks,
          connections=mp_face_mesh.FACEMESH_CONTOURS,
          landmark_drawing_spec=None,
          connection_drawing_spec=mp_drawing_styles
          .get_default_face_mesh_contours_style())

        images.append(annotated_image.astype(np.uint8))
        all_face_landmarks.append(face_landmarks)
    return images, all_face_landmarks

sequence_id, file_id, phrase = dataset_df.iloc[random.randint(0 , dataset_df.shape[0])][['sequence_id', 'file_id', 'phrase']]
sample_sequence_df = pq.read_table(f"/kaggle/input/asl-fingerspelling/train_landmarks/{str(file_id)}.parquet",
    filters=[[('sequence_id', '=', sequence_id)],]).to_pandas()
face_images, face_landmarks = get_face(sample_sequence_df)

## Extracting Pose Landmark Data and Converting to Images

### Function Definition: `get_pose`

The function `get_pose` processes pose landmark data stored in a DataFrame (`seq_df`) and converts it into images. The code relies on the Mediapipe library for this task.

1. **Data Extraction**: The code iterates through sequences in `seq_df` and extracts x, y, and z coordinates of pose landmarks using regular expressions.

2. **Image Initialization**: An empty image (`annotated_image`) is initialized as a black canvas with dimensions 900x600 pixels.

3. **Data Processing**: The x, y, and z coordinates are collected into `data_points`, which are stored as NumPy arrays.

4. **Landmark List Creation**: A `NormalizedLandmarkList` object (`pose_landmarks`) is created to store the pose landmark data.

5. **Adding Landmarks**: The code adds the pose landmarks to `pose_landmarks` based on the data points.

6. **Drawing Landmarks**: Pose landmarks are drawn on the `annotated_image` using Mediapipe's `mp_drawing.draw_landmarks` function.

7. **Data Storage**: The annotated image (converted to a NumPy array) and the `pose_landmarks` object are appended to the `images` and `all_pose_landmarks` lists, respectively.

### Return Values

The function returns two lists: `images` containing annotated pose images and `all_pose_landmarks` containing pose landmark data.

`get_pose` processes pose landmark data, creates annotated pose images, and stores them along with landmark data for further use or visualization.


In [None]:
def get_pose(seq_df):
    images = []
    all_pose_landmarks = []
    for seq_idx in range(len(seq_df)):
        x_pose = seq_df.iloc[seq_idx].filter(regex="x_pose.*").values
        y_pose = seq_df.iloc[seq_idx].filter(regex="y_pose.*").values
        z_pose = seq_df.iloc[seq_idx].filter(regex="z_pose.*").values

        annotated_image = np.zeros((900, 600, 3))
        
        data_points = []
        for x, y, z in zip(x_pose, y_pose, z_pose):
            data_points.append(np.array([x, y, z]))

        pose_landmarks = landmark_pb2.NormalizedLandmarkList()
        for row in data_points:
            pose_landmarks.landmark.add(x=row[0], y=row[1], z=row[2])

        mp_drawing.draw_landmarks(
                annotated_image,
                pose_landmarks,
                mp_pose.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())
        images.append(annotated_image.astype(np.uint8))
        all_pose_landmarks.append(pose_landmarks)
    return images, all_pose_landmarks

pose_images, pose_landmarks = get_pose(sample_sequence_df)

## Converting Landmark Data to Images and Normalizing

### Function Definition: `get_all_images`

`get_all_images` function processes landmark data, normalizes it, and converts it into images using the Mediapipe library. The function handles pose, hand, and face landmarks separately.

1. **Data Extraction**: The code retrieves pose, hand, and face landmarks for each sequence from the input DataFrame (`seq_df`).

2. **Normalization and Scaling**: The landmarks are normalized and scaled to fit within a [0, 1] range. Scaling is done based on the maximum x and y values among all landmarks. Additionally, the landmarks are shifted to center the image around the shoulders and hips.

3. **Image Creation**: Images are initialized as black canvases with dimensions 900x600 pixels.

4. **Drawing Landmarks**: Mediapipe functions are used to draw the normalized landmarks on the images for pose, right hand, left hand, and face.

5. **Data Storage**: The resulting images (as NumPy arrays), landmark data (as NumPy arrays), and landmark objects are appended to various lists.

6. **Return Values**: The function returns three lists: `all_images` containing annotated images, `all_landmarks_data` containing normalized landmark data, and `all_landmarks` containing landmark objects.

`get_all_images` processes and normalizes landmark data for multiple body parts and generates corresponding annotated images for visualization or further analysis.


In [None]:
def convert_landmark_to_npy(landmarklist):
    return np.array([np.array([landmark.x, landmark.y, landmark.z]) for landmark in landmarklist.landmark])

def get_all_images(seq_df):
    pose_images, pose_landmarks = get_pose(seq_df)
    hand_images, hand_landmarks = get_hands(seq_df)
    face_images, face_landmarks = get_face(seq_df)
    
    all_images = []
    all_landmarks_data = []
    all_landmarks = []
    for seq_idx in tqdm(range(len(pose_landmarks))):
        pose_landmark_np = convert_landmark_to_npy(pose_landmarks[seq_idx])
        right_hand_landmark_np = convert_landmark_to_npy(hand_landmarks[seq_idx][0])
        left_hand_landmark_np = convert_landmark_to_npy(hand_landmarks[seq_idx][1])
        face_landmark_np = convert_landmark_to_npy(face_landmarks[seq_idx])
        
        # Pool all landmarks together to find min and max coordinates
        pooled_landmarks = np.vstack((pose_landmark_np, right_hand_landmark_np, left_hand_landmark_np, face_landmark_np))
        pooled_min = np.nanmin(pooled_landmarks, axis=0)
        pooled_max = np.nanmax(pooled_landmarks, axis=0)
        
        # Use the max of x and y scaling to proportionally scale the image. We don't need to scale z for 2D image
        scaling_factor = np.nanmax(pooled_max[:2])
        pooled_scaled_min = np.nanmin(pooled_landmarks / scaling_factor, axis=0)

        pose_landmark_np_normed = (pose_landmark_np / scaling_factor) - pooled_scaled_min
        
        # Center the image around shoulder and hips. Makes for a better visualization
        x_shift = ((1-(pose_landmark_np_normed[23]+pose_landmark_np_normed[24]))/2)[0]
        axis_shift = np.array([x_shift, 0, 0])
        
        pose_landmark_np_normed = pose_landmark_np_normed + axis_shift
        right_hand_landmark_np_normed = (right_hand_landmark_np / scaling_factor) - pooled_scaled_min + axis_shift
        left_hand_landmark_np_normed = (left_hand_landmark_np / scaling_factor) - pooled_scaled_min  + axis_shift
        face_landmark_np_normed = (face_landmark_np / scaling_factor) - pooled_scaled_min + axis_shift
        
        # Now that we have scaled and shifted the landmarks to fit into a [0, 1] range, we can start plotting them using mediapipe APIs
        # BG image with zeros
        image = np.zeros((900, 600, 3))
        
        # Pose
        pose_landmark_np_normed_z = landmark_pb2.LandmarkList()
        for row in pose_landmark_np_normed:
            pose_landmark_np_normed_z.landmark.add(x=row[0], y=row[1], z=row[2])

        mp_drawing.draw_landmarks(
                    image,
                    pose_landmark_np_normed_z,
                    mp_pose.POSE_CONNECTIONS,
                    landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())

        # Right hand
        right_hand_landmark_np_normed_z = landmark_pb2.LandmarkList()
        for row in right_hand_landmark_np_normed:
            right_hand_landmark_np_normed_z.landmark.add(x=row[0], y=row[1], z=row[2])

        mp_drawing.draw_landmarks(
                    image,
                    right_hand_landmark_np_normed_z,
                    mp_hands.HAND_CONNECTIONS,
                    landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Left hand
        left_hand_landmark_np_normed_z = landmark_pb2.LandmarkList()
        for row in left_hand_landmark_np_normed:
            left_hand_landmark_np_normed_z.landmark.add(x=row[0], y=row[1], z=row[2])

        mp_drawing.draw_landmarks(
                    image,
                    left_hand_landmark_np_normed_z,
                    mp_hands.HAND_CONNECTIONS,
                    landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Face
        face_landmark_np_normed_z = landmark_pb2.LandmarkList()
        for row in face_landmark_np_normed:
            face_landmark_np_normed_z.landmark.add(x=row[0], y=row[1], z=row[2])

        mp_drawing.draw_landmarks(
            image=image,
            landmark_list=face_landmark_np_normed_z,
            connections=mp_face_mesh.FACEMESH_TESSELATION,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles
            .get_default_face_mesh_tesselation_style())
        
        mp_drawing.draw_landmarks(
            image=image,
            landmark_list=face_landmark_np_normed_z,
            connections=mp_face_mesh.FACEMESH_CONTOURS,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles
            .get_default_face_mesh_contours_style())
        
        # Iris data not available. So ignoring the iris visualization.
        
        all_images.append(image.astype(np.uint8))
        all_landmarks_data.append([pose_landmark_np_normed, right_hand_landmark_np_normed_z, left_hand_landmark_np_normed_z, face_landmark_np_normed_z])
        all_landmarks.append([pose_landmark_np_normed_z, right_hand_landmark_np_normed_z, left_hand_landmark_np_normed_z, face_landmark_np_normed_z])
    return all_images, all_landmarks_data, all_landmarks

## Generating Images and Landmark Data

### Image and Landmark Data Generation

Two sets of images and associated landmark data are generated using the `get_all_images` function:

1. **`all_images, all_landmarks_data, all_landmarks`**:
   - This line of code calls the `get_all_images` function on the `sample_sequence_df` DataFrame. It generates images (`all_images`) and associated landmark data (`all_landmarks_data`) for the provided sequence data (`sample_sequence_df`). The `all_landmarks` list contains landmark objects.
   
2. **`all_images1, _, _`**:
   - A second set of images (`all_images1`) is generated using the same `get_all_images` function. However, this time, the `sample_sequence_df` DataFrame is filled using the forward-fill method (`fillna(method='ffill')`) to handle missing values.

These generated images and landmark data can be used for various purposes, such as visualization, analysis, or as input data for machine learning models. The filled DataFrame with forward-fill can be useful for scenarios where continuity of data is important.


In [None]:
all_images, all_landmarks_data, all_landmarks = get_all_images(sample_sequence_df)
all_images1, _, _ = get_all_images(sample_sequence_df.fillna(method='ffill'))

The create_animation(all_images1) function generates an animation from a list of images (all_images1) for visualization. It configures settings, creates a figure, displays images frame by frame, and returns the animation object.

In [None]:
create_animation(all_images1)

# Pre-Processing

## Load X/y

### Loading Training and Validation Data

#### Using Validation Data (USE_VAL=True)
- If `USE_VAL` is set to `True`, the code loads both training and validation datasets.

#### Training Data
- `X_train` is loaded from the file `/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/X_train.npy`.
- `y_train` is loaded from the file `/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/y_train.npy` and limited to a maximum phrase length of `MAX_PHRASE_LENGTH`.
- The variable `N_TRAIN_SAMPLES` is assigned the length of `X_train`.

#### Validation Data
- `X_val` is loaded from the file `/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/X_val.npy`.
- `y_val` is loaded from the file `/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/y_val.npy` and limited to a maximum phrase length of `MAX_PHRASE_LENGTH`.
- The variable `N_VAL_SAMPLES` is assigned the length of `X_val`.

#### Displaying Shapes
- The shapes of `X_train`, `X_val`, `y_train`, and `y_val` are displayed to provide information about the dimensions of the loaded data.

#### Using All Data (USE_VAL=False)
- If `USE_VAL` is set to `False`, the code loads only the training dataset.

#### Training Data
- `X_train` is loaded from the file `/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/X.npy`.
- `y_train` is loaded from the file `/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/y.npy` and limited to a maximum phrase length of `MAX_PHRASE_LENGTH`.
- The variable `N_TRAIN_SAMPLES` is assigned the length of `X_train`.

#### Displaying Shapes
- The shapes of `X_train` and `y_train` are displayed to provide information about the dimensions of the loaded data.

We load and prepare the training and validation datasets, depending on whether we want to use validation during training or not.


In [None]:
# Train/Validation
if USE_VAL:
    # TRAIN
    X_train = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/X_train.npy')
    y_train = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/y_train.npy')[:,:MAX_PHRASE_LENGTH]
    N_TRAIN_SAMPLES = len(X_train)
    # VAL
    X_val = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/X_val.npy')
    y_val = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/y_val.npy')[:,:MAX_PHRASE_LENGTH]
    N_VAL_SAMPLES = len(X_val)
    # Shapes
    print(f'X_train shape: {X_train.shape}, X_val shape: {X_val.shape}')
    print(f'y_train shape: {X_train.shape}, y_val shape: {X_val.shape}')
    
# Train On All Data
else:
    # TRAIN
    X_train = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/X.npy')
    y_train = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/y.npy')[:,:MAX_PHRASE_LENGTH]
    N_TRAIN_SAMPLES = len(X_train)
    print(f'X_train shape: {X_train.shape}')
    print(f'y_train shape: {y_train.shape}')

## Example Batch

### Creating Example Batches for Debugging

#### Large Example Batch
- `N_EXAMPLE_BATCH_SAMPLES` is set to 1024, determining the number of samples in the large example batch.
- The variable `X_batch` is a dictionary with two keys: `'frames'` and `'phrase'`.
  - `'frames'` contains a copy of the first `N_EXAMPLE_BATCH_SAMPLES` samples from `X_train`.
  - `'phrase'` contains a copy of the first `N_EXAMPLE_BATCH_SAMPLES` samples from `y_train`.
- The variable `y_batch` contains a copy of the first `N_EXAMPLE_BATCH_SAMPLES` samples from `y_train`.

#### Small Example Batch
- `N_EXAMPLE_BATCH_SAMPLES_SMALL` is set to 32, determining the number of samples in the small example batch.
- The variable `X_batch_small` is a dictionary with two keys: `'frames'` and `'phrase'`.
  - `'frames'` contains a copy of the first `N_EXAMPLE_BATCH_SAMPLES_SMALL` samples from `X_train`.
  - `'phrase'` contains a copy of the first `N_EXAMPLE_BATCH_SAMPLES_SMALL` samples from `y_train`.
- The variable `y_batch_small` contains a copy of the first `N_EXAMPLE_BATCH_SAMPLES_SMALL` samples from `y_train`.

These example batches are useful for debugging and testing the machine learning model during development.


In [None]:
# Example Batch For Debugging
N_EXAMPLE_BATCH_SAMPLES = 1024
N_EXAMPLE_BATCH_SAMPLES_SMALL = 32
# Example Batch
X_batch = {
    'frames': np.copy(X_train[:N_EXAMPLE_BATCH_SAMPLES]),
    'phrase': np.copy(y_train[:N_EXAMPLE_BATCH_SAMPLES]),
#     'phrase_type': np.copy(y_phrase_type_train[:N_EXAMPLE_BATCH_SAMPLES]),
}
y_batch = np.copy(y_train[:N_EXAMPLE_BATCH_SAMPLES])
# Small Example Batch
X_batch_small = {
    'frames': np.copy(X_train[:N_EXAMPLE_BATCH_SAMPLES_SMALL]),
    'phrase': np.copy(y_train[:N_EXAMPLE_BATCH_SAMPLES_SMALL]),
#     'phrase_type': np.copy(y_phrase_type_train[:N_EXAMPLE_BATCH_SAMPLES_SMALL]),
}
y_batch_small = np.copy(y_train[:N_EXAMPLE_BATCH_SAMPLES_SMALL])

## Landmark Indices

Similarly to the preprocessing notebook, we define the `get_idxs` function to obtain the indices of the left hand, right hand, and lips.

### Get Indices in Original DataFrame

#### Function Parameters
- `df`: This parameter represents the input DataFrame from which you want to extract column indices and names.
- `words_pos`: A list of words that you want to search for in column names.
- `words_neg`: An optional list of words that you want to exclude from column names.
- `ret_names`: A boolean parameter that determines whether to return both column indices and names (`True`) or only column indices (`False`).
- `idxs_pos`: An optional list of specific column indices to consider. If provided, the function will only search for words in these columns.

#### Function Behavior
- The function `get_idxs` iterates over the `words_pos` list and, for each word, searches for matching column names in the input DataFrame `df`.
- It excludes columns with names like "frame."
- If the `idxs_pos` parameter is provided, it restricts the search to columns with indices specified in `idxs_pos`.
- If the `words_neg` list is provided, it excludes columns that contain any of the words in `words_neg`.
- The function returns either both the column indices and names (if `ret_names` is `True`) or only the column indices (if `ret_names` is `False`).

#### Return Values
- If `ret_names` is `True`, the function returns two NumPy arrays:
  - `idxs`: An array of column indices that match the search criteria.
  - `names`: An array of column names that match the search criteria.
- If `ret_names` is `False`, the function returns a single NumPy array containing the column indices that match the search criteria.

This function can be useful for selecting specific columns from a DataFrame based on certain criteria defined by the `words_pos`, `words_neg`, and `idxs_pos` parameters.


In [None]:
# Get indices in original dataframe
def get_idxs(df, words_pos, words_neg=[], ret_names=True, idxs_pos=None):
    idxs = []
    names = []
    for w in words_pos:
        for col_idx, col in enumerate(example_parquet_df.columns):
            # Exclude Non Landmark Columns
            if col in ['frame']:
                continue
                
            col_idx = int(col.split('_')[-1])
            # Check if column name contains all words
            if (w in col) and (idxs_pos is None or col_idx in idxs_pos) and all([w not in col for w in words_neg]):
                idxs.append(col_idx)
                names.append(col)
    # Convert to Numpy arrays
    idxs = np.array(idxs)
    names = np.array(names)
    # Returns either both column indices and names
    if ret_names:
        return idxs, names
    # Or only columns indices
    else:
        return idxs

## Lips Landmark Face Ids

#### Lips Landmark Indices
- `LIPS_LANDMARK_IDXS`: This NumPy array contains a list of landmark indices associated with the lips and facial features. These indices represent specific points on the face related to lip movement and expression.

#### Landmark Indices for Left/Right Hand (X and Y Axes Only)
- `LEFT_HAND_IDXS0` and `RIGHT_HAND_IDXS0`: These arrays store the landmark indices for the left and right hands, respectively.
- `LEFT_HAND_NAMES0` and `RIGHT_HAND_NAMES0`: Corresponding arrays that store the names of the left and right hand landmarks.
- These landmarks are filtered to include only the X and Y axes; the Z axis information is excluded.
- These landmarks represent points in the hand regions.

#### Lips Landmark Indices (X and Y Axes Only)
- `LIPS_IDXS0` and `LIPS_NAMES0`: These arrays contain the landmark indices and names for lip and facial landmarks.
- Similar to the hand landmarks, they include only the X and Y axes information.
- These landmarks are associated with facial features and lip movements.

#### Column Organization
- `COLUMNS0`: This array consolidates all the landmark names for the left hand, right hand, and lips. It represents the names of columns in the dataset that correspond to these landmarks.
- `N_COLS0`: The total number of columns in the dataset that correspond to these landmarks.

#### Dimension Information
- `N_DIMS0`: This variable represents the number of dimensions used for these landmarks. In this case, only the X and Y axes are considered (2 dimensions).

#### Output
- Provides information about the number of columns (`N_COLS0`) and the dimensions (`N_DIMS0`) used for these landmark data.


In [None]:
# Lips Landmark Face Ids
LIPS_LANDMARK_IDXS = np.array([
        61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
        291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
        78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
        95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
    ])

# Landmark Indices for Left/Right hand without z axis in raw data
LEFT_HAND_IDXS0, LEFT_HAND_NAMES0 = get_idxs(example_parquet_df, ['left_hand'], ['z'])
RIGHT_HAND_IDXS0, RIGHT_HAND_NAMES0 = get_idxs(example_parquet_df, ['right_hand'], ['z'])
LIPS_IDXS0, LIPS_NAMES0 = get_idxs(example_parquet_df, ['face'], ['z'], idxs_pos=LIPS_LANDMARK_IDXS)
COLUMNS0 = np.concatenate((LEFT_HAND_NAMES0, RIGHT_HAND_NAMES0, LIPS_NAMES0))
N_COLS0 = len(COLUMNS0)
# Only X/Y axes are used
N_DIMS0 = 2

print(f'N_COLS0: {N_COLS0}')

### Landmark Indices in Subset of DataFrame with Only Selected Columns

#### Left Hand Landmark Indices
- `LEFT_HAND_IDXS`: This array contains the landmark indices for the left hand based on the selected columns (`LEFT_HAND_NAMES0`). These indices correspond to specific points on the left hand.

#### Right Hand Landmark Indices
- `RIGHT_HAND_IDXS`: Similar to the left hand, this array contains the landmark indices for the right hand based on the selected columns (`RIGHT_HAND_NAMES0`).

#### Lips Landmark Indices
- `LIPS_IDXS`: This array stores the landmark indices for the lips and facial features based on the selected columns (`LIPS_NAMES0`).

#### Hand Landmark Indices
- `HAND_IDXS`: This array concatenates the landmark indices for both the left and right hands. It represents all hand-related landmarks.

#### Total Number of Columns
- `N_COLS`: This variable represents the total number of columns in the dataset that correspond to the selected landmarks. It's equal to `N_COLS0` from the previous code block.

#### Dimension Information
- `N_DIMS`: Similar to before, this variable represents the number of dimensions used for these landmarks. In this case, only the X and Y axes are considered (2 dimensions).

#### Output
- Provides information about the number of columns (`N_COLS`) and dimensions (`N_DIMS`) used for these selected landmark data subsets.


In [None]:
# Landmark Indices in subset of dataframe with only COLUMNS selected
LEFT_HAND_IDXS = np.argwhere(np.isin(COLUMNS0, LEFT_HAND_NAMES0)).squeeze()
RIGHT_HAND_IDXS = np.argwhere(np.isin(COLUMNS0, RIGHT_HAND_NAMES0)).squeeze()
LIPS_IDXS = np.argwhere(np.isin(COLUMNS0, LIPS_NAMES0)).squeeze()
HAND_IDXS = np.concatenate((LEFT_HAND_IDXS, RIGHT_HAND_IDXS), axis=0)
N_COLS = N_COLS0
# Only X/Y axes are used
N_DIMS = 2

print(f'N_COLS: {N_COLS}')

### Indices in Processed Data by Axes (Dominant Hand)

#### X-Axis Landmark Indices
- `HAND_X_IDXS`: This array contains the indices of landmarks related to the dominant hand in the X-axis. It is generated by filtering the `LEFT_HAND_NAMES0` array to select only the landmarks with 'x' in their names.

#### Y-Axis Landmark Indices
- `HAND_Y_IDXS`: Similar to the X-axis, this array contains the indices of landmarks related to the dominant hand in the Y-axis. It is generated by filtering the `LEFT_HAND_NAMES0` array to select only the landmarks with 'y' in their names.

#### Names in Processed Data by Axes
- `HAND_X_NAMES`: This array stores the names of landmarks related to the dominant hand in the X-axis. It corresponds to the names of landmarks selected based on the `HAND_X_IDXS` indices.

- `HAND_Y_NAMES`: Similar to `HAND_X_NAMES`, this array contains the names of landmarks related to the dominant hand in the Y-axis. It corresponds to the names of landmarks selected based on the `HAND_Y_IDXS` indices.

#### Output
- The code provides information about the selected landmark indices and their corresponding names for the X and Y axes of the dominant hand. These subsets of data can be used for further processing or analysis.


In [None]:
# Indices in processed data by axes with only dominant hand
HAND_X_IDXS = np.array(
        [idx for idx, name in enumerate(LEFT_HAND_NAMES0) if 'x' in name]
    ).squeeze()
HAND_Y_IDXS = np.array(
        [idx for idx, name in enumerate(LEFT_HAND_NAMES0) if 'y' in name]
    ).squeeze()
# Names in processed data by axes
HAND_X_NAMES = LEFT_HAND_NAMES0[HAND_X_IDXS]
HAND_Y_NAMES = LEFT_HAND_NAMES0[HAND_Y_IDXS]

print(f'HAND_X_NAMES: {HAND_X_NAMES}')

# Mean/STD Loading




In [None]:
# Mean/Standard Deviations of data used for normalizing
MEANS = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/MEANS.npy').reshape(-1)
STDS = np.load('/kaggle/input/aslfr-eda-preprocessing-dataset-for-beginners/STDS.npy').reshape(-1)

print(f'First 5 values of MEANS: {MEANS[:5]}')
print(f'Shape of MEANS: {MEANS.shape}') # 164 values, 1 for every column

Let's add the preprocessing layer again here, as we will use it in the `TFLite` model.

In [None]:
class PreprocessLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(PreprocessLayer, self).__init__()
        self.normalisation_correction = tf.constant(
                    # Add 0.50 to x coordinates of left hand (original right hand) and substract 0.50 of right hand (original left hand)
                     [0.50 if 'x' in name else 0.00 for name in LEFT_HAND_NAMES0],
                dtype=tf.float32,
            )
    
    @tf.function(
        input_signature=(tf.TensorSpec(shape=[None,N_COLS0], dtype=tf.float32),),
    )
    def call(self, data0, resize=True):
        # Fill NaN Values With 0
        data = tf.where(tf.math.is_nan(data0), 0.0, data0)
        
        # Hacky
        data = data[None]
        
        # Empty Hand Frame Filtering
        hands = tf.slice(data, [0,0,0], [-1, -1, 84])
        hands = tf.abs(hands)
        mask = tf.reduce_sum(hands, axis=2)
        mask = tf.not_equal(mask, 0)
        data = data[mask][None]
        
        # Pad Zeros
        N_FRAMES = len(data[0])
        if N_FRAMES < N_TARGET_FRAMES:
            data = tf.concat((
                data,
                tf.zeros([1,N_TARGET_FRAMES-N_FRAMES,N_COLS], dtype=tf.float32)
            ), axis=1)
        # Downsample
        data = tf.image.resize(
            data,
            [1, N_TARGET_FRAMES],
            method=tf.image.ResizeMethod.BILINEAR,
        )
        
        # Squeeze Batch Dimension
        data = tf.squeeze(data, axis=[0])
        
        return data
    
preprocess_layer = PreprocessLayer()

# Train Dataset

1. `get_train_dataset` is a Python function that creates a generator for the training dataset.

2. `sample_idxs` is an array of indices ranging from 0 to the length of the training dataset `X`. These indices represent the samples in the dataset.

3. The function enters an infinite loop using `while True`, allowing it to continually provide batches of training samples.

4. In each iteration of the loop:
   - `random_sample_idxs` is generated by randomly selecting `batch_size` indices from `sample_idxs`. This step simulates random sampling of training samples with replacement.
   - Two dictionaries, `inputs` and `outputs`, are created. `inputs` contains two key-value pairs:
     - `'frames'`: This key corresponds to the input data, represented by `X` (frames or sequences of numerical data).
     - `'phrase'`: This key corresponds to the output data (labels), represented by `y` (phrases or sequences of integer indices).
   - `outputs` is a batch of output data represented by `y` based on the random indices selected.

5. The `yield` statement is used to return the `inputs` and `outputs` dictionaries as a pair of values in each iteration of the generator. This allows the generator to yield a batch of training samples in each iteration.

6. The generator, when called, creates an iterator that can be used in a `for` loop to iterate over training batches. Each batch is represented by `inputs` and `outputs`.

7. After defining the generator, it is used to create the `train_dataset`. This dataset will be used during model training.

8. Finally, the variable `TRAIN_STEPS_PER_EPOCH` is calculated. It represents the number of training steps required per epoch based on the batch size and the total number of training samples.

The generator and dataset iterator provide an efficient way to iterate over the training dataset in batches, which is necessary for training machine learning models on large datasets. In each training step, the model can process a batch of samples, updating its weights and gradually improving its performance.

In [None]:
# Train Dataset Iterator
def get_train_dataset(X, y, batch_size=BATCH_SIZE):
    sample_idxs = np.arange(len(X))
    while True:
        # Get random indices
        random_sample_idxs = np.random.choice(sample_idxs, batch_size)
        
        inputs = {
            'frames': X[random_sample_idxs],
            'phrase': y[random_sample_idxs],
        }
        outputs = y[random_sample_idxs]
        
        yield inputs, outputs
        

# Train Dataset
train_dataset = get_train_dataset(X_train, y_train)

# Training Steps Per Epoch
TRAIN_STEPS_PER_EPOCH = math.ceil(N_TRAIN_SAMPLES / BATCH_SIZE)
print(f'TRAIN_STEPS_PER_EPOCH: {TRAIN_STEPS_PER_EPOCH}')

# Validation Dataset

1. `get_val_dataset` is a Python function that creates a generator for the validation dataset. This generator will be used to provide batches of validation samples for evaluating the model during training.

2. `offsets` is an array of offsets representing the starting indices of each batch within the validation dataset. These offsets are generated using `np.arange` and ensure that the entire validation dataset is covered.

3. The function enters an infinite loop using `while True`, allowing it to continually provide batches of validation samples.

4. In each iteration of the outer loop, the generator iterates over the entire validation set by iterating through the `offsets` array. This inner loop covers all batches of the validation dataset.

5. For each batch, two dictionaries, `inputs` and `outputs`, are created. `inputs` contains two key-value pairs:
   - `'frames'`: This key corresponds to the input data, represented by `X` (frames or sequences of numerical data).
   - `'phrase'`: This key corresponds to the output data (labels), represented by `y` (phrases or sequences of integer indices).

6. `outputs` is a batch of output data represented by `y` for the current batch.

7. The `yield` statement is used to return the `inputs` and `outputs` dictionaries as a pair of values in each iteration of the generator. This allows the generator to yield a batch of validation samples in each iteration.

8. After defining the generator, it can be used to create the `val_dataset`, which will be used for validation during model training.

9. If the variable `USE_VAL` is `True`, indicating that a validation dataset should be used, the `val_dataset` is created.

10. Finally, if a validation dataset is used (`USE_VAL` is `True`), the variable `N_VAL_STEPS_PER_EPOCH` is calculated. It represents the number of validation steps required per epoch based on the batch size and the total number of validation samples.

The generator and validation dataset iterator provide an efficient way to iterate over the validation dataset in batches, which is necessary for evaluating the model's performance during training. In each validation step, the model can process a batch of validation samples and compute metrics such as loss and accuracy to monitor its performance.

In [None]:
# Validation Set
def get_val_dataset(X, y, batch_size=BATCH_SIZE):
    offsets = np.arange(0, len(X), batch_size)
    while True:
        # Iterate over whole validation set
        for offset in offsets:
            inputs = {
                'frames': X[offset:offset+batch_size],
                'phrase': y[offset:offset+batch_size],
            }
            outputs = y[offset:offset+batch_size]

            yield inputs, outputs
            

# Validation Dataset
if USE_VAL:
    val_dataset = get_val_dataset(X_val, y_val)
    

if USE_VAL:
    N_VAL_STEPS_PER_EPOCH = math.ceil(N_VAL_SAMPLES / BATCH_SIZE)
    print(f'N_VAL_STEPS_PER_EPOCH: {N_VAL_STEPS_PER_EPOCH}')

# Model Config

Here we defines several important variables related to the configuration and architecture of a Transformer model:

1. `LAYER_NORM_EPS`: This variable represents the epsilon value used in the layer normalization of the Transformer model. Layer normalization is a technique to normalize the activations of a layer, and `LAYER_NORM_EPS` is the small constant added to the denominator to avoid division by zero.

2. `UNITS_ENCODER` and `UNITS_DECODER`: These variables indicate the size (dimensionality) of the final output and the embeddings of the encoder and decoder, respectively. The encoder and decoder layers will have output and embedding dimensions defined by these variables.

3. `NUM_BLOCKS_ENCODER` and `NUM_BLOCKS_DECODER`: These variables represent the number of blocks (layers) in the encoder and decoder of the Transformer model. Each block consists of multiple sub-layers, including multi-head attention and feed-forward layers.

4. `NUM_HEADS`: Indicates the number of attention heads in the multi-head attention mechanism of the Transformer. Multi-head attention allows the model to attend to different parts of the input simultaneously, improving its ability to capture complex patterns.

5. `MLP_RATIO`: This variable is the multiplication factor used to calculate the size of the feed-forward layer in the Transformer block. The feed-forward layer is a part of the self-attention mechanism and its size is determined by multiplying the `UNITS_ENCODER` or `UNITS_DECODER` by `MLP_RATIO`.

6. `EMBEDDING_DROPOUT`, `MLP_DROPOUT_RATIO`, `MHA_DROPOUT_RATIO`, and `CLASSIFIER_DROPOUT_RATIO`: These variables define the dropout rates used in different parts of the model. Dropout is a regularization technique that helps prevent overfitting by randomly setting a fraction of input units to zero during training. These variables control the dropout rates for the embedding layer, the multi-layer perceptron (MLP) layer, the multi-head attention (MHA) mechanism, and the classifier layer, respectively.

7. `INIT_HE_UNIFORM`, `INIT_GLOROT_UNIFORM`, and `INIT_ZEROS`: These variables specify the initializers used to initialize the weights of the model's layers. Initialization is crucial for the training process, and different initializers may affect how well the model converges during training. `INIT_HE_UNIFORM` uses He Uniform initialization, `INIT_GLOROT_UNIFORM` uses Glorot Uniform initialization, and `INIT_ZEROS` initializes the weights with zeros.

8. `GELU`: `GELU` is an activation function called Gaussian Error Linear Unit. It is used as the activation function in the model. GELU is known to perform well in deep neural networks and is often used in Transformer models.

These variables collectively define the hyperparameters, architecture, and regularization techniques used in the Transformer model. They play a crucial role in shaping the model's behavior, capacity, and training dynamics. Properly tuning these hyperparameters is essential for achieving good model performance on the target task.

In [None]:
# Epsilon value for layer normalisation
LAYER_NORM_EPS = 1e-6

# final embedding and transformer embedding size
UNITS_ENCODER = 384
UNITS_DECODER = 256

# Transformer
NUM_BLOCKS_ENCODER = 4
NUM_BLOCKS_DECODER = 2
NUM_HEADS = 4
MLP_RATIO = 2

# Dropout
EMBEDDING_DROPOUT = 0.05
MLP_DROPOUT_RATIO = 0.30
MHA_DROPOUT_RATIO = 0.15
CLASSIFIER_DROPOUT_RATIO = 0.10

# Initiailizers
INIT_HE_UNIFORM = tf.keras.initializers.he_uniform
INIT_GLOROT_UNIFORM = tf.keras.initializers.glorot_uniform
INIT_ZEROS = tf.keras.initializers.constant(0.0)
# Activations
GELU = tf.keras.activations.gelu

## Landmark Embedding

The `LandmarkEmbedding` class is a custom layer in TensorFlow/Keras that is designed to embed landmarks using fully connected layers. This custom layer takes a sequence of landmark data as input and returns embedded representations for each landmark.

1. **Initialization**: The `__init__` method initializes the `LandmarkEmbedding` layer. It takes two arguments, `units` and `name`. `units` specifies the dimensionality of the embedded representation, and `name` sets the name of the layer. It also sets `supports_masking` to `True`, indicating that the layer supports masking.

2. **Build Method**: The `build` method is called to create the layer's weights. In this method, two important components are defined:
   - `empty_embedding`: This component is a trainable weight representing the embedding for missing landmarks in a frame. It's initialized with zeros using the `INIT_ZEROS` initializer.
   - `dense`: This component is a sequential model consisting of two dense (fully connected) layers. The first dense layer uses the GELU activation function and Glorot Uniform initialization (`INIT_GLOROT_UNIFORM`). The second dense layer uses He Uniform initialization (`INIT_HE_UNIFORM`). These layers are used to embed the landmark data.

3. **Call Method**: The `call` method is where the actual embedding process takes place. It accepts the input `x`, which is expected to be a tensor representing the landmark data. The following steps are performed in the `call` method:
   - It checks whether each landmark is missing in the frame by summing along the third axis (axis=2) and creating a binary mask (0 if missing, 1 if present).
   - If a landmark is missing (indicated by a sum of 0 in the mask), it uses the `empty_embedding` for that landmark, effectively returning zeros.
   - If a landmark is present, it passes the landmark data through the `dense` layers to obtain the embedded representation.

This custom layer allows you to handle missing landmarks by providing an empty embedding for them and embedding the available landmarks using a neural network. It's a useful component in a model that processes sequences of landmarks, such as the one you're building.

In [None]:
# Embeds a landmark using fully connected layers
class LandmarkEmbedding(tf.keras.Model):
    def __init__(self, units, name):
        super(LandmarkEmbedding, self).__init__(name=f'{name}_embedding')
        self.units = units
        self.supports_masking = True
        
    def build(self, input_shape):
        # Embedding for missing landmark in frame, initialized with zeros
        self.empty_embedding = self.add_weight(
            name=f'{self.name}_empty_embedding',
            shape=[self.units],
            initializer=INIT_ZEROS,)
        # Embedding: 2 dense layers
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(self.units, name=f'{self.name}_dense_1', use_bias=False, kernel_initializer=INIT_GLOROT_UNIFORM, activation=GELU),
            tf.keras.layers.Dense(self.units, name=f'{self.name}_dense_2', use_bias=False, kernel_initializer=INIT_HE_UNIFORM),
        ], name=f'{self.name}_dense')

    def call(self, x):
        # if the landmark = 0 -> use empty embedding (return 0s), else use dense embedding
        return tf.where(
                # Checks whether landmark is missing in frame
                tf.reduce_sum(x, axis=2, keepdims=True) == 0,
                # If so, the empty embedding is used
                self.empty_embedding,
                # Otherwise the landmark data is embedded
                self.dense(x),
            )

## Embedding

The `Embedding` class is another custom layer in TensorFlow/Keras that is used to create embeddings for each frame in a sequence. It incorporates both positional embeddings and landmark embeddings.

1. **Initialization**: The `__init__` method initializes the `Embedding` layer. It sets `supports_masking` to `True`, indicating that the layer supports masking. The masking feature can be useful when dealing with sequences of varying lengths.

2. **Build Method**: The `build` method is responsible for creating the layer's weights. In this method, two important components are defined:

   - `positional_embedding`: This component represents the positional embeddings for each frame index. It is a trainable variable initialized with zeros but is trainable. The shape of this variable is `[N_TARGET_FRAMES, UNITS_ENCODER]`, where `N_TARGET_FRAMES` is the number of target frames (presumably the number of frames in the output sequence), and `UNITS_ENCODER` is the dimensionality of the embedding.

   - `dominant_hand_embedding`: This component is an instance of the `LandmarkEmbedding` layer that you defined earlier. It is used to embed the landmark data for the dominant hand. The embedding dimension is set to `UNITS_ENCODER`.

3. **Call Method**: The `call` method is where the embedding process occurs. It accepts the input `x`, which is expected to be a tensor representing the frames of data. The following steps are performed in the `call` method:

   - Data Normalization: The input data `x` is normalized. Specifically, it checks whether each element is equal to 0.0 (indicating missing data) and replaces those elements with 0.0. For non-missing data, it performs a standardization operation using the `MEANS` and `STDS` constants. This normalization step scales the data to have zero mean and unit variance.

   - Dominant Hand Embedding: The normalized data is passed through the `dominant_hand_embedding` layer, which applies landmark embeddings. This step extracts information from the landmarks.

   - Positional Encoding: The positional embeddings are added to the result of the dominant hand embedding. This adds positional information to each frame, which can be crucial for capturing temporal dependencies in the sequence.

   - The final embedded representation is returned.

This custom layer combines both landmark and positional embeddings to create meaningful representations for each frame in the input sequence. These embeddings can then be used as input to subsequent layers in your model.

In [None]:
# Creates embedding for each frame
class Embedding(tf.keras.Model):
    def __init__(self):
        super(Embedding, self).__init__()
        self.supports_masking = True
    
    def build(self, input_shape):
        # Positional embedding for each frame index
        self.positional_embedding = tf.Variable(
            initial_value=tf.zeros([N_TARGET_FRAMES, UNITS_ENCODER], dtype=tf.float32),
            trainable=True,
            name='embedding_positional_encoder',)
        # Embedding layer for Landmarks
        self.dominant_hand_embedding = LandmarkEmbedding(UNITS_ENCODER, 'dominant_hand')

    def call(self, x, training=False):
        # Normalize data before aplying embedding
        x = tf.where(
                tf.math.equal(x, 0.0),
                0.0,
                (x - MEANS) / STDS,)
        # Dominant Hand: apply landmark embedding to extract information
        x = self.dominant_hand_embedding(x)
        # Add Positional Encoding
        x = x + self.positional_embedding
        
        return x

The `PositionalEmbedding` class is designed to perform positional embedding, which adds positional information to the input data. This positional information can be critical for capturing the sequential order of elements in a sequence. Let's break down this class:

1. **Initialization**: The `__init__` method initializes the `PositionalEmbedding` layer. It doesn't have any specific configuration parameters in this case.

2. **Build Method**: The `build` method is responsible for creating the layer's weights. In this case, it defines a `positional_embedding` variable. This variable represents the positional embeddings and is trainable. The shape of this variable is `[1, 164]`, which indicates that it contains positional embeddings for 164 positions. The choice of 164 positions is specific to your use case and the length of sequences you are dealing with.

3. **Call Method**: The `call` method is where the positional embedding process occurs. It accepts an input tensor `x`, which typically represents the data to which positional embeddings need to be added. The following steps are performed in the `call` method:

   - The input tensor `x` is added to the `positional_embedding` variable. This operation effectively adds the positional information to each element of the input tensor.

   - The resulting tensor, which now includes positional embeddings, is returned.

Positional embeddings are a key component in models like Transformers because they provide information about the order of elements in a sequence. These embeddings are often added to the input data at the beginning of the model to ensure that the model can capture the sequential dependencies effectively.

To use this `PositionalEmbedding` layer in your model, you would typically include it as one of the initial layers in your model architecture to embed the input sequence with positional information before further processing.

In [None]:
# positional Embedding class
class PositionalEmbedding(tf.keras.Model):
    def __init__(self):
        super(PositionalEmbedding, self).__init__()
    
    def build(self, input_shape):
        self.positional_embedding = tf.Variable(
            initial_value=tf.zeros([1, 164], dtype=tf.float32),
            trainable=True,
            name='positional_embedding')
    
    def call(self, x):
        return x + self.positional_embedding

# Tranformer

The `MultiHeadAttention` layer is an implementation of the multi-head self-attention mechanism used in Transformer models. This mechanism is crucial for capturing relationships between different elements in a sequence. Let's break down the key components of this layer:

1. **Initialization**: In the `__init__` method, you specify the parameters for the multi-head attention layer:
   - `d_model`: The total dimensionality of the model or the input data.
   - `n_heads`: The number of attention heads.
   - `dropout`: The dropout rate applied to the output of the attention layer.
   - `d_out`: The dimensionality of the output after the attention layer (if different from `d_model`).

2. **Learnable Projections**: Three learnable projections are defined for queries (`wq`), keys (`wk`), and values (`wv`). Each of these projections uses a fused multi-head approach, meaning that a single dense layer is applied to all attention heads, and then the results are reshaped and permuted to match the multi-head attention format.

3. **Output Projection**: The output of the multi-head attention mechanism is projected to the specified output dimensionality (`d_model` or `d_out`) using a dense layer (`wo`). This projection allows the model to adapt the output of the attention mechanism to the desired dimensionality.

4. **Softmax Activation with Masking**: The attention scores are computed using a dot product between queries and keys (`tf.matmul(Q, K, transpose_b=True) * self.scale`) and then passed through a softmax activation (`self.softmax`) with masking support. This masking is essential for models that work with sequences of varying lengths, as it ensures that padded elements do not contribute to the attention scores.

5. **Reshaping**: The attention scores are reshaped to flatten the attention heads before applying the output projection.

6. **Dropout**: A dropout layer (`self.do`) is applied to the output of the attention mechanism for regularization.

7. **Support for Masking**: The layer supports masking through the `mask` parameter when applying the softmax activation.

8. **`call` Method**: The `call` method defines the forward pass of the layer. It takes queries (`q`), keys (`k`), and values (`v`) as inputs, along with an optional `attention_mask`. The method computes the attention scores, applies the softmax activation with masking, and produces the output after projection and dropout.

This implementation follows the typical structure of multi-head self-attention layers found in Transformer models. It's important for capturing dependencies between different parts of the input sequence. This layer can be used as one of the building blocks when constructing a Transformer-based model for various natural language processing tasks, sequence-to-sequence tasks, and more.

In [None]:
# based on: https://stackoverflow.com/questions/67342988/verifying-the-implementation-of-multihead-attention-in-transformer
# replaced softmax with softmax layer to support masked softmax
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, dropout, d_out=None):
        super(MultiHeadAttention,self).__init__()
        # Number of Units in Model
        self.d_model = d_model
        # Number of Attention Heads
        self.n_heads = n_heads
        # Number of Units in Intermediate Layers
        self.depth = d_model // 2
        # Scaling Factor Of Values
        self.scale = 1.0 / tf.math.sqrt(tf.cast(self.depth, tf.float32))
        # Learnable Projections to Depth
        self.wq = self.fused_mha(self.depth)
        self.wk = self.fused_mha(self.depth)
        self.wv = self.fused_mha(self.depth)
        # Output Projection
        self.wo = tf.keras.layers.Dense(d_model if d_out is None else d_out, use_bias=False)
        # Softmax Activation Which Supports Masking
        self.softmax = tf.keras.layers.Softmax()
        # Reshaping Of Multiple Attention heads to Single Value
        self.reshape = tf.keras.Sequential([
            # [attention heads, number of frames, d_model] → [number of frames, n_heads, d_model // n_heads]
            tf.keras.layers.Permute([2, 1, 3]),
            # [number of frames, attention heads, d_model] → [number of frames, d_model]
            tf.keras.layers.Reshape([N_TARGET_FRAMES, self.depth]),
        ])
        # Output Dropout
        self.do = tf.keras.layers.Dropout(dropout)
        self.supports_masking = True
        
    # Single dense layer for all attention heads
    def fused_mha(self, dim):
        return tf.keras.Sequential([
            # Single dense layer
            tf.keras.layers.Dense(dim, use_bias=False),
            # Reshape to [number of frames, number of attention head, depth]
            tf.keras.layers.Reshape([N_TARGET_FRAMES, self.n_heads, dim // self.n_heads]),
            # Permutate to [number of attention heads, number of frames, depth]
            tf.keras.layers.Permute([2, 1, 3]),
        ])
        
    def call(self, q, k, v, attention_mask=None, training=False):
        # Projections to attention heads
        Q = self.wq(q)
        K = self.wk(k)
        V = self.wv(v)
        # Matrix multiply QxK to acquire attention scores
        x = tf.matmul(Q, K, transpose_b=True) * self.scale
        # Softmax attention scores and Multiply with Values
        x = self.softmax(x, mask=attention_mask) @ V
        # Reshape to flatten attention heads
        x = self.reshape(x)
        # Output projection
        x = self.wo(x)
        # Dropout
        x = self.do(x, training=training)
        return x

## Encoder

The `Encoder` class represents the encoder part of a Transformer-based model. The encoder processes the input data, captures relevant information, and prepares it for further processing by the decoder. Here's an explanation of the key components of this `Encoder` class:

1. **Initialization**: In the `__init__` method, you specify the number of transformer blocks (`num_blocks`) that make up the encoder. Each block consists of multi-head self-attention and feed-forward layers. You also set `supports_masking` to `True` to indicate that the encoder supports masking.

2. **Building Blocks**: In the `build` method, you create the layers that make up each transformer block (`num_blocks`). For each block, you define the following components:
   - Layer Normalization (`ln_1` and `ln_2`): Layer normalization is applied before and after the multi-head self-attention layer and the feed-forward layer in each block. Layer normalization helps stabilize the training process.
   - Multi-Head Self-Attention (`mha`): The multi-head self-attention layer processes the input data, capturing dependencies between different elements in the sequence.
   - Multi-Layer Perception (`mlp`): The feed-forward layer applies a series of dense layers with activation functions (GELU) and dropout to transform the data.

3. **Optional Dimensionality Projection**: You provide an option to project the output of the encoder to a different dimensionality (`UNITS_DECODER`) if it's different from the encoder's dimensionality (`UNITS_ENCODER`). This projection can be useful when the encoder and decoder have different dimensionalities.

4. **Attention Mask**: You define the `get_attention_mask` method to create an attention mask that helps the model ignore missing frames in the input data. This mask is applied to the multi-head self-attention layer.

5. **Forward Pass**: In the `call` method, you perform the forward pass through the encoder. Here are the steps:
   - Calculate the attention mask using the `get_attention_mask` method.
   - Iterate through the transformer blocks, applying layer normalization, multi-head self-attention, and feed-forward layers in sequence.
   - Optionally project the output to the decoder's dimensionality (`UNITS_DECODER`) if specified.

The `Encoder` class can be used as part of a larger Transformer-based model. It takes input data (`x`) and information about the input frames (`x_inp`) to process and capture relationships within the sequence. The encoder's output is typically passed to the decoder for further processing in tasks like sequence-to-sequence modeling or language translation.

This implementation follows the standard structure of the encoder in a Transformer model and can be customized as needed for specific applications.

In [None]:
# Encoder based on multiple transformer blocks
class Encoder(tf.keras.Model):
    def __init__(self, num_blocks):
        super(Encoder, self).__init__(name='encoder')
        self.num_blocks = num_blocks
        self.supports_masking = True
    
    def build(self, input_shape):
        self.ln_1s = []
        self.mhas = []
        self.ln_2s = []
        self.mlps = []
        # Make Transformer Blocks
        for i in range(self.num_blocks):
            # First Layer Normalisation
            self.ln_1s.append(tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS))
            # Multi Head Attention
            self.mhas.append(MultiHeadAttention(UNITS_ENCODER, NUM_HEADS, MHA_DROPOUT_RATIO))
            # Second Layer Normalisation
            self.ln_2s.append(tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS))
            # Multi Layer Perception
            self.mlps.append(tf.keras.Sequential([
                tf.keras.layers.Dense(UNITS_ENCODER * MLP_RATIO, activation=GELU, kernel_initializer=INIT_GLOROT_UNIFORM, use_bias=False),
                tf.keras.layers.Dropout(MLP_DROPOUT_RATIO),
                tf.keras.layers.Dense(UNITS_ENCODER, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
            ]))
            # Optional Projection to Decoder Dimension
            if UNITS_ENCODER != UNITS_DECODER:
                self.dense_out = tf.keras.layers.Dense(UNITS_DECODER, kernel_initializer=INIT_GLOROT_UNIFORM, use_bias=False)
                self.apply_dense_out = True
            else:
                self.apply_dense_out = False
                
    def get_attention_mask(self, x_inp):
        # Attention Mask
        attention_mask = tf.math.count_nonzero(x_inp, axis=[2], keepdims=True, dtype=tf.int32)
        attention_mask = tf.math.count_nonzero(attention_mask, axis=[2], keepdims=False)
        attention_mask = tf.expand_dims(attention_mask, axis=1)
        attention_mask = tf.expand_dims(attention_mask, axis=1)
        return attention_mask
        
    def call(self, x, x_inp, training=False):
        # Attention mask to ignore missing frames
        attention_mask = self.get_attention_mask(x_inp)
        # Iterate input over transformer blocks
        for ln_1, mha, ln_2, mlp in zip(self.ln_1s, self.mhas, self.ln_2s, self.mlps):
            x = ln_1(x + mha(x, x, x, attention_mask=attention_mask))
            x = ln_2(x + mlp(x))
            
        # Optional Projection to Decoder Dimension
        if self.apply_dense_out:
            x = self.dense_out(x)
    
        return x

# Decoder

The `Decoder` class represents the decoder part of a Transformer-based model. The decoder takes the encoder's output, previously generated tokens, and positional information to generate the next token in a sequence. Here's an explanation of the key components of this `Decoder` class:

1. **Initialization**: In the `__init__` method, you specify the number of transformer blocks (`num_blocks`) that make up the decoder. Each block consists of multi-head self-attention and feed-forward layers. You also set `supports_masking` to `True` to indicate that the decoder supports masking.

2. **Causal Attention Mask**: You create a causal attention mask using the `get_causal_attention_mask` method. This mask ensures that each position in the sequence can only attend to previous positions, preventing information flow from future tokens.

3. **Positional Embedding**: You initialize a positional embedding variable, which represents the positional information of tokens in the decoder. This embedding is added to the character embeddings.

4. **Character Embedding**: You apply an embedding layer to the input character sequence (`phrase`). This layer converts character indices into dense vectors.

5. **Positional Encoder MHA**: You use a multi-head attention mechanism (`pos_emb_mha`) to capture relationships between tokens in the decoder, considering both the character embeddings and positional embeddings. Layer normalization is applied after the multi-head attention.

6. **Building Blocks**: Similar to the encoder, you create the layers that make up each transformer block (`num_blocks`). For each block, you define the following components:
   - Layer Normalization (`ln_1` and `ln_2`): Layer normalization is applied before and after the multi-head self-attention layer and the feed-forward layer in each block.
   - Multi-Head Self-Attention (`mha`): The multi-head self-attention layer processes the decoder input, capturing dependencies between different elements in the sequence.
   - Multi-Layer Perception (`mlp`): The feed-forward layer applies a series of dense layers with activation functions (GELU) and dropout to transform the data.

7. **Input Preparation**: You prepare the input data for the decoder, including adding a start-of-sequence (SOS) token at the beginning and padding with pad tokens. This is done to align the input with the expected output sequence.

8. **Attention Mask**: You define the `get_attention_mask` method to create an attention mask that helps the model ignore missing frames in the input data. This mask is applied to the multi-head self-attention layer.

9. **Forward Pass**: In the `call` method, you perform the forward pass through the decoder. Here are the steps:
   - Prepend an SOS token to the input character sequence.
   - Pad the sequence with pad tokens to ensure a fixed length.
   - Add positional embeddings to the character embeddings.
   - Apply causal attention using the causal attention mask.
   - Calculate the attention mask for the input frames using the `get_attention_mask` method.
   - Iterate through the transformer blocks, applying layer normalization, multi-head self-attention, and feed-forward layers in sequence.
   - Slice the output to retain only the first 31 characters, corresponding to the output phrase.

The `Decoder` class can be used as part of a larger Transformer-based model for sequence-to-sequence tasks where the goal is to generate an output sequence (e.g., translation or text generation) based on an input sequence. It integrates both character and positional information and generates output tokens one step at a time while considering the causal dependencies between tokens.

In [None]:
# Decoder based on multiple transformer blocks
class Decoder(tf.keras.Model):
    def __init__(self, num_blocks):
        super(Decoder, self).__init__(name='decoder')
        self.num_blocks = num_blocks
        self.supports_masking = True
    
    def build(self, input_shape):
        # Causal Mask Batch Size 1
        self.causal_mask = self.get_causal_attention_mask()
        # Positional Embedding, initialized with zeros
        self.positional_embedding = tf.Variable(
            initial_value=tf.zeros([N_TARGET_FRAMES, UNITS_DECODER], dtype=tf.float32),
            trainable=True,
            name='embedding_positional_encoder',
        )
        # Character Embedding
        self.char_emb = tf.keras.layers.Embedding(N_UNIQUE_CHARACTERS, UNITS_DECODER, embeddings_initializer=INIT_ZEROS)
        # Positional Encoder MHA
        self.pos_emb_mha = MultiHeadAttention(UNITS_DECODER, NUM_HEADS, MHA_DROPOUT_RATIO)
        self.pos_emb_ln = tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
        # First Layer Normalisation
        self.ln_1s = []
        self.mhas = []
        self.ln_2s = []
        self.mlps = []
        # Make Transformer Blocks
        for i in range(self.num_blocks):
            # First Layer Normalisation
            self.ln_1s.append(tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS))
            # Multi Head Attention
            self.mhas.append(MultiHeadAttention(UNITS_DECODER, NUM_HEADS, MHA_DROPOUT_RATIO))
            # Second Layer Normalisation
            self.ln_2s.append(tf.keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPS))
            # Multi Layer Perception
            self.mlps.append(tf.keras.Sequential([
                tf.keras.layers.Dense(UNITS_DECODER * MLP_RATIO, activation=GELU, kernel_initializer=INIT_GLOROT_UNIFORM, use_bias=False),
                tf.keras.layers.Dropout(MLP_DROPOUT_RATIO),
                tf.keras.layers.Dense(UNITS_DECODER, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
            ]))
            
    def get_causal_attention_mask(self):
        i = tf.range(N_TARGET_FRAMES)[:, tf.newaxis]
        j = tf.range(N_TARGET_FRAMES)
        mask = tf.cast(i >= j, dtype=tf.int32)
        mask = tf.reshape(mask, (1, N_TARGET_FRAMES, N_TARGET_FRAMES))
        mult = tf.concat(
            [tf.expand_dims(1, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        mask = tf.tile(mask, mult)
        mask = tf.cast(mask, tf.float32)
        return mask
    
    def get_attention_mask(self, x_inp):
        # Attention Mask
        attention_mask = tf.math.count_nonzero(x_inp, axis=[2], keepdims=True, dtype=tf.int32)
        attention_mask = tf.math.count_nonzero(attention_mask, axis=[2], keepdims=False)
        attention_mask = tf.expand_dims(attention_mask, axis=1)
        attention_mask = tf.expand_dims(attention_mask, axis=1)
        return attention_mask
        
    def call(self, encoder_outputs, phrase, x_inp, training=False):
        # Batch Size
        B = tf.shape(encoder_outputs)[0]
        # Cast to INT32
        phrase = tf.cast(phrase, tf.int32)
        # Prepend SOS Token
        phrase = tf.pad(phrase, [[0,0], [1,0]], constant_values=START_TOKEN, name='prepend_sos_token')
        # Pad With PAD Token
        phrase = tf.pad(phrase, [[0,0], [0,N_TARGET_FRAMES-MAX_PHRASE_LENGTH-1]], constant_values=PAD_TOKEN, name='append_pad_token')
        # Positional Embedding
        x = self.positional_embedding + self.char_emb(phrase)
        # Causal Attention
        x = self.pos_emb_ln(x + self.pos_emb_mha(x, x, x, attention_mask=self.causal_mask))
        # Attention mask to ignore missing frames
        attention_mask = self.get_attention_mask(x_inp)
        # Iterate input over transformer blocks
        for ln_1, mha, ln_2, mlp in zip(self.ln_1s, self.mhas, self.ln_2s, self.mlps):
            x = ln_1(x + mha(x, encoder_outputs, encoder_outputs, attention_mask=attention_mask))
            x = ln_2(x + mlp(x))
        # Slice 31 Characters
        x = tf.slice(x, [0, 0, 0], [-1, MAX_PHRASE_LENGTH, -1])
    
        return x

This function is used to generate a causal attention mask for the decoder, ensuring that each position in the sequence can only attend to previous positions.

1. **Inputs**: The function takes a single argument `B`, which represents the batch size. This parameter is used to determine the batch dimension when creating the mask.

2. **Generating the Mask**: The mask is generated using two TensorFlow tensors `i` and `j`, where `i` represents the row indices and `j` represents the column indices. These tensors are created using `tf.range` to produce sequences of integers.

3. **Causal Mask**: The `mask` tensor is created by comparing `i` and `j` using the expression `i >= j`. This results in a tensor where each element is `True` if the row index is greater than or equal to the column index (causal mask condition), and `False` otherwise.

4. **Reshaping the Mask**: The mask is reshaped to have shape `(1, N_TARGET_FRAMES, N_TARGET_FRAMES)`, adding a singleton dimension at the beginning. This reshaping is necessary for broadcasting the mask during subsequent operations.

5. **Multiplying Batch Dimension**: The mask is multiplied along its first axis (batch dimension) by `B` to account for different batch sizes. This is done by concatenating `[1, 1]` with `tf.expand_dims(B, -1)` and applying element-wise multiplication. The result is a mask with shape `(B, N_TARGET_FRAMES, N_TARGET_FRAMES)`.

6. **Data Type Conversion**: The final mask is cast to a float32 data type to ensure compatibility with other TensorFlow operations.

7. **Returning the Mask**: The function returns the generated causal attention mask, which can be used in the decoder to enforce causality.

This function can be called with the batch size as an argument to generate a causal attention mask tailored to the batch size used during decoding in your model.

In [None]:
# Causal Attention to make decoder not attent to future characters which it needs to predict
def get_causal_attention_mask(B):
    i = tf.range(N_TARGET_FRAMES)[:, tf.newaxis]
    j = tf.range(N_TARGET_FRAMES)
    mask = tf.cast(i >= j, dtype=tf.int32)
    mask = tf.reshape(mask, (1, N_TARGET_FRAMES, N_TARGET_FRAMES))
    mult = tf.concat(
        [tf.expand_dims(B, -1), tf.constant([1, 1], dtype=tf.int32)],
        axis=0,
    )
    mask = tf.tile(mask, mult)
    mask = tf.cast(mask, tf.float32)
    return mask

Here we see how to create a causal attention mask for a specific batch size `B`, and it correctly generates the mask. 

1. `B = 1`: This sets the batch size to 1, indicating that you are generating a causal attention mask for a single example.

2. `i = tf.range(N_TARGET_FRAMES)[:, tf.newaxis]`: This creates a tensor `i` containing a range of integers from 0 to `N_TARGET_FRAMES - 1`. The `[:, tf.newaxis]` operation adds a new axis to `i`, converting it into a column vector.

3. `j = tf.range(N_TARGET_FRAMES)`: This creates a tensor `j` similar to `i`, representing another range of integers.

4. `mask = tf.cast(i >= j, dtype=tf.int32)`: Here, you compare each element of `i` with each element of `j` using the `>=` operator. This results in a boolean tensor where each element is `True` if the corresponding element in `i` is greater than or equal to the corresponding element in `j`, and `False` otherwise. The `tf.cast` function is then used to cast this boolean tensor to `tf.int32`, effectively converting `True` to 1 and `False` to 0.

5. `tf.expand_dims(B, -1)`: This creates a tensor with the value of `B` and adds a new axis at the end of the tensor. The `-1` argument indicates that the new axis should be added at the last dimension.

6. `mask = tf.reshape(mask, (1, N_TARGET_FRAMES, N_TARGET_FRAMES))`: The `mask` tensor is reshaped to have the shape `(1, N_TARGET_FRAMES, N_TARGET_FRAMES)`. This reshaping adds a singleton dimension at the beginning, effectively creating a batch dimension with size 1.

7. `mult = tf.concat([tf.expand_dims(B, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0)`: This code concatenates two tensors along the first dimension (axis 0). The first tensor is the result of `tf.expand_dims(B, -1)`, which has a shape of `(1, 1)` because of the added singleton dimension. The second tensor is created using `tf.constant` and has the shape `(2,)`. The `mult` tensor is the result of this concatenation, and it has a shape of `(2, 1)`.

8. `mask = tf.tile(mask, mult)`: The `tf.tile` function is used to replicate the `mask` tensor along dimensions specified by the `mult` tensor. In this case, it replicates the mask along the batch dimension (axis 0) by a factor of `B`, which is 1 in this case.

9. `mask = tf.cast(mask, tf.float32)`: Finally, the mask is cast to the `tf.float32` data type to ensure that it contains floating-point values.

The resulting `mask` tensor is a causal attention mask suitable for a batch size of 1 and can be used in your model for causal self-attention.

In [None]:
B = 1
i = tf.range(N_TARGET_FRAMES)[:, tf.newaxis]
j = tf.range(N_TARGET_FRAMES)

mask = tf.cast(i >= j, dtype=tf.int32)
tf.expand_dims(B, -1)
mask = tf.reshape(mask, (1, N_TARGET_FRAMES, N_TARGET_FRAMES)) # reshape [128,128] to [1,128,128]  
mult = tf.concat(
        [tf.expand_dims(B, -1), tf.constant([1, 1], dtype=tf.int32)],
        axis=0,)
mask = tf.tile(mask, mult)
mask = tf.cast(mask, tf.float32)

mask

# TopKAccuracy

The `TopKAccuracy` class is a custom metric for measuring top-k accuracy for multi-dimensional output. Here's an explanation of how this metric works:

1. **Initialization**: The `__init__` method is used to initialize the metric. It takes two arguments: `k` and `**kwargs`. The `k` argument specifies the value of k for the top-k accuracy calculation. The `**kwargs` argument allows additional keyword arguments to be passed when initializing the metric. The metric is named based on the value of `k`.

2. **Metric Initialization**: Inside the `__init__` method, a `SparseTopKCategoricalAccuracy` metric is created and stored in `self.top_k_acc`. This sub-metric is part of TensorFlow and is designed to compute top-k categorical accuracy for sparse targets. It calculates the accuracy of the top-k predictions against the true labels.

3. **`update_state` Method**: The `update_state` method is used to update the metric's state based on the true labels (`y_true`) and the predicted probabilities (`y_pred`). It also accepts an optional `sample_weight` argument, which is not used in this implementation.

    - `y_true` and `y_pred` are reshaped to have the same shape before further processing.
    
    - `character_idxs` is computed using `tf.where` to find the indices where `y_true` is less than `N_UNIQUE_CHARACTERS0`. This step is necessary to filter out any special tokens or padding tokens from the calculation, ensuring that only valid character indices are considered for accuracy.
    
    - The `tf.gather` function is then used to extract the relevant rows from `y_true` and `y_pred` based on the indices in `character_idxs`. This step effectively filters out any unwanted tokens from both the true labels and predicted probabilities.
    
    - Finally, the `update_state` method of the `SparseTopKCategoricalAccuracy` sub-metric (`self.top_k_acc`) is called with the filtered `y_true` and `y_pred` to update the top-k accuracy.

4. **`result` Method**: The `result` method is implemented to return the top-k accuracy as computed by the `SparseTopKCategoricalAccuracy` sub-metric (`self.top_k_acc`). It provides the final top-k accuracy value.

5. **`reset_state` Method**: The `reset_state` method is used to reset the state of the `SparseTopKCategoricalAccuracy` sub-metric (`self.top_k_acc`). It ensures that the metric can be used for a new batch of data without carrying over state information from previous batches.

The `TopKAccuracy` metric is a custom wrapper around the `SparseTopKCategoricalAccuracy` metric from TensorFlow. It calculates top-k accuracy for multi-dimensional output by filtering out unwanted tokens and then using the sub-metric to compute the accuracy based on the filtered data.

In [None]:
# TopK accuracy for multi dimensional output
class TopKAccuracy(tf.keras.metrics.Metric):
    def __init__(self, k, **kwargs):
        super(TopKAccuracy, self).__init__(name=f'top{k}acc', **kwargs)
        self.top_k_acc = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=k)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, [-1])
        y_pred = tf.reshape(y_pred, [-1, N_UNIQUE_CHARACTERS])
        character_idxs = tf.where(y_true < N_UNIQUE_CHARACTERS0)
        y_true = tf.gather(y_true, character_idxs, axis=0)
        y_pred = tf.gather(y_pred, character_idxs, axis=0)
        self.top_k_acc.update_state(y_true, y_pred)

    def result(self):
        return self.top_k_acc.result()
    
    def reset_state(self):
        self.top_k_acc.reset_state()

## Loss Weights



Here, we are creating an array `loss_weights` to specify the loss weights for different tokens in your model's output.

1. **Initialization**: You create an array `loss_weights` with a length of `N_UNIQUE_CHARACTERS`, which appears to be the number of unique tokens or characters in your model's output.

2. **Set Initial Weights to 1**: Initially, you set all the elements of the `loss_weights` array to 1. This means that all tokens in the model's output will have the same weight when calculating the loss during training.

3. **Set Pad Token Weight to 0**: You specifically set the weight of the "PAD_TOKEN" to 0. This indicates that you want to assign zero weight to the padding tokens in your output. Padding tokens are typically used to pad sequences to a uniform length during batch processing but do not contribute to the actual loss calculation. By setting their weight to 0, you effectively ignore them when computing the loss.

The idea behind setting different loss weights for tokens is to control the importance of each token when computing the loss during training. Tokens that are more critical or meaningful in your task can be assigned higher weights, while tokens that are less important (e.g., padding tokens) can be assigned lower or zero weights.

Keep in mind that these loss weights are often used in sequence-to-sequence models, especially when dealing with padded sequences, to ensure that the model focuses on learning from the meaningful tokens in the sequence and not the padding.

In [None]:
# Create Initial Loss Weights All Set To 1
loss_weights = np.ones(N_UNIQUE_CHARACTERS, dtype=np.float32)
# Set Loss Weight Of Pad Token To 0
loss_weights[PAD_TOKEN] = 0

## Sparse Categorical Crossentropy With Label Smoothing¶


The `scce_with_ls` function is a custom loss function for training a neural network, specifically designed for tasks involving sparse categorical cross-entropy with label smoothing.

1. **Filter Pad Tokens**: The first step is to filter out padding tokens from both `y_true` and `y_pred`. Padding tokens are usually used to make sequences equal in length, and they are not relevant for loss calculation. Filtering them out ensures that only actual tokens contribute to the loss.

2. **One-Hot Encoding**: Next, `y_true` is one-hot encoded. In a sparse categorical cross-entropy loss, `y_true` typically contains integer labels indicating the true class indices. One-hot encoding converts these integer labels into one-hot vectors. Each one-hot vector has a length equal to the number of unique classes (`N_UNIQUE_CHARACTERS` in this case), and it has a 1 in the position corresponding to the true class and 0s elsewhere.

3. **Categorical Cross-Entropy with Label Smoothing**: With one-hot encoded `y_true`, the function computes the categorical cross-entropy loss (`tf.keras.losses.categorical_crossentropy`) between the one-hot encoded true labels (`y_true`) and the predicted probabilities (`y_pred`). The `label_smoothing` parameter is set to 0.25, which means that label smoothing is applied during the loss calculation. Label smoothing is a regularization technique that prevents the model from becoming too confident about its predictions and encourages it to have softer, more uniform probability distributions over classes.

4. **Reduce Mean**: Finally, the computed loss values are averaged across all non-padding tokens to obtain the final loss value. This loss value can be used to update the neural network's weights during training.

In summary, `scce_with_ls` is a custom loss function designed for tasks where you have sparse categorical cross-entropy as the loss function and want to apply label smoothing to encourage more robust and generalized predictions from your model. It filters out padding tokens, one-hot encodes the true labels, applies label smoothing during the loss calculation, and computes the mean loss across non-padding tokens.

In [None]:
# source:: https://stackoverflow.com/questions/60689185/label-smoothing-for-sparse-categorical-crossentropy
def scce_with_ls(y_true, y_pred):
    # Filter Pad Tokens
    idxs = tf.where(y_true != PAD_TOKEN)
    y_true = tf.gather_nd(y_true, idxs)
    y_pred = tf.gather_nd(y_pred, idxs)
    # One Hot Encode Sparsely Encoded Target Sign
    y_true = tf.cast(y_true, tf.int32)
    y_true = tf.one_hot(y_true, N_UNIQUE_CHARACTERS, axis=1)
    # Categorical Crossentropy with native label smoothing support
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, label_smoothing=0.25, from_logits=True)
    loss = tf.math.reduce_mean(loss)
    return loss

# Model


We are defining the input layers and processing the inputs through the Encoder component of a Transformer-based model.

1. **Input Layers**: You define two input layers:
   - `frames_inp`: This input layer represents the input frames, where `N_TARGET_FRAMES` is the number of target frames, and `N_COLS` is the number of columns in the input data.
   - `phrase_inp`: This input layer represents the input phrase, where `MAX_PHRASE_LENGTH` is the maximum length of the phrase.

2. **Frames Input Processing**:
   - `x = frames_inp`: You initialize `x` with the `frames_inp` input tensor.

3. **Masking Layer**:
   - `x = tf.keras.layers.Masking(mask_value=0.0, input_shape=(N_TARGET_FRAMES, N_COLS))(x)`: Here, you apply a masking layer to `x`. This layer masks out values equal to `0.0`, which is often used to represent missing or padding values in sequences. The `input_shape` parameter specifies the expected shape of the input data.

4. **Embedding Layer**:
   - `x = Embedding()(x)`: You pass the masked input `x` through an `Embedding` layer. This layer applies embeddings to the input data, as explained in a previous code section. It includes positional embeddings and processes the dominant hand landmarks.

5. **Encoder Transformer Blocks**:
   - `x = Encoder(NUM_BLOCKS_ENCODER)(x, frames_inp)`: This step involves processing the embedded input data through an Encoder component composed of multiple Transformer blocks. You pass `x` as the input data and `frames_inp` as additional information. The Encoder applies self-attention mechanisms and feed-forward layers to capture contextual information from the input frames. The resulting `x` represents the encoded information after passing through the Encoder.

6. **Output Shape**:
   - After processing through the Encoder, you print the shape of `x`, which represents the output shape post-encoding, and the value of `UNITS_DECODER`, which indicates the size of the final output.

This part is initial processing of input data and the application of the Encoder component in a Transformer-based model. This is a common setup for sequence-to-sequence tasks, where the input frames are encoded into a meaningful representation before being used for generating an output sequence (e.g., a translation or prediction task).

In [None]:
# Inputs
frames_inp = tf.keras.layers.Input([N_TARGET_FRAMES, N_COLS], dtype=tf.float32, name='frames')
phrase_inp = tf.keras.layers.Input([MAX_PHRASE_LENGTH], dtype=tf.int32, name='phrase')
# Frames
x = frames_inp

# Masking to eliminate NaN values in frames
x = tf.keras.layers.Masking(mask_value=0.0, input_shape=(N_TARGET_FRAMES, N_COLS))(x)
print(f' Input shape: {x.shape}')
# Embedding
x = Embedding()(x)

print(f' Shape post Embedding: {x.shape}')
    
# Encoder Transformer Blocks
x = Encoder(NUM_BLOCKS_ENCODER)(x, frames_inp)

print(f' Shape post encoder: {x.shape}')
print(f' Units Decoder: {UNITS_DECODER}')

Here we define the Decoder component of a Transformer-based model.

1. **Decoder Blocks**:
   - `x = Decoder(NUM_BLOCKS_DECODER)(x, phrase_inp, frames_inp)`: Here, we are using the Decoder component, which consists of multiple Transformer blocks. We pass the following inputs:
     - `x`: The output from the Encoder, representing the encoded information of the input frames.
     - `phrase_inp`: The input phrase, which serves as the target sequence that the model aims to generate.
     - `frames_inp`: This input is also provided to the Decoder, but its purpose is not explicitly mentioned in this code snippet.
   - The Decoder processes the encoded input `x` and generates an output sequence. The Decoder blocks include self-attention mechanisms and feed-forward layers designed to generate sequences based on the provided inputs.

2. **Output Shape**:
   - `print(f'Shape post Decoder: {x.shape}')`: After processing through the Decoder, we print the shape of `x`. This shape represents the output of the Decoder, which is expected to have a shape related to the target sequence length and the number of unique characters in the output vocabulary.

3. **Number of Unique Characters**:
   - `print(f'No. of unique characters: {N_UNIQUE_CHARACTERS}')`: We print the number of unique characters in the output vocabulary. This is an important value as it indicates the size of the output space, which typically corresponds to the number of different tokens or characters that the model can generate.

4. **Classifier**:
   - `x = tf.keras.Sequential([...], name='classifier')(x)`: After the Decoder, you apply a sequence of operations to the output `x`. These operations include dropout and a dense layer. The dense layer serves as a classifier, with the number of output neurons equal to the number of unique characters (`N_UNIQUE_CHARACTERS`). This layer essentially performs a mapping from the Decoder's output to the space of possible output characters.

5. **Outputs**:
   - `outputs = x`: The final output of the model is assigned to `outputs`. This output represents the model's predictions for the target sequence.

The Decoder in a sequence-to-sequence model like this one plays a crucial role in generating the output sequence based on the information encoded from the input frames and the provided target phrase. The output is typically a sequence of tokens or characters, and the classifier helps map the intermediate representation to this output sequence.

In [None]:
# Decoder
x = Decoder(NUM_BLOCKS_DECODER)(x, phrase_inp, frames_inp)
print(f'Shape post Decoder: {x.shape}')    
print(f'No. of unique characters: {N_UNIQUE_CHARACTERS}')
# Classifier
x = tf.keras.Sequential([
    # Dropout
    tf.keras.layers.Dropout(CLASSIFIER_DROPOUT_RATIO), # CLASSIFIER_DROPOUT_RATIO = 0.1
    # Output Neurons: 62 classes (different tokens)
    tf.keras.layers.Dense(N_UNIQUE_CHARACTERS, activation=tf.keras.activations.linear,
                          kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
    ], name='classifier')(x)
    
outputs = x

A deep learning model is defined for a sequence-to-sequence task.

1. **Model Definition**: A TensorFlow/Keras model is created to handle two types of input: `frames_inp` for frames data and `phrase_inp` for phrase data. The model produces an output called `outputs`. The objective is to map sequences of frames and phrases to sequences of characters.

2. **Loss Function**: A custom loss function named `scce_with_ls` is defined. This loss function combines sparse categorical cross-entropy with label smoothing. It's employed to compute the loss during the model training process. Label smoothing is a regularization technique that smooths the target labels to improve the model's generalization.

3. **Optimizer Configuration**: The optimizer for training the model is set up. Specifically, the Rectified Adam optimizer (`tfa.optimizers.RectifiedAdam`) is used with a smoothness threshold of 4. Rectified Adam is an adaptive learning rate optimizer that adjusts learning rates based on the gradient history. Additionally, the Lookahead optimizer (`tfa.optimizers.Lookahead`) is applied to the Rectified Adam optimizer. Lookahead is a technique that can enhance training convergence.

4. **Evaluation Metrics**: A list of evaluation metrics is specified to monitor the model's performance during training. These metrics assess how effectively the model is learning. Two metrics are included:
   - `TopKAccuracy(1)`: This metric calculates the top-1 accuracy, which measures the frequency of the correct token being the model's top prediction for a given sequence.
   - `TopKAccuracy(5)`: This metric computes the top-5 accuracy, which measures the frequency of the correct token being among the top 5 predictions made by the model.

5. **Model Compilation**: The model is compiled by defining its loss function, optimizer, evaluation metrics, and loss weights. The loss weights assign varying importance to different classes, with a weight of 0 assigned to the `PAD_TOKEN`. This ensures that the model does not focus on padding tokens during training.

6. **Model Summary**: A summary of the model's architecture is displayed. This summary provides comprehensive information about each layer in the model, including layer names, output shapes, and the number of trainable parameters. It serves as a useful way to inspect the model's structure and complexity.

Once the model is compiled, it is ready for training. During training, it optimizes its weights to minimize the defined loss function while monitoring the specified evaluation metrics. The ultimate goal is to create a model that accurately maps input frames and phrases to character sequences.

In [None]:
# Create Tensorflow Model
model = tf.keras.models.Model(inputs=[frames_inp, phrase_inp], outputs=outputs)
    
# Categorical Crossentropy Loss With Label Smoothing
loss = scce_with_ls
    
# Adam Optimizer
optimizer = tfa.optimizers.RectifiedAdam(sma_threshold=4)
optimizer = tfa.optimizers.Lookahead(optimizer, sync_period=5)

# TopK Metrics
metrics = [
        TopKAccuracy(1),
        TopKAccuracy(5),]
    
model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=metrics,
    loss_weights=loss_weights,
    )
model.summary()

The `get_model` function is a convenient way to create and configure a deep learning model for a specific sequence-to-sequence task.

1. **Input Layers**: The function starts by defining two input layers:
   - `frames_inp`: This layer is used for frames data and expects input sequences with a shape of `[N_TARGET_FRAMES, N_COLS]`, where `N_TARGET_FRAMES` represents the number of target frames and `N_COLS` is the number of columns in each frame.
   - `phrase_inp`: This layer is designed for phrase data and expects input sequences with a maximum length of `MAX_PHRASE_LENGTH`.

2. **Frames Data Preprocessing**:
   - Masking Layer: A masking layer is applied to the frames input to handle any NaN (Not-a-Number) values in the data. The `mask_value` parameter is set to 0.0, indicating that 0.0 is treated as a masked value.

3. **Embedding Layer**: The frames data is passed through an embedding layer created using the `Embedding` class. This layer performs data normalization and landmark embedding to capture information from the frames.

4. **Encoder Transformer Blocks**: The normalized and embedded frames data is then passed through an encoder, which consists of multiple transformer blocks. These blocks apply multi-head self-attention to capture contextual information from the frames. The number of blocks is determined by the `NUM_BLOCKS_ENCODER` parameter.

5. **Decoder**: After encoding the frames, the function proceeds to the decoder, which is also composed of multiple transformer blocks. These blocks handle the generation of character sequences based on the encoded frames and input phrases. The number of decoder blocks is determined by the `NUM_BLOCKS_DECODER` parameter.

6. **Classifier Layer**: Following the decoder, a classifier layer is added. This layer includes dropout regularization with a dropout rate specified by `CLASSIFIER_DROPOUT_RATIO`. It's responsible for producing the final output, which is a sequence of characters. The number of output neurons in this layer matches the number of unique characters (`N_UNIQUE_CHARACTERS`) in the dataset.

7. **Model Compilation**: The function compiles the model with the following configurations:
   - Loss Function: It uses the custom loss function `scce_with_ls`, which combines sparse categorical cross-entropy with label smoothing. This loss function encourages the model to make more confident predictions.
   - Optimizer: The Rectified Adam optimizer (`tfa.optimizers.RectifiedAdam`) is employed with a smoothness threshold of 4. Additionally, the Lookahead optimizer (`tfa.optimizers.Lookahead`) is applied to enhance training convergence.
   - Metrics: Two evaluation metrics are specified:
     - `TopKAccuracy(1)`: Measures the top-1 accuracy, indicating how often the correct token is the model's top prediction.
     - `TopKAccuracy(5)`: Calculates the top-5 accuracy, indicating the frequency of the correct token appearing among the top 5 model predictions.
   - Loss Weights: The function assigns different loss weights to different classes, with a weight of 0 assigned to the `PAD_TOKEN` class. This ensures that padding tokens are not emphasized during training.

8. **Model Creation**: Finally, the function creates a TensorFlow/Keras model with the specified inputs, outputs, loss, optimizer, metrics, and loss weights. The configured model is returned by the function.

One can call this `get_model` function to obtain a pre-configured model for the sequence-to-sequence task.

In [None]:
def get_model():
    # Inputs
    frames_inp = tf.keras.layers.Input([N_TARGET_FRAMES, N_COLS], dtype=tf.float32, name='frames')
    phrase_inp = tf.keras.layers.Input([MAX_PHRASE_LENGTH], dtype=tf.int32, name='phrase')
    # Frames
    x = frames_inp

    # Masking
    x = tf.keras.layers.Masking(mask_value=0.0, input_shape=(N_TARGET_FRAMES, N_COLS))(x)
    
    # Embedding
    x = Embedding()(x)
    
    # Encoder Transformer Blocks
    x = Encoder(NUM_BLOCKS_ENCODER)(x, frames_inp)
    
    # Decoder
    x = Decoder(NUM_BLOCKS_DECODER)(x, phrase_inp, frames_inp)
    
    # Classifier
    x = tf.keras.Sequential([
        # Dropout
        tf.keras.layers.Dropout(CLASSIFIER_DROPOUT_RATIO),
        # Output Neurons
        tf.keras.layers.Dense(N_UNIQUE_CHARACTERS, activation=tf.keras.activations.linear, kernel_initializer=INIT_HE_UNIFORM, use_bias=False),
    ], name='classifier')(x)
    
    outputs = x
    
    # Create Tensorflow Model
    model = tf.keras.models.Model(inputs=[frames_inp, phrase_inp], outputs=outputs)
    
    # Categorical Crossentropy Loss With Label Smoothing
    loss = scce_with_ls
    
    # Adam Optimizer
    optimizer = tfa.optimizers.RectifiedAdam(sma_threshold=4)
    optimizer = tfa.optimizers.Lookahead(optimizer, sync_period=5)

    # TopK Metrics
    metrics = [
        TopKAccuracy(1),
        TopKAccuracy(5),
    ]
    
    model.compile(
        loss=loss,
        optimizer=optimizer,
        metrics=metrics,
        loss_weights=loss_weights,
    )
    
    return model

In [None]:
tf.keras.backend.clear_session()
model = get_model()

In [None]:
# Plot model summary
model.summary(expand_nested=True)

In [None]:
# Plot Model Architecture
tf.keras.utils.plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, expand_nested=True, show_layer_activations=True)

## Learning Rate Scheduler

The `lrfn` (Learning Rate Function) is a function used for scheduling learning rates during training in deep learning models, particularly for training neural networks with cyclical learning rates. Let's break down this function step by step:

- **Inputs**:
  - `current_step`: The current training step or iteration.
  - `num_warmup_steps`: The number of warm-up steps during which the learning rate gradually increases.
  - `lr_max`: The maximum learning rate to be used during training.
  - `num_cycles`: The number of cycles of learning rate adjustments. Default value is 0.50.
  - `num_training_steps`: The total number of training steps or iterations, typically representing the number of epochs multiplied by the number of steps per epoch.

- **Warm-up Phase**:
  - In the beginning of training, for the first `num_warmup_steps` steps, the learning rate is increased gradually to help the model stabilize and converge more quickly. This is useful to prevent large initial weight updates that might destabilize the training process.
  - There are two methods for warm-up:
    - `'log'` Method: This method increases the learning rate using a logarithmic scale, where the learning rate decreases exponentially from `lr_max` as `current_step` increases.
    - Default Method: If the method is not `'log'`, it uses a method where the learning rate is increased using an exponential scale. It doubles the learning rate at each step until `current_step` reaches `num_warmup_steps`.

- **Main Training Phase**:
  - After the warm-up phase, the learning rate scheduling depends on the number of cycles and the progress through the training.
  - `progress` is a value between 0 and 1 that represents the progress through the training. It's calculated as the fraction of steps completed after the warm-up phase.
  - The learning rate during the main training phase follows a cyclical pattern. It oscillates between a lower bound and `lr_max` based on the cosine of a fraction of cycles completed.

- **Cosine Annealing**:
  - The learning rate during the main training phase is adjusted using a cosine annealing schedule. This means that the learning rate decreases smoothly from `lr_max` to a lower bound and then smoothly increases back to `lr_max` in a cyclical manner.
  - The cosine annealing is controlled by the `math.cos` function, and `num_cycles` determines how many times this cycle occurs during training.

- **Final Learning Rate**:
  - The final learning rate at any step is determined by combining the warm-up phase (if applicable) and the cosine annealing phase.

The `lrfn` function is used to schedule learning rates during training. It starts with a warm-up phase to gradually increase the learning rate and then enters a main training phase with cyclical adjustments. This scheduling strategy helps improve training stability and convergence in deep learning models. The specific scheduling method (logarithmic or exponential warm-up, cosine annealing) can be customized based on the requirements of the training task.

In [None]:
def lrfn(current_step, num_warmup_steps, lr_max, num_cycles=0.50, num_training_steps=N_EPOCHS):
    
    if current_step < num_warmup_steps:
        if WARMUP_METHOD == 'log':
            return lr_max * 0.10 ** (num_warmup_steps - current_step)
        else:
            return lr_max * 2 ** -(num_warmup_steps - current_step)
    else:
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))

        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) * lr_max

The `plot_lr_schedule` function is used to visualize the learning rate schedule during training.

- **Inputs**:
  - `lr_schedule`: A list of learning rates scheduled for each training step or epoch.
  - `epochs`: The total number of training epochs.

- **Plotting Learning Rate Schedule**:
  - The function starts by creating a figure (plot) with a specified size (20 units in width and 10 units in height).
  - It plots the learning rate schedule as a line plot using `plt.plot`. The `lr_schedule` list is plotted with `None` values added at the beginning and end to avoid plotting the line segments connecting the first and last points to the axis.
  - It sets the x-axis labels using `np.arange` and `plt.xticks` to label every epoch. If there are many epochs, it only labels every 5 epochs for better readability.
  - The y-axis limit is increased by 10% of the maximum learning rate for better visualization.
  - A title is added to the plot, displaying information about the learning rate schedule, including the initial, maximum, and final learning rates.
  - The learning rates at each epoch are plotted as points on the graph, and the values are annotated next to the points for clarity.

- **X and Y Labels**:
  - The x-axis is labeled as "Epoch" with a specified font size.
  - The y-axis is labeled as "Learning Rate" with a specified font size.
  - A grid is added to the plot.

- **Displaying the Plot**:
  - Finally, the plot is displayed using `plt.show()`.

The purpose of this function is to provide a visual representation of how the learning rate changes during training. This is important for monitoring and debugging the learning rate schedule, ensuring that it behaves as expected throughout training. The plotted graph helps practitioners understand the dynamics of learning rate adjustments and their impact on model training.

In [None]:
def plot_lr_schedule(lr_schedule, epochs):
    fig = plt.figure(figsize=(20, 10))
    plt.plot([None] + lr_schedule + [None])
    # X Labels
    x = np.arange(1, epochs + 1)
    x_axis_labels = [i if epochs <= 40 or i % 5 == 0 or i == 1 else None for i in range(1, epochs + 1)]
    plt.xlim([1, epochs])
    plt.xticks(x, x_axis_labels) # set tick step to 1 and let x axis start at 1
    
    # Increase y-limit for better readability
    plt.ylim([0, max(lr_schedule) * 1.1])
    
    # Title
    schedule_info = f'start: {lr_schedule[0]:.1E}, max: {max(lr_schedule):.1E}, final: {lr_schedule[-1]:.1E}'
    plt.title(f'Step Learning Rate Schedule, {schedule_info}', size=18, pad=12)
    
    # Plot Learning Rates
    for x, val in enumerate(lr_schedule):
        if epochs <= 40 or x % 5 == 0 or x is epochs - 1:
            if x < len(lr_schedule) - 1:
                if lr_schedule[x - 1] < val:
                    ha = 'right'
                else:
                    ha = 'left'
            elif x == 0:
                ha = 'right'
            else:
                ha = 'left'
            plt.plot(x + 1, val, 'o', color='black');
            offset_y = (max(lr_schedule) - min(lr_schedule)) * 0.02
            plt.annotate(f'{val:.1E}', xy=(x + 1, val + offset_y), size=12, ha=ha)
    
    plt.xlabel('Epoch', size=16, labelpad=5)
    plt.ylabel('Learning Rate', size=16, labelpad=5)
    plt.grid()
    plt.show()

# Learning rate for encoder
LR_SCHEDULE = [lrfn(step, num_warmup_steps=N_WARMUP_EPOCHS, lr_max=LR_MAX, num_cycles=0.50) for step in range(N_EPOCHS)]
# Plot Learning Rate Schedule
plot_lr_schedule(LR_SCHEDULE, epochs=N_EPOCHS)
# Learning Rate Callback
lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda step: LR_SCHEDULE[step], verbose=0)

## Weight Decay Callback

The `WeightDecayCallback` is a custom callback in TensorFlow that is used to update the weight decay (L2 regularization term) based on the learning rate during training.

- **Initialization**:
  - The constructor (`__init__`) accepts an optional argument `wd_ratio`, which represents the weight decay ratio. It is initialized with a default value (`WD_RATIO`).

- **`on_epoch_begin` Method**:
  - This method is executed at the beginning of each training epoch.
  - It calculates the weight decay value based on the current learning rate and the `wd_ratio`.
  - The weight decay is computed as the product of the learning rate and the `wd_ratio`.
  - It then prints the current learning rate and weight decay to the console for monitoring.

- **Usage**:
  - This callback should be used during training by passing it to the `callbacks` parameter when calling the `fit` method on a Keras model.

This callback allows you to dynamically adjust the weight decay based on the learning rate, which can be beneficial in certain optimization scenarios, especially when using learning rate schedules.

In [None]:
# Custom callback to update weight decay with learning rate
class WeightDecayCallback(tf.keras.callbacks.Callback):
    def __init__(self, wd_ratio=WD_RATIO):
        self.step_counter = 0
        self.wd_ratio = wd_ratio
    
    def on_epoch_begin(self, epoch, logs=None):
        model.optimizer.weight_decay = model.optimizer.learning_rate * self.wd_ratio
        print(f'learning rate: {model.optimizer.learning_rate.numpy():.2e}, weight decay: {model.optimizer.weight_decay.numpy():.2e}')

# Training

In [None]:
# We manually call `gc.collect()` to release unused memory.
gc.collect()

The code snippet you provided appears to be part of a larger training process.

- **Conditional Training** (`if TRAIN_MODEL`):
  - This code block is wrapped in a conditional statement that checks whether the `TRAIN_MODEL` variable is set to `True`. If it is, the model training process will be executed; otherwise, training will be skipped.

- **Clearing GPU Memory** (`tf.keras.backend.clear_session()`):
  - Before training a new model, it's a good practice to clear the GPU memory to release any resources consumed by previously trained models. The `tf.keras.backend.clear_session()` function is used for this purpose.

- **Getting a Fresh Model** (`model = get_model()`):
  - A new fresh model is obtained by calling the `get_model()` function. This function likely returns a pre-defined model architecture.

- **Model Summary and Sanity Check**:
  - The `model.summary()` function is called to print a summary of the model's architecture to the console. This summary provides information about the layers, their shapes, and the total number of parameters in the model.
  - Additionally, there is a `print('\n\n')` statement that adds some separation between the model summary and the training output for readability.

- **Actual Training** (`model.fit()`):
  - The core training process is executed using the `model.fit()` method. This method trains the model on the training dataset (`train_dataset`) for a specified number of epochs (`N_EPOCHS`).
  - The `steps_per_epoch` argument specifies the number of batches to process in each epoch.
  - If a validation dataset is available (`val_dataset`), it can be used to evaluate the model's performance during training. The `validation_data` and `validation_steps` arguments are used for this purpose.
  - Callbacks, such as the learning rate scheduler (`lr_callback`) and the weight decay callback, can be specified to customize the training process.
  - The `verbose` argument controls the verbosity of the training output. Depending on the value of `VERBOSE`, it can print training progress information to the console.

- **Impact on the Remaining Code**:
  - This code block represents the core training process of the model. It initializes a new model, trains it, and monitors its progress. Removing this part of the code would mean that model training does not occur.

Overall, training the model, and its execution depends on the value of `TRAIN_MODEL`. If `TRAIN_MODEL` is `True`, the model training process is executed as described. If it's `False`, training is skipped.

In [None]:
if TRAIN_MODEL:
    # Clear all models in GPU
    tf.keras.backend.clear_session()

    # Get new fresh model
    model = get_model()

    # Sanity Check
    model.summary()
    print('\n\n')
    # Actual Training
    history = model.fit(
            x=train_dataset,
            steps_per_epoch=TRAIN_STEPS_PER_EPOCH,
            epochs=N_EPOCHS,
            # Only used for validation data since training data is a generator
            validation_data=val_dataset if USE_VAL else None,
            validation_steps=N_VAL_STEPS_PER_EPOCH if USE_VAL else None,
            callbacks=[
                lr_callback,
                WeightDecayCallback(),
            ],
            verbose = VERBOSE,
        )

In [None]:
# Load Weights
if LOAD_WEIGHTS: # LOAD_WEIGHTS is set to False
    model.load_weights('/kaggle/input/aslfr-training-python37/model.h5')
    print(f'Successfully Loaded Pretrained Weights')

In [None]:
# Save Model Weights
model.save_weights('model.h5')

Now, we evaluate the loaded model on the specified dataset and calculate the loss and metrics of the model on that dataset. This step helps us verify that the model is loaded correctly and ready for use. Since `train_dataset` is a generator object, each time we access it, we obtain a new batch of different data. Therefore, we are not evaluating with the same data that we used for training.

The `evaluate()` function prints three values to the screen in the form of a list:

- Loss (Sparse Categorical Cross Entropy with Label Smoothing in our case): This measures how well the model's predictions fit the true labels during training. A lower loss indicates a better fit of the model to the training data.

- Top-1 Accuracy (TopKAccuracy(1)): This is the top-1 accuracy, also known as the accuracy in the ranking of the highest probability. It represents the fraction of samples in which the model correctly predicts the true class as the class with the highest probability. In other words, it is the accuracy for the case where only the most probable class is taken as the prediction.

- Top-5 Accuracy (TopKAccuracy(5)): This is the top-5 accuracy, which represents the fraction of samples in which the model correctly predicts the true class as one of the five classes with the highest probabilities. In other words, it considers the top five most probable classes and checks if the true class is present in those five classes.

In [None]:
# Verify Model is Loaded Correctly
model.evaluate(
    val_dataset if USE_VAL else train_dataset,
    steps=N_VAL_STEPS_PER_EPOCH if USE_VAL else TRAIN_STEPS_PER_EPOCH,
    batch_size=BATCH_SIZE,
    verbose=VERBOSE,
)

# Performance

## Levenshtein Distance


Levenshtein Distance, also known as the Edit Distance, is a metric used to measure the similarity or difference between two strings. Specifically, it measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to transform one string into another.

For example, let's say we have two strings: "kitten" and "sitting." The Levenshtein Distance between these two strings is 3 because you can transform "kitten" into "sitting" with the following three edits:

1. Substitute 'k' with 's': "sitten"
2. Substitute 'e' with 'i': "sittin"
3. Add 'g' at the end: "sitting"

Levenshtein Distance is commonly used in various natural language processing tasks, including spell checking, DNA sequence analysis, and text similarity measurement. In the context of text generation or translation, it can be used to evaluate how similar a generated text is to a reference text. Smaller Levenshtein Distance values indicate greater similarity between two strings.

Here we use Levenshtein Distance to compare the model's predicted phrase (output) to the ground truth or reference phrase. This can help you quantify the accuracy of the generated text by measuring the number of edits needed to make the generated text match the reference text.


The `outputs2phrase` function takes model outputs, which are typically one-hot encoded or logits, and converts them into a human-readable string. Here's an explanation of how it works:

- If the `outputs` array has a dimension of 2, it implies that the model outputs are one-hot encoded. In this case, the function uses `np.argmax(outputs, axis=1)` to find the index of the highest value in each row (each row represents a character or token). This index corresponds to the predicted character/token.

- Next, the function iterates through the predicted indices and uses the `num_to_char` dictionary to map each index to its corresponding character/token. It appends these characters/tokens together to form a string.

In essence, `outputs2phrase` decodes the model's output into a sequence of characters or tokens that represent a predicted phrase or sequence.


In [None]:
# Output Predictions to string
def outputs2phrase(outputs):
    if outputs.ndim == 2:
        outputs = np.argmax(outputs, axis=1)
    
    return ''.join([num_to_char.get(s, '') for s in outputs])

The `predict_phrase` function is a TensorFlow function decorated with `@tf.function()`, which is used to create a graph from the Python code for optimized execution. Here's an explanation of how this function works:

1. **Input Preparation**:
   - The function takes a single argument, `frames`, which is a tensor containing input frames.
   - It adds a batch dimension to the frames tensor using `tf.expand_dims(frames, axis=0)` to make it compatible with the model's input shape.

2. **Initialization**:
   - It initializes an empty tensor `phrase` of shape `[1, MAX_PHRASE_LENGTH]` filled with PAD_TOKEN values. This tensor will be used to build the predicted phrase.

3. **Prediction Loop**:
   - The function enters a loop that iterates for a maximum of `MAX_PHRASE_LENGTH` times. This loop is used to predict each token in the phrase.
   - Inside the loop, it casts the `phrase` tensor to `int8` data type.

   - It passes the `frames` and `phrase` tensors to the model as inputs using `model(...)`.
   
   - It uses `tf.argmax(outputs, axis=2, output_type=tf.int32)` to find the index (token) with the highest probability in the model's output (`outputs`). This predicted token is added to the `phrase` tensor at the appropriate position, effectively extending the predicted phrase.
   
   - A mask is created with `tf.range(MAX_PHRASE_LENGTH) < idx + 1`, which evaluates to `True` for positions in the phrase that have been predicted so far (up to the current index).
   
   - The `tf.where` function updates the `phrase` tensor: it keeps the original PAD_TOKEN values where the mask is `False` (indicating positions that haven't been predicted yet) and replaces positions where the mask is `True` with the predicted token.

4. **Output Preparation**:
   - After the loop, the function squeezes the batch dimension from the `phrase` tensor using `tf.squeeze(phrase, axis=0)` to get a tensor of shape `[MAX_PHRASE_LENGTH]`.
   
   - It then performs one-hot encoding on this tensor using `tf.one_hot(outputs, N_UNIQUE_CHARACTERS)` to obtain a one-hot encoded representation of the predicted phrase.

5. **Return**:
   - The function returns the one-hot encoded tensor containing the predicted phrase.

This function is designed to generate a phrase by iteratively predicting tokens one by one based on the input frames and the model's predictions. It uses a loop to build the phrase, updating the predicted token at each step. The final output is a one-hot encoded representation of the predicted phrase.

In [None]:
@tf.function()
def predict_phrase(frames):
    # Add Batch Dimension
    frames = tf.expand_dims(frames, axis=0)
    # Start Phrase
    phrase = tf.fill([1,MAX_PHRASE_LENGTH], PAD_TOKEN)

    for idx in tf.range(MAX_PHRASE_LENGTH):
        # Cast phrase to int8
        phrase = tf.cast(phrase, tf.int8)
        # Predict Next Token
        outputs = model({
            'frames': frames,
            'phrase': phrase,
        })

        # Add predicted token to input phrase
        phrase = tf.cast(phrase, tf.int32)
        phrase = tf.where( # where its True search for max prob, if its False keep Pad token
            tf.range(MAX_PHRASE_LENGTH) < idx + 1, # create a mask of Trues and Falses
            tf.argmax(outputs, axis=2, output_type=tf.int32), # search for the max probability
            phrase,
        )

    # Squeeze outputs
    outputs = tf.squeeze(phrase, axis=0) # drop first dimension
    outputs = tf.one_hot(outputs, N_UNIQUE_CHARACTERS) # one-hot encoding of the numbers

    # Return a dictionary with the output tensor
    return outputs

## Levenshtein's distance evaluation on training data

Here we computes Levenshtein distances between predicted and true phrases and organizes the results into a DataFrame.

1. **Initialization**:
   - The variable `N` is set to 100 if `IS_INTERACTIVE` is true (or 1000 otherwise). This determines how many samples will be processed.
   - An empty list `LD_TRAIN` is initialized to store dictionaries containing information about each sample's true phrase, predicted phrase, and Levenshtein distance.

2. **Loop Over Data**:
   - The code iterates over the training data using a `for` loop and the `zip` function, combining input frames (`frames`) and true phrases (`phrase_true`) for each sample.
   - For each sample, it does the following:
      - Calls the `predict_phrase` function to generate a predicted phrase based on the input frames. This prediction is converted to a string.
      - Converts the true phrase from its ordinal representation to a string.
      - Computes the Levenshtein distance between the predicted and true phrases using the `levenshtein` function (not shown in your provided code).

3. **DataFrame Creation**:
   - For each sample, a dictionary is created containing the following:
     - `'phrase_true'`: The true phrase as a string.
     - `'phrase_pred'`: The predicted phrase as a string.
     - `'levenshtein_distance'`: The Levenshtein distance between the predicted and true phrases.
   - Each dictionary is appended to the `LD_TRAIN` list.

4. **Subset Selection**:
   - If the current index (`idx`) reaches the value of `N`, the loop stops, creating a subset of `N` samples. This is useful in interactive mode to limit the number of processed samples.

5. **DataFrame Conversion**:
   - The `LD_TRAIN` list of dictionaries is converted into a Pandas DataFrame named `LD_TRAIN_DF`.

6. **Adding Length Column**:
   - A new column `'len_char'` is added to the DataFrame using the `apply` function. This column stores the length (number of characters) of the true phrases.

7. **Displaying Errors**:
   - The code displays the first 25 rows of the `LD_TRAIN_DF` DataFrame, showing information about true phrases, predicted phrases, Levenshtein distances, and phrase lengths.

We processes a subset of training data (controlled by `N`), calculate Levenshtein distances between predicted and true phrases, and presents the results in a DataFrame for further analysis and inspection of errors in phrase predictions.

In [None]:
# Compute Levenstein Distances
def get_ld_train():
    N = 100 if IS_INTERACTIVE else 1000
    LD_TRAIN = []
    for idx, (frames, phrase_true) in enumerate(zip(tqdm(X_train, total=N), y_train)):
        # Predict Phrase and Convert to String
        phrase_pred = predict_phrase(frames).numpy()
        phrase_pred = outputs2phrase(phrase_pred)
        # True Phrase Ordinal to String
        phrase_true = outputs2phrase(phrase_true)
        # Add Levenstein Distance
        LD_TRAIN.append({
            'phrase_true': phrase_true,
            'phrase_pred': phrase_pred,
            'levenshtein_distance': levenshtein(phrase_pred, phrase_true),
        })
        # Take subset in interactive mode
        if idx == N:
            break
            
    # Convert to DataFrame
    LD_TRAIN_DF = pd.DataFrame(LD_TRAIN)
    
    return LD_TRAIN_DF

In [None]:
LD_TRAIN_DF = get_ld_train()

# add column to see the length of the true phrase
LD_TRAIN_DF['len_char'] = LD_TRAIN_DF['phrase_true'].apply(lambda x: len(x))

# Display Errors
display(LD_TRAIN_DF.head(25))

Now, we calculate the distribution of Levenshtein distances in the training set and create a bar chart to visualize how these distances are distributed in the dataset.

In [None]:
# Value Counts
LD_TRAIN_VC = dict([(i, 0) for i in range(LD_TRAIN_DF['levenshtein_distance'].max()+1)])
for ld in LD_TRAIN_DF['levenshtein_distance']:
    LD_TRAIN_VC[ld] += 1

plt.figure(figsize=(15,8))
pd.Series(LD_TRAIN_VC).plot(kind='bar', width=1)
plt.title(f'Train Levenstein Distance Distribution | Mean: {LD_TRAIN_DF.levenshtein_distance.mean():.4f}')
plt.xlabel('Levenstein Distance')
plt.ylabel('Sample Count')
plt.xlim(-0.50, LD_TRAIN_DF.levenshtein_distance.max()+0.50)
plt.grid(axis='y')
plt.savefig('temp.png')
plt.show()

## Levenshtein's distance evaluation on Validation data

This part is similar to the previous evaluation on training data that computes Levenshtein distances between predicted and true phrases. However, it is applied to the validation dataset (X_val and y_val).

In [None]:
# Compute Levenstein Distances
def get_ld_val():
    N = 100 if IS_INTERACTIVE else 1000
    LD_VAL = []
    for idx, (frames, phrase_true) in enumerate(zip(tqdm(X_val, total=N), y_val)):
        # Predict Phrase and Convert to String
        phrase_pred = predict_phrase(frames).numpy()
        phrase_pred = outputs2phrase(phrase_pred)
        # True Phrase Ordinal to String
        phrase_true = outputs2phrase(phrase_true)
        # Add Levenstein Distance
        LD_VAL.append({
            'phrase_true': phrase_true,
            'phrase_pred': phrase_pred,
            'levenshtein_distance': levenshtein(phrase_pred, phrase_true),
        })
        # Take subset in interactive mode
        if idx == N:
            break
            
    # Convert to DataFrame
    LD_VAL_DF = pd.DataFrame(LD_VAL)
    
    return LD_VAL_DF

In [None]:
if USE_VAL:
    LD_VAL_DF = get_ld_val()

    # Display Errors
    display(LD_VAL_DF.head(25))

In [None]:
# Value Counts
if USE_VAL:
    LD_VAL_VC = dict([(i, 0) for i in range(LD_VAL_DF['levenshtein_distance'].max()+1)])
    for ld in LD_VAL_DF['levenshtein_distance']:
        LD_VAL_VC[ld] += 1

    plt.figure(figsize=(15,8))
    pd.Series(LD_VAL_VC).plot(kind='bar', width=1)
    plt.title(f'Validation Levenstein Distance Distribution | Mean: {LD_VAL_DF.levenshtein_distance.mean():.4f}')
    plt.xlabel('Levenstein Distance')
    plt.ylabel('Sample Count')
    plt.xlim(0-0.50, LD_VAL_DF.levenshtein_distance.max()+0.50)
    plt.grid(axis='y')
    plt.savefig('temp.png')
    plt.show()

## Training History

Once the model is trained and predictions are created for our data, a function called `plot_history_metric` is defined to plot the evolution of different metrics used in the training.

In [None]:
def plot_history_metric(metric, f_best=np.argmax, ylim=None, yscale=None, yticks=None):
    # Only plot when training
    if not TRAIN_MODEL:
        return
    
    plt.figure(figsize=(20, 10))
    
    values = history.history[metric]
    N_EPOCHS = len(values)
    val = 'val' in ''.join(history.history.keys())
    # Epoch Ticks
    if N_EPOCHS <= 20:
        x = np.arange(1, N_EPOCHS + 1)
    else:
        x = [1, 5] + [10 + 5 * idx for idx in range((N_EPOCHS - 10) // 5 + 1)]

    x_ticks = np.arange(1, N_EPOCHS+1)

    # Validation
    if val:
        val_values = history.history[f'val_{metric}']
        val_argmin = f_best(val_values)
        plt.plot(x_ticks, val_values, label=f'val')

    # summarize history for accuracy
    plt.plot(x_ticks, values, label=f'train')
    argmin = f_best(values)
    plt.scatter(argmin + 1, values[argmin], color='red', s=75, marker='o', label=f'train_best')
    if val:
        plt.scatter(val_argmin + 1, val_values[val_argmin], color='purple', s=75, marker='o', label=f'val_best')

    plt.title(f'Model {metric}', fontsize=24, pad=10)
    plt.ylabel(metric, fontsize=20, labelpad=10)

    if ylim:
        plt.ylim(ylim)

    if yscale is not None:
        plt.yscale(yscale)
        
    if yticks is not None:
        plt.yticks(yticks, fontsize=16)

    plt.xlabel('epoch', fontsize=20, labelpad=10)        
    plt.tick_params(axis='x', labelsize=8)
    plt.xticks(x, fontsize=16) # set tick step to 1 and let x axis start at 1
    plt.yticks(fontsize=16)
    
    plt.legend(prop={'size': 10})
    plt.grid()
    plt.show()

In [None]:
history.history.keys() # there's not 'val'

In [None]:
plot_history_metric('loss', f_best=np.argmin)

In [None]:
plot_history_metric('top1acc', ylim=[0,1], yticks=np.arange(0.0, 1.1, 0.1))

In [None]:
plot_history_metric('top5acc', ylim=[0,1], yticks=np.arange(0.0, 1.1, 0.1))