## Imports

In [None]:
!pip install -q tensorflow_addons
!pip install -q pyarrow
!pip install -q wandb

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('INFO')
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sn

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, GroupShuffleSplit

import glob
import sys
import os
import math
import gc
import sys
import sklearn
import scipy
import io
import json
import datetime
import wandb

## Utils

In [None]:
# Prints Shape and Dtype For List Of Variables
def print_shape_dtype(l, names):
    for e, n in zip(l, names):
        print(f'{n} shape: {e.shape}, dtype: {e.dtype}')

def pd_read_s3_parquet(key, bucket, s3_client=None, **args):
    """
    Read parquet file from S3 into Pandas.
    Args:
        key: Key to parquet file.
        bucket: AWS bucket to fetch file from.
        s3_client: Instance of Boto3 S3 client.
    """
    if s3_client is None:
        s3_client = boto3.client('s3')
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    return pd.read_parquet(io.BytesIO(obj['Body'].read()), **args)

def load_relevant_data_subset(pq_path):
    """
    Load data sample from S3.
    Args:
        pq_path: Path to parquet object file.
    """
    data_columns = ['x', 'y']
    data = pd_read_s3_parquet(pq_path[14:], AWS_S3_BUCKET, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

def get_file_path(path):
    """
    Get complete AWS file path to each parquet file.
    Args:
        path: Path to parquet file from train_landmarks as root. 
    """
    return f'{AWS_S3_BUCKET}/raw-data/{path}'

## Config

In [None]:
# Training parameters.
config = {
    "TRANSFORMER" : 'V2',
    "PREPROCESS" : 'V1',
    "N_COLS0": 543,
    "N_COLS": 227,
    "N_ROWS":543,
    "N_DIMS": 2, 
    "N_EPOCHS": 150,
    "TRAIN_BATCH_SIZE": 512, #128,
    "INPUT_SIZE":38,
    "NUM_CLASSES": 250,
    "VAL_BATCH_SIZE": 512, #128,
    "WD_RATIO":0.05,
    "LEARNING_RATE": 0.0001, #0.00001,
    "WEIGHT_DECAY": 0.0001, #0.00001,
    "N_WARMUP_EPOCHS": 2,
    'LAYER_NORM_EPS' : 1e-6,
    'LANDMARK_UNITS' : 384,
    'TOTAL_UNITS' : 512,
    'NUM_BLOCKS' : 2,
    'MLP_RATIO' : 2,
    'NUM_HEADS' : 4,
    'EMBEDDING_DROPOUT' : 0.00,
    'MLP_DROPOUT_RATIO' : 0.10,
    'CLASSIFIER_DROPOUT_RATIO' : 0.10
}

# Verbose output.
VERBOSE = True

DIM_NAMES = ['x', 'y']

# Set seed to get reproducible results.
SEED = 42

# Epsilon value for layer normalisation.
LAYER_NORM_EPS = config['LAYER_NORM_EPS']
# Dense layer units for landmarks.
FACE_UNITS = config['LANDMARK_UNITS']
HANDS_UNITS = config['LANDMARK_UNITS']
POSE_UNITS = config['LANDMARK_UNITS']
# Final embedding and transformer embedding size.
UNITS = config['TOTAL_UNITS']
# Transformer.
NUM_BLOCKS = config['NUM_BLOCKS']
MLP_RATIO = config['MLP_RATIO']
NUM_HEADS = config['NUM_HEADS']
# Dropout.
EMBEDDING_DROPOUT = config['EMBEDDING_DROPOUT']
MLP_DROPOUT_RATIO = config['MLP_DROPOUT_RATIO']
CLASSIFIER_DROPOUT_RATIO = config['CLASSIFIER_DROPOUT_RATIO']
# Number of landmarks per frame.
ROWS_PER_FRAME = config['N_ROWS'] 

# Initiailizers.
INIT_GLOROT_UNIFORM = tf.keras.initializers.glorot_uniform
# Activations.
ACTIVATION = tf.keras.activations.gelu

In [None]:
# Setup Weights and Biases
wandb.login()

LOG_DIR = "./logs/fit/"
wandb.tensorboard.patch(root_logdir=LOG_DIR)
wandb.init(project='w251-GISLR-Submission', 
           config=config)

## Data Processing

### Read Samples Metadata

In [None]:
import boto3

# Create client to connect to s3.
s3_client = boto3.client("s3")

# Keys to get files.
AWS_S3_BUCKET = "w251-asl-data"

# Load metadata in that contains file paths.
train_metadata = pd.read_csv("../resources/train.csv")

# train_metadata = train_metadata.head(100)

# Total number of samples in the dataset.
N_SAMPLES = len(train_metadata)
print(f'N_SAMPLES: {N_SAMPLES}')

# Add full AWS S3 file path to each sample.
train_metadata['file_path'] = train_metadata['path'].apply(get_file_path)

# Add ordinally Encoded Sign (assign number to each sign name)
train_metadata['sign_ord'] = train_metadata['sign'].astype('category').cat.codes

# Dictionaries to translate sign <-> ordinal encoded sign
SIGN2ORD = train_metadata[['sign', 'sign_ord']].set_index('sign').squeeze().to_dict()
ORD2SIGN = train_metadata[['sign_ord', 'sign']].set_index('sign_ord').squeeze().to_dict()

### Preprocess Data

#### Preprocess Layer

In [None]:
class PreprocessLayer(tf.keras.layers.Layer):
    def __init__(self, INPUT_SIZE):
        super(PreprocessLayer, self).__init__()
        self.INPUT_SIZE = INPUT_SIZE
        # Indicies in original data. 
        self.FACE_IDXS = tf.constant([0, 6, 7, 11, 12, 13, 14, 15, 17, 22, 23, 24, 25, 26, 30, 31, 
                     33, 37, 38, 39, 40, 41, 42, 56, 61, 62, 72, 73, 74, 76, 77, 
                     78, 80, 81, 82, 84, 86, 87, 88, 89, 90, 91, 95, 96, 110, 112, 
                     113, 122, 128, 130, 133, 144, 145, 146, 153, 154, 155, 157, 158, 
                     159, 160, 161, 163, 168, 173, 178, 179, 180, 181, 183, 184, 185, 
                     188, 189, 190, 191, 193, 196, 197, 232, 233, 243, 244, 245, 246, 
                     247, 249, 252, 253, 254, 255, 256, 259, 260, 263, 267, 268, 269, 
                     270, 271, 272, 286, 291, 292, 302, 303, 304, 306, 307, 308, 310, 
                     311, 312, 314, 316, 317, 318, 319, 320, 321, 324, 325, 339, 341, 
                     351, 357, 359, 362, 373, 374, 375, 380, 381, 382, 384, 385, 386, 
                     387, 388, 390, 398, 402, 403, 404, 405, 407, 408, 409, 412, 413, 
                     414, 415, 417, 419, 453, 463, 464, 465, 466, 467], dtype=tf.int32)
        self.POSE_IDXS = tf.constant(tf.range(489, 514, delta=1, dtype=tf.int32))
        self.LEFT_HAND_IDXS = tf.constant(tf.range(468, 489, delta=1, dtype=tf.int32))
        self.RIGHT_HAND_IDXS = tf.constant(tf.range(522, 543, delta=1, dtype=tf.int32))
            
        # All landmarks that are used for modeling. 
        self.LANDMARK_IDXS = tf.constant(tf.concat([self.FACE_IDXS, self.POSE_IDXS, self.LEFT_HAND_IDXS, self.RIGHT_HAND_IDXS], 0), dtype=tf.int32)
        
        # Indicies after landmarks have been filtered. 
        self.FACE_START = tf.constant(0, dtype=tf.int32)
        self.LEFT_HAND_START = tf.constant(len(self.FACE_IDXS), dtype=tf.int32)
        self.POSE_START = tf.constant(self.LEFT_HAND_START + len(self.LEFT_HAND_IDXS), dtype=tf.int32)
        self.RIGHT_HAND_START = tf.constant(self.POSE_START + len(self.POSE_IDXS), dtype=tf.int32)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'INPUT_SIZE': self.INPUT_SIZE,
        })
        return config
    
    @tf.function(input_signature=(tf.TensorSpec(shape=[None, 543, 2], dtype=tf.float32),),)
    def call(self, data):
        N_FRAMES = tf.shape(data)[0]
        data = tf.gather(data, self.LANDMARK_IDXS, axis=1)
        
        # Slice out face indicies, normalize across batch.        
        face = tf.slice(data, [0, self.FACE_START, 0], [N_FRAMES, self.LEFT_HAND_START, 2])
        face = tf.keras.utils.normalize(face, axis=1, order=2)
        
        # Slice out left_hand indicies, normalize across batch.
        left_hand = tf.slice(data, [0, self.LEFT_HAND_START, 0], [N_FRAMES, self.POSE_START-self.LEFT_HAND_START, 2])
        left_hand = tf.keras.utils.normalize(left_hand, axis=1, order=2)
        
        # Slice out pose indicies, normalize across batch.
        pose = tf.slice(data, [0, self.POSE_START, 0], [N_FRAMES, self.RIGHT_HAND_START-self.POSE_START, 2])
        pose = tf.keras.utils.normalize(pose, axis=1, order=2)
        
        # Slice out right_hand indicies, normalize across batch.
        right_hand = tf.slice(data, [0, self.RIGHT_HAND_START, 0], [N_FRAMES, tf.shape(data)[2] - self.RIGHT_HAND_START, 2])
        right_hand = tf.keras.utils.normalize(right_hand, axis=1, order=2)
        
        # Concat landmarks back into same frame.
        data = tf.concat([face, left_hand, pose, right_hand], 1)
        
        # Video fits in self.INPUT_SIZE
        if N_FRAMES < self.INPUT_SIZE: # Number of frames we want
            # Attention mask for frames that contain data. 
            non_empty_frames_idxs = tf.pad(tf.range(0, N_FRAMES, 1), [[0, self.INPUT_SIZE-N_FRAMES]], constant_values=-1)
            data = tf.pad(data, [[0, self.INPUT_SIZE-N_FRAMES], [0,0], [0,0]], constant_values=-1)
            # Fill NaN Values With 0
            data = tf.where(tf.math.is_nan(data), 0, data)
            # Reshape into (Number of desired frames, (Number of landmarks * 2))
            data = tf.reshape(data, [self.INPUT_SIZE, tf.shape(data)[1] * 2])
            return data, non_empty_frames_idxs
        # Video needs to be downsampled to INPUT_SIZE
        else:
            # Downsample video using nearest interpolation method. 
            data = tf.image.resize(data, size=(self.INPUT_SIZE, data.shape[1]), method='nearest')
            # Fill NaN Values With 0
            data = tf.where(tf.math.is_nan(data), 0, data)
            # Reshape into (Number of desired frames, (Number of landmarks * 2)).
            data = tf.reshape(data, [self.INPUT_SIZE, tf.shape(data)[1] * 2])
            # Create attention mask with all frames. 
            non_empty_frames_idxs = tf.range(0, self.INPUT_SIZE, 1)
            return data, non_empty_frames_idxs

#### Train/Test Set Creation

In [None]:
def get_data(file_path):
    """
    Function to get an individal raw data sample and pass it through the preprocess layer.
    Args:
        file_path: Path to the raw file.
    """
    # Load raw data.
    data = load_relevant_data_subset(file_path)
    # Process data using Tensorflow.
    data = preprocess_layer(data)
    
    return data

def preprocess_dataset():
    """
    Function to fully preprocess the dataset and save to disk for training. 
    """
    # Create arrays to save data
    X = np.zeros([N_SAMPLES, config["INPUT_SIZE"], config["N_COLS"] * config["N_DIMS"]], dtype=np.float32)
    y = np.zeros([N_SAMPLES], dtype=np.int32)
    NON_EMPTY_FRAME_IDXS = np.full([N_SAMPLES, config["INPUT_SIZE"]], -1, dtype=np.float32)

    for row_idx, (file_path, sign_ord) in enumerate(tqdm(train_metadata[['file_path', 'sign_ord']].values)):
        if row_idx % 5000 == 0:
            print(f'Generated {row_idx}/{N_SAMPLES}')

        data, non_empty_frame_idxs = get_data(file_path)
        X[row_idx] = data
        y[row_idx] = sign_ord
        NON_EMPTY_FRAME_IDXS[row_idx] = non_empty_frame_idxs
        if np.isnan(data).sum() > 0:
            print(row_idx)
            return data
    
    # Save X/y
    np.save('X.npy', X)
    np.save('y.npy', y)
    np.save('NON_EMPTY_FRAME_IDXS.npy', NON_EMPTY_FRAME_IDXS)
        
    return X, y, NON_EMPTY_FRAME_IDXS

In [None]:
X = np.load("X.npy")
y = np.load("y.npy")
NON_EMPTY_FRAME_IDXS = np.load("NON_EMPTY_FRAME_IDXS.npy")

In [None]:
# # Preprocess the entire dataset and get input for training. 
# X, y, NON_EMPTY_FRAME_IDXS = preprocess_dataset()

# Split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)
NON_EMPTY_FRAME_IDXS_train, NON_EMPTY_FRAME_IDXS_test = train_test_split(NON_EMPTY_FRAME_IDXS, test_size=0.1, random_state=SEED)

print_shape_dtype([X, y, NON_EMPTY_FRAME_IDXS], ['X', 'y', 'NON_EMPTY_FRAME_IDXS'])
print(f'# NaN Values X: {np.isnan(X).sum()}')

## Model Definition

In [None]:
FACE_IDXS = [0, 6, 7, 11, 12, 13, 14, 15, 17, 22, 23, 24, 25, 26, 30, 31, 
                     33, 37, 38, 39, 40, 41, 42, 56, 61, 62, 72, 73, 74, 76, 77, 
                     78, 80, 81, 82, 84, 86, 87, 88, 89, 90, 91, 95, 96, 110, 112, 
                     113, 122, 128, 130, 133, 144, 145, 146, 153, 154, 155, 157, 158, 
                     159, 160, 161, 163, 168, 173, 178, 179, 180, 181, 183, 184, 185, 
                     188, 189, 190, 191, 193, 196, 197, 232, 233, 243, 244, 245, 246, 
                     247, 249, 252, 253, 254, 255, 256, 259, 260, 263, 267, 268, 269, 
                     270, 271, 272, 286, 291, 292, 302, 303, 304, 306, 307, 308, 310, 
                     311, 312, 314, 316, 317, 318, 319, 320, 321, 324, 325, 339, 341, 
                     351, 357, 359, 362, 373, 374, 375, 380, 381, 382, 384, 385, 386, 
                     387, 388, 390, 398, 402, 403, 404, 405, 407, 408, 409, 412, 413, 
                     414, 415, 417, 419, 453, 463, 464, 465, 466, 467]
POSE_IDXS = np.arange(489, 514)
LEFT_HAND_IDXS = np.arange(468, 489)
RIGHT_HAND_IDXS = np.arange(522, 543)

# All landmarks that are used for modeling. 
LANDMARK_IDXS = np.concatenate((FACE_IDXS, POSE_IDXS, LEFT_HAND_IDXS, RIGHT_HAND_IDXS))

# Indicies after landmarks have been filtered. 
FACE_START = 0
LEFT_HAND_START = len(FACE_IDXS)
POSE_START = LEFT_HAND_START + len(LEFT_HAND_IDXS)
RIGHT_HAND_START = POSE_START + len(POSE_IDXS)

# Length of landmarks.
FACE_LEN = len(FACE_IDXS)
POSE_LEN = POSE_IDXS.size
LEFT_HAND_LEN = LEFT_HAND_IDXS.size
RIGHT_HAND_LEN = RIGHT_HAND_IDXS.size

### Embeddings

#### Landmark Embedding

In [None]:
class LandmarkEmbedding(tf.keras.layers.Layer):
    def __init__(self, units, name, activation):
        super(LandmarkEmbedding, self).__init__(name=f'{name}_embedding')
        self.UNITS = units
        self.ACTIVATION = activation
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'UNITS': self.UNITS,
            'ACTIVATION': self.ACTIVATION,
        })
        return config
        
    def build(self, input_shape):
        # Embedding for missing landmark in frame, initizlied with zeros
        self.empty_embedding = self.add_weight(
            name=f'{self.name}_empty_embedding',
            shape=[self.UNITS],
            initializer=tf.keras.initializers.constant(0.0),
        )
        # Embedding
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(self.UNITS, name=f'{self.name}_dense_1', use_bias=False, 
                                  kernel_initializer=tf.keras.initializers.glorot_uniform, activation=self.ACTIVATION),
            tf.keras.layers.Dense(self.UNITS, name=f'{self.name}_dense_2', use_bias=False, kernel_initializer=tf.keras.initializers.he_uniform),
        ], name=f'{self.name}_dense')

    def call(self, x):
        return tf.where(
                # Checks whether landmark is missing in frame
                tf.reduce_sum(x, axis=2, keepdims=True) == 0,
                # If so, the empty embedding is used
                self.empty_embedding,
                # Otherwise the landmark data is embedded
                self.dense(x),
            )

#### Embedding

In [None]:
class Embedding(tf.keras.layers.Layer):
    def __init__(self, input_size, face_units, hands_units, pose_units, units, activation):
        super(Embedding, self).__init__()
        self.INPUT_SIZE = input_size
        self.FACE_UNITS = face_units
        self.HANDS_UNITS = hands_units
        self.POSE_UNITS = pose_units
        self.UNITS = units
        self.ACTIVATION = activation
        
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'INPUT_SIZE': self.INPUT_SIZE,
            'FACE_UNITS': self.FACE_UNITS,
            'HANDS_UNITS': self.HANDS_UNITS,
            'POSE_UNITS': self.POSE_UNITS,
            'UNITS': self.UNITS,
            'ACTIVATION': self.ACTIVATION,
        })
        return config

    def build(self, input_shape):
        # Positional Embedding, initialized with zeros
        self.positional_embedding = tf.keras.layers.Embedding(self.INPUT_SIZE+1, self.UNITS, embeddings_initializer=tf.keras.initializers.constant(0.0))
        # Embedding layer for Landmarks
        self.face_embedding = LandmarkEmbedding(self.FACE_UNITS, 'face', self.ACTIVATION)
        self.left_hand_embedding = LandmarkEmbedding(self.HANDS_UNITS, 'left_hand', self.ACTIVATION)
        self.right_hand_embedding = LandmarkEmbedding(self.HANDS_UNITS, 'right_hand', self.ACTIVATION)
        self.pose_embedding = LandmarkEmbedding(self.POSE_UNITS, 'pose', self.ACTIVATION)
        # Landmark Weights
        self.landmark_weights = tf.Variable(tf.zeros([4], dtype=tf.float32), name='landmark_weights')
        # Fully Connected Layers for combined landmarks
        self.fc = tf.keras.Sequential([
            tf.keras.layers.Dense(self.UNITS, name='fully_connected_1', use_bias=False, 
                                  kernel_initializer=tf.keras.initializers.glorot_uniform, activation=self.ACTIVATION),
            tf.keras.layers.Dense(self.UNITS, name='fully_connected_2', use_bias=False, kernel_initializer=tf.keras.initializers.he_uniform),
        ], name='fc')


    def call(self, face0, left_hand0, right_hand0, pose0, non_empty_frame_idxs,training=False):
        # Face
        face_embedding = self.face_embedding(face0)
        # Left Hand
        left_hand_embedding = self.left_hand_embedding(left_hand0)
        # Right Hand
        right_hand_embedding = self.right_hand_embedding(right_hand0)
        # Pose
        pose_embedding = self.pose_embedding(pose0)
        # Merge Embeddings of all landmarks with mean pooling
        x = tf.stack((face_embedding, left_hand_embedding, right_hand_embedding, pose_embedding), axis=3)
        # Merge Landmarks with trainable attention weights
        x = x * tf.nn.softmax(self.landmark_weights)
        x = tf.reduce_sum(x, axis=3)
        # Fully Connected Layers
        x = self.fc(x)
        # Add Positional Embedding
        normalised_non_empty_frame_idxs = tf.where(
            tf.math.equal(non_empty_frame_idxs, -1.0),
            self.INPUT_SIZE,
            tf.cast(
                non_empty_frame_idxs / tf.reduce_max(non_empty_frame_idxs, axis=1, keepdims=True) * self.INPUT_SIZE,
                tf.int32,
            ),
        )
        x = x + self.positional_embedding(normalised_non_empty_frame_idxs)
        
        return x

### Transformers

#### Multi Head Attention

In [None]:
# based on: https://stackoverflow.com/questions/67342988/verifying-the-implementation-of-multihead-attention-in-transformer
# replaced softmax with softmax layer to support masked softmax
def scaled_dot_product(q,k,v, softmax, attention_mask):
    #calculates Q . K(transpose)
    qkt = tf.matmul(q,k,transpose_b=True)
    #caculates scaling factor
    dk = tf.math.sqrt(tf.cast(q.shape[-1],dtype=tf.float32))
    scaled_qkt = qkt/dk
    softmax = softmax(scaled_qkt, mask=attention_mask)
    
    z = tf.matmul(softmax,v)
    #shape: (m,Tx,depth), same shape as q,k,v
    return z

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self,d_model,num_of_heads):
        super(MultiHeadAttention,self).__init__()
        self.d_model = d_model
        self.num_of_heads = num_of_heads
        self.depth = d_model//num_of_heads
        self.wq = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
        self.wk = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
        self.wv = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
        self.wo = tf.keras.layers.Dense(d_model)
        self.softmax = tf.keras.layers.Softmax()
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'd_model': self.d_model,
            'num_of_heads': self.num_of_heads
        })
        return config
        
    def call(self,x, attention_mask):
        multi_attn = []
        for i in range(self.num_of_heads):
            Q = self.wq[i](x)
            K = self.wk[i](x)
            V = self.wv[i](x)
            multi_attn.append(scaled_dot_product(Q,K,V, self.softmax, attention_mask))
            
        multi_head = tf.concat(multi_attn,axis=-1)
        multi_head_attention = self.wo(multi_head)
        return multi_head_attention

#### Transformer

In [None]:
class Transformer(tf.keras.layers.Layer):
    def __init__(self, t_input_shape, num_heads, units, dropout_rate, layer_norm_eps, num_blocks):
        super(Transformer, self).__init__()
        self.t_input_shape = t_input_shape
        self.num_heads = num_heads
        self.dropout_rate = dropout_rate
        self.layer_norm_eps = layer_norm_eps
        self.units = units
        self.num_blocks = num_blocks
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            't_input_shape': self.t_input_shape,
            'num_heads': self.num_heads,
            'dropout_rate': self.dropout_rate,
            'layer_norm_eps': self.layer_norm_eps,
            'units': self.units,
            'num_blocks': self.num_blocks
        })
        return config

    
    def build(self, input_shape):
        self.mhsa = []
        self.mha_dropouts = []
        self.ln_1s = []
        self.dense_1s = []
        self.mlp_droupouts = []
        self.dense_2s = []
        self.ln_2s = []
        for i in range(self.num_blocks):
            self.mhsa.append(tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.t_input_shape[-1]))
            self.mha_dropouts.append(tf.keras.layers.Dropout(self.dropout_rate))
            self.ln_1s.append(tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps))
            self.dense_1s.append(tf.keras.layers.Dense(self.units, activation='relu'))
            self.mlp_droupouts.append(tf.keras.layers.Dropout(self.dropout_rate))
            self.dense_2s.append(tf.keras.layers.Dense(self.t_input_shape[-1]))
            self.ln_1s.append(tf.keras.layers.LayerNormalization(epsilon=self.layer_norm_eps))
            
    def call(self, x):
        for mha, dropout1, ln_1, dense1, dropout2, dense2, ln_2 in zip(self.mhsa, self.mha_dropouts, 
                                                                       self.ln_1s, self.dense_1s, 
                                                                       self.mlp_droupouts, self.dense_2s, self.ln_2s):
            attn_output = mha(x, x)
            attn_output = dropout1(attn_output)
            out1 = ln_1(inputs + attn_output)
            ff_output = dense1(out1)
            ff_output = dropout2(ff_output)
            ff_output = dense2(ff_output)
            x = ln_2(out1 + ff_output)

        return x

### Full Model

In [None]:
def get_model():
    """
    This function builds the full model that is used for training using the Keras Functional API.
    """
    # Inputs
    frames = tf.keras.layers.Input([config["INPUT_SIZE"], config["N_COLS"] * config["N_DIMS"]], dtype=tf.float32, name='FRAMES')
    non_empty_frame_idxs = tf.keras.layers.Input([config["INPUT_SIZE"]], dtype=tf.float32, name='NON_EMPTY_FRAME_IDXS')
    
    # Attention Mask
    mask = tf.cast(tf.math.not_equal(non_empty_frame_idxs, -1), tf.float32)
    mask = tf.expand_dims(mask, axis=2)
    
    # Slice out face indicies       
    face = tf.slice(frames, [0, 0, FACE_START], [-1, config["INPUT_SIZE"], FACE_LEN * 2])
    # face = tf.reshape(frames, [-1, config["INPUT_SIZE"], FACE_LEN*2])
    
     # Slice out left_hand indicies
    left_hand = tf.slice(frames, [0, 0, LEFT_HAND_START * 2], [-1, config["INPUT_SIZE"], LEFT_HAND_LEN * 2])
    # left_hand = tf.reshape(frames, [-1, config["INPUT_SIZE"], len(LEFT_HAND_IDXS)*2])

    # Slice out pose indicies
    pose = tf.slice(frames, [0, 0, POSE_START * 2], [-1, config["INPUT_SIZE"], POSE_LEN * 2])
    # pose = tf.reshape(frames, [-1, config["INPUT_SIZE"], len(POSE_IDXS)*2])

    # Slice out right_hand indicies
    right_hand = tf.slice(frames, [0, 0, RIGHT_HAND_START * 2], [-1, config["INPUT_SIZE"], RIGHT_HAND_LEN * 2])
    # right_hand = tf.reshape(frames, [-1, config["INPUT_SIZE"], len(RIGHT_HAND_IDXS)*2])
    
    embedding_layer = Embedding(config["INPUT_SIZE"], FACE_UNITS, HANDS_UNITS, POSE_UNITS, UNITS, ACTIVATION)
    x = embedding_layer(face, left_hand, right_hand, pose, non_empty_frame_idxs)
    transformer_input_shape = x.shape
    
    # Encoder Transformer Blocks
    x = Transformer(transformer_input_shape, NUM_HEADS, UNITS, MLP_DROPOUT_RATIO, LAYER_NORM_EPS, NUM_BLOCKS)(x)
    
    # Pooling
    x = tf.reduce_sum(x * mask, axis=1) / tf.reduce_sum(mask, axis=1)
    # Classifier Dropout
    x = tf.keras.layers.Dropout(CLASSIFIER_DROPOUT_RATIO)(x)
    # Classification Layer
    x = tf.keras.layers.Dense(config["NUM_CLASSES"], activation=tf.keras.activations.softmax, kernel_initializer=INIT_GLOROT_UNIFORM)(x)
    
    outputs = x
    
    # Create Tensorflow Model
    model = tf.keras.models.Model(inputs=[frames, non_empty_frame_idxs], outputs=outputs)
    
    # Simple Categorical Crossentropy Loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    
    # Adam Optimizer with weight decay
    optimizer = tfa.optimizers.AdamW(learning_rate=config["LEARNING_RATE"], weight_decay=config["WEIGHT_DECAY"])
    
    # TopK Metrics
    metrics = [
        tf.keras.metrics.SparseCategoricalAccuracy(name='acc'),
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_acc'),
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10, name='top_10_acc'),
    ]
    
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics, run_eagerly=True)
    
    return model

## Training

### Callbacks

#### Learning Rate Scheduler

In [None]:
def lrfn(current_step, num_warmup_steps, lr_max, num_cycles=0.50, num_training_steps=config["N_EPOCHS"], warm_method='log'):
    """
    This function creates a cosine decay schedule.
    Args:
        current_step: The step that the function is currently at.
        num_warmup_steps: Number of warmup steps in the schedule.
        lr_max: Max learning rate.
        num_cycle: Number of cycles.
        num_training_cycles: Total number of training steps.
        warm_method: Warmup method.
    """
    
    if current_step < num_warmup_steps:
        if warm_method == 'log':
            return lr_max * 0.10 ** (num_warmup_steps - current_step)
        else:
            return lr_max * 2 ** -(num_warmup_steps - current_step)
    else:
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))

        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) * lr_max
    
# Learning rate schedule.
LR_SCHEDULE = [lrfn(step, num_warmup_steps=config["N_WARMUP_EPOCHS"], 
                    lr_max=config["LEARNING_RATE"], num_cycles=0.50) for step in range(config["N_EPOCHS"])]

# Pass learning rate schedule into Keras callback. 
lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda step: LR_SCHEDULE[step], verbose=1)

#### Weight Decay Callback

In [None]:
# Custom callback to update weight decay with learning rate
class WeightDecayCallback(tf.keras.callbacks.Callback):
    def __init__(self, wd_ratio=config['WD_RATIO']):
        self.step_counter = 0
        self.wd_ratio = wd_ratio
    
    def on_epoch_begin(self, epoch, logs=None):
        model.optimizer.weight_decay = model.optimizer.learning_rate * self.wd_ratio
        print(f'learning rate: {model.optimizer.learning_rate.numpy():.2e}, weight decay: {model.optimizer.weight_decay.numpy():.2e}')
        
wd_callback = WeightDecayCallback(config["WD_RATIO"])

#### Tensorboard Callback

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)

#### Early Stopping Callback

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=5)

### Model Fit

In [None]:
# Clear all models in GPU.
tf.keras.backend.clear_session()

# Get a brand new model for training.
model = get_model()

# Sanity c.heck.
model.summary()

# Fit model. 
history = model.fit(
    x=({'FRAMES': X_train, 'NON_EMPTY_FRAME_IDXS': NON_EMPTY_FRAME_IDXS_train}),
    y=y_train,
    batch_size=config['TRAIN_BATCH_SIZE'],
    epochs=config['N_EPOCHS'],
    verbose=VERBOSE,
    callbacks=[
            lr_callback,
            wd_callback,
            stop_early,
            tensorboard_callback
          ],
    validation_split=0.1,
    shuffle=True,
    validation_batch_size=config['VAL_BATCH_SIZE']
)

# Finish the WandB logging. 
wandb.finish()

In [None]:
# Save Model Weights
model.save_weights(f'model.h5')

In [None]:
# Load weights into model for classification report.
# model = get_model()
# model.load_weights('tf_models/model.h5')

## Model Performance

### Classification Report

In [None]:
def print_classification_report(y_test, y_test_pred, labels):
    """
    The function creates a classification report for all the signs.
    """
    # Classification report for all signs.
    classification_report = sklearn.metrics.classification_report(
            y_test,
            y_test_pred,
            target_names=labels,
            output_dict=True,
        )
    # Round Data for better readability
    classification_report = pd.DataFrame(classification_report).T
    classification_report = classification_report.round(2)
    classification_report = classification_report.astype({
            'support': np.uint16,
        })
    # Add signs
    classification_report['sign'] = [e if e in SIGN2ORD else -1 for e in classification_report.index]
    classification_report['sign_ord'] = classification_report['sign'].apply(SIGN2ORD.get).fillna(-1).astype(np.int16)
    # Sort on F1-score
    classification_report = pd.concat((
        classification_report.head(config["NUM_CLASSES"]).sort_values('f1-score', ascending=False),
        classification_report.tail(3),
    ))

    pd.options.display.max_rows = 999
    display(classification_report)

In [None]:
# # Get predicted and ground truth labels.
# y_test_pred = []  # Store predicted labels.
# y_test = []  # Store true labels.

# # Iterate over the dataset.
# for image_batch, label_batch in X_test:
#     y_test.append(label_batch)
#     # Compute predictions.
#     preds = model.predict(image_batch)
#     # Get predicted label.
#     y_test_pred.append(np.argmax(preds, axis = - 1))
    
# y_test = np.concatenate(y_test)
# y_test_pred = np.concatenate(y_test_pred)

y_test_pred = model.predict({'FRAMES': X_test, 'NON_EMPTY_FRAME_IDXS': NON_EMPTY_FRAME_IDXS_test}).argmax(axis=1)
labels = [ORD2SIGN.get(i).replace(' ', '_') for i in range(config["NUM_CLASSES"])]

print_classification_report(y_test, y_test_pred, labels)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Create confusion matrix.
cm = confusion_matrix(y_test, y_test_pred, normalize='true')

# Change figure size and increase DPI for better resolution.
plt.figure(figsize=(10,8), dpi=300)
# Scale up the size of all text
sns.set(font_scale = 0)

# Plot Confusion Matrix using Seaborn heatmap.
ax = sns.heatmap(cm, annot=False,cmap="icefire" )
plt.savefig('confusion_matrix.png')

In [None]:
incorrect_preds = pd.DataFrame([ORD2SIGN[i] for i in y_test_pred[y_test != y_test_pred]],
                               [ORD2SIGN[i] for i in y_test[y_test != y_test_pred]]).reset_index()

incorrect_preds = incorrect_preds.rename(columns={'index':'prediction',0:'label'})
print(len(incorrect_preds))
incorrect_preds.head(100)

## TFLite Conversion

In [None]:
# TFLite model for submission
class TFLiteModel(tf.keras.Model):
    def __init__(self, model):
        super(TFLiteModel, self).__init__()

        # Load the feature generation and main models
        self.preprocess_layer = preprocess_layer
        self.model = model
    
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, config["N_ROWS"], config["N_DIMS"]], dtype=tf.float32, name='inputs')])
    def call(self, inputs):
        # Preprocess Data
        x, non_empty_frame_idxs = self.preprocess_layer(inputs)
        # Add Batch Dimension
        x = tf.expand_dims(x, axis=0)
        non_empty_frame_idxs = tf.expand_dims(non_empty_frame_idxs, axis=0)
        # Make Prediction
        outputs = self.model({'FRAMES': x, 'NON_EMPTY_FRAME_IDXS': non_empty_frame_idxs })
        # Squeeze Output 1x250 -> 250
        outputs = tf.squeeze(outputs, axis=0)

        # Return a dictionary with the output tensor
        return {'outputs': outputs}

# Define TF Lite Model
tflite_keras_model = TFLiteModel(model)


In [None]:
# Create Model Converter
keras_model_converter = tf.lite.TFLiteConverter.from_keras_model(tflite_keras_model)
keras_model_converter.experimental_new_converter = True

# Convert Model
tflite_model = keras_model_converter.convert()
# Write Model
with open(f'model.tflite', 'wb') as f:
    f.write(tflite_model)
    
# Zip Model
# !zip submission.zip /kaggle/working/model.tflite

In [None]:
# Sanity Check
demo_raw_data = load_relevant_data_subset(train_metadata['file_path'].values[2])
print(f'demo_raw_data shape: {demo_raw_data.shape}, dtype: {demo_raw_data.dtype}')
demo_output = tflite_keras_model(demo_raw_data)["outputs"]
print(f'demo_output shape: {demo_output.shape}, dtype: {demo_output.dtype}')
demo_prediction = demo_output.numpy().argmax()
print(f'demo_prediction: {demo_prediction}, correct: {train_metadata.iloc[0]["sign_ord"]}')

In [None]:
# Verify TFLite model can be loaded and used for prediction
# !pip install tflite-runtime
# import tf.lite.interpreter as tflite

interpreter = tf.lite.Interpreter("model.tflite")
# interpreter = tf.lite.Interpreter("/kaggle/working/model.tflite")
found_signatures = list(interpreter.get_signature_list().keys())
prediction_fn = interpreter.get_signature_runner("serving_default")

output = prediction_fn(inputs=demo_raw_data)
sign = output['outputs'].argmax()

print("PRED : ", ORD2SIGN.get(sign), f'[{sign}]')
print("TRUE : ", train_metadata.sign.values[0], f'[{train_metadata.sign_ord.values[0]}]')