# This notebook makes predictions with the first model of my solution

><p><a href="https://colab.research.google.com/github/shlomoron/Google---American-Sign-Language-Fingerspelling-Recognition-12th-place-solution/blob/main/ASLFR_base_model_predict.ipynb"><img align="left" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open in Google Colaboratory"></a>

In [None]:
import tensorflow as tf
import os
import numpy as np
import random

In [None]:
SEED = 42
def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
set_seeds()

In [None]:
import json
import math
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import shutil
from shutil import copyfile

!pip install tensorflow-addons
import tensorflow_addons as tfa

!pip install cached-property
from cached_property import cached_property

!pip install fastparquet
import fastparquet

!pip install Levenshtein
import Levenshtein as lev

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/612.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/612.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m368.6/612.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.21.0 typeguard-2.13.3



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



Collecting cached-property
  Downloading cached_property-1.5.2-py2.py3-none-any.whl (7.6 kB)
Installing collected packages: cached-property
Successfully installed cached-property-1.5.2
Collecting fastparquet
  Downloading fastparquet-2023.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.7.0
Collecting Levenshtein
  Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB

### Data paths and imports
[ctc_tpu](https://www.kaggle.com/datasets/shlomoron/ctc-tpu) contains an implementation of CTC for TensorFlow.  
[aslfr_light](https://www.kaggle.com/datasets/shlomoron/aslfr-light) contains two needed files from the much larger [competition dataset](https://www.kaggle.com/competitions/asl-fingerspelling/data).  
The [TFRecord dataset](https://www.kaggle.com/datasets/shlomoron/aslfr-tfrecords) contains the training data as TFRecords and is imported straight to the TPU.


In [None]:
input_folder='/content/input'
working_folder='/content'

try:
  os.mkdir(input_folder)
except:
  pass

ctc_tpu_path = 'gs://kds-56377fe3e755931e5dec0abf56fc91724117fbe7bab3e27b74e37f3e'
!gsutil -m cp -r  $ctc_tpu_path $input_folder/ctc-tpu

aslfr_light_folder_name = 'asl-fingerspelling'
try:
  os.mkdir(os.path.join(input_folder, aslfr_light_folder_name))
except:
  pass
aslfr_light_bucket = 'kds-0edb6d00755af6434e157bf0f17ca3102a6722fdba4325c7c11b0e8c'
aslfr_light_path = f'gs://{aslfr_light_bucket}'
!gsutil -m cp -r $aslfr_light_path $input_folder/asl-fingerspelling
shutil.copytree(f'{input_folder}/{aslfr_light_folder_name}/{aslfr_light_bucket}',
                f'{input_folder}/{aslfr_light_folder_name}', dirs_exist_ok=True)

tfrecords_path = 'gs://kds-4104ecb783277dae764d8a7e543344b63bd14d4a90cc61b85b9c2307'

aslfr_MEANs_STDs_folder_name = 'aslfr_MEANs_STDs'
try:
  os.mkdir(os.path.join(input_folder, aslfr_MEANs_STDs_folder_name))
except:
  pass
aslfr_MEANs_STDs_bucket = 'kds-6aa19d9b7a0929191862354c0121926a98e0c2a5828fe9d275ca75d3'
aslfr_MEANs_STDs_path = f'gs://{aslfr_MEANs_STDs_bucket}'
!gsutil -m cp -r $aslfr_MEANs_STDs_path $input_folder/aslfr_MEANs_STDs
shutil.copytree(f'{input_folder}/{aslfr_MEANs_STDs_folder_name}/{aslfr_MEANs_STDs_bucket}',
                f'{input_folder}/{aslfr_MEANs_STDs_folder_name}', dirs_exist_ok=True)

aslfr_base_model_path = 'gs://kds-10c4676ea5d76407cd16c1c709456ceaad133ffd1e93cb64350c0c03'
!gsutil -m cp -r  $aslfr_base_model_path $input_folder/aslfr-base-model

Copying gs://kds-56377fe3e755931e5dec0abf56fc91724117fbe7bab3e27b74e37f3e/CTC_TPU.py...
/ [1/1 files][ 56.1 KiB/ 56.1 KiB] 100% Done                                    
Operation completed over 1 objects/56.1 KiB.                                     
Copying gs://kds-0edb6d00755af6434e157bf0f17ca3102a6722fdba4325c7c11b0e8c/character_to_prediction_index.json...
Copying gs://kds-0edb6d00755af6434e157bf0f17ca3102a6722fdba4325c7c11b0e8c/train.csv...
/ [2/2 files][  5.0 MiB/  5.0 MiB] 100% Done                                    
Operation completed over 2 objects/5.0 MiB.                                      
Copying gs://kds-6aa19d9b7a0929191862354c0121926a98e0c2a5828fe9d275ca75d3/MEANs.p...
Copying gs://kds-6aa19d9b7a0929191862354c0121926a98e0c2a5828fe9d275ca75d3/STDs.p...
/ [2/2 files][  5.1 KiB/  5.1 KiB] 100% Done                                    
Operation completed over 2 objects/5.1 KiB.                                      
Copying gs://kds-10c4676ea5d76407cd16c1c709456ceaad133f

### Define where the output is saved

It is best to save Colab output to a mounted Google drive, but giving public notebook access to your drive can be risky, so the default here is saving to the local disk of the colab session. If you have familiarized yourself with the code here and feel at ease with granting such access, you can unmark the second part in this cell and define the appropriate path to a folder in your drive (the folder that you create a path to should exist already- in this example, content/drive/MyDrive/kaggle/ASLFR already exist on my drive)

In [None]:
save_folder = '/content/save'
try:
  os.mkdir(save_folder)
except:
  pass

'''
from google.colab import drive
drive.mount('/content/drive')
save_folder_name = 'base_model_predict'
ASLFR_folder = '/content/drive/MyDrive/kaggle/ASLFR'
save_folder = os.path.join(ASLFR_folder, save_folder_name)
try:
  os.mkdir(save_folder)
except:
  pass
'''

Mounted at /content/drive


### Import the ctc loss function

In [None]:
# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = f"{input_folder}/ctc-tpu/CTC_TPU.py", dst = f"{working_folder}//CTC_TPU.py")

# import all our functions
from CTC_TPU import classic_ctc_loss

### TPU boilerplate code

In [None]:
# Configure Strategy. Assume TPU...if not set default for GPU
tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu=None)
    strategy = tf.distribute.TPUStrategy(tpu)
    print("on TPU")
    print("REPLICAS: ", strategy.num_replicas_in_sync)
except:
    strategy = tf.distribute.get_strategy()

print(strategy)

<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7d4a7d0bc1c0>


### Landmarks indices

In [None]:
NOSE_old=[
    1,2,98,327
]
LNOSE_old = [98]
RNOSE_old = [327]
LIP_old = [ 0,
    61, 185, 40, 39, 37, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]
LLIP_old = [84,181,91,146,61,185,40,39,37,87,178,88,95,78,191,80,81,82]
RLIP_old = [314,405,321,375,291,409,270,269,267,317,402,318,324,308,415,310,311,312]

FACE_old = LIP_old+NOSE_old
FACE_old.sort()

LPOSE_old = [11, 13, 15, 17, 19, 21, 23]
RPOSE_old = [12, 14, 16, 18, 20, 22, 24]
POSE_old = LPOSE_old + RPOSE_old

X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_face_{i}' for i in FACE_old] + [f'x_pose_{i}' for i in POSE_old]
Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_face_{i}' for i in FACE_old] + [f'y_pose_{i}' for i in POSE_old]

SEL_COLS = X + Y

print('SEL_COLS size:' + str(len(SEL_COLS)))

SEL_COLS_x = [x for x in SEL_COLS if 'x' in x]
NOSE = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in NOSE_old]
LNOSE = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in LNOSE_old]
RNOSE = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in RNOSE_old]
LIP = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in LIP_old]
LLIP = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in LLIP_old]
RLIP = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in RLIP_old]
FACE = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and int(x.split('_')[-1]) in FACE_old]

LPOSE = [i for i, x in enumerate(SEL_COLS_x) if 'pose' in x and int(x.split('_')[-1]) in LPOSE_old]
RPOSE = [i for i, x in enumerate(SEL_COLS_x) if 'pose' in x and int(x.split('_')[-1]) in RPOSE_old]
POSE = [i for i, x in enumerate(SEL_COLS_x) if 'pose' in x and int(x.split('_')[-1]) in POSE_old]

LHAND = [i for i, x in enumerate(SEL_COLS_x) if 'left_hand' in x]
RHAND = [i for i, x in enumerate(SEL_COLS_x) if 'right_hand' in x]

POINT_LANDMARKS = FACE+RHAND+LHAND+RPOSE+LPOSE

norm_point = [i for i, x in enumerate(SEL_COLS_x) if 'face' in x and x.split('_')[-1] == '17'][0]

SEL_COLS size:200


### Config and stuff

In [None]:
DEBUG = False

ROWS_PER_FRAME = int(len(SEL_COLS)/2)
PAD = 0.
NUM_NODES = len(POINT_LANDMARKS)
CHANNELS = 6*NUM_NODES
print("Number of landmarks: " + str(NUM_NODES))
print("Number of features: " + str(CHANNELS))
pad_token = 'P'
pad_token_idx = 59


with open (f"{input_folder}/asl-fingerspelling/character_to_prediction_index.json", "r") as f:
    char_to_num = json.load(f)

char_to_num[pad_token] = pad_token_idx
num_to_char = {j:i for i,j in char_to_num.items()}

inpdir = f"{input_folder}/asl-fingerspelling"
df = pd.read_csv(f'{inpdir}/train.csv')

tffiles = df.file_id.map(lambda x: f'{tfrecords_path}/tfds/{x}.tfrecord').unique()

table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=list(char_to_num.keys()),
        values=list(char_to_num.values()),
    ),
    default_value=tf.constant(-1),
    name="class_weight"
)

MAX_LEN = 380
batch_size = 64*2
dim = 384
val_len = int(0.05 * len(tffiles))
cache = False
SHUFFLE = -1

if DEBUG:
  MAX_LEN = 64
  batch_size = 16
  dim=48
  val_len = 1
  cache = False
  SHUFFLE = 50

print("Val len: " + str(val_len))

Number of landmarks: 100
Number of features: 600
Val len: 3


### Augmentations functions

In [None]:
def interp1d_(x, target_len, method='random'):
    length = tf.shape(x)[1]
    target_len = tf.maximum(1,target_len)
    if method == 'random':
        if tf.random.uniform(()) < 0.33:
            x = tf.image.resize(x, (target_len,tf.shape(x)[1]),'bilinear')
        else:
            if tf.random.uniform(()) < 0.5:
                x = tf.image.resize(x, (target_len,tf.shape(x)[1]),'bicubic')
            else:
                x = tf.image.resize(x, (target_len,tf.shape(x)[1]),'nearest')
    else:
        x = tf.image.resize(x, (target_len,tf.shape(x)[1]),method)
    return x

def flip_lr(x):
    x,y,z = tf.unstack(x, axis=-1)
    x = 1-x
    new_x = tf.stack([x,y,z], -1)
    new_x = tf.transpose(new_x, [1,0,2])
    lhand = tf.gather(new_x, LHAND, axis=0)
    rhand = tf.gather(new_x, RHAND, axis=0)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(LHAND)[...,None], rhand)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(RHAND)[...,None], lhand)
    llip = tf.gather(new_x, LLIP, axis=0)
    rlip = tf.gather(new_x, RLIP, axis=0)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(LLIP)[...,None], rlip)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(RLIP)[...,None], llip)
    lpose = tf.gather(new_x, LPOSE, axis=0)
    rpose = tf.gather(new_x, RPOSE, axis=0)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(LPOSE)[...,None], rpose)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(RPOSE)[...,None], lpose)
    lnose = tf.gather(new_x, LNOSE, axis=0)
    rnose = tf.gather(new_x, RNOSE, axis=0)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(LNOSE)[...,None], rnose)
    new_x = tf.tensor_scatter_nd_update(new_x, tf.constant(RNOSE)[...,None], lnose)
    new_x = tf.transpose(new_x, [1,0,2])
    return new_x

def resample(x, rate=(0.8,1.2)):
    rate = tf.random.uniform((), rate[0], rate[1])
    length = tf.shape(x)[0]
    new_size = tf.cast(rate*tf.cast(length,tf.float32), tf.int32)
    new_x = interp1d_(x, new_size)
    return new_x

def spatial_random_affine(xyz,
    scale  = (0.8,1.2),
    shear = (-0.15,0.15),
    shift  = (-0.1,0.1),
    degree = (-30,30),
):
    center = tf.constant([0.5,0.5])
    if scale is not None:
        scale = tf.random.uniform((),*scale)
        xyz = scale*xyz

    if shear is not None:
        xy = xyz[...,:2]
        z = xyz[...,2:]
        shear_x = shear_y = tf.random.uniform((),*shear)
        if tf.random.uniform(()) < 0.5:
            shear_x = 0.
        else:
            shear_y = 0.
        shear_mat = tf.identity([
            [1.,shear_x],
            [shear_y,1.]
        ])
        xy = xy @ shear_mat
        center = center + [shear_y, shear_x]
        xyz = tf.concat([xy,z], axis=-1)

    if degree is not None:
        xy = xyz[...,:2]
        z = xyz[...,2:]
        xy -= center
        degree = tf.random.uniform((),*degree)
        radian = degree/180*np.pi
        c = tf.math.cos(radian)
        s = tf.math.sin(radian)
        rotate_mat = tf.identity([
            [c,s],
            [-s, c],
        ])
        xy = xy @ rotate_mat
        xy = xy + center
        xyz = tf.concat([xy,z], axis=-1)

    if shift is not None:
        shift = tf.random.uniform((),*shift)
        xyz = xyz + shift

    return xyz

def temporal_mask(x, size=(0.2,0.4), mask_value=float('nan')):
    l = tf.shape(x)[0]
    mask_size = tf.random.uniform((), *size)
    mask_size = tf.cast(tf.cast(l, tf.float32) * mask_size, tf.int32)
    mask_offset = tf.random.uniform((), 0, l, dtype=tf.int32)
    mask_start = mask_offset
    mask_end = tf.clip_by_value(mask_offset+mask_size,0,l)
    x = tf.tensor_scatter_nd_update(x,tf.range(mask_start, mask_end)[...,None],tf.fill([mask_end - mask_start,ROWS_PER_FRAME,3],mask_value))
    if mask_offset+mask_size>l:
      mask_start = 0
      mask_end = mask_offset+mask_size - l
      x = tf.tensor_scatter_nd_update(x,tf.range(mask_start, mask_end)[...,None],tf.fill([mask_end - mask_start,ROWS_PER_FRAME,3],mask_value))
    return x

def spatial_mask(x, size=(0.2,0.4), mask_value=float('nan')):
    mask_size_x = tf.random.uniform((), *size)
    mask_size_y = tf.random.uniform((), *size)
    mask_offset_x = tf.random.uniform(())
    mask_offset_y = tf.random.uniform(())
    mask_x = ((mask_offset_x<x[...,0]) & (x[...,0] < mask_offset_x + mask_size_x)) | ((0<=x[...,0]) & (x[...,0] < mask_offset_x + mask_size_x -1))
    mask_y = ((mask_offset_y<x[...,1]) & (x[...,1] < mask_offset_y + mask_size_y)) | ((0<=x[...,1]) & (x[...,1] < mask_offset_y + mask_size_y -1))
    mask = mask_x & mask_y
    x = tf.where(mask[...,None], mask_value, x)
    return x

def augment_fn(x, always=False, max_len=None):
    if tf.random.uniform(())<0.8 or always:
        x = resample(x, (0.5,1.5))
    if tf.random.uniform(())<0.5 or always:
        x = flip_lr(x)
    if tf.random.uniform(())<0.75 or always:
        x = spatial_random_affine(x)
    if tf.random.uniform(())<0.5 or always:
        x = temporal_mask(x)
    if tf.random.uniform(())<0.5 or always:
        x = spatial_mask(x)
    return x

### Helper functions

In [None]:
def tf_nan_mean(x, axis=0, keepdims=False):
    return tf_nan_sum(x, axis=axis, keepdims=keepdims) / tf_nan_count(x, axis=axis, keepdims=keepdims)

def tf_nan_std(x, center=None, axis=0, keepdims=False):
    if center is None:
        center = tf_nan_mean(x, axis=axis,  keepdims=True)
    d = x - center
    return tf.math.sqrt(tf_nan_mean(d * d, axis=axis, keepdims=keepdims))

def tf_nan_sum(x, axis=0, keepdims=False):
    return tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), x), axis=axis, keepdims=keepdims)

def tf_nan_count(x, axis=0, keepdims=False):
    return tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), tf.ones_like(x)), axis=axis, keepdims=keepdims)

### Decode

In [None]:
def decode_tfrec(record_bytes):
    schema = {}
    schema["frames"] = tf.io.VarLenFeature(dtype=tf.float32)
    schema["phrase"] = tf.io.VarLenFeature(dtype=tf.float32)
    features = tf.io.parse_single_example(record_bytes, schema)

    frames = tf.sparse.to_dense(features["frames"])
    frames = tf.transpose(tf.reshape(frames,(-1, 2, int(len(SEL_COLS)/2))),[0, 2, 1])
    phrase = tf.cast(tf.sparse.to_dense(features["phrase"]), tf.int32)

    out = {}
    out['coordinates']  = frames
    out['phrase'] = phrase
    return out

### Preprocess

In [None]:
def preprocess_continue(inputs, point_landmarks, max_len):
    if tf.rank(inputs) == 3:
        x = inputs[None,...]
    else:
        x = inputs

    mean = tf_nan_mean(tf.gather(x, [norm_point], axis=2), axis=[1,2], keepdims=True)
    mean = tf.where(tf.math.is_nan(mean), tf.constant(0.5,x.dtype), mean)
    x = tf.gather(x, point_landmarks, axis=2)
    std = tf_nan_std(x, center=mean, axis=[1,2], keepdims=True)
    x = (x - mean)/std

    length = tf.shape(x)[1]

    x = x[...,:2]
    dx = tf.cond(tf.shape(x)[1]>1,lambda:tf.pad(x[:,1:] - x[:,:-1], [[0,0],[0,1],[0,0],[0,0]]),lambda:tf.zeros_like(x))
    dx2 = tf.cond(tf.shape(x)[1]>2,lambda:tf.pad(x[:,2:] - x[:,:-2], [[0,0],[0,2],[0,0],[0,0]]),lambda:tf.zeros_like(x))

    x = tf.concat([
        tf.reshape(x, (-1,length,len(point_landmarks), 2)),
        tf.reshape(dx, (-1,length,len(point_landmarks), 2)),
        tf.reshape(dx2, (-1,length,len(point_landmarks), 2)),
    ], axis = -1)

    if max_len is not None and tf.shape(x)[1] > max_len:
        x = tf.image.resize(x[0], (max_len, tf.shape(x)[2]))
        x = x[None]

    x = tf.concat([
        tf.reshape(x, (-1,tf.shape(x)[1],6*len(point_landmarks))),
    ], axis = -1)
    return tf.cast(x, tf.float32)

def normalize(x, MEANs, STDs):
  x = (x-MEANs)/STDs
  return x

def remove_nans(x):
  x = tf.where(tf.math.is_nan(x),tf.constant(0.,x.dtype),x)
  return x

def preprocess(x, point_landmarks, max_len, MEANs, STDs, augment=False):
    coord = x['coordinates']
    coord = tf.concat([coord, tf.zeros(( tf.shape(coord)[0],  tf.shape(coord)[1], 1))], axis = -1)
    if augment:
        coord = augment_fn(coord, max_len=max_len)
    coord = tf.ensure_shape(coord, (None,ROWS_PER_FRAME,3))
    coord = preprocess_continue(coord, point_landmarks, max_len)[0]
    coord = normalize(coord, MEANs, STDs)
    coord = remove_nans(coord)
    return coord, x['phrase']

### Filter
The first model (this) uses filter_by_length. The second model would use filter_by_lev.

In [None]:
RHAND_IDX = [i for i, x in enumerate(SEL_COLS_x) if 'right' in x]
LHAND_IDX = [i for i, x in enumerate(SEL_COLS_x) if 'left' in x]
def f1(): return True
def f2(): return False
def filter_by_length(x):
    frames = x['coordinates']
    phrase = x['phrase']
    rhand_landmarks = tf.gather(frames, RHAND_IDX, axis=1)
    lhand_landmarks = tf.gather(frames, LHAND_IDX, axis=1)
    r_nonan = tf.math.reduce_sum(tf.cast(~tf.math.is_nan(rhand_landmarks[:, 0, 0]), tf.int64))
    l_nonan = tf.math.reduce_sum(tf.cast(~tf.math.is_nan(lhand_landmarks[:, 0, 0]), tf.int64))
    no_nan = tf.math.maximum(r_nonan, l_nonan)
    return tf.cond(2*tf.shape(phrase)[0]<tf.cast(no_nan, tf.int32), true_fn=f1, false_fn=f2)

def filter_by_lev(x, treshold):
    lev = x['lev']
    return tf.cond(lev>treshold, true_fn=f1, false_fn=f2)

### Define sets
**val_files_filtered** set checks the loss and Levenshtein distance on the filtered validation set (same distribution as the train set).  
**val_files_unfiltered** checks the loss and Levenshtein distance on the validation set, unfiltered (different distribution than the train set, similar distribution to the leaderboard set).  
**sub_train_files** checks the Levenshtein distance on a small part of the train set. The Levenshtein distance is the metric, so it is helpful to know how much the train set overfit it compared to the validation set, but it takes a long time to calculate, so we only do it for a small part of the train set.

In [None]:
val_files_filtered = tffiles[:val_len]
val_files_unfiltered = tffiles[:val_len]

if DEBUG:
  train_files = tffiles[val_len:val_len+1]
  sub_train_files = tffiles[val_len:val_len+1]
else:
  train_files = tffiles[val_len:]
  sub_train_files = tffiles[val_len:val_len+3]

### Load MEANs and STDs

In [None]:
MEANs = pickle.load(open(f"{input_folder}/aslfr_MEANs_STDs/MEANs.p", "rb"))
STDs = pickle.load(open(f"{input_folder}/aslfr_MEANs_STDs/STDs.p", "rb"))

### Get dataset function

In [None]:
def get_tfrec_dataset(tfrecords, MEANs = MEANs, STDs = STDs, point_landmarks = POINT_LANDMARKS, batch_size=64, max_len=64, drop_remainder=False,
                      augment=False, shuffle=False,to_filter = False, cache = False):
    # Initialize dataset with TFRecords
    ds = tf.data.TFRecordDataset(tfrecords, num_parallel_reads=tf.data.AUTOTUNE, compression_type = 'GZIP').prefetch(tf.data.AUTOTUNE)
    ds = ds.map(decode_tfrec, tf.data.AUTOTUNE)

    if to_filter:
        ds = ds.filter(filter_by_length)
        #ds = ds.filter(lambda x: filter_by_lev(x, 0.22))

    if DEBUG:
        ds = ds.take(64)

    if cache:
        ds = ds.cache()
    if shuffle:
        if shuffle == -1:
            samples_num = ds.reduce(0, lambda x,_: x+1).numpy()
            ds = ds.shuffle(samples_num, reshuffle_each_iteration = True)
        else:
            ds = ds.shuffle(shuffle, reshuffle_each_iteration = True)

    ds = ds.map(lambda x: preprocess(x, point_landmarks, max_len, MEANs, STDs, augment=augment), tf.data.AUTOTUNE)

    if batch_size:
        ds = ds.padded_batch(batch_size, padding_values=(PAD, pad_token_idx), padded_shapes=([max_len,CHANNELS],[64]), drop_remainder=drop_remainder)

    ds = ds.prefetch(tf.data.AUTOTUNE)

    if shuffle == -1:
      return ds, samples_num
    return ds

### Get datasets

In [None]:
val_dataset_filtered = get_tfrec_dataset(val_files_filtered, batch_size=batch_size, max_len=MAX_LEN, drop_remainder=True,
                                        shuffle=False, to_filter = True, cache = cache)

INPUT_SHAPE = [MAX_LEN, CHANNELS]
batch = next(iter(val_dataset_filtered))
batch[0].shape, batch[1].shape

(TensorShape([128, 380, 600]), TensorShape([128, 64]))

### Model layers

In [None]:
#Copied from previous comp 1st place model: https://www.kaggle.com/code/hoyso48/1st-place-solution-training
class ECA(tf.keras.layers.Layer):
    def __init__(self, kernel_size=5, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.kernel_size = kernel_size
        self.conv = tf.keras.layers.Conv1D(1, kernel_size=kernel_size, strides=1, padding="same", use_bias=False)

    def call(self, inputs, mask=None):
        nn = tf.keras.layers.GlobalAveragePooling1D()(inputs, mask=mask)
        nn = tf.expand_dims(nn, -1)
        nn = self.conv(nn)
        nn = tf.squeeze(nn, -1)
        nn = tf.nn.sigmoid(nn)
        nn = nn[:,None,:]
        return inputs * nn

class CausalDWConv1D(tf.keras.layers.Layer):
    def __init__(self,
        kernel_size=17,
        dilation_rate=1,
        use_bias=False,
        depthwise_initializer='glorot_uniform',
        name='', **kwargs):
        super().__init__(name=name,**kwargs)
        self.causal_pad = tf.keras.layers.ZeroPadding1D((dilation_rate*(kernel_size-1),0),name=name + '_pad')
        self.dw_conv = tf.keras.layers.DepthwiseConv1D(
                            kernel_size,
                            strides=1,
                            dilation_rate=dilation_rate,
                            padding='valid',
                            use_bias=use_bias,
                            depthwise_initializer=depthwise_initializer,
                            name=name + '_dwconv')
        self.supports_masking = True

    def call(self, inputs):
        x = self.causal_pad(inputs)
        x = self.dw_conv(x)
        return x

def Conv1DBlock(channel_size,
          kernel_size,
          dilation_rate=1,
          drop_rate=0.0,
          expand_ratio=2,
          se_ratio=0.25,
          activation='swish',
          name=None):
    '''
    efficient conv1d block, @hoyso48
    '''
    if name is None:
        name = str(tf.keras.backend.get_uid("mbblock"))
    # Expansion phase
    def apply(inputs):
        channels_in = tf.keras.backend.int_shape(inputs)[-1]
        channels_expand = channels_in * expand_ratio

        skip = inputs

        x = tf.keras.layers.Dense(
            channels_expand,
            use_bias=True,
            activation=activation,
            name=name + '_expand_conv')(inputs)

        # Depthwise Convolution
        x = CausalDWConv1D(kernel_size,
            dilation_rate=dilation_rate,
            use_bias=False,
            name=name + '_dwconv')(x)

        x = tf.keras.layers.BatchNormalization(momentum=0.95, name=name + '_bn')(x)

        x  = ECA()(x)

        x = tf.keras.layers.Dense(
            channel_size,
            use_bias=True,
            name=name + '_project_conv')(x)

        if drop_rate > 0:
            x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1), name=name + '_drop')(x)

        if (channels_in == channel_size):
            x = tf.keras.layers.add([x, skip], name=name + '_add')
        return x

    return apply

class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, dim=256, num_heads=4, dropout=0, **kwargs):
        super().__init__(**kwargs)
        self.dim = dim
        self.scale = self.dim ** -0.5
        self.num_heads = num_heads
        self.qkv = tf.keras.layers.Dense(3 * dim, use_bias=False)
        self.drop1 = tf.keras.layers.Dropout(dropout)
        self.proj = tf.keras.layers.Dense(dim, use_bias=False)
        self.supports_masking = True

    def call(self, inputs, mask=None):
        qkv = self.qkv(inputs)
        qkv = tf.keras.layers.Permute((2, 1, 3))(tf.keras.layers.Reshape((-1, self.num_heads, self.dim * 3 // self.num_heads))(qkv))
        q, k, v = tf.split(qkv, [self.dim // self.num_heads] * 3, axis=-1)

        attn = tf.matmul(q, k, transpose_b=True) * self.scale

        if mask is not None:
            mask = mask[:, None, None, :]

        attn = tf.keras.layers.Softmax(axis=-1)(attn, mask=mask)
        attn = self.drop1(attn)

        x = attn @ v
        x = tf.keras.layers.Reshape((-1, self.dim))(tf.keras.layers.Permute((2, 1, 3))(x))
        x = self.proj(x)
        return x


def TransformerBlock(dim=256, num_heads=6, expand=4, attn_dropout=0.2, drop_rate=0.2, activation='swish'):
    def apply(inputs):
        x = inputs
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
        x = MultiHeadSelfAttention(dim=dim,num_heads=num_heads,dropout=attn_dropout)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([inputs, x])
        attn_out = x

        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
        x = tf.keras.layers.Dense(dim*expand, use_bias=False, activation=activation)(x)
        x = tf.keras.layers.Dense(dim, use_bias=False)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([attn_out, x])
        return x
    return apply

def positional_encoding(maxlen, num_hid):
        depth = num_hid/2
        positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
        depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
        angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
        angle_rads = tf.linalg.matmul(positions, angle_rates)
        pos_encoding = tf.concat(
          [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
          axis=-1)
        return pos_encoding

class LateDropout(tf.keras.layers.Layer):
    def __init__(self, rate, noise_shape=None, start_step=0, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.rate = rate
        self.start_step = start_step
        self.dropout = tf.keras.layers.Dropout(rate, noise_shape=noise_shape)

    def build(self, input_shape):
        super().build(input_shape)
        agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
        self._train_counter = tf.Variable(0, dtype="int64", aggregation=agg, trainable=False)

    def call(self, inputs, training=False):
        x = tf.cond(self._train_counter < self.start_step, lambda:inputs, lambda:self.dropout(inputs, training=training))
        if training:
            self._train_counter.assign_add(1)
        return x

In [None]:
def CTCLoss(labels, logits):
    label_length = tf.reduce_sum(tf.cast(labels != pad_token_idx, tf.int32), axis=-1)
    logit_length = tf.ones(tf.shape(logits)[0], dtype=tf.int32) * tf.shape(logits)[1]

    loss = classic_ctc_loss(
            labels=labels,
            logits=logits,
            label_length=label_length,
            logit_length=logit_length,
            blank_index=pad_token_idx,
        )
    loss = tf.reduce_mean(loss)
    return loss

In [None]:
def get_model(dim = 384, dropout_step=0):
    with strategy.scope():
        inp = tf.keras.Input(INPUT_SHAPE)
        x = inp

        x = tf.keras.layers.Masking(mask_value=0.0)(x)
        x = tf.keras.layers.Dense(dim, use_bias=False,name='stem_conv')(x)
        x = tf.keras.layers.BatchNormalization(momentum=0.95,name='stem_bn')(x)

        x = Conv1DBlock(dim,11,drop_rate=0.2)(x)
        x = Conv1DBlock(dim,7,drop_rate=0.2)(x)
        x = Conv1DBlock(dim,3,drop_rate=0.2)(x)

        x = x + positional_encoding(INPUT_SHAPE[0], dim)

        x = TransformerBlock(dim,expand=2)(x)

        x = Conv1DBlock(dim,11,drop_rate=0.2)(x)
        x = Conv1DBlock(dim,7,drop_rate=0.2)(x)
        x = Conv1DBlock(dim,3,drop_rate=0.2)(x)
        x = TransformerBlock(dim,expand=2)(x)

        x = Conv1DBlock(dim,11,drop_rate=0.2)(x)
        x = Conv1DBlock(dim,7,drop_rate=0.2)(x)
        x = Conv1DBlock(dim,3,drop_rate=0.2)(x)
        x = TransformerBlock(dim,expand=2)(x)

        x = tf.keras.layers.Dense(dim*2,activation='relu',name='top_conv')(x)
        x = LateDropout(0.4, start_step=dropout_step)(x)
        x = tf.keras.layers.Dense(len(char_to_num))(x)

        model = tf.keras.Model(inp, x)

        loss = CTCLoss

        # Adam Optimizer
        optimizer = tfa.optimizers.RectifiedAdam(sma_threshold=4.0)
        optimizer = tfa.optimizers.Lookahead(optimizer, sync_period=5)

        model.compile(loss=loss, optimizer=optimizer)

        return model

tf.keras.backend.clear_session()

print('dim: ' + str(dim))

model = get_model(dim = dim)
model.load_weights(f"{input_folder}/aslfr-base-model/model_epoch_299.h5")
model(batch[0])
model.summary()

dim: 384
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 380, 600)]   0           []                               
                                                                                                  
 masking (Masking)              (None, 380, 600)     0           ['input_1[0][0]']                
                                                                                                  
 stem_conv (Dense)              (None, 380, 384)     230400      ['masking[0][0]']                
                                                                                                  
 stem_bn (BatchNormalization)   (None, 380, 384)     1536        ['stem_conv[0][0]']              
                                                                                     

In [None]:
def num_to_char_fn(y):
    return [num_to_char.get(x, "") for x in y]

@tf.function()
def decode_phrase(pred):
    x = tf.argmax(pred, axis=1)
    diff = tf.not_equal(x[:-1], x[1:])
    adjacent_indices = tf.where(diff)[:, 0]
    x = tf.gather(x, adjacent_indices)
    mask = x != pad_token_idx
    x = tf.boolean_mask(x, mask, axis=0)
    return x

# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    output_text = []
    for result in pred:
        result = "".join(num_to_char_fn(decode_phrase(result).numpy()))
        output_text.append(result)
    return output_text

### Make predictions and save

In [None]:
with open (f"{input_folder}/asl-fingerspelling/character_to_prediction_index.json", "r") as f:
    character_map = json.load(f)
rev_character_map = {j:i for i,j in character_map.items()}

preds = []
targets = []

for i in range(len(tffiles)):
  scores = []

  tffile = tffiles[i]
  tffile_name = tffile.split('/')[-1].split('.')[0]
  print(tffile_name)
  dataset_tf = get_tfrec_dataset(tffile, batch_size=batch_size, max_len=MAX_LEN, drop_remainder=False,
                                          shuffle=False, to_filter = False, cache = cache)
  dataset = [x for x in dataset_tf]
  for batch_idx in range(len(dataset)):
      preds_batch = model.predict(dataset[batch_idx][0], verbose = 0)
      targets_batch = dataset[batch_idx][1]
      for pred_idx in range(len(preds_batch)):
          pred = "".join([rev_character_map.get(s, "") for s in decode_phrase(preds_batch[pred_idx]).numpy()])
          target = "".join([rev_character_map.get(s, "") for s in targets_batch[pred_idx].numpy()])
          preds.append(pred)
          targets.append(target)
          scores.append((len(target) - lev.distance(pred, target))/len(target))
  pickle.dump(scores, open( f'{save_folder}/{tffile_name}.p', "wb" ) )

N = [len(phrase) for phrase in targets]
lev_dist = [lev.distance(preds[i], targets[i]) for i in range(len(targets))]
metric_result = (np.sum(N) - np.sum(lev_dist))/np.sum(N)
print(metric_result)

5414471
105143404
128822441
149822653
152029243
169560558
175396851
234418913
296317215
349393104
388576474
425182931
433948159
450474571
474255203
495378749
522550314
527708222
532011803
546816846
566963657
568753759
614661748
638508439
649779897
654436541
683666742
871280215
882979387
933868835
939623093
1019715464
1021040628
1098899348
1099408314
1133664520
1134756332
1255240050
1320204318
1341528257
1358493307
1365275733
1365772051
1405046009
1448136004
1497621680
1552432300
1557244878
1562234637
1643479812
1647220008
1662742697
1664666588
1726141437
1785039512
1865557033
1880177496
1905462118
1906357076
1920330615
1967755728
1969985709
1997878546
2026717426
2036580525
2072296290
2072876091
2118949241
0.8178067446979561


In [None]:
from google.colab import runtime
runtime.unassign()