# Google - American Sign Language Fingerspelling Recognition

## tpu setting

In [None]:
import tensorflow as tf

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))



All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

## library

In [None]:
!pip install transformers
!pip install leven



In [None]:
import glob
import pandas as pd
from tqdm import tqdm
import time
import json
import random
import os

import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_, weight_norm

import numpy as np
import gc
from leven import levenshtein

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

from transformers import BartConfig, BartModel, BartForConditionalGeneration,  GPT2Model, GPT2Config, RobertaPreLayerNormConfig, RobertaPreLayerNormModel
from transformers.optimization import get_cosine_schedule_with_warmup

from matplotlib import pyplot as plt

import tensorflow as tf
from keras import layers
from transformers import BartConfig, TFBartModel
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.optimizers.schedules import LearningRateSchedule

from keras import backend as K
from tensorflow.keras.optimizers import AdamW, Adam
from keras.losses import CategoricalCrossentropy
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import Sequence
import math
import keras

## config

In [None]:
# select columns to be used.

LIP = [
    61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308
    ]

FRAME = ['frame']
LHAND = [f'x_left_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)]
RHAND = [f'x_right_hand_{i}' for i in range(21)] + [f'y_right_hand_{i}' for i in range(21)] + [f'z_right_hand_{i}' for i in range(21)]
POSE = [f'x_pose_{i}' for i in range(33)] + [f'y_pose_{i}' for i in range(33)] + [f'z_pose_{i}' for i in range(33)]
FACE = [f'x_face_{i}' for i in LIP] + [f'y_face_{i}' for i in LIP] + [f'z_face_{i}' for i in LIP]

cols = FRAME + LHAND + RHAND + POSE + FACE
len(cols)

346

In [None]:
# CTC loss

class CustomConfig():
  path = '/content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/'
  seed = 42
  n_folds = 5
  columns = cols[1:]#LHAND + RHAND
  max_frame = 384
  max_label = 384
  concat_max_label = 43
  n_hand_landmarks = 21
  n_pose_landmarks = 33
  n_face_landmarks = 40
  n_landmarks = 115 #- 33 - 40
  encoder_pad_token = 0
  decoder_bos_token = 59
  decoder_eos_token = 60
  decoder_pad_token = 61
  blank_token = 62
  blank_string = 'ϵ'
  drop_z = True
  drop_nan = False
  vocab_size = 63
  use_suppl = True

  # custom model config

  input_dim = n_landmarks * 2 if drop_z else n_landmarks * 3
  hidden_dim = 256
  drop_rate = 0.2
  n_beams = 1


  # train config
  ignore_index = -100
  smoothing = 0.0
  n_epochs = 200
  early_stop = 200
  score_start_mode = 'epoch'
  score_start_epoch = 1
  score_start_train_acc = 0.8
  n_workers = 8
  lr = 1e-3
  batch_size = 128
  device = 'cuda'
  wd = 0.01
  warmup_ratio = 0.1
  clip_grad_norm = True
  max_norm = 5.0

  # inference config
  pytorch_model_path = '/content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/result-ctc6/best-acc-epoch200.bin'



if __name__ == "__main__":
  args = CustomConfig()

## preprocess

In [None]:
def preprocess(args):
  train = pd.read_csv(args.path + 'Data Sources/train.csv')
  supplemental_metadata = pd.read_csv(args.path + 'Data Sources/supplemental_metadata.csv')
  supplemental_metadata = supplemental_metadata.drop(supplemental_metadata.index[[35589]]).reset_index(drop=True)

  print('num_of_train : ', len(train))
  print('num_of_supplemental_metadata : ', len(supplemental_metadata))

  participant_ids = np.array(sorted(list(set(list(train['participant_id'])))))

  kf = KFold(n_splits = args.n_folds, shuffle = False)

  folds = list()
  for train_index, val_index in kf.split(participant_ids):
    train_ids = participant_ids[train_index]
    val_ids = participant_ids[val_index]

    train_df = train[train['participant_id'].isin(train_ids)].reset_index(drop = True)
    if args.use_suppl:
      train_df = pd.concat([train_df, supplemental_metadata]).reset_index(drop = True)
    val_df = train[train['participant_id'].isin(val_ids)].reset_index(drop = True)

    folds.append([train_df, val_df])
  return folds, train, supplemental_metadata

if __name__ == "__main__":
  folds, train, supplemental_metadata = preprocess(args)

num_of_train :  67208
num_of_supplemental_metadata :  52957


## utils

In [None]:
def levenshtein_score(preds, trues, reduce='mean'):
  scores = []
  for i in range(len(preds)):
    pred, true = preds[i], trues[i]
    score = (len(true) - levenshtein(pred, true))/len(true)
    scores.append(score)
  if reduce == 'mean':
    return np.mean(scores)
  elif reduce == 'none':
    return scores

def lr_warmup_cosine_decay(global_step,
                           warmup_steps,
                           hold = 0,
                           total_steps=0,
                           start_lr=0.0,
                           target_lr=1e-3):
    # Cosine decay
    learning_rate = 0.5 * target_lr * (1 + np.cos(np.pi * (global_step - warmup_steps - hold) / float(total_steps - warmup_steps - hold)))

    # Target LR * progress of warmup (=1 at the final warmup step)
    warmup_lr = target_lr * (global_step / warmup_steps)

    # Choose between `warmup_lr`, `target_lr` and `learning_rate` based on whether `global_step < warmup_steps` and we're still holding.
    # i.e. warm up if we're still warming up and use cosine decayed lr otherwise
    if hold > 0:
        learning_rate = np.where(global_step > warmup_steps + hold,
                                 learning_rate, target_lr)

    learning_rate = np.where(global_step < warmup_steps, warmup_lr, learning_rate)
    return learning_rate

class WarmupCosineDecay(keras.callbacks.Callback):
    def __init__(self, total_steps=0, warmup_steps=0, start_lr=0.0, target_lr=1e-3, hold=0):

        super(WarmupCosineDecay, self).__init__()
        self.start_lr = start_lr
        self.hold = hold
        self.total_steps = total_steps
        self.global_step = 0
        self.target_lr = target_lr
        self.warmup_steps = warmup_steps
        self.lrs = []

    def on_batch_end(self, batch, logs=None):
        self.global_step = self.global_step + 1
        lr = self.model.optimizer.lr.numpy()
        self.lrs.append(lr)

    def on_batch_begin(self, batch, logs=None):
        lr = lr_warmup_cosine_decay(global_step=self.global_step,
                                    total_steps=self.total_steps,
                                    warmup_steps=self.warmup_steps,
                                    start_lr=self.start_lr,
                                    target_lr=self.target_lr,
                                    hold=self.hold)
        K.set_value(self.model.optimizer.lr, lr)

class TrainLoggerCallback(tf.keras.callbacks.Callback):
    def __init__(self, log_file, model, dataset):
        super().__init__()
        self.log_file = log_file
        self.model = model
        self.dataset = dataset

    def on_train_begin(self, logs=None):
        with open(self.log_file, 'a+') as f:
            f.write("train start! \n")

    def on_epoch_end(self, epoch, logs=None):
        with open(self.log_file, 'a') as f:
            f.write(f"epoch : {epoch+1}, lr : {self.model.optimizer.lr.numpy()}, loss : {logs['classifier_1_loss']}, val_loss : {logs['val_classifier_1_loss']}\n")

## seed

In [None]:
def seed_everything(seed: int = 1):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    keras.utils.set_random_seed(seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

## gcs

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from typing import List

from google.cloud import storage

def set_bucket_public_iam(
    bucket_name: str = "your-bucket-name",
    members: List[str] = ["allUsers"],
):
    """Set a public IAM Policy to bucket"""
    # bucket_name = "your-bucket-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    policy = bucket.get_iam_policy(requested_policy_version=3)
    policy.bindings.append(
        {"role": "roles/storage.objectViewer", "members": members}
    )

    bucket.set_iam_policy(policy)

    print(f"Bucket {bucket.name} is now publicly readable")

def pre_process0(x):
    x = x[:args.max_frame]
    x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
    n_frames = tf.shape(x)[0]

    lhand = tf.transpose(tf.reshape(x[:, 0:63], (n_frames, 3, args.n_hand_landmarks)), (0, 2, 1))
    rhand = tf.transpose(tf.reshape(x[:, 63:126], (n_frames, 3, args.n_hand_landmarks)), (0, 2, 1))
    pose = tf.transpose(tf.reshape(x[:, 126:225], (n_frames, 3, args.n_pose_landmarks)), (0, 2, 1))
    face = tf.transpose(tf.reshape(x[:, 225:345], (n_frames, 3, args.n_face_landmarks)), (0, 2, 1))

    x = tf.concat([
        lhand,
        rhand,
        pose,
        face
        ], axis=1)

    x = x[:, :, :2]
    return x
set_bucket_public_iam('gaslfr')

Bucket gaslfr is now publicly readable


## stronger concat

In [None]:
def reverse_frames(x, y):
    x = x[::-1]
    y = y[::-1]
    return x, y

def flip_hand(video):
    video = tf.reshape(video, shape=(-1, args.n_landmarks, 2))
    hands = video[:, :int(2 * args.n_hand_landmarks)]
    other = video[:, int(2 * args.n_hand_landmarks):]

    lhand = hands[:, :args.n_hand_landmarks]
    rhand = hands[:, args.n_hand_landmarks:]

    lhand_x, rhand_x = lhand[:, :, 0], rhand[:, :, 0]
    lhand_x = tf.negative(lhand_x) + 2 * tf.reduce_mean(lhand_x, axis=1, keepdims=True)
    rhand_x = tf.negative(rhand_x) + 2 * tf.reduce_mean(rhand_x, axis=1, keepdims=True)

    lhand = tf.concat([tf.expand_dims(lhand_x, axis=-1), lhand[:, :, 1:]], axis=-1)
    rhand = tf.concat([tf.expand_dims(rhand_x, axis=-1), rhand[:, :, 1:]], axis=-1)

    flipped_hands = tf.concat([rhand, lhand, other], axis=1)
    flipped_hands = tf.reshape(flipped_hands, shape=(-1, args.input_dim))
    return flipped_hands

def augment_fn(x, y):
    def reverse_fn():
        return reverse_frames(x, y)

    def flip_fn():
        return flip_hand(x)

    should_reverse = tf.random.uniform(()) < 0.5
    should_flip = tf.random.uniform(()) < 0.5

    x, y = tf.cond(should_reverse, reverse_fn, lambda: (x, y))
    x = tf.cond(should_flip, flip_fn, lambda: x)

    return x, y

def cat_augment(inputs, _inputs):
    x, y = inputs
    _x, _y = _inputs

    x_shape = tf.shape(x)
    _x_shape = tf.shape(_x)

    x_condition = (x_shape[0] + _x_shape[0] < args.max_frame)

    x = tf.cond(x_condition, lambda: tf.concat([x, _x], axis=0), lambda: x)
    y = tf.cond(x_condition, lambda: tf.concat([y, _y], axis=0), lambda: y)

    return x, y


def decode_fn(record_bytes, augment=False):
    schema = {
        'coordinates': tf.io.VarLenFeature(tf.float32),
        'phrase': tf.io.VarLenFeature(tf.int64)
    }
    x = tf.io.parse_single_example(record_bytes, schema)

    coordinates = tf.reshape(tf.sparse.to_dense(x["coordinates"]), (-1, args.input_dim))
    phrase = tf.sparse.to_dense(x["phrase"])

    if augment:
      coordinates, phrase = augment_fn(coordinates, phrase)

    dx = tf.cond(tf.shape(coordinates)[0]>1,lambda:tf.pad(coordinates[1:] - coordinates[:-1], [[0,1],[0,0]]),lambda:tf.zeros_like(coordinates))
    coordinates = tf.concat([coordinates, dx], axis=-1)

    return coordinates, phrase

In [None]:
folds, train, supplemental_metadata = preprocess(args)
train_df, val_df = folds[0]

train_tffiles = tf.io.gfile.glob('gs://gaslfr/TfrecordsV2/train_tfds/*')
val_tffiles = tf.io.gfile.glob('gs://gaslfr/TfrecordsV2/val_tfds/*')
weight_tffiles = ['gs://gaslfr/TfrecordsV2/train_tfds/' + x.split('/')[-1] for x in val_tffiles]

trainall = True
if trainall:
  train_df = pd.concat([train_df, val_df]).reset_index(drop=True)
  train_tffiles = train_tffiles + val_tffiles
  weight_tffiles = weight_tffiles + val_tffiles

train_loader1 = tf.data.TFRecordDataset(train_tffiles).map(lambda x: decode_fn(x, augment=True), tf.data.AUTOTUNE).shuffle(buffer_size=len(train_df), seed=args.seed, reshuffle_each_iteration=True)
train_loader2 = tf.data.TFRecordDataset(train_tffiles).map(lambda x: decode_fn(x, augment=True), tf.data.AUTOTUNE).shuffle(buffer_size=len(train_df), seed=args.seed+1, reshuffle_each_iteration=True)

train_loader = tf.data.Dataset.zip((train_loader1, train_loader2))
train_loader = train_loader.map(cat_augment)

weight_loader = tf.data.TFRecordDataset(weight_tffiles).map(lambda x: decode_fn(x, augment=True), tf.data.AUTOTUNE)

train_loader = train_loader.concatenate(weight_loader)
train_loader = train_loader.shuffle(buffer_size=len(train_df) + len(train), seed=args.seed+2, reshuffle_each_iteration=True)
train_loader = train_loader.padded_batch(args.batch_size,
                                         padding_values=(tf.constant(0, dtype=tf.float32), tf.constant(args.decoder_pad_token, dtype=tf.int64)),
                                         padded_shapes=([args.max_frame,2*args.input_dim],[args.max_label]),
                                         drop_remainder=True).prefetch(tf.data.AUTOTUNE)

val_loader = tf.data.TFRecordDataset(val_tffiles).map(decode_fn)
val_loader = val_loader.padded_batch(args.batch_size,
                                     padding_values=(tf.constant(0, dtype=tf.float32), tf.constant(args.decoder_pad_token, dtype=tf.int64)),
                                     padded_shapes=([args.max_frame,2*args.input_dim],[args.max_label]),
                                     drop_remainder=True).prefetch(tf.data.AUTOTUNE)

num_of_train :  67208
num_of_supplemental_metadata :  52957


In [None]:
len(train_tffiles), len(val_tffiles)

(189, 68)

## lossV2

In [None]:
from shutil import copyfile
copyfile(src = "/content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/CTC_TPU.py", dst = "/content/CTC_TPU.py")
from CTC_TPU import classic_ctc_loss

@tf.function
def ctc_loss(y_true, y_pred):

    batch = tf.cast(tf.shape(y_true)[0], dtype="int64")

    input_length = args.max_label * tf.ones(shape=(batch), dtype="int64")
    label_length = tf.math.not_equal(y_true, args.decoder_pad_token)
    label_length = tf.reduce_sum(tf.cast(label_length, dtype="int64"), axis=1)

    y_pred = tf.reshape(y_pred, (batch, args.max_label, args.vocab_size))
    y_true = tf.reshape(y_true, (batch, args.max_label))
    input_length = tf.squeeze(tf.reshape(input_length, (batch, 1)), axis=1)
    label_length = tf.squeeze(tf.reshape(label_length, (batch, 1)), axis=1)

    #loss = tf.nn.ctc_loss(
    #    labels=tf.cast(y_true, tf.int32),
    #    logit_length=tf.cast(input_length, tf.int32),
    #    logits=tf.cast(y_pred, tf.float32),
    #    label_length=tf.cast(label_length, tf.int32),
    #    logits_time_major=False,
    #    blank_index=args.blank_token,
    #)

    loss = classic_ctc_loss(
            labels=tf.cast(y_true, tf.int32),
            logits=tf.cast(y_pred, tf.float32),
            label_length=tf.cast(label_length, tf.int32),
            logit_length=tf.cast(input_length, tf.int32),
            blank_index=args.blank_token,
        )
    loss = tf.where(tf.math.is_finite(loss), loss, tf.zeros_like(loss))
    #loss = loss/tf.cast(label_length, dtype=tf.float32)
    loss = tf.nn.compute_average_loss(loss, global_batch_size=args.batch_size)

    return loss

## model

In [None]:
# nomask

class ECA(tf.keras.layers.Layer):
    def __init__(self, kernel_size=5, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.kernel_size = kernel_size
        self.conv = tf.keras.layers.Conv1D(1, kernel_size=kernel_size, strides=1, padding="same", use_bias=False)

    def call(self, inputs, mask=None):
        nn = tf.keras.layers.GlobalAveragePooling1D()(inputs, mask=mask)
        nn = tf.expand_dims(nn, -1)
        nn = self.conv(nn)
        nn = tf.squeeze(nn, -1)
        nn = tf.nn.sigmoid(nn)
        nn = nn[:,None,:]
        return inputs * nn

class LateDropout(tf.keras.layers.Layer):
    def __init__(self, rate, noise_shape=None, start_step=0, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.rate = rate
        self.start_step = start_step
        self.dropout = tf.keras.layers.Dropout(rate, noise_shape=noise_shape)

    def build(self, input_shape):
        super().build(input_shape)
        agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
        self._train_counter = tf.Variable(0, dtype="int64", aggregation=agg, trainable=False)

    def call(self, inputs, training=False):
        x = tf.cond(self._train_counter < self.start_step, lambda:inputs, lambda:self.dropout(inputs, training=training))
        if training:
            self._train_counter.assign_add(1)
        return x

class CausalDWConv1D(tf.keras.layers.Layer):
    def __init__(self,
        kernel_size=17,
        dilation_rate=1,
        use_bias=False,
        depthwise_initializer='glorot_uniform',
        name='', **kwargs):
        super().__init__(name=name,**kwargs)
        #self.causal_pad = tf.keras.layers.ZeroPadding1D((dilation_rate*(kernel_size-1),0),name=name + '_pad')
        self.dw_conv = tf.keras.layers.DepthwiseConv1D(
                            kernel_size,
                            strides=1,
                            dilation_rate=dilation_rate,
                            padding='same',#padding='valid',
                            use_bias=use_bias,
                            depthwise_initializer=depthwise_initializer,
                            name=name + '_dwconv')
        self.supports_masking = True

    def call(self, inputs):
        x = inputs#x = self.causal_pad(inputs)
        x = self.dw_conv(x)
        return x

def Conv1DBlock(channel_size,
          kernel_size,
          dilation_rate=1,
          drop_rate=0.0,
          expand_ratio=2,
          se_ratio=0.25,
          activation='swish',
          name=None):
    '''
    efficient conv1d block, @hoyso48
    '''
    if name is None:
        name = str(tf.keras.backend.get_uid("mbblock"))
    # Expansion phase
    def apply(inputs):
        channels_in = tf.keras.backend.int_shape(inputs)[-1]
        channels_expand = channels_in * expand_ratio

        skip = inputs

        x = tf.keras.layers.Dense(
            channels_expand,
            use_bias=True,
            activation=activation,
            name=name + '_expand_conv')(inputs)

        # Depthwise Convolution
        x = CausalDWConv1D(kernel_size,
            dilation_rate=dilation_rate,
            use_bias=False,
            name=name + '_dwconv')(x)

        x = tf.keras.layers.BatchNormalization(momentum=0.95, name=name + '_bn')(x)

        x  = ECA()(x)

        x = tf.keras.layers.Dense(
            channel_size,
            use_bias=True,
            name=name + '_project_conv')(x)

        if drop_rate > 0:
            x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1), name=name + '_drop')(x)

        if (channels_in == channel_size):
            x = tf.keras.layers.add([x, skip], name=name + '_add')
        return x

    return apply
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, dim=256, num_heads=4, dropout=0, **kwargs):
        super().__init__(**kwargs)
        self.dim = dim
        self.scale = self.dim ** -0.5
        self.num_heads = num_heads
        self.qkv = tf.keras.layers.Dense(3 * dim, use_bias=False)
        self.drop1 = tf.keras.layers.Dropout(dropout)
        self.proj = tf.keras.layers.Dense(dim, use_bias=False)
        self.supports_masking = True

    def call(self, inputs, mask=None):
        qkv = self.qkv(inputs)
        qkv = tf.keras.layers.Permute((2, 1, 3))(tf.keras.layers.Reshape((-1, self.num_heads, self.dim * 3 // self.num_heads))(qkv))
        q, k, v = tf.split(qkv, [self.dim // self.num_heads] * 3, axis=-1)

        attn = tf.matmul(q, k, transpose_b=True) * self.scale

        #if mask is not None:
        #    mask = mask[:, None, None, :]

        attn = tf.keras.layers.Softmax(axis=-1)(attn, mask=None)#attn = tf.keras.layers.Softmax(axis=-1)(attn, mask=mask)
        attn = self.drop1(attn)

        x = attn @ v
        x = tf.keras.layers.Reshape((-1, self.dim))(tf.keras.layers.Permute((2, 1, 3))(x))
        x = self.proj(x)
        return x


def TransformerBlock(dim=256, num_heads=4, expand=4, attn_dropout=0.2, drop_rate=0.2, activation='swish'):
    def apply(inputs):
        x = inputs
        x = tf.keras.layers.BatchNormalization(momentum=0.95)(x)
        x = MultiHeadSelfAttention(dim=dim,num_heads=num_heads,dropout=attn_dropout)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([inputs, x])
        attn_out = x

        x = tf.keras.layers.BatchNormalization(momentum=0.95)(x)
        x = tf.keras.layers.Dense(dim*expand, use_bias=False, activation=activation)(x)
        x = tf.keras.layers.Dense(dim, use_bias=False)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([attn_out, x])
        return x
    return apply

In [None]:
# interCTC

policy = tf.keras.mixed_precision.Policy('float32')
tf.keras.mixed_precision.set_global_policy(policy)

def get_model(max_len=384,
              dropout_step=0,
              dim=160,
              CHANNELS=230,
              NUM_CLASSES=63,
              PAD=0,
              ksize=31,
              drop_rate=0.1,
              num_layers=16):

    inp = tf.keras.Input((None,2*CHANNELS))
    x = inp
    x = tf.keras.layers.Dense(dim, use_bias=False,name='stem_conv')(x)
    x = tf.keras.layers.BatchNormalization(momentum=0.95,name='stem_bn')(x)

    xs = []
    for i in range(num_layers):
        x = TransformerBlock(dim,expand=2)(x)
        x = Conv1DBlock(dim,ksize,drop_rate=drop_rate)(x)
        xs.append(x)

    classifier = tf.keras.layers.Dense(NUM_CLASSES,name='classifier')

    x1 = LateDropout(0.2, start_step=dropout_step)(xs[-8])
    x1 = classifier(x1)

    x2 = LateDropout(0.2, start_step=dropout_step)(xs[-1])
    x2 = classifier(x2)
    return tf.keras.Model(inp, [x1, x2])

model = get_model()
input_shape = (args.max_label, 2*args.input_dim)
inputs = tf.keras.layers.Input(shape=input_shape, name='input')
model(inputs)

[<KerasTensor: shape=(None, 384, 63) dtype=float32 (created by layer 'model')>,
 <KerasTensor: shape=(None, 384, 63) dtype=float32 (created by layer 'model')>]

## run

In [None]:
tf.config.run_functions_eagerly(False)

path = '/content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0824)-deep-wide/'

policy = tf.keras.mixed_precision.Policy('float32')
tf.keras.mixed_precision.set_global_policy(policy)

seed_everything(args.seed)
print('number of train data : ', len(train_df))
print('number of val data : ', len(val_df))
print('seed : ', args.seed)

log_file = path + "./log.txt"

if not os.path.exists(path):
  os.makedirs(path)

weights_name = "weights/epoch_{epoch:02d}-val_loss_{val_classifier_1_loss:.4f}.h5"

checkpoint = ModelCheckpoint(path + weights_name,
                             monitor='val_classifier_1_loss',
                             verbose=1,
                             save_weights_only=True,
                             mode='max')


with strategy.scope():
  model = get_model()
  input_shape = (args.max_label, 2*args.input_dim)
  inputs = keras.layers.Input(shape=input_shape, name='input')
  model(inputs)

  optimizer = AdamW(learning_rate=args.lr, weight_decay=args.wd, global_clipnorm=args.max_norm)
  loss = ctc_loss
  inter_loss = ctc_loss


  model.compile(
      optimizer=optimizer,
      jit_compile=True
  )

  model.compile(
    loss=[inter_loss, loss],
    loss_weights=[0.3, 0.7]
  )

total_steps = ((len(train_df) + len(train)) // args.batch_size) * args.n_epochs
warmup_steps = int(args.warmup_ratio*total_steps)
print('total_steps: ', total_steps)
print('warmup_steps: ', warmup_steps)


callback = WarmupCosineDecay(total_steps=total_steps,
                             warmup_steps=warmup_steps,
                             hold=0,
                             start_lr=0.0,
                             target_lr=args.lr)

logger = TrainLoggerCallback(log_file, model, val_loader)

model.fit(
    train_loader ,
    validation_data = val_loader,
    epochs = args.n_epochs,
    verbose = 1,
    callbacks = [callback, logger, checkpoint]
  )

number of train data :  120165
number of val data :  13339
seed :  42
total_steps:  292600
warmup_steps:  29260
Epoch 1/200
   1463/Unknown - 478s 151ms/step - loss: 49.3608 - classifier_loss: 45.8915 - classifier_1_loss: 50.8475
Epoch 1: saving model to /content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0824)-deep-wide/weights/epoch_01-val_loss_8.1849.h5
Epoch 2/200
Epoch 2: saving model to /content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0824)-deep-wide/weights/epoch_02-val_loss_7.3855.h5
Epoch 3/200
Epoch 3: saving model to /content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0824)-deep-wide/weights/epoch_03-val_loss_4.9714.h5
Epoch 4/200
Epoch 4: saving model to /content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0824)-deep-wide/weights/epoch_04-val_loss_4.41

<keras.callbacks.History at 0x7f58ac06b8e0>

## evaluation

In [None]:
class CustomTokenizer(nn.Module):
  def __init__(self, args):
    super(CustomTokenizer, self).__init__()
    self.args = args
    with open(args.path + 'Data Sources/character_to_prediction_index.json', 'r') as f:
      self.character_to_prediction_index = json.load(f)

    self.prediction_index_to_character = {value: key for key, value in self.character_to_prediction_index.items()}
    self.prediction_index_to_character.update({
                                               args.decoder_bos_token : '<s>',
                                               args.decoder_eos_token : '</s>',
                                               args.decoder_pad_token : '<pad>',
                                               args.blank_token : 'ϵ'
                                               })
    self.special_tokens = [
        args.decoder_bos_token,
        args.decoder_eos_token,
        args.decoder_pad_token
        ]


  def tokenize(self, x):
    tokenized_x = list()
    for i in range(len(x)):
      tokenized_x.append(x[i])
    return tokenized_x

  def encode(self, x):
    encoded_x = list()
    for i in range(len(x)):
      encoded_x.append(self.character_to_prediction_index[x[i]])
    return encoded_x

  def decode(self, x, skip_speical_tokens=True):
    if torch.is_tensor(x):
      x = x.tolist()
    decoded_x = list()
    for i in range(len(x)):
      if skip_speical_tokens:
        if x[i] not in self.special_tokens:
          decoded_x.append(self.prediction_index_to_character[x[i]])
      else:
        decoded_x.append(self.prediction_index_to_character[x[i]])
    return ''.join(decoded_x)

  def batch_decode(self, batch_x, skip_speical_tokens=True):
    if torch.is_tensor(batch_x):
      batch_x = batch_x.tolist()
    decoded_x = list()
    for i in range(len(batch_x)):
      x = batch_x[i]
      decoded_x.append(self.decode(x, skip_speical_tokens=skip_speical_tokens))
    return decoded_x

tokenizer = CustomTokenizer(args)

In [None]:
with strategy.scope():
  model = get_model()
  inputs = keras.layers.Input(shape = (args.max_label, args.input_dim*2))
  model(inputs)
  model.load_weights('/content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0818)-deep/weights/epoch_185-val_loss_0.4702.h5')

  model.compile(
      optimizer=optimizer,
      jit_compile=True
  )

  model.compile(
    loss=loss,
  )

In [None]:
with strategy.scope():
  model.evaluate(val_loader)

In [None]:
def decode_phrase(pred):
    x = tf.argmax(pred, axis=1)
    diff = tf.not_equal(x[:-1], x[1:])
    adjacent_indices = tf.where(diff)[:, 0]
    x = tf.gather(x, adjacent_indices)
    mask = x < 59
    x = tf.boolean_mask(x, mask, axis=0)
    return x

model = get_model()
inputs = keras.layers.Input(shape = (args.max_label, args.input_dim*2))
model(inputs)
model.load_weights('/content/drive/MyDrive/Kaggle/Google - American Sign Language Fingerspelling Recognition/Model/keras/simple(0818)-deep/weights/epoch_185-val_loss_0.4702.h5')

preds, trues = [], []
for batch in tqdm(val_loader):
    X, y = batch
    _, batch_pred = model(X)
    y = y.numpy()
    for i in range(X.shape[0]):
        pred = decode_phrase(batch_pred[i]).numpy()
        pred = tokenizer.decode(pred)
        true = tokenizer.decode(y[i])
        preds.append(pred)
        trues.append(true)

levenshtein_score(preds, trues)