# APPROACH:2 IDEA:2 

In [None]:
#------------------------------------
# globals
#------------------------------------
EPOCHS           = 100
VOCAB            = "unicode"# @["unicdoe","grapheme"]    
class ModelConfig:
    num_layers       = 4               # number of encoder decoder layers
    num_heads        = 8               # number of attention heads in MHA
    d_model          = 128            # Embedding Dimension 
    dff              = 512             # Feed Forward netwrok Dimension
    rate             = 0.1             # dropout rate 
    pe_max           = 50              # max positonal endocing 
    d_len            = 30             # data length
    pad_value        = 0
    start_value      = 1
    end_value        = 2
    inp_shape        = (30,)
    tar_shape        = (30,)
#-------------------------------
# imports
#-------------------------------
import os 
import numpy as np
import json
import tensorflow as tf
import random
import pandas as pd
from tqdm.auto import tqdm
from kaggle_datasets import KaggleDatasets
from glob import glob
tqdm.pandas()
    
config_json="../input/dakshina-bnt-to-bn/config.json"
with open(config_json) as f:
    config = json.load(f)
inp_vocab=config["source_vocab"]
tgt_vocab=config[f"{VOCAB[0]}_target_vocab"]
#-------------------------------
# config update
#-------------------------------
ModelConfig.inp_voclen=len(inp_vocab)
ModelConfig.tgt_voclen=len(tgt_vocab)

#--------------
# GCS
#--------------
def get_tfrecs(_path):
    gcs_pattern=os.path.join(_path,'*.tfrecord')
    file_paths = tf.io.gfile.glob(gcs_pattern)
    random.shuffle(file_paths)
    return file_paths
    

GCS_PATH = KaggleDatasets().get_gcs_path('dakshina-bnt-to-bn')+"/tfrecords/"
gp_train=GCS_PATH+"train/"
gp_eval =GCS_PATH+"eval/"
train_recs=get_tfrecs(gp_train)
eval_recs =get_tfrecs(gp_eval)
# numbers
nb_train  =int((len(train_recs)-1)*20480)
nb_eval   =int((len(eval_recs)-1)*20480)
print("Train Data:",nb_train,len(train_recs))
print("Eval Data:",nb_eval,len(eval_recs))

#----------------------------------------------------------
# Detect hardware, return appropriate distribution strategy
#----------------------------------------------------------
# TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 
    # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)
#-------------------------------------
# batching , strategy and steps
#-------------------------------------
if strategy.num_replicas_in_sync==1:
    BATCH_SIZE = 32
else:
    BATCH_SIZE = 16 * strategy.num_replicas_in_sync

# set    
STEPS_PER_EPOCH = nb_train//BATCH_SIZE
EVAL_STEPS      = nb_eval//BATCH_SIZE
print("Steps:",STEPS_PER_EPOCH)
print("Eval Steps:",EVAL_STEPS)
print("Batch Size:",BATCH_SIZE)
#------------------------------
# parsing tfrecords basic
#------------------------------
def create_padding_mask(seq):
    '''
        creates padding mask: fixed pad value 0
    '''
    seq = tf.cast(tf.math.equal(seq,0), tf.float32)
    # add extra dimensions to add the padding to the attention logits.
    return seq[:,tf.newaxis, tf.newaxis, :] 

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    # (seq_len, seq_len)
    return mask  
    

def create_masks(inp,tar):
    '''
        create relevent masks:
        args:
            inp : source encoded
            tar : target endcoded
        returns:
            mask     : Encoder padding mask
                      * Used in the 2nd attention block in the decoder,
                      * This padding mask is used to mask the encoder outputs.
            comb_mask: look ahead mask
                      * Used in the 1st attention block in the decoder.
                      * It is used to pad and mask future tokens in the input received by the decoder.
    '''
    #mask
    mask            = create_padding_mask(inp)
    # lmask
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_mask        = create_padding_mask(tar)
    combined_mask   = tf.maximum(dec_mask, look_ahead_mask)

    return mask,combined_mask

    
    
    

def data_input_fn(recs): 
    '''
      This Function generates data from gcs
      * The parser function should look similiar now because of datasetEDA
    '''
    def _parser(example):   
        feature ={  
                    'src'  : tf.io.FixedLenFeature([ModelConfig.d_len],tf.int64),
                    'gtgt'  : tf.io.FixedLenFeature([ModelConfig.d_len],tf.int64),
                    'utgt'  : tf.io.FixedLenFeature([ModelConfig.d_len],tf.int64)
            

        }    
        parsed_example=tf.io.parse_single_example(example,feature)
        # src
        inp=parsed_example['src']
        inp=tf.cast(inp, tf.int64)
        if VOCAB=="grapheme": tgt="gtgt"
        else: tgt="utgt"
        # tar
        tar=parsed_example[tgt]
        tar=tf.cast(tar, tf.int64)
        # mask
        return inp,tar
    # fixed code (for almost all tfrec training)
    dataset = tf.data.TFRecordDataset(recs)
    dataset = dataset.map(_parser)
    dataset = dataset.shuffle(2048,reshuffle_each_iteration=True)
    dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset


train_ds  =   data_input_fn(train_recs)
eval_ds   =   data_input_fn(eval_recs)
for x,y in train_ds.take(1):
    print("---------------------------------------------------------------")
    print("source:",x[0])
    print("---------------------------------------------------------------")
    print("target:",y[0])
    print("---------------------------------------------------------------")
    print('Source Batch Shape:',x.shape)
    print('Target Batch Shape:',y.shape)
    
    

# Modeling
Base Layers:
* Multihead Attention
* Positional Encoding

Composite Layers
* Encoder Base Layer
* Decoder Base Layer

Base Blocks:
* Endoder 
* Decoder

### Base Layers

In [None]:
class Masking(tf.keras.layers.Layer):
    def __init__(self,pad_value,size,**kwargs,):  
        super().__init__(**kwargs)
        self.pad_value = pad_value
        self.size      = size
        
    def create_padding_mask(self,seq):
        '''
            creates padding mask: fixed pad value 0
        '''
        seq = tf.cast(tf.math.equal(seq,self.pad_value), tf.float32)
        # add extra dimensions to add the padding to the attention logits.
        return seq[:,tf.newaxis, tf.newaxis, :] 

    def create_look_ahead_mask(self,size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        # (seq_len, seq_len)
        return mask  


    def call(self,inp,tar):
        '''
            create relevent masks:
            args:
                inp : source encoded
                tar : target endcoded
            returns:
                mask     : Encoder padding mask
                          * Used in the 2nd attention block in the decoder,
                          * This padding mask is used to mask the encoder outputs.
                comb_mask: look ahead mask
                          * Used in the 1st attention block in the decoder.
                          * It is used to pad and mask future tokens in the input received by the decoder.
        '''
        #mask
        mask            = self.create_padding_mask(inp)
        # lmask
        look_ahead_mask = self.create_look_ahead_mask(self.size)
        dec_mask        = self.create_padding_mask(tar)
        combined_mask   = tf.maximum(dec_mask, look_ahead_mask)

        return mask,combined_mask
    def get_config(self):

        config = super().get_config().copy()
        config.update({'pad_value'   : self.pad_value,
                       'size'        : self.size})
        return config


In [None]:
import numpy as np
class PositionalEncoding(tf.keras.layers.Layer):
    '''
    tensorflow wrapper for positional encoding layer
    args:
      position  :   incoming sequence length
      d_model   :   required embedding dim
    '''
    def __init__(self,position,d_model,use_scale,**kwargs,):  
        super().__init__(**kwargs)
        self.use_scale = use_scale
        self.position  = position
        self.d_model   = d_model
        
        
    def call(self,x):
        # pos encoding
        pos=self.positional_encoding()
        # input processing
        seq_len = tf.shape(x)[1]
        if self.use_scale:
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += pos[:, :seq_len, :]
        return x 
    
    def get_angles(self,pos, i):
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(self.d_model))
        return pos * angle_rates

    def positional_encoding(self):
        angle_rads = self.get_angles(np.arange(self.position)[:, np.newaxis],np.arange(self.d_model)[np.newaxis, :])
        # apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        # apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'position'   : self.position,
            'd_model'    : self.d_model,
            'use_scale'  : self.use_scale
        })
        return config

    
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads,**kwargs,):
        super(MultiHeadAttention, self).__init__(**kwargs)
        assert d_model % num_heads == 0,"Model Dimension Must be divideable by number of head provided"
        # attrs
        self.num_heads   = num_heads
        self.d_model     = d_model
        self.depth       = self.d_model // self.num_heads
        # ops
        self.wq    = tf.keras.layers.Dense(d_model)
        self.wk    = tf.keras.layers.Dense(d_model)
        self.wv    = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        '''
            * Split the last dimension into (num_heads, depth).
            * Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        '''
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  ## (batch_size, seq_len, d_model)
        k = self.wk(k)  ## (batch_size, seq_len, d_model)
        v = self.wv(v)  ## (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  ## (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  ## (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  ## (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
        # (batch_size, seq_len_q, num_heads, depth)
        scaled_attention                    = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  
        # (batch_size, seq_len_q, d_model)
        concat_attention                    = tf.reshape(scaled_attention,(batch_size, -1, self.d_model))  
        # (batch_size, seq_len_q, d_model)
        output                              = self.dense(concat_attention)  
        return output, attention_weights
    
    def scaled_dot_product_attention(self,q, k, v, mask):
        '''
            Calculate the attention weights.

            args:
                q   : query shape == (..., seq_len_q, depth)
                k   : key shape == (..., seq_len_k, depth)
                v   : value shape == (..., seq_len_v, depth_v)
                mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k). Defaults to None.
            returns:
                output, attention_weights
            NOTES:
            * q, k, v must have matching leading dimensions.
            * k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
            * The mask has different shapes depending on its type(padding or look ahead) but it must be broadcastable for addition.

        '''
        ## (..., seq_len_q, seq_len_k)
        matmul_qk = tf.matmul(q, k, transpose_b=True)  
        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
        ## (..., seq_len_q, seq_len_k)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  
        ## (..., seq_len_q, depth_v)
        output = tf.matmul(attention_weights, v)  
        return output, attention_weights
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'd_model'  : self.d_model,
            'num_heads': self.num_heads,
        })
        return config


### Composite Layers

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  ## (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)                  ## (batch_size, seq_len, d_model)
    ])

class EncoderBaseLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1,**kwargs,):
        super(EncoderBaseLayer, self).__init__(**kwargs)
        # attrs
        self.num_heads = num_heads
        self.d_model   = d_model
        self.dff       = dff
        self.rate      = rate
        # ops
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def call(self, x,mask,training=True):
        ## op outs:(batch_size, input_seq_len, d_model)
        attn_output, _ = self.mha(x, x, x, mask)  
        attn_output    = self.dropout1(attn_output, training=training)
        out1           = self.layernorm1(x + attn_output)  
        
        ffn_output     = self.ffn(out1)  
        ffn_output     = self.dropout2(ffn_output, training=training)
        out2           = self.layernorm2(out1 + ffn_output) 
        return out2
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'd_model'  : self.d_model,
            'num_heads': self.num_heads,
            'dff'      : self.dff,
            'rate'     : self.rate
        })
        return config


class DecoderBaseLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1,**kwargs,):
        super(DecoderBaseLayer, self).__init__(**kwargs,)
        # attrs
        self.num_heads = num_heads
        self.d_model   = d_model
        self.dff       = dff
        self.rate      = rate
        # ops
        self.mha1 = MultiHeadAttention(self.d_model, self.num_heads)
        self.mha2 = MultiHeadAttention(self.d_model, self.num_heads)
        
        self.ffn  = point_wise_feed_forward_network(self.d_model, self.dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(self.rate)
        self.dropout2 = tf.keras.layers.Dropout(self.rate)
        self.dropout3 = tf.keras.layers.Dropout(self.rate)

    def call(self, x, enc,comb_mask,mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        ##op outs:(batch_size, target_seq_len, d_model)
        attn1, attn_weights_block1 = self.mha1(x, x, x, comb_mask)  
        attn1                      = self.dropout1(attn1)
        out1                       = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc, enc, out1,mask)  
        attn2                      = self.dropout2(attn2)
        out2                       = self.layernorm2(attn2 + out1)  

        ffn_output                 = self.ffn(out2)  
        ffn_output                 = self.dropout3(ffn_output)
        out3                       = self.layernorm3(ffn_output + out2)  

        return out3, attn_weights_block1, attn_weights_block2
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'd_model'  : self.d_model,
            'num_heads': self.num_heads,
            'dff'      : self.dff,
            'rate'     : self.rate
        })
        return config


### Base Blocks

In [None]:
def Encoder(inp,mask,cfg):
    x   = tf.keras.layers.Embedding(cfg.inp_voclen,cfg.d_model,name="EncoderInputEncoding")(inp)
    x   = PositionalEncoding(cfg.pe_max,cfg.d_model,True,name="EncoderPositionalEncoding")(x)
    for i in range(cfg.num_layers):
        x=EncoderBaseLayer(cfg.d_model, cfg.num_heads, cfg.dff, cfg.rate,name=f"EncoderLayer_{i}")(x,mask)
    return x

def Decoder(tar,enc,mask,comb_mask,cfg):
    x   = tf.keras.layers.Embedding(cfg.tgt_voclen,cfg.d_model,name="DecoderInputEncoding")(tar)
    x   = PositionalEncoding(cfg.pe_max,cfg.d_model,True,name="DecoderPositionalEncoding")(x)
    x   = tf.keras.layers.Dropout(cfg.rate)(x)
    w_attn={}
    for i in range(cfg.num_layers):
        x,awb1,awb2=DecoderBaseLayer(cfg.d_model, cfg.num_heads, cfg.dff, cfg.rate,name=f"DecoderLayer_{i}")(x,enc,comb_mask,mask)
        w_attn[f'decoder_layer{i+1}_block1'] = awb1
        w_attn[f'decoder_layer{i+1}_block2'] = awb2
    x=tf.keras.layers.Dense(cfg.tgt_voclen,name="logits")(x)
    return x,w_attn    


def net(cfg):
    inp           = tf.keras.layers.Input(shape=cfg.inp_shape,name="input")
    tar           = tf.keras.layers.Input(shape=cfg.tar_shape,name="target")
    mask,comb_mask=Masking(pad_value=cfg.pad_value,size=cfg.d_len,name="masks")(inp,tar)
    enc= Encoder(inp,mask,cfg)
    x,w_attn=Decoder(tar,enc,mask,comb_mask,cfg)
    model=tf.keras.Model(inputs=[inp,tar],outputs=[x,w_attn],name="TransformerBaseNet")
    return model

transformer=net(ModelConfig)
transformer.summary()

# Model Params and callbacks

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
d_model=ModelConfig.d_model
with strategy.scope():
    class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        def __init__(self, d_model, warmup_steps=4000):
            super(CustomSchedule, self).__init__()

            self.d_model = d_model
            self.d_model = tf.cast(self.d_model, tf.float32)

            self.warmup_steps = warmup_steps

        def __call__(self, step):
            arg1 = tf.math.rsqrt(step)
            arg2 = step * (self.warmup_steps ** -1.5)

            return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    learning_rate = CustomSchedule(d_model)

    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,epsilon=1e-9)

    temp_learning_rate_schedule = CustomSchedule(d_model)

    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


    def accuracy_function(real, pred):
        accuracies = tf.equal(real, tf.argmax(pred, axis=2))

        mask = tf.math.logical_not(tf.math.equal(real, 0))
        accuracies = tf.math.logical_and(mask, accuracies)

        accuracies = tf.cast(accuracies, dtype=tf.float32)
        mask = tf.cast(mask, dtype=tf.float32)
        return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)


In [None]:
class Transformer(tf.keras.Model):
    def __init__(self,cfg):
        super(Transformer, self).__init__()
        self.net    = net(cfg)
        self.cfg    = cfg
        
    def compile(self,optimizer,loss_fn,acc):
        super(Transformer, self).compile()
        self.optimizer = optimizer
        self.loss_fn   = loss_fn
        self.acc       = acc
        
    def train_step(self, batch_data):
        inp,tar=batch_data
        with tf.GradientTape() as tape:
            pred,_= self.net({"input":inp,"target":tar},training=True)
            loss  = self.loss_fn(tar[:, 1:],pred[:,:-1,:])

        gradients = tape.gradient(loss, self.net.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.net.trainable_variables))
        return {"loss": loss,
                "acc" : self.acc(tar[:, 1:],pred[:,:-1,:])}
    
    def test_step(self, batch_data):
        inp,tar=batch_data
        # label
        label=tf.ones_like(tar,dtype=tf.int64)*self.cfg.start_value
        preds=[]
        for i in range(self.cfg.d_len):
            pred,_ = self.net({"input":inp,"target":label},training=False)
            pred   = pred[:,i, :]
            preds.append(pred)
            pred   =tf.nn.softmax(pred,axis=-1)
            max_idx=tf.math.argmax(pred,axis=-1)

            if i < self.cfg.d_len - 1:
                label     = tf.unstack(label,axis=-1)
                label[i+1]= tf.cast(max_idx,tf.int64)
                label     = tf.stack(label,axis=-1)
                label     = tf.cast(label,tf.int64)
                
        pred=tf.stack(preds,axis=1)
        loss = self.loss_fn(tar[:, 1:],pred[:,:-1,:])

        return {"loss": loss,
                "acc" : self.acc(tar[:, 1:],pred[:,:-1,:])}

In [None]:
with strategy.scope():
    model =  Transformer(ModelConfig)
    model.compile(optimizer = optimizer,
                  loss_fn   = loss_function,
                  acc       = accuracy_function)

In [None]:
# early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(patience=30, 
                                                  verbose=1, 
                                                  mode = 'auto') 

class SaveBestModel(tf.keras.callbacks.Callback):
    def __init__(self):
        self.best = float('inf')

    def on_epoch_end(self, epoch, logs=None):
        metric_value = logs['val_loss']
        if metric_value < self.best:
            print(f"Loss Improved epoch:{epoch} from {self.best} to {metric_value}")
            self.best = metric_value
            self.model.net.save_weights(f"model_{VOCAB}.h5")
            print("Saved Best Weights")
    def set_model(self, model):
        self.model = model
            
model_save=SaveBestModel()
model_save.set_model(model)
callbacks= [model_save,early_stopping]

In [None]:
history=model.fit(train_ds,
                  epochs=EPOCHS,
                  steps_per_epoch=STEPS_PER_EPOCH,
                  verbose=1,
                  validation_data=eval_ds,
                  validation_steps=EVAL_STEPS,
                  callbacks=callbacks)

In [None]:
import pandas as pd
curves={}
for key in history.history.keys():
    curves[key]=history.history[key]
curves=pd.DataFrame(curves)
curves.to_csv(f"history_{VOCAB}.csv",index=False)