In [None]:
# -*- coding: utf-8 -*-
vocab_json_path  ="../input/dakshina-bnt-to-bn/config.json"
model_weights_dir="../input/dakshina-inspection/"
font_path        ="../input/dakshina-inspection/Bangla.ttf"
#----------------------------------------
# imports
#----------------------------------------
import pandas as pd
import string
import os 
import numpy as np
import json
import tensorflow as tf
import random
from tqdm.auto import tqdm
from scipy.special import softmax
import math
import matplotlib.pyplot as plt 
import matplotlib.font_manager as fm
tqdm.pandas()
prop = fm.FontProperties(fname=font_path,size=20)
plt.rcParams.update({'font.size': 20})
# --------------------------------
# plot utils
#--------------------------------

def plot_attn_head(inp,tgt,attn,max_idx):
    
    attn=attn[:max_idx,:max_idx]
    ax=plt.gca()
    ax.matshow(attn)
    ax.set_xticks(range(len(inp)))
    ax.set_yticks(range(len(tgt)))
    ax.set_yticklabels(tgt,fontproperties=prop)
    ax.set_xticklabels(inp)
    
def plot_attn_weights(inp,tgt,attn_heads):
    fig=plt.figure(figsize=(30,15))
    
    for idx,val in enumerate(inp):
        if val in ["START","END"]:
            inp[idx]=''
    for idx,val in enumerate(tgt):
        if val in ["START","END"]:
            tgt[idx]=''
    inp=inp[1:]
    tgt=tgt[1:]
    inp_idx=inp.index('')
    tgt_idx=tgt.index('')
    max_idx=max(inp_idx,tgt_idx)
    
    inp=inp[:max_idx]
    tgt=tgt[:max_idx]
    for h,head in enumerate(attn_heads):
        ax=fig.add_subplot(2,4,h+1)
        plot_attn_head(inp,tgt,head,max_idx)
        ax.set_xlabel(f"Head:{h+1}")
    plt.tight_layout()
    plt.show()
#----------------------------------------
# Model Config
#----------------------------------------
class ModelConfig(object):
    def __init__(self,
                vocab_type,
                vocab_json_path,
                model_weights_dir,
                num_layers       = 4,
                num_heads        = 8,
                d_model          = 128,
                dff              = 512,
                rate             = 0.1,
                pe_max           = 50,
                d_len            = 30,
                pad_value        = 0,
                start_value      = 1,
                end_value        = 2,
                inp_shape        = (30,),
                tar_shape        = (30,)):
        '''
            initialize a model config
            args:
                vocab_type     : unicode or grapheme
                vocab_json_path: path of the json file that contains vocabulary data
            OPTIONAL:
                num_layers :# number of encoder decoder layers
                num_head   :# number of attention heads in MHA
                d_model    :# Embedding Dimension 
                dff        :# Feed Forward netwrok Dimension
                rate       :# dropout rate 
                pe_max     :# max positonal endocing 
                d_len      :# data length
                pad_value  :# padding value in data encoding
                start_value:# START token index
                end_value  :# END token index
                inp_shape  :# shape of model input
                tar_shape  :# shape of model target
        '''
        assert os.path.exists(vocab_json_path),"vocab json missing/wrong file path"
        assert os.path.exists(f"{model_weights_dir}model_{vocab_type}.h5"),"model weights missing"
        
        with open(vocab_json_path) as f:
            vocab_json_data = json.load(f)
        assert "source_vocab" in vocab_json_data.keys(),"source vocab missing"
        assert "g_target_vocab" in vocab_json_data.keys(),"target grapheme vocab missing"
        assert "u_target_vocab" in vocab_json_data.keys(),"target unicode vocab missing"
        
        self.vocab_type       = vocab_type
        self.vocab_json_data  = vocab_json_data
        self.num_layers       = num_layers               
        self.num_heads        = num_heads               
        self.d_model          = d_model            
        self.dff              = dff             
        self.rate             = rate             
        self.pe_max           = pe_max              
        self.d_len            = d_len             
        self.pad_value        = pad_value
        self.start_value      = start_value
        self.end_value        = end_value
        self.inp_shape        = inp_shape
        self.tar_shape        = tar_shape
        self.inp_vocab        = vocab_json_data["source_vocab"]
        self.tgt_vocab        = vocab_json_data[f"{vocab_type[0]}_target_vocab"]
        self.inp_voclen       = len(self.inp_vocab)
        self.tgt_voclen       = len(self.tgt_vocab)
        self.weight_path      = f"{model_weights_dir}model_{vocab_type}.h5"

cfg_g=ModelConfig("grapheme",vocab_json_path,model_weights_dir)
cfg_u=ModelConfig("unicode",vocab_json_path,model_weights_dir)
df= pd.read_csv("../input/dakshina-inspection/data.csv")
df

# Modeling

In [None]:
class Masking(tf.keras.layers.Layer):
    def __init__(self,pad_value,size,**kwargs,):  
        super().__init__(**kwargs)
        self.pad_value = pad_value
        self.size      = size
        
    def create_padding_mask(self,seq):
        '''
            creates padding mask: fixed pad value 0
        '''
        seq = tf.cast(tf.math.equal(seq,self.pad_value), tf.float32)
        # add extra dimensions to add the padding to the attention logits.
        return seq[:,tf.newaxis, tf.newaxis, :] 

    def create_look_ahead_mask(self,size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        # (seq_len, seq_len)
        return mask  


    def call(self,inp,tar):
        '''
            create relevent masks:
            args:
                inp : source encoded
                tar : target endcoded
            returns:
                mask     : Encoder padding mask
                          * Used in the 2nd attention block in the decoder,
                          * This padding mask is used to mask the encoder outputs.
                comb_mask: look ahead mask
                          * Used in the 1st attention block in the decoder.
                          * It is used to pad and mask future tokens in the input received by the decoder.
        '''
        #mask
        mask            = self.create_padding_mask(inp)
        # lmask
        look_ahead_mask = self.create_look_ahead_mask(self.size)
        dec_mask        = self.create_padding_mask(tar)
        combined_mask   = tf.maximum(dec_mask, look_ahead_mask)

        return mask,combined_mask
    def get_config(self):

        config = super().get_config().copy()
        config.update({'pad_value'   : self.pad_value,
                       'size'        : self.size})
        return config

class PositionalEncoding(tf.keras.layers.Layer):
    '''
    tensorflow wrapper for positional encoding layer
    args:
      position  :   incoming sequence length
      d_model   :   required embedding dim
    '''
    def __init__(self,position,d_model,use_scale,**kwargs,):  
        super().__init__(**kwargs)
        self.use_scale = use_scale
        self.position  = position
        self.d_model   = d_model
        
        
    def call(self,x):
        # pos encoding
        pos=self.positional_encoding()
        # input processing
        seq_len = tf.shape(x)[1]
        if self.use_scale:
            x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += pos[:, :seq_len, :]
        return x 
    
    def get_angles(self,pos, i):
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(self.d_model))
        return pos * angle_rates

    def positional_encoding(self):
        angle_rads = self.get_angles(np.arange(self.position)[:, np.newaxis],np.arange(self.d_model)[np.newaxis, :])
        # apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        # apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'position'   : self.position,
            'd_model'    : self.d_model,
            'use_scale'  : self.use_scale
        })
        return config

    
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads,**kwargs,):
        super(MultiHeadAttention, self).__init__(**kwargs)
        assert d_model % num_heads == 0,"Model Dimension Must be divideable by number of head provided"
        # attrs
        self.num_heads   = num_heads
        self.d_model     = d_model
        self.depth       = self.d_model // self.num_heads
        # ops
        self.wq    = tf.keras.layers.Dense(d_model)
        self.wk    = tf.keras.layers.Dense(d_model)
        self.wv    = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        '''
            * Split the last dimension into (num_heads, depth).
            * Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        '''
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  ## (batch_size, seq_len, d_model)
        k = self.wk(k)  ## (batch_size, seq_len, d_model)
        v = self.wv(v)  ## (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  ## (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  ## (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  ## (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
        # (batch_size, seq_len_q, num_heads, depth)
        scaled_attention                    = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  
        # (batch_size, seq_len_q, d_model)
        concat_attention                    = tf.reshape(scaled_attention,(batch_size, -1, self.d_model))  
        # (batch_size, seq_len_q, d_model)
        output                              = self.dense(concat_attention)  
        return output, attention_weights
    
    def scaled_dot_product_attention(self,q, k, v, mask):
        '''
            Calculate the attention weights.

            args:
                q   : query shape == (..., seq_len_q, depth)
                k   : key shape == (..., seq_len_k, depth)
                v   : value shape == (..., seq_len_v, depth_v)
                mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k). Defaults to None.
            returns:
                output, attention_weights
            NOTES:
            * q, k, v must have matching leading dimensions.
            * k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
            * The mask has different shapes depending on its type(padding or look ahead) but it must be broadcastable for addition.

        '''
        ## (..., seq_len_q, seq_len_k)
        matmul_qk = tf.matmul(q, k, transpose_b=True)  
        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
        ## (..., seq_len_q, seq_len_k)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  
        ## (..., seq_len_q, depth_v)
        output = tf.matmul(attention_weights, v)  
        return output, attention_weights
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'd_model'  : self.d_model,
            'num_heads': self.num_heads,
        })
        return config

def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  ## (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)                  ## (batch_size, seq_len, d_model)
    ])

class EncoderBaseLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1,**kwargs,):
        super(EncoderBaseLayer, self).__init__(**kwargs)
        # attrs
        self.num_heads = num_heads
        self.d_model   = d_model
        self.dff       = dff
        self.rate      = rate
        # ops
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def call(self, x,mask,training=True):
        ## op outs:(batch_size, input_seq_len, d_model)
        attn_output, _ = self.mha(x, x, x, mask)  
        attn_output    = self.dropout1(attn_output, training=training)
        out1           = self.layernorm1(x + attn_output)  
        
        ffn_output     = self.ffn(out1)  
        ffn_output     = self.dropout2(ffn_output, training=training)
        out2           = self.layernorm2(out1 + ffn_output) 
        return out2
    
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'd_model'  : self.d_model,
            'num_heads': self.num_heads,
            'dff'      : self.dff,
            'rate'     : self.rate
        })
        return config


class DecoderBaseLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1,**kwargs,):
        super(DecoderBaseLayer, self).__init__(**kwargs,)
        # attrs
        self.num_heads = num_heads
        self.d_model   = d_model
        self.dff       = dff
        self.rate      = rate
        # ops
        self.mha1 = MultiHeadAttention(self.d_model, self.num_heads)
        self.mha2 = MultiHeadAttention(self.d_model, self.num_heads)
        
        self.ffn  = point_wise_feed_forward_network(self.d_model, self.dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(self.rate)
        self.dropout2 = tf.keras.layers.Dropout(self.rate)
        self.dropout3 = tf.keras.layers.Dropout(self.rate)

    def call(self, x, enc,comb_mask,mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        ##op outs:(batch_size, target_seq_len, d_model)
        attn1, attn_weights_block1 = self.mha1(x, x, x, comb_mask)  
        attn1                      = self.dropout1(attn1)
        out1                       = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc, enc, out1,mask)  
        attn2                      = self.dropout2(attn2)
        out2                       = self.layernorm2(attn2 + out1)  

        ffn_output                 = self.ffn(out2)  
        ffn_output                 = self.dropout3(ffn_output)
        out3                       = self.layernorm3(ffn_output + out2)  

        return out3, attn_weights_block1, attn_weights_block2
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'd_model'  : self.d_model,
            'num_heads': self.num_heads,
            'dff'      : self.dff,
            'rate'     : self.rate
        })
        return config
def Encoder(inp,mask,cfg):
    x   = tf.keras.layers.Embedding(cfg.inp_voclen,cfg.d_model,name="EncoderInputEncoding")(inp)
    x   = PositionalEncoding(cfg.pe_max,cfg.d_model,True,name="EncoderPositionalEncoding")(x)
    for i in range(cfg.num_layers):
        x=EncoderBaseLayer(cfg.d_model, cfg.num_heads, cfg.dff, cfg.rate,name=f"EncoderLayer_{i}")(x,mask)
    return x

def Decoder(tar,enc,mask,comb_mask,cfg):
    x   = tf.keras.layers.Embedding(cfg.tgt_voclen,cfg.d_model,name="DecoderInputEncoding")(tar)
    x   = PositionalEncoding(cfg.pe_max,cfg.d_model,True,name="DecoderPositionalEncoding")(x)
    x   = tf.keras.layers.Dropout(cfg.rate)(x)
    w_attn={}
    for i in range(cfg.num_layers):
        x,awb1,awb2=DecoderBaseLayer(cfg.d_model, cfg.num_heads, cfg.dff, cfg.rate,name=f"DecoderLayer_{i}")(x,enc,comb_mask,mask)
        w_attn[f'decoder_layer{i+1}_block1'] = awb1
        w_attn[f'decoder_layer{i+1}_block2'] = awb2
    x=tf.keras.layers.Dense(cfg.tgt_voclen,name="logits")(x)
    return x,w_attn    


def net(cfg):
    inp           = tf.keras.layers.Input(shape=cfg.inp_shape,name="input")
    tar           = tf.keras.layers.Input(shape=cfg.tar_shape,name="target")
    mask,comb_mask=Masking(pad_value=cfg.pad_value,size=cfg.d_len,name="masks")(inp,tar)
    enc= Encoder(inp,mask,cfg)
    x,w_attn=Decoder(tar,enc,mask,comb_mask,cfg)
    model=tf.keras.Model(inputs=[inp,tar],outputs=[x,w_attn],name="TransformerBaseNet")
    return model


# Tralsliterator

In [None]:
class Tralsliterator(object):
    def __init__(self,cfg):
        self.cfg=cfg
        self.transformer=net(cfg)
        self.transformer.load_weights(cfg.weight_path)
        print(f"Loaded Transliterator Weights:{cfg.vocab_type}")
    def process_sentence(self,sentence,return_processed_sentence=False):
        '''
            process a sentence with model tokens
        '''
        words=[]
        # clean the sentence
        sentence=sentence.lower()
        for word in sentence.split():
            word=word.translate(str.maketrans('', '', string.punctuation))
            for ch in word:
                if ch not in self.cfg.inp_vocab:
                    word=word.replace(ch,"")
            if word.strip():
                words.append(word)
        if return_processed_sentence:
            return " ".join(words)
        tokens=[]
        # encode
        for word in words:
            word=[ch for ch in word]
            word=[self.cfg.inp_vocab[self.cfg.start_value]]+word+[self.cfg.inp_vocab[self.cfg.end_value]]
            token=[self.cfg.inp_vocab.index(ch) for ch in word]
            token+=[self.cfg.pad_value for _ in range(self.cfg.d_len-len(token))]
            tokens.append(token)
        return np.array(tokens)
    def transliterate(self,sentence,process_attention_visualization=False):
        preds=[]
        inp=self.process_sentence(sentence)
        label=np.ones_like(inp,dtype="int64")*self.cfg.start_value
        # sequential decoding
        for i in tqdm(range(self.cfg.d_len)):
            pred,_=self.transformer.predict({"input":inp,"target":label})
            pred  =pred[:,i,:]
            preds.append(pred)
            char_out=softmax(pred,axis=-1)
            max_idx =np.argmax(char_out,axis=-1)
            if i < self.cfg.d_len - 1:
                label[:, i + 1] = max_idx

            words=[]
            for w_label in label:
                _label=[]
                for v in w_label[1:]:
                    if v==self.cfg.end_value:
                        break
                    _label.append(v)
                words.append("".join([self.cfg.tgt_vocab[l] for l in _label]))
        
        # attention visualization data 
        if process_attention_visualization:
            _,w_attn=self.transformer.predict({"input":inp,"target":label})
            viz_data=[]
            for idx,word_inp in enumerate(inp):
                word_tgt=label[idx]
                word_inp=[self.cfg.inp_vocab[i] for i in word_inp]
                word_tgt=[self.cfg.tgt_vocab[i] for i in word_tgt]
                word_attn ={}
                for k,v in w_attn.items():
                    word_attn[k]=v[idx]
                viz_data.append({"inp":word_inp,"tgt":word_tgt,"attn":word_attn})
            return viz_data," ".join(words)
        # prediction
        else:
            return " ".join(words)
        


# Prediction

In [None]:
model_g=Tralsliterator(cfg_g)# grapheme based transliterator
model_u=Tralsliterator(cfg_u)# unicode based transliterator

In [None]:
def get_random_prediction():
    idx=random.randint(0,len(df))
    bangla  =df.iloc[idx,0]
    sentence=df.iloc[idx,1]
    g_pred=model_g.transliterate(sentence)
    u_pred=model_u.transliterate(sentence)
    print("=======================TAKLA=========================")
    print()
    print(sentence)
    print()
    print("=======================GROUND TRUTH==================")
    print()
    print(bangla)
    print()
    print("================GRAPHEME BASED TRANSLITERATION======")
    print()
    print(g_pred)
    print()
    print("================UNICODE BASED TRANSLITERATION=======")
    print(u_pred)
    print()
    
get_random_prediction()

# Attention Scores

In [None]:
idx=random.randint(0,len(df))
bangla  =df.iloc[idx,0]
sentence=df.iloc[idx,1]
viz_data,g_pred=model_g.transliterate(sentence,process_attention_visualization=True)
print("=======================TAKLA=========================")
print()
print(sentence)
print()
print("=======================BANGLA=======================")
print()
print(bangla)
print()
print("================GRAPHEME BASED TRANSLITERATION======")
print()
print(g_pred)
print()
print("Word Present:",len(viz_data))

In [None]:
word_index=random.randint(0,len(viz_data)-1)
block_name='decoder_layer4_block2'
data=viz_data[word_index]
inp=data["inp"]
tgt=data["tgt"]
attn=data["attn"][block_name]
plot_attn_weights(inp,tgt,attn)