In [None]:
import numpy as np
import io
import time
import tensorflow as tf
from tensorflow import keras
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.font_manager import FontProperties

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import RNN, SimpleRNN
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense, Flatten
from keras.utils.vis_utils import plot_model

In [None]:
# getting dataset for running on google colab
!nvidia-smi
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf /content/dakshina_dataset_v1.0.tar
!apt-get install -y fonts-lohit-deva
!fc-list :lang=hi family

In [None]:
batch_size = 64  # Batch size for training.
latent_dim = 256  # Latent dimensionality of the encoding space.
# Path to the data txt file on disk.
train_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
val_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [None]:
# dataset preprocessing
def preprocess_sentence(w):
  w = '\t' + w + '\n'
  return w

def create_dataset(path):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in line.split('\t')]
                for line in lines]

  return zip(*word_pairs)

def tokenize(lang, lang_tokenizer=None):
  if lang_tokenizer is None:
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
    lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

  return tensor, lang_tokenizer

def load_dataset(path, inp_lang_tokenizer=None, targ_lang_tokenizer=None):
  # create input, output pairs
  targ_lang, inp_lang, _ = create_dataset(path)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang, inp_lang_tokenizer)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang, targ_lang_tokenizer)
  target_str = tf.convert_to_tensor(targ_lang)
  dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor, target_str))
  dataset = dataset.shuffle(len(dataset))
  
  return dataset, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
#loading dataset
train_dataset, inp_tokenizer, targ_tokenizer = load_dataset(train_path)
val_dataset, _, _ = load_dataset(val_path, inp_tokenizer, targ_tokenizer)
test_dataset, _, _ = load_dataset(test_path, inp_tokenizer, targ_tokenizer)

In [None]:
def get_layer(layer_name, num_cells, dropout, return_sequences, return_state):
  if layer_name=="RNN":
    return SimpleRNN(num_cells, return_sequences=return_sequences, return_state=return_state, dropout=dropout)
  elif layer_name=="LSTM":
    return LSTM(num_cells, return_sequences=return_sequences, return_state=return_state, dropout=dropout)
  elif layer_name=="GRU":
    return GRU(num_cells, return_sequences=return_sequences, return_state=return_state, dropout=dropout)

# encoder of our model
class Encoder(tf.keras.Model):
  def __init__(self, layer_name, evsize, embedd_size, nlayers, num_encoder, bsize, dropout=0.0):
    super(Encoder,self).__init__()
    self.bsize=bsize
    self.nlayers=nlayers
    self.layer_name = layer_name
    self.num_encoder=num_encoder
    self.vocab_size = evsize
    self.embedding=Embedding(evsize,embedd_size)
    #self.layer=get_layer(layer_name,num_encoder, dropout, True, True)
    self.layer_list=[]
    for i in range(self.nlayers):
        self.layer_list.append(get_layer(layer_name,num_encoder, dropout, True, True))

  def call(self,x,hidden=None):
    x=self.embedding(x)
    x=self.layer_list[0](x, initial_state=hidden)
    for layer in self.layer_list[1:]:
      x = layer(x)
    output, state_h = x[0], x[1:]
    return output,state_h

  def initialize_hidden_state(self, bsize=-1):
    if bsize == -1:
        bsize = self.bsize
    if self.layer_name=="LSTM":
        return [tf.zeros((bsize, self.num_encoder))]*2
    return [tf.zeros((bsize,self.num_encoder))]

  def from_embedd(self, x, hidden=None):
    x=self.layer_list[0](x, initial_state=hidden)
    for layer in self.layer_list[1:]:
      x = layer(x)
    output, state_h = x[0], x[1:]
    return output,state_h
    

In [None]:
# decoder of our model
class Decoder(tf.keras.Model):
  def __init__(self, layer_name, dvsize, embedd_size, nlayers, num_units, bsize, dropout=0.0, attention=False):
    super(Decoder, self).__init__()
    self.batch_sz = bsize
    self.layer_name = layer_name
    self.vocab_size = dvsize
    self.dec_units = num_units
    self.dropout=dropout
    self.nlayers = nlayers
    self.attention=attention
    self.embedding = tf.keras.layers.Embedding(input_dim=dvsize,output_dim= embedd_size)
    #self.layer = get_layer(layer_name, num_units, dropout, True, True)
    self.layer_list = []
    for i in range(nlayers-1):
      self.layer_list.append(get_layer(layer_name, num_units, dropout, True, True))
    self.layer_list.append(get_layer(layer_name, num_units, dropout, False, True))
    self.dense = Dense(self.vocab_size, activation="softmax")
    self.flatten=Flatten()
    if self.attention:
        self.attention_layer = BahdanauAttention(num_units)

  def call(self,x,hidden,enc_output=None):
    x=self.embedding(x)

    if self.attention:
      context, att_wts = self.attention_layer(hidden, enc_output)
      x = tf.concat([tf.expand_dims(context, 1), x], axis=-1)
    else:
      att_wts=None
    
    x = self.layer_list[0](x, initial_state = hidden)
    for layer in self.layer_list[1:]:
        x = layer(x)
    output = x[0]
    dec_h = x[1:]
    output = self.dense(self.flatten(output))

    return output, dec_h, att_wts

  def rnn_output(self, x, hidden, enc_output=None):
    x=self.embedding(x)

    if self.attention:
      context, att_wts = self.attention_layer(hidden, enc_output)
      x = tf.concat([tf.expand_dims(context, 1), x], axis=-1)
    else:
      att_wts=None
    
    x = self.layer_list[0](x, initial_state = hidden)
    for layer in self.layer_list[1:]:
        x = layer(x)
    output = x[0]
    dec_h = x[1:]
    return output, dec_h, att_wts

# encoder-decoder model class
class make_model():
  def __init__(self, embedded_dim, enc_layers, dec_layers, layer_name, num_units, dropout, attention=False, wandbcallback=False):
    evsize = len(inp_tokenizer.word_index) + 1
    dvsize = len(targ_tokenizer.word_index) + 1
    self.callback = wandbcallback
    self.batch_size = 64
    self.inp_tokenizer = inp_tokenizer
    self.targ_tokenizer = targ_tokenizer
    self.encoder = Encoder(layer_name, evsize, embedded_dim, enc_layers, num_units, self.batch_size, dropout)
    self.decoder = Decoder(layer_name, dvsize, embedded_dim, dec_layers, num_units, self.batch_size, dropout, attention)
  
  def get_vsize(self):
    evsize = len(inp_tokenizer.word_index) + 1
    dvsize = len(targ_tokenizer.word_index) + 1
    return evsize, dvsize
  
  # DO NOT USE
  def create(self):
    enc_inputs = Input(shape=(None,))
    #enc_state = self.encoder.initialize_hidden_state()
    enc_output, state_h = self.encoder(enc_inputs)
    enc_states = [state_h]

    dec_inputs = Input(shape=(None,))
    #dec_input = tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*self.batch_size ,1)
    dec_state = state_h
    dec_output, att_wts = self.decoder(dec_inputs, dec_state, enc_output)

    self.model = Model([enc_inputs, dec_inputs], dec_output)
    self.model.summary()    
  
  def build(self, loss, optimizer, metric):
    self.loss_function = loss
    self.optimizer = optimizer
    self.metric = metric

  def decoder_input(self, ii=None):
    if ii is None:
        return tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*self.batch_size,1)
    return tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*ii,1)

  @tf.function
  def train_step(self,input,target,encoder_hidden):
    loss=0
    with tf.GradientTape() as tape:
      encoder_output,encoder_hidden=self.encoder(input,encoder_hidden)
      decoder_hidden=encoder_hidden
      decoder_input=tf.expand_dims([self.targ_tokenizer.word_index["\t"]]*self.batch_size ,1)
      for t in range(1,target.shape[1]):
        predictions,decoder_hidden,attnweights=self.decoder(decoder_input,decoder_hidden,encoder_output)
        loss+=self.loss_function(target[:,t],predictions)
        self.metric.update_state(target[:,t], predictions)
        predictions=tf.argmax(predictions,1)#get max index
        decoder_input=tf.expand_dims(predictions,1)#to match shape of decoder input
      batch_loss=(loss/int(target.shape[1]))
      variables = self.encoder.trainable_variables + self.decoder.trainable_variables
      gradients = tape.gradient(loss, variables)
      self.optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss, self.metric.result()
  
  def fit(self, dataset, val_dataset, epochs, iswandb=False, teacher_ratio = 1.0):
    self.epochs = epochs
    steps_per_epoch = len(dataset) // self.batch_size
    steps_per_epoch_val = len(val_dataset) // self.batch_size
    dataset = dataset.batch(self.batch_size, drop_remainder=True)
    val_dataset = val_dataset.batch(self.batch_size, drop_remainder=True)
    sample_inp, sample_targ , _= next(iter(dataset))
    self.max_inp_len = sample_inp.shape[1]
    self.max_targ_len = sample_targ.shape[1]
    self.teacher_ratio = teacher_ratio

    for epoch in range(self.epochs):
      start=time.time()
      encoder_hidden=self.encoder.initialize_hidden_state()
      total_loss=0
      total_acc = 0
      self.metric.reset_states()
      for (batch,(input,target,_dh)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss, batch_acc =self.train_step(input,target,encoder_hidden)
        total_loss += batch_loss
        total_acc += batch_acc
      loss = total_loss / steps_per_epoch
      acc = total_acc / steps_per_epoch

      encoder_hidden=self.encoder.initialize_hidden_state()
      tot_valloss=0
      tot_valacc = 0
      self.metric.reset_states()
      for (batch,(input,target,_)) in enumerate(val_dataset.take(steps_per_epoch_val)):
        batch_loss, batch_acc =self.test_step(input,target,encoder_hidden)
        tot_valloss += batch_loss
        tot_valacc += batch_acc
      val_loss = tot_valloss / steps_per_epoch_val
      val_acc = tot_valacc / steps_per_epoch_val
      print("Epoch: "+str(epoch+1)+"/"+str(self.epochs)+" trained,\t Time taken : "+str(int(time.time()-start))+"s")
      print("loss : {0:.4f}  acc : {1:.4f},   val_loss : {2:.4f}  val_acc : {3:.4f}".format(loss, acc, val_loss, val_acc))
      if self.callback == True:
        wandb.log({'val_loss': val_loss, 'val_accuracy': val_acc, 
                    'loss': loss, 'accuracy': acc, 'epoch': epoch+1})
    print("Training done.")


  @tf.function
  def test_step(self, input, target, encoder_hidden):
    loss=0
    encoder_output,encoder_hidden=self.encoder(input,encoder_hidden)
    decoder_hidden=encoder_hidden
    decoder_input=self.decoder_input()
    for t in range(1,target.shape[1]):
      predictions,decoder_hidden,attnweights=self.decoder(decoder_input,decoder_hidden,encoder_output)
      loss+=self.loss_function(target[:,t],predictions)
      self.metric.update_state(target[:,t], predictions)
      predictions=tf.argmax(predictions,1)#get max index
      decoder_input=tf.expand_dims(predictions,1)#to match shape of decoder input
    batch_loss=(loss/int(target.shape[1]))
    return batch_loss, self.metric.result()

  # get prediction score on dataset (char-level)
  def predict(self, test_dataset):
      steps_per_epoch = len(test_dataset) // self.batch_size
      dataset = test_dataset.batch(self.batch_size, drop_remainder=True)
      encoder_hidden=self.encoder.initialize_hidden_state()
      total_loss=0
      total_acc=0
      self.metric.reset_states()
      for (batch,(input,target,_)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss, batch_acc =self.test_step(input,target,encoder_hidden)
        total_loss += batch_loss
        total_acc += batch_acc
      avg_loss = total_loss / steps_per_epoch
      avg_acc = total_acc / steps_per_epoch
      print("Character level -  loss : {0:.4f}   acc : {1:.4f}".format(avg_loss, avg_acc))

  def get_input(self, word):
    word = '\t'+word+'\n'
    input = inp_tokenizer.texts_to_sequences([word])
    input = tf.keras.preprocessing.sequence.pad_sequences(input,maxlen=self.max_inp_len,padding="post")
    #print(input.dtype)
    return input

  def get_next_char(self, predictions):
    # print(predictions.shape)
    x=predictions.numpy().item()
    flag=0
    if x==0:
      x=5
      flag=1
    return self.targ_tokenizer.index_word[x],flag

  # transliterate one word
  def transliterate(self,word,val=False):
    input = self.get_input(word)
    new_word=''
    attnweightplot=np.zeros((self.max_targ_len,self.max_inp_len))
    encoder_hidden=self.encoder.initialize_hidden_state(1)
    encoder_output,encoder_hidden=self.encoder(input,encoder_hidden)
    decoder_hidden=encoder_hidden
    decoder_input=self.decoder_input(1)
    boolflag=0
    for t in range(1,self.max_targ_len):
      predictions,decoder_hidden,attnweights=self.decoder(decoder_input,decoder_hidden,encoder_output)
      predictions=tf.argmax(predictions,1)#get max index
      if val==True:
        attnweights=tf.reshape(attnweights,(-1,))
        attnweightplot[t]=attnweights.numpy()
      next,flag=self.get_next_char(predictions)
      boolflag+=flag
      new_word+=next
      # next=self.targ_tokenizer.index_word[predictions.numpy().item()]
      #print(next)
      if next == "\n":
        new_word = new_word[:-1]
        break
      decoder_input=tf.expand_dims(predictions,1)#to match shape of decoder input
    return new_word,boolflag,attnweightplot

  def plot_heatmap(self,word, wandbcallback=False):
    new_word,boolflag,attnweightplot=self.transliterate(word,True)
    xx=list(word)
    yy=list(new_word)
    attention=attnweightplot[:len(yy),:len(xx)]
    
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    fontdict = {'fontsize': 14}
    hindi_font=FontProperties(fname="/usr/share/fonts/truetype/lohit-devanagari/Lohit-Devanagari.ttf")
    ax.set_xticklabels([''] + yy, fontproperties=hindi_font, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + xx,fontdict=fontdict)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    if wandbcallback == True:
      wandb.log({"att_hm": plt})
    else:
      plt.show()

  def word_level(self,name):
    test_tsv = pd.read_csv(test_path, sep="\t", header=None)
    inputs = test_tsv[1].astype(str).tolist()
    targets = test_tsv[0].astype(str).tolist()

    outputs=[]
    correctness=[]
    for word in inputs:
      z=model.transliterate(word)
      x=z[0]
      y=z[1]
      outputs.append(x)
      correctness.append(y)
    acc1=np.sum(np.asarray(outputs) == np.array(targets)) / len(outputs)
    cor=[]
    for i in range(len(outputs)):
      if outputs[i] == targets[i]:
        cor.append(outputs[i])
    
    correct_outputs=[]
    correct_targets=[]
    for i in range(len(outputs)):
      if correctness[i] == 0:
        correct_outputs.append(outputs[i])
        correct_targets.append(targets[i])
    acc2=np.sum(np.asarray(correct_outputs) == np.array(correct_targets)) / len(correct_outputs)
    print("Word level acc : " + str(acc1))
    print("acc2 : " +str(acc2))

    df = pd.DataFrame()
    df["inputs"] = inputs
    df["targets"] = targets
    df["outputs"] = outputs
    df.to_csv(name+'.csv')
    df2=pd.DataFrame()
    df2["correct-preds"]=cor
    df2.to_csv(name+'-correct.csv')
   
  def get_connectivity(self, word):
    input = self.get_input(word)
    new_word=''
    gradlist = []
    enc_hidden = self.encoder.initialize_hidden_state(1)
    embedd_in=self.encoder.embedding(input)
    with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
      tape.watch(embedd_in)
      enc_out, enc_hidden = self.encoder.from_embedd(embedd_in, enc_hidden)
      dec_hidden = enc_hidden
      dec_input = self.decoder_input(1)
      for t in range(1, self.max_targ_len):
        rnn_out, dec_hidden, _ = self.decoder.rnn_output(dec_input, dec_hidden, enc_out)
        predictions = self.decoder.dense(self.decoder.flatten(rnn_out))
        predictions = tf.argmax(predictions, 1)
        next = self.get_next_char(predictions)[0]
        #print(next)
        if next == "\n":
          break
        new_word += next
        gradlist.append(tape.gradient(rnn_out, embedd_in)[0])
        dec_input = tf.expand_dims(predictions, 1)
      return new_word, gradlist

In [None]:
# for running on custom model
model = make_model(128, 2, 2, 'GRU', 128, 0.3, True)
model.build(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer = tf.keras.optimizers.Adam(),
                metric = tf.keras.metrics.SparseCategoricalAccuracy())



In [None]:
model.fit(train_dataset, val_dataset,7)
model.predict(test_dataset)

# uncomment to find prediction at word level
#model.word_level('predictions')

In [None]:
# plotting attention heatmaps
#ll=['ankit','ank']
for x in ll:
  model.plot_heatmap(x)

In [None]:
# wandb sweeps in colab
!pip install wandb -q
import wandb
from wandb.keras import WandbCallback

wandb.login()

In [None]:
# sweep config for logging on wandb
sweep_config = {
    'method': 'random', #grid, bayes
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'embedded_dim' : {
            'values' : [16,32,64,256]
        },
        'num_units' : {
            'values' : [16,32,64,256]
        },
        'enc_layers' : {
            'values' : [1,2,3]
        },
        'dec_layers' : {
            'values' : [1,2,3]
        },
        'dropout' : {
            'values' : [0.0,0.2,0.3]
        },
        'layer_name' : {
            'values' : ['RNN', 'GRU', 'LSTM']
        },
        'attention':{
            'values': [False]
        },
        'epochs':{
            'values': [10,15]
        }
    }
}

sweep_id = wandb.sweep(sweep_config,project = "cs6910-a3")


In [None]:
# wandb train function
def train():
    # # Default values for hyper-parameters we're going to sweep over
    some_parameters = {
      'wandbcallback' : True
    }
    # Initialize a new wandb run
    wandb.init(project = "cs6910-a3", config=some_parameters)
    config = wandb.config
    runname = "ln-"+str(config.layer_name)+"_nu-"+str(config.num_units)+"_ed-"+str(config.embedded_dim)+"_el-"+str(config.enc_layers)
    runname += "_dl-"+str(config.dec_layers) + "_dr-"+str(config.dropout)
    wandb.run.name = runname
    model = make_model(config.embedded_dim, config.enc_layers, config.dec_layers, config.layer_name, config.num_units,
                       config.dropout, config.attention, True)
    model.build(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer = tf.keras.optimizers.Adam(),
                metric = tf.keras.metrics.SparseCategoricalAccuracy())
    model.fit(train_dataset, val_dataset, epochs = config.epochs)
    

In [None]:
wandb.agent(sweep_id, function = train, count = 5)

In [None]:
# visualising connectivity
from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
import matplotlib
from IPython.display import HTML as html_print
from IPython.display import display
import tensorflow.keras.backend as K

# get html element
def cstr(s, color='black'):
    if s == ' ':
      return "<text style=color:#000;padding-left:10px;background-color:{}> </text>".format(color, s)
    else:
      return "<text style=color:#000;background-color:{}>{} </text>".format(color, s)
	
# print html
def print_color(t):
	  display(html_print(''.join([cstr(ti, color=ci) for ti,ci in t])))

# get appropriate color for value
def get_clr(value):
    colors = ['#85c2e1', '#89c4e2', '#95cae5', '#99cce6', '#a1d0e8'
      '#b2d9ec', '#baddee', '#c2e1f0', '#eff7fb', '#f9e8e8',
      '#f9e8e8', '#f9d4d4', '#f9bdbd', '#f8a8a8', '#f68f8f',
      '#f47676', '#f45f5f', '#f34343', '#f33b3b', '#f42e2e']
    value = int(value * 19)
    if value == 19:
        value -= 1
    return colors[value]

# sigmoid function
def sigmoid(x):
    z = 1/(1 + np.exp(-x)) 
    return z

def visualize(grads, word, new_word):
    print("Romanized:", word)
    print("Devanagari:", new_word)
    for i in range(len(new_word)):
        print("Connectivity visualization for", new_word[i],":")
        text_colours = []
        for j in range(len(grads[i])):
            text = (word[j], get_clr(grads[i][j]))
            text_colours.append(text)
        print_color(text_colours)

def get_activation(grads, word):
    act_grad = []
    for tensor in grads:
        grad = tf.norm(tensor, axis=1)
        grad = grad[:len(word)]
        scaler = MinMaxScaler()
        grad = tf.reshape(grad, (-1,1))
        grad = scaler.fit_transform(grad)
        act_grad.append(grad)
    return act_grad


def connectivity_vis(model, word):
    new_word, grads = model.get_connectivity(word)
    grad_act = get_activation(grads, word)
    visualize(grad_act, word, new_word)

In [None]:
connectivity_vis(model, 'bhagwan')