In [None]:
import numpy as np
import io
import time
import tensorflow as tf
from tensorflow import keras
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.font_manager import FontProperties

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import RNN, SimpleRNN
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense, Flatten
from keras.utils.vis_utils import plot_model

In [None]:
!nvidia-smi
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf /content/dakshina_dataset_v1.0.tar
!apt-get install -y fonts-lohit-deva
!fc-list :lang=hi family

In [None]:
batch_size = 64  # Batch size for training.
latent_dim = 256  # Latent dimensionality of the encoding space.
# Path to the data txt file on disk.
train_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
val_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"
test_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv"

In [None]:
def preprocess_sentence(w):
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '\t' + w + '\n'
  return w

def create_dataset(path):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in line.split('\t')]
                for line in lines]

  return zip(*word_pairs)

def tokenize(lang, lang_tokenizer=None):
  if lang_tokenizer is None:
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
    lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

  return tensor, lang_tokenizer

def load_dataset(path, inp_lang_tokenizer=None, targ_lang_tokenizer=None):
  # create input, output pairs
  targ_lang, inp_lang, _ = create_dataset(path)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang, inp_lang_tokenizer)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang, targ_lang_tokenizer)
  target_str = tf.convert_to_tensor(targ_lang)
  dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor, target_str))
  dataset = dataset.shuffle(len(dataset))
  
  return dataset, inp_lang_tokenizer, targ_lang_tokenizer

In [None]:
train_dataset, inp_tokenizer, targ_tokenizer = load_dataset(train_path)
val_dataset, _, _ = load_dataset(val_path, inp_tokenizer, targ_tokenizer)
test_dataset, _, _ = load_dataset(test_path, inp_tokenizer, targ_tokenizer)

In [None]:
def get_layer(layer_name, num_cells, dropout, return_sequences, return_state):
  if layer_name=="RNN":
    return SimpleRNN(num_cells, return_sequences=return_sequences, return_state=return_state, dropout=dropout)
  elif layer_name=="LSTM":
    return LSTM(num_cells, return_sequences=return_sequences, return_state=return_state, dropout=dropout)
  elif layer_name=="GRU":
    return GRU(num_cells, return_sequences=return_sequences, return_state=return_state, dropout=dropout)

class Encoder(tf.keras.Model):
  def __init__(self, layer_name, evsize, embedd_size, nlayers, num_encoder, bsize, dropout=0.0):
    super(Encoder,self).__init__()
    self.bsize=bsize
    self.nlayers=nlayers
    self.layer_name = layer_name
    self.num_encoder=num_encoder
    self.vocab_size = evsize
    self.embedding=Embedding(evsize,embedd_size)
    #self.layer=get_layer(layer_name,num_encoder, dropout, True, True)
    self.layer_list=[]
    for i in range(self.nlayers):
        self.layer_list.append(get_layer(layer_name,num_encoder, dropout, True, True))

  def call(self,x,hidden=None):
    x=self.embedding(x)
    x=self.layer_list[0](x, initial_state=hidden)
    for layer in self.layer_list[1:]:
      x = layer(x)
    output, state_h = x[0], x[1:]
    return output,state_h

  def initialize_hidden_state(self, bsize=-1):
    if bsize == -1:
        bsize = self.bsize
    if self.layer_name=="LSTM":
        return [tf.zeros((bsize, self.num_encoder))]*2
    return [tf.zeros((bsize,self.num_encoder))]

  def from_embedd(self, x, hidden=None):
    x=self.layer_list[0](x, initial_state=hidden)
    for layer in self.layer_list[1:]:
      x = layer(x)
    output, state_h = x[0], x[1:]
    return output,state_h
    