# TO DO
1. Figure out dataset
2. Break down into individual characters
3. Form corpus from dataset for input and output characters
4. Assign a number to each character
5. Model will have an embedding so it will handle it
6. Output will be a bunch of integers so we will have to decode it

In [1]:
!pip install wandb
# !pip install tensorflow
# !pip install keras

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/5c/ee/d755f9e5466df64c8416a2c6a860fb3aaa43ed6ea8e8e8e81460fda5788b/wandb-0.10.28-py2.py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 8.7MB/s 
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/f3/92/5a33be64990ba815364a8f2dd9e6f51de60d23dfddafb4f1fc5577d4dc64/sentry_sdk-1.0.0-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 49.7MB/s 
Collecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/a6/99/98019716955ba243657daedd1de8f3a88ca1f5b75057c38e959db22fb87b/GitPython-3.1.14-py3-none-any.whl (159kB)
[K     |████████████████████████████████| 163kB 54.5MB/s 
Collecting configparser>=3.8.1
  Downloading https://fil

In [8]:
import tarfile
import os
import pandas as pd
import keras
import numpy as np
import wandb
from keras import layers
from keras.layers import LSTM, Dense, Embedding, Input
from keras.models import Model
from keras.utils.vis_utils import plot_model
from tqdm.auto import tqdm

In [3]:
!wget -nc https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar

if not os.path.isdir('/content/dakshina_dataset_v1.0'):
  tarfile.open("/content/dakshina_dataset_v1.0.tar").extractall()

--2021-04-30 10:11:55--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.199.128, 74.125.142.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2021-04-30 10:12:34 (49.4 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [4]:
wandb.login(key='14394907543f59ea21931529e34b4d80d2ca8c9c')

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Loading Data

In [9]:
class data_loader():

  @staticmethod
  def _load_raw_df(languages = ["hi", "mr"]):
    lex = dict()
    lex['train'], lex['val'], lex['test'] = [], [], [] 
    column_names = ['input', 'output', 'count']
    
    for la in languages:
      lex['train'].append(pd.read_csv('/content/dakshina_dataset_v1.0/'+la+'/lexicons/'+la+'.translit.sampled.train.tsv', sep='\t', header=None, names=column_names))
      lex['val'].append(pd.read_csv('/content/dakshina_dataset_v1.0/'+la+'/lexicons/'+la+'.translit.sampled.dev.tsv', sep='\t', header=None, names=column_names))
      lex['test'].append(pd.read_csv('/content/dakshina_dataset_v1.0/'+la+'/lexicons/'+la+'.translit.sampled.test.tsv', sep='\t', header=None, names=column_names))

    lex['train'] = pd.concat(lex['train'])
    lex['val'] = pd.concat(lex['val'])
    lex['test'] = pd.concat(lex['test'])

    return lex    

  @staticmethod
  def _make_final_df(lex):
    
    for div in ['train', 'val', 'test']:
    
      # removing non max transliterations
      idx = lex[div].groupby(['input'])['count'].transform(max) == lex[div]['count']
      lex[div] = lex[div][idx].reset_index(drop=True)

      # calclulating difference in lengths of various transliterations
      lex[div]['input_len'] = lex[div].apply(lambda x: len(str(x['input'])), axis=1)
      lex[div]['output_len'] = lex[div].apply(lambda y: len(str(y['output'])), axis=1)
      lex[div]['mod_dif'] = lex[div].apply(lambda z: abs(z['input_len'] - z['output_len']), axis=1) 

      # removing transliterations that vary by a lot in length
      idx = lex[div].groupby(['input'])['mod_dif'].transform(min) == lex[div]['mod_dif']
      lex[div] = lex[div][idx].reset_index(drop=True)

      # removing duplicates if any remain
      lex[div].drop_duplicates(subset='input', keep='first', inplace=True)

      # removing redundant columns
      lex[div].drop(labels=['count', 'input_len', 'output_len', 'mod_dif'], inplace=True, axis=1)

    return lex

  @staticmethod
  def _generate_batch(X, y, data_dict, num_decoder_tokens, batch_size = 1):
    ''' Generate a batch of data '''
    assert len(X)==len(y)
    ind = np.random.permutation(len(X))
    X,y = [X[j] for j in ind],[y[j] for j in ind]
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, data_dict['max_source_length']),dtype='float32')
            decoder_input_data = np.zeros((batch_size, data_dict['max_target_length']),dtype='float32')
            decoder_target_data = np.zeros((batch_size, data_dict['max_target_length'], num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text):
                  encoder_input_data[i, t] = word
                for t, word in enumerate(target_text):
                    if t<len(target_text)-1:
                        decoder_input_data[i, t] = word # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        #print(word)
                        decoder_target_data[i, t - 1, word] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [10]:
class Tokenizer:

  def __init__(self, df):

    self.start_token = '<STR>'
    self.stop_token = '<STP>'
    self.unknown_token = '<UNK>'

    self.input_corpus = [self.start_token, self.stop_token, self.unknown_token]
    self.output_corpus = [self.start_token, self.stop_token, self.unknown_token]

    input_words = df.input.tolist()
    output_words = df.output.tolist()

    for word in input_words:
      tokens = str(word)
      for token in tokens:
        if token not in self.input_corpus:
          self.input_corpus.append(token)

    for word in output_words:
      tokens = str(word)
      for token in tokens:
        if token not in self.output_corpus:
          self.output_corpus.append(token)
    
    self.encode_dict_input = {self.input_corpus[i] : i+1 for i in range(len(self.input_corpus))}
    self.decode_dict_input = {k:v for v,k in self.encode_dict_input.items()}
    
    self.encode_dict_output = {self.output_corpus[i] : i+1 for i in range(len(self.output_corpus))}
    self.decode_dict_output = {k:v for v,k in self.encode_dict_output.items()}

  # takes in lists of words and returns lists of integers
  def encode(self, X, mode='input'):

    if (mode=='input'):
      input_list = []
      for word in X:
        word = str(word)
        # integer_list = [self.encode_dict_input[self.start_token]] + [self.encode_dict_input.get(token, self.encode_dict_input[self.unknown_token]) for token in word] + [self.encode_dict_input[self.stop_token]]
        integer_list =np.array([self.encode_dict_input.get(token, self.encode_dict_input[self.unknown_token]) for token in word])
        input_list.append(integer_list)
      
      return input_list
    
    if (mode=='output'):
      output_list = []
      for word in X:
        word = str(word)
        integer_list = np.array([self.encode_dict_output[self.start_token]] + [self.encode_dict_output.get(token, self.encode_dict_output[self.unknown_token]) for token in word] + [self.encode_dict_output[self.stop_token]])
        output_list.append(integer_list)
      
      return output_list
    

  # takes in lists of integers and returns lists of words
  def decode(self, X, mode='input'):

    if (mode=='input'):
      input_list = []
      for integers in X:
        token_list = [self.decode_dict_input.get(integer, '') for integer in integers] 
        input_list.append(''.join(token_list))
      
      return input_list

    if (mode=='output'):
      output_list = []
      for integers in X:
        token_list = [self.decode_dict_output.get(integer, '') for integer in integers[1:-1]] 
        output_list.append(''.join(token_list))
      
      return output_list

In [11]:
def return_data_dict(languages=['hi', 'mr'], batch_size=32):

  lex = data_loader._load_raw_df(languages)
  lex = data_loader._make_final_df(lex)

  data_dict = dict()

  df_train = lex['train']
  df_val = lex['val']
  df_test = lex['test']

  tk = Tokenizer(df_train)

  data_dict['in_size'] = len(tk.input_corpus) + 1
  data_dict['out_size'] = len(tk.output_corpus) + 1

  X_train = tk.encode(df_train.input.tolist(), mode='input')
  Y_train = tk.encode(df_train.output.tolist(), mode='output')
  X_val = tk.encode(df_val.input.tolist(), mode='input')
  Y_val = tk.encode(df_val.output.tolist(), mode='output')
  X_test = tk.encode(df_test.input.tolist(), mode='input')
  Y_test = tk.encode(df_test.output.tolist(), mode='output')

  max_source_length = np.max(np.array([len(x) for x in X_train]))
  max_target_length = np.max(np.array([len(x) for x in Y_train]))

  data_dict['train'], data_dict['val'], data_dict['test']= dict(), dict(), dict()

  data_dict['train']['df'] = df_train
  data_dict['val']['df'] = df_val
  data_dict['test']['df'] = df_test

  data_dict['train']['max_source_length'] = np.max(np.array([len(x) for x in X_train]))
  data_dict['train']['max_target_length'] = np.max(np.array([len(x) for x in Y_train]))
  data_dict['val']['max_source_length'] = np.max(np.array([len(x) for x in X_val]))
  data_dict['val']['max_target_length'] = np.max(np.array([len(x) for x in Y_test]))
  data_dict['test']['max_source_length'] = np.max(np.array([len(x) for x in X_test]))
  data_dict['test']['max_target_length'] = np.max(np.array([len(x) for x in Y_test]))

  data_dict['train']['batch'] = data_loader._generate_batch(X_train, Y_train, data_dict['train'], data_dict['out_size'], batch_size)
  data_dict['val']['batch'] = data_loader._generate_batch(X_val, Y_val, data_dict['val'], data_dict['out_size'], batch_size)
  data_dict['test']['batch'] = data_loader._generate_batch(X_test, Y_test, data_dict['test'], data_dict['out_size'], batch_size)  
  
  data_dict['tokenizer'] = tk

  return data_dict

In [None]:
data_dict = return_data_dict()

In [None]:
# data = dict()
# data['train'] = dict()
# data['train']['X'] = X_train
# data['train']['Y'] = Y_train
# data['in_size'] = len(tk.input_corpus) + 1
# data['out_size'] = len(tk.output_corpus) + 1
# num_decoder_tokens = data['out_size'] 
# max_source_length = np.max(np.array([len(x) for x in X_train]))
# max_target_length = np.max(np.array([len(x) for x in Y_train]))

# Question 1


In [12]:
class BeamSearchCallBack(keras.callbacks.Callback):
  def __init__(self,details,test_data,tokenizer,out_size,num_val=200,beam=3) :
    self.details = details
    self.test_data = test_data
    self.tokenizer = tokenizer
    self.out_size = out_size
    self.num_val = num_val
    self.beam = beam

  def on_epoch_end(self,epoch,logs=None) :
    
    encoder_model = Model(self.details['encoder_inputs'], self.details['encoder_states'])
    rep_size = self.details['params']['rep_size']
    decoder_state_input = []
    for i in range(len(self.details['encoder_states'])) :
        new_state = Input(shape=(rep_size,))
        decoder_state_input.append(new_state)
    decoder_inputs = self.details['decoder_inputs']
    x = self.details['decoder_embedding'](decoder_inputs)
    
    for layer in self.details['decoder_layers'] :
      x, *decoder_states = layer(x,initial_state=decoder_state_input)

    x = self.details['decoder_dense'](x)
    decoder_model = Model(
        [decoder_inputs] + decoder_state_input,
        [x] + decoder_states )
    inp = self.tokenizer.encode(self.test_data['df'].input.tolist())
    out = self.tokenizer.encode(self.test_data['df'].output.tolist(),mode='output')
    out_size = self.out_size
    val_gen = data_loader._generate_batch(inp,out,self.test_data,self.out_size)
    acc = 0
    for i in tqdm(range(self.num_val)) :
      (input_seq,ans) , _ = next(val_gen)
      _,best = decode_sequence_beam(input_seq,self.beam,encoder_model,decoder_model,self.tokenizer,self.test_data['max_target_length'])
      w1 = self.tokenizer.decode(best,mode='output')
      w2 = self.tokenizer.decode(ans,mode='output')
      comp = (w1==w2)
      if comp :
        acc += 1    

    acc /= len(inp)
    print("Val Accuracy : "+str(acc))
    

In [13]:
def decode_sequence_beam(input_seq,k,encoder_model,decoder_model,tk,max_target_length=20,getall=False):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq,batch_size=1,use_multiprocessing=True)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of 
    #target sequence with the start character.
    target_seq[0, 0] = 1 
    run_condition = [True for i in range(k)]
    # print(len(states_value))
    # print([target_seq] + [states_value])
    results, *states_values_temp = decoder_model.predict([target_seq] + [states_value])
    output_tokens = results

    states_values_k = [states_values_temp for i in range(k)]
    #get topk indices
    ind = np.argpartition(np.array(output_tokens[0, -1, :]), -k)[-k:]
    bestk_ind = ind
    output_tokens = np.array(output_tokens[0, -1, :])
    bestk_prob = output_tokens[ind]
    bestk_tot = [[1,bestk_ind[i]] for i in range(k)]
    # print(bestk_tot)

    
    while any(run_condition):
        bestk_tot_new = []
        bestk_prob_new = []
        states_values_k_new = []
        for i in range(k) :
            if run_condition[i] :
                a = bestk_tot[i]
                b = bestk_prob[i]
                target_seq[0,0] = a[-1]
                results,*states_values_temp = decoder_model.predict([target_seq] + states_values_k[i],batch_size=1)
                output_tokens = results

                states_values_k_temp = [states_values_temp for m in range(k)]

                states_values_k_new += states_values_k_temp
                ind = np.argpartition(np.array(output_tokens[0, -1, :]), -k)[-k:]
                bestk_ind = ind
                output_tokens = np.array(output_tokens[0, -1, :])
                bestk_prob_temp = output_tokens[ind]
                bestk_tot_temp = [a+[bestk_ind[j]] for j in range(k)]
                bestk_prob_temp2 = [b*bestk_prob_temp[j] for j in range(k)]
                bestk_prob_new += bestk_prob_temp2
                bestk_tot_new += bestk_tot_temp
            
            else :
                a = bestk_tot[i]
                b = bestk_prob[i]
                bestk_tot_new += [bestk_tot[i]]
                bestk_prob_new += [b]
                states_values_k_new += [states_values_k[i]]

        bestk_prob_new = np.array(bestk_prob_new)
        # print(len(bestk_prob_new),len(bestk_tot_new),len(states_values_k_new))
        ind = np.argpartition(bestk_prob_new,-k)[-k:]
        bestk_tot = [bestk_tot_new[i] for i in ind]
        states_values_k = [states_values_k_new[i] for i in ind]
        bestk_prob = bestk_prob_new[ind]
        run_condition = []
        for i in range(k) :
            a = bestk_tot[i]
            b = bestk_prob[i]
            if a[-1]!= 2 and len(a)<=max_target_length :
              run_condition.append(True)
            else :
              run_condition.append(False)

        # print(bestk_tot)

    final_words = []
    best_word = []
    best = 0.0
    for i in range(k) :
      a = bestk_tot[i]
      b = bestk_prob[i]
      final_words += [a]
      if b > best :
        best_word = [a]

    if getall :
      return (tk.decode(final_words,'output'),best_word)
    else :
      return final_words,best_word

In [14]:
class rnn():

  def __init__(self, params):
    
    num_encode_layers = params['num_encode_layers']
    num_decode_layers = params['num_decode_layers']
    data_dict = params['data_dict']
    in_size = params['data_dict']['in_size']
    out_size = params['data_dict']['out_size']
    cell_type = params['cell_type']
    dropout = params['dropout']
    embed_size = params['embed_size']
    rep_size = params['rep_size']
        
    ###################### ENCODER NETWORK ######################
    
    encoder_inputs = Input(shape=(None,))
    x = Embedding(in_size, embed_size ,mask_zero=True)(encoder_inputs)

    encoder_layers = []
    
    for j in range(num_encode_layers-1) :   
      curr_layer = getattr(layers, cell_type)(rep_size, dropout=dropout, return_sequences=True)
      encoder_layers.append(curr_layer)
      x = curr_layer(x)

    curr_layer = getattr(layers, cell_type)(rep_size, dropout=dropout, return_state=True)
    encoder_layers.append(curr_layer)
    x, *encoder_states = curr_layer(x)

    ###################### DECODER NETWORK ######################

    decoder_inputs = Input(shape=(None,))

    decoder_embedding =  Embedding(out_size, embed_size, mask_zero=True)
    x = decoder_embedding(decoder_inputs)

    decoder_layers = []    
    
    for j in range(num_decode_layers) :
      curr_layer = getattr(layers, cell_type)(rep_size,dropout=dropout,return_state=True, return_sequences=True)
      decoder_layers.append(curr_layer)
      x, *decoder_states = curr_layer(x, initial_state=encoder_states)

    decoder_dense = Dense(units=out_size, activation='softmax')
    decoder_outputs = decoder_dense(x)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    self.model = model
    self.encoder_inputs = encoder_inputs
    self.encoder_layers = encoder_layers
    self.decoder_inputs = decoder_inputs
    self.decoder_embedding = decoder_embedding
    self.decoder_layers = decoder_layers
    self.decoder_dense = decoder_dense
    self.encoder_states = encoder_states
    self.params = params
    self.details = {
        'model' : self.model,
        'encoder_inputs' : self.encoder_inputs,
        'encoder_layers' :self.encoder_layers ,
        'decoder_inputs' :self.decoder_inputs ,
        'decoder_embedding' : self.decoder_embedding,
        'decoder_layers' : self.decoder_layers,
        'decoder_dense' : self.decoder_dense,
        'encoder_states' : self.encoder_states ,
        'params' :self.params
    }
  def compile_and_fit(self, data_dict, params):

    self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    
    summary = self.model.summary()
    plot = plot_model(self.model, show_shapes=True)
    
    train_samples = len(data_dict['train']['df']) # Total Training samples
    val_samples = len(data_dict['val']['df'])    # Total validation or test samples
    batch_size = params['batch_size']
    num_epochs = params['num_epochs']
    beam = params['beam_width']
    num_val_samples = params['val_test_samples']
    run_details = self.model.fit_generator(generator = data_dict['train']['batch'],
                                            steps_per_epoch = train_samples//batch_size,
                                            epochs=num_epochs,
                                            callbacks=[BeamSearchCallBack(self.details,data_dict['val'],data_dict['tokenizer'],data_dict['out_size'],num_val=num_val_samples,beam=beam)
                                                        #,wandb.keras.WandbCallback()
                                                        ]
                                            )

    # train_ds, val_ds = data['train'], data['val']
    # optimizer, epochs = params['optimizer'], params['num_epochs']
    
    return {
        'run_details' : run_details
    }



In [30]:
class tools:
  def init_params(config,data_dict):
  
    
    # returning parameters
    params = {
        'num_encode_layers' : config.num_encode_layers,
        'num_decode_layers' : config.num_decode_layers,
        'cell_type' : config.cell_type,
        'rep_size' : config.rep_size,
        'embed_size' : config.embed_size,
        'dropout' : config.dropout,
        'num_epochs' : config.num_epochs,
        'data_dict' : data_dict,
        'batch_size' : config.batch_size,
        'beam_width' : config.beam,
        'val_test_samples' : config.num_val_samples
    }
    return params

In [29]:
# sweep configuration
sweep_config = {
    'method' : 'bayes',
    'metric' : {
        'name' : 'Val_acc',
        'goal' : 'maximize'
    },
    'parameters': {
        'cell_type' : {
            'values': ['LSTM','GRU','SimpleRNN']  
        },
        'beam': {
            'values': [2,3,4,5]
        },
        'num_val_samples': {
            'values': [200]
        },
        'embed_size': {
            'values': [16,32,64]
        },
        'rep_size': {
            'values': [32,64,128,256]
        },
        'dropout': {
            'values': [0,0.2,0.4,0.5]
        },
        'batch_size': {
            'values': [32]
        },
        'num_epochs': {
            'values': [25]
        },
        'num_encode_layers': {
            'values': [1,2,3,4,5]
        },
        'num_decode_layers': {
            'values': [1,2,3,4,5]
        }
    }
}

In [31]:
sweep_id = wandb.sweep(sweep_config, project='dakshina_v1')

Create sweep with ID: 5to80pdt
Sweep URL: https://wandb.ai/ramkamal/dakshina_v1/sweeps/5to80pdt


In [34]:
class sweep_module:
  @staticmethod
  def train(config=None):

    with wandb.init(config):
      
      config = wandb.config
 
      #wandb.run.name = 'fil:'+str(config['num_filters_'])+'_type:'+config['type_of_filters'][0]+'_aug:'+str(config['augmentation'])[0]+'_dro:'+str(config['dropout'])[0]
      data_dict = return_data_dict(batch_size=config.batch_size)
      params = tools.init_params(config,data_dict)
      network = rnn(params)
      run_details = network.compile_and_fit(data_dict, params)

      if os.path.isdir('/content/wandb'): 
        shutil.rmtree('/content/wandb')

In [None]:
sweep_id = '7g0porer'

In [None]:
# performing the sweep
wandb.agent(sweep_id, sweep_module.train)

[34m[1mwandb[0m: Agent Starting Run: mrcgtrjy with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam: 2
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embed_size: 32
[34m[1mwandb[0m: 	num_decode_layers: 4
[34m[1mwandb[0m: 	num_encode_layers: 3
[34m[1mwandb[0m: 	num_epochs: 25
[34m[1mwandb[0m: 	num_val_samples: 200
[34m[1mwandb[0m: 	rep_size: 128


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 32)     2304        input_1[0][0]                    
__________________________________________________________________________________________________
gru (GRU)                       (None, None, 128)    62208       embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________



Epoch 1/25


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

# Run One Model separate

In [None]:
params1 = {
    'num_encode_layers' : 2,
    'num_decode_layers' : 1,
    'cell_type' : 'LSTM', # SimpleRNN, LSTM
    'rep_size' : 20,
    'embed_size' : 20,
    'dropout' : 0,
    'num_epochs' : 1,
    'data_dict' : data_dict,
    'batch_size' : 32,
    'beam_width' : 3,
    'val_test_samples' : 200
}

In [None]:
params = {
    'num_encode_layers' : 2,
    'num_decode_layers' : 1,
    'cell_type' : 'LSTM', # SimpleRNN, LSTM
    'rep_size' : 20,
    'embed_size' : 20,
    'dropout' : 0,
    'num_epochs' : 10,
    'data_dict' : data_dict,
    'batch_size' : 32,
    'beam_width' : 3,
    'val_test_samples' : 200
}
network = rnn(params)
plot_model(network.model, show_shapes=True)
network.compile_and_fit(data_dict, params)

Model: "model_23"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 20)     1440        input_17[0][0]                   
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 20)     3280        embedding_6[0][0]                
___________________________________________________________________________________________





100%|██████████| 200/200 [03:01<00:00,  1.10it/s]

Val Accuracy : 0.0
Epoch 2/10
   8/1420 [..............................] - ETA: 23s - loss: 0.8358 - acc: 0.3905






100%|██████████| 200/200 [03:15<00:00,  1.02it/s]

Val Accuracy : 0.0
Epoch 3/10
  10/1420 [..............................] - ETA: 24s - loss: 0.6853 - acc: 0.4663






100%|██████████| 200/200 [03:15<00:00,  1.02it/s]

Val Accuracy : 0.0
Epoch 4/10
  10/1420 [..............................] - ETA: 25s - loss: 0.5823 - acc: 0.5497






100%|██████████| 200/200 [03:13<00:00,  1.03it/s]

Val Accuracy : 0.0
Epoch 5/10
   9/1420 [..............................] - ETA: 22s - loss: 0.5297 - acc: 0.5939






100%|██████████| 200/200 [03:06<00:00,  1.07it/s]

Val Accuracy : 0.0
Epoch 6/10
   8/1420 [..............................] - ETA: 22s - loss: 0.4980 - acc: 0.6172






100%|██████████| 200/200 [03:06<00:00,  1.07it/s]

Val Accuracy : 0.0
Epoch 7/10
  10/1420 [..............................] - ETA: 23s - loss: 0.4813 - acc: 0.6356






100%|██████████| 200/200 [03:10<00:00,  1.05it/s]

Val Accuracy : 0.0
Epoch 8/10
  11/1420 [..............................] - ETA: 23s - loss: 0.4144 - acc: 0.6783






100%|██████████| 200/200 [03:11<00:00,  1.04it/s]

Val Accuracy : 0.0
Epoch 9/10
   9/1420 [..............................] - ETA: 22s - loss: 0.3947 - acc: 0.6927






100%|██████████| 200/200 [03:16<00:00,  1.02it/s]

Val Accuracy : 0.0
Epoch 10/10
   8/1420 [..............................] - ETA: 23s - loss: 0.3487 - acc: 0.7154






100%|██████████| 200/200 [03:17<00:00,  1.01it/s]

Val Accuracy : 0.0





{'run_details': <tensorflow.python.keras.callbacks.History at 0x7f595b7b7510>}

In [None]:
encoder_inputs = Input(shape=(None,))
x = Embedding(data_dict['in_size'], 64,mask_zero=True)(encoder_inputs)
x = LSTM(units=256,return_sequences=True)(x)
x, *encoder_states = LSTM(units=256,
                           return_state=True)(x)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

decoder_embedding =  Embedding(data_dict['out_size'], 64,mask_zero=True)
x = decoder_embedding(decoder_inputs)

decoder_LSTM = LSTM(units=256, return_sequences=True, return_state=True)
x, *decoder_states = decoder_LSTM(x, initial_state=encoder_states)

decoder_dense = Dense(units=data_dict['out_size'], activation='softmax')
decoder_outputs = decoder_dense(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2 = decoder_embedding(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs_2, *decoder_states_2 = decoder_LSTM(dec_emb2
                                                    ,initial_state=decoder_state_input
                                                    )
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs_2 = decoder_dense(decoder_outputs_2)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs_2] + decoder_states_2)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of 
    #target sequence with the start character.
    target_seq[0, 0] = 1
# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    chars = [1]
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 2 :
          stop_condition = True
        chars.append(sampled_token_index)
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
    return tk.decode([chars],'output')[0]

In [None]:
plot_model(decoder_model,show_shapes=True)

NameError: ignored

In [None]:
test_gen = generate_batch(X_test, Y_test, batch_size = 1)
k=-1

In [None]:
x1 = tk.encode(['टोप्पर​'])
x1

In [None]:
decode_sequence(x1)

In [None]:
lex['test'].input.tolist()[k]

In [None]:
k += 1
(input_seq, actual_output), _ = next(test_gen)
# print(input_seq)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', tk.decode([X_test[k]])[0] )
print('Actual Target Translation:', tk.decode([Y_test[k]],mode='output')[0])
print('Predicted Target Translation:', decoded_sentence)

# Romanized

In [None]:
ta_rom = dict()
ta_rom['rejoined'] = pd.read_csv('/content/dakshina_dataset_v1.0/ta/romanized/ta.romanized.rejoined.tsv', sep='\t', header=None, error_bad_lines=False)
ta_rom['rejoined_aligned_cased'] = pd.read_csv('/content/dakshina_dataset_v1.0/ta/romanized/ta.romanized.rejoined.aligned.cased_nopunct.tsv', sep='\t', header=None, error_bad_lines=False) 
ta_rom['rejoined_aligned'] = pd.read_csv('/content/dakshina_dataset_v1.0/ta/romanized/ta.romanized.rejoined.aligned.tsv', sep='\t', header=None, error_bad_lines=False)
ta_rom['split'] = pd.read_csv('/content/dakshina_dataset_v1.0/ta/romanized/ta.romanized.split.tsv', sep='\t', header=None, error_bad_lines=False)

In [None]:
list(ta_rom['rejoined'].iloc[0, 0])[:10]

In [None]:
ta_rom['rejoined_aligned_cased']

In [None]:
ta_rom['rejoined_aligned']

In [None]:
ta_rom['split']

In [None]:
l1 = [1,4,2,3]
l2 = [1,4,2,5]
print(np.array(l1[1:-1])==np.array(l2[1:-1]))

[ True  True]
