<a href="https://colab.research.google.com/github/ramkamal2000/cs6910_assignment3/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Drive

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Installing Required Packages

In [13]:
!pip install editdistance
!pip install wandb



# Importing Required Libraries

In [14]:
import tarfile
import os
import pandas as pd
import editdistance
import keras
import numpy as np
import wandb
import tensorflow as tf
import shutil
import pickle
from keras import layers
from keras.layers import LSTM, Dense, Embedding, Input, TimeDistributed, Dropout
from keras.models import Model, save_model, load_model
from keras.utils.vis_utils import plot_model
from tqdm.auto import tqdm
from keras.layers import Lambda
from keras import backend as K
from math import ceil
from pprint import pprint

# Setting Current Directory

In [15]:
dir = '/content'
# dir = '/kaggle/working'

# Downloading Dataset

In [16]:
!wget -nc https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar

if not os.path.isdir(dir + '/dakshina_dataset_v1.0'):
  tarfile.open(dir + '/dakshina_dataset_v1.0.tar').extractall()

File ‘dakshina_dataset_v1.0.tar’ already there; not retrieving.



# Logging Onto wandb

In [17]:
wandb.login(key='14394907543f59ea21931529e34b4d80d2ca8c9c', force=True)

[34m[1mwandb[0m: Currently logged in as: [33mramkamal[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Loading Data

In [22]:
class data_loader():

  @staticmethod
  def _load_raw_df(languages = ['ta']):
    lex = dict()
    lex['train'], lex['val'], lex['test'] = [], [], [] 
    column_names = ['output', 'input', 'count']
    
    for la in languages:
      lex['train'].append(pd.read_csv(dir + '/dakshina_dataset_v1.0/'+la+'/lexicons/'+la+'.translit.sampled.train.tsv', sep='\t', header=None, names=column_names))
      lex['val'].append(pd.read_csv(dir + '/dakshina_dataset_v1.0/'+la+'/lexicons/'+la+'.translit.sampled.dev.tsv', sep='\t', header=None, names=column_names))
      lex['test'].append(pd.read_csv(dir + '/dakshina_dataset_v1.0/'+la+'/lexicons/'+la+'.translit.sampled.test.tsv', sep='\t', header=None, names=column_names))

    lex['train'] = pd.concat(lex['train'])
    lex['val'] = pd.concat(lex['val'])
    lex['test'] = pd.concat(lex['test'])

    return lex    

  @staticmethod
  def _make_final_df(lex):
    
    for div in ['train', 'val']:
    
      # removing non max transliterations
      idx = lex[div].groupby(['input'])['count'].transform(max) == lex[div]['count']
      lex[div] = lex[div][idx].reset_index(drop=True)

      # calclulating difference in lengths of various transliterations
      lex[div]['input_len'] = lex[div].apply(lambda x: len(str(x['input'])), axis=1)
      lex[div]['output_len'] = lex[div].apply(lambda y: len(str(y['output'])), axis=1)
      lex[div]['mod_dif'] = lex[div].apply(lambda z: abs(z['input_len'] - z['output_len']), axis=1) 

      # removing transliterations that vary by a lot in length
      idx = lex[div].groupby(['input'])['mod_dif'].transform(min) == lex[div]['mod_dif']
      lex[div] = lex[div][idx].reset_index(drop=True)

      # removing duplicates if any remain
      lex[div].drop_duplicates(subset='input', keep='first', inplace=True)

      # removing redundant columns
      lex[div].drop(labels=['count', 'input_len', 'output_len', 'mod_dif'], inplace=True, axis=1)

      # shuffling the dataset i.e. rows of the dataset
      lex[div] = lex[div].sample(frac=1, random_state=6910)
      lex[div] = lex[div][idx].reset_index(drop=True)

    lex['test'] = lex['test'].sample(frac=1, random_state=6910)
    lex['test'].drop(labels=['count'], axis=1, inplace=True)
    lex['test'] = lex['test'].reset_index(drop=True)
    return lex

  @staticmethod
  def _generate_batch(X, y, data_dict, num_decoder_tokens, batch_size = 1):

    while True:
        for j in range(0, len(X), batch_size):
            
            # placeholder data structures
            encoder_input_data = np.zeros((batch_size, data_dict['max_source_length']),dtype='float32')
            decoder_input_data = np.zeros((batch_size, data_dict['max_target_length']),dtype='float32')
            decoder_target_data = np.zeros((batch_size, data_dict['max_target_length'], num_decoder_tokens),dtype='float32')

            # assessing one batch at a time
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):

                for t, word in enumerate(input_text):
                  encoder_input_data[i, t] = word
                for t, word in enumerate(target_text):
                    if t<len(target_text)-1:
                        # decoder input sequence
                        # does not include the <EOW> token
                        decoder_input_data[i, t] = word 
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the <SOW> token
                        decoder_target_data[i, t - 1, word] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

  @staticmethod
  def _generate_batch_greedy(X, y, data_dict, num_decoder_tokens, batch_size = 1):

    while True:
        for j in range(0, len(X), batch_size):

            # placeholder data structures
            encoder_input_data = np.zeros((batch_size, data_dict['max_source_length']),dtype='float32')
            decoder_input_data = np.zeros((batch_size, 1),dtype='float32')
            decoder_target_data = np.zeros((batch_size, data_dict['max_target_length'], num_decoder_tokens),dtype='float32')
            
            # assessing one batch at a time
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text):
                  encoder_input_data[i, t] = word
                for t, word in enumerate(target_text):
                    if t==0 :
                        decoder_input_data[i, t] = 1 # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        decoder_target_data[i, t - 1, word] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [23]:
class Tokenizer:

  def __init__(self, df):

    self.start_token = '<SOW>'
    self.stop_token = '<EOW>'
    self.unknown_token = '<UNK>'

    self.input_corpus = [self.start_token, self.stop_token, self.unknown_token]
    self.output_corpus = [self.start_token, self.stop_token, self.unknown_token]

    input_words = df.input.tolist()
    output_words = df.output.tolist()

    for word in input_words:
      tokens = str(word)
      for token in tokens:
        if token not in self.input_corpus:
          self.input_corpus.append(token)

    for word in output_words:
      tokens = str(word)
      for token in tokens:
        if token not in self.output_corpus:
          self.output_corpus.append(token)
    
    self.encode_dict_input = {self.input_corpus[i] : i+1 for i in range(len(self.input_corpus))}
    self.decode_dict_input = {k:v for v,k in self.encode_dict_input.items()}
    
    
    self.encode_dict_output = {self.output_corpus[i] : i+1 for i in range(len(self.output_corpus))}
    self.decode_dict_output = {k:v for v,k in self.encode_dict_output.items()}
    self.decode_dict_output.update({2:''})

  # takes in lists of words and returns lists of integers
  def encode(self, X, mode='input'):

    if (mode=='input'):
      input_list = []
      for word in X:
        word = str(word)
        integer_list =np.array([self.encode_dict_input.get(token, self.encode_dict_input[self.unknown_token]) for token in word])
        input_list.append(integer_list)
      
      return input_list
    
    if (mode=='output'):
      output_list = []
      for word in X:
        word = str(word)
        integer_list = np.array([self.encode_dict_output[self.start_token]] + [self.encode_dict_output.get(token, self.encode_dict_output[self.unknown_token]) for token in word] + [self.encode_dict_output[self.stop_token]])
        output_list.append(integer_list)
      
      return output_list
    
  # takes in lists of integers and returns lists of words
  def decode(self, X, mode='input'):

    if (mode=='input'):
      input_list = []
      for integers in X:
        token_list=[]
        for integer in integers :
          if integer == 2 :
            break
          token_list.append(self.decode_dict_input.get(integer, '')) 
        input_list.append(''.join(token_list))
      
      return input_list

    if (mode=='output'):
      output_list = []
      for integers in X:
        token_list=[]
        for integer in integers :
          if integer == 2 :
            break
          token_list.append(self.decode_dict_output.get(integer, '')) 
        output_list.append(''.join(token_list))
      
      return output_list

In [24]:
def return_data_dict(languages=['ta'], batch_size=32):

  lex = data_loader._load_raw_df(languages)
  lex = data_loader._make_final_df(lex)

  data_dict = dict()

  df_train = lex['train']
  df_val = lex['val']
  df_test = lex['test']

  tk = Tokenizer(df_train)

  data_dict['in_size'] = len(tk.input_corpus) + 1
  data_dict['out_size'] = len(tk.output_corpus) + 1

  X_train = tk.encode(df_train.input.tolist(), mode='input')
  Y_train = tk.encode(df_train.output.tolist(), mode='output')
  
  X_val = tk.encode(df_val.input.tolist(), mode='input')
  Y_val = tk.encode(df_val.output.tolist(), mode='output')

  X_test = tk.encode(df_val.input.tolist(), mode='input')
  Y_test = tk.encode(df_val.output.tolist(), mode='output')


  data_dict['train'], data_dict['val'], data_dict['test']= dict(), dict(), dict()


  data_dict['train']['df'] = df_train
  data_dict['val']['df'] = df_val
  data_dict['test']['df'] = df_test


  data_dict['train']['max_source_length'] = np.max(np.array([len(x) for x in X_train]))
  data_dict['train']['max_target_length'] = np.max(np.array([len(x) for x in Y_train]))
  
  data_dict['val']['max_source_length'] = np.max(np.array([len(x) for x in X_val]))
  data_dict['val']['max_target_length'] = np.max(np.array([len(x) for x in Y_val]))

  data_dict['test']['max_source_length'] = np.max(np.array([len(x) for x in X_test]))
  data_dict['test']['max_target_length'] = np.max(np.array([len(x) for x in Y_test]))

  data_dict['max_source_length'] = max(data_dict['train']['max_source_length'], data_dict['val']['max_source_length'])
  data_dict['max_target_length'] = max(data_dict['train']['max_target_length'], data_dict['val']['max_target_length'])

  data_dict['train']['batch'] = data_loader._generate_batch(X_train, Y_train, data_dict, data_dict['out_size'], batch_size)
  data_dict['train']['batch_greedy'] = data_loader._generate_batch_greedy(X_train, Y_train, data_dict, data_dict['out_size'], batch_size)
  data_dict['train']['batch_greedy_big'] = data_loader._generate_batch_greedy(X_train, Y_train, data_dict, data_dict['out_size'], 1024)
  
  data_dict['val']['batch'] = data_loader._generate_batch(X_val, Y_val, data_dict, data_dict['out_size'], batch_size)
  data_dict['val']['batch_greedy'] = data_loader._generate_batch_greedy(X_val, Y_val, data_dict, data_dict['out_size'], batch_size)
  data_dict['val']['batch_greedy_big'] = data_loader._generate_batch_greedy(X_val, Y_val, data_dict, data_dict['out_size'], 1024)

  data_dict['test']['batch'] = data_loader._generate_batch(X_test, Y_test, data_dict, data_dict['out_size'], batch_size)
  data_dict['test']['batch_greedy'] = data_loader._generate_batch_greedy(X_test, Y_test, data_dict, data_dict['out_size'], batch_size)
  data_dict['test']['batch_greedy_big'] = data_loader._generate_batch_greedy(X_test, Y_test, data_dict, data_dict['out_size'], len(X_test))

  data_dict['tokenizer'] = tk

  return data_dict

In [25]:
dict_data_dict = dict()

for batch_size in [32]:
  dict_data_dict.update({batch_size: return_data_dict(batch_size=batch_size)})

data_dict = list(dict_data_dict.values())[0]



# Question 1


In [None]:
class rnn():

  def __init__(self, params):
    
    num_encode_layers = params['num_encode_layers']
    num_decode_layers = params['num_decode_layers']
    data_dict = params['data_dict']
    in_size = params['data_dict']['in_size']
    out_size = params['data_dict']['out_size']
    cell_type = params['cell_type']
    dropout = params['dropout']
    embed_size = params['embed_size']
    rep_size = params['rep_size']
        
    ###################### ENCODER NETWORK ######################
    
    encoder_inputs = Input(shape=(None,))
    x = Embedding(in_size, embed_size ,mask_zero=True)(encoder_inputs)

    encoder_layers = []
    
    for j in range(num_encode_layers-1) :   
      curr_layer = getattr(layers, cell_type)(rep_size, dropout=dropout, return_sequences=True)
      encoder_layers.append(curr_layer)
      x = curr_layer(x)

    curr_layer = getattr(layers, cell_type)(rep_size, dropout=dropout, return_state=True)
    encoder_layers.append(curr_layer)
    x, *encoder_states = curr_layer(x)

    ###################### DECODER NETWORK ######################

    decoder_inputs = Input(shape=(None,))

    decoder_embedding =  Embedding(out_size, embed_size, mask_zero=True)
    x = decoder_embedding(decoder_inputs)

    decoder_layers = []    
    
    for j in range(num_decode_layers) :
      curr_layer = getattr(layers, cell_type)(rep_size,dropout=dropout,return_state=True, return_sequences=True)
      decoder_layers.append(curr_layer)
      x, *decoder_states = curr_layer(x, initial_state=encoder_states)

    x = Dropout(dropout)(x)
    decoder_dense = TimeDistributed(Dense(units=out_size, activation='softmax'))
    decoder_outputs = decoder_dense(x)

    # define the model that will turn `encoder_inputs` & `decoder_inputs` into `decoder_outputs`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    self.model = model
    self.encoder_inputs = encoder_inputs
    # self.encoder_layers = encoder_layers
    self.decoder_inputs = decoder_inputs
    self.decoder_embedding = decoder_embedding
    self.decoder_layers = decoder_layers
    self.decoder_dense = decoder_dense
    self.encoder_states = encoder_states
    self.params = params
    self.details = {
        'model' : self.model,
        'encoder_inputs' : self.encoder_inputs,
        # 'encoder_layers' :self.encoder_layers ,
        'decoder_inputs' :self.decoder_inputs ,
        'decoder_embedding' : self.decoder_embedding,
        'decoder_layers' : self.decoder_layers,
        'decoder_dense' : self.decoder_dense,
        'encoder_states' : self.encoder_states ,
        'params' :self.params,
    }

  def compile_and_fit(self, data_dict, params):

    # compiling the model
    self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    
    # printing the summary of the model
    summary = self.model.summary()

    # plotting the model figure
    plot = plot_model(self.model, show_shapes=True)
    
    # total training samples
    train_samples = len(data_dict['train']['df'])

    # total validation samples
    val_samples = len(data_dict['val']['df'])    
    
    # batch size
    batch_size = params['batch_size']

    # number of epochs
    num_epochs = params['num_epochs']

    # training the model
    run_details = self.model.fit_generator(generator = data_dict['train']['batch'],
                                           steps_per_epoch = train_samples//batch_size,
                                           epochs=num_epochs,
                                           callbacks=[
                                                      wandb.keras.WandbCallback()
                                                      ],
                                           validation_data = data_dict['val']['batch'], 
                                           validation_steps = val_samples//batch_size
                                          )

    return {
        'run_details' : run_details
    }

    

In [None]:
class rnn_second() :
  def __init__(self, details=None) :

    if details is not None:
      # copying required details
      self.details = details

      # copying decoder state input
      decoder_state_input = self.details['encoder_states']

      decoder_inputs = Input(shape=(1,))

      # copying hidden representation size
      rep_size = self.details['params']['rep_size']

      # copying decoder inputs
      # decoder_inputs = self.details['decoder_inputs']

      # the decoder model
      x = self.details['decoder_embedding'](decoder_inputs)
    
      all_outputs = []
      for _ in range(self.details['params']['data_dict']['max_target_length']) :
          for layer in self.details['decoder_layers'] :
              x, *decoder_states = layer(x, initial_state=decoder_state_input)

          x = self.details['decoder_dense'](x)

          # appending the softmax output
          all_outputs.append(x)

          # taking the argmax to feed into the next time step
          # print("Hello ",tf.math.argmax(x, 2))
          # if int(tf.math.argmax(x, 2))==2:
          #     x = 0
          # else:
          #     x = tf.math.argmax(x, 2)
          x = tf.math.argmax(x, 2) 
          x = self.details['decoder_embedding'](x)
          
          # decoder state input for the next time step
          decoder_state_input = decoder_states

      ##### ????? ????? ????? ????? ????? ????? ????? ????? ????? ?????
      # where do we evaluate stop condition?

      decoder_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)
      model = Model([self.details['encoder_inputs'], decoder_inputs], decoder_outputs)
      self.model = model
    
    else:
      self.details = None 
      self.model = None

  def compile_and_fit(self, data_dict, params) :

    # compiling the model
    self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

     # printing the summary of the model   
    summary = self.model.summary()

    # plotting the model figure
    plot = plot_model(self.model, show_shapes=True)
    
    # total training samples
    train_samples = len(data_dict['train']['df'])

    # total validation samples 
    val_samples = len(data_dict['val']['df'])

    # batch size   
    batch_size = params['batch_size']

    # number of epochs
    num_epochs = params['num_epochs_2']

    # training the model
    run_details = self.model.fit_generator(generator = data_dict['train']['batch_greedy'],
                                            steps_per_epoch = train_samples//batch_size,
                                            epochs=num_epochs,
                                            callbacks=[
                                                      wandb.keras.WandbCallback()
                                                      ],
                                            validation_data = data_dict['val']['batch_greedy'],
                                            validation_steps = val_samples//batch_size)
   
    return {
        'run_details' : run_details
    }

  def evaluate(self, data_dict, train=False) :

    if train :
      test_gen = data_dict['train']['batch_greedy_big']
    
      # number of test samples
      test_samples = len(data_dict['train']['df'])
      # test_samples=100
    
      batch_size=1024

      num_hits = 0
      num_edits = 0

      for i in tqdm(range(test_samples//batch_size)) :

        (a,b),c = next(test_gen)
        c = np.argmax(c, axis=2)
        # print(c)
        l1 = data_dict['tokenizer'].decode(c, mode='output')
        out = self.model.predict([a,b])
        out = np.argmax(out,axis=2) 
        l2 = data_dict['tokenizer'].decode(out, mode='output')
        num_hits += np.sum(np.array(l1)==np.array(l2))
        num_edits += get_score(l1,l2)

      print("Final Train Acc ", num_hits/test_samples)
      print("Editdistance Train Avg ",num_edits/test_samples)
      wandb.log({"final_train_acc":  num_hits/test_samples, 
               "editdistance_train_acc":  num_edits/test_samples})

    # test batch generator
    test_gen = data_dict['val']['batch_greedy_big']
     
    # number of test samples
    test_samples = len(data_dict['val']['df'])
    # test_samples=100
    batch_size=1024
    
    num_hits = 0
    num_edits = 0
    for _ in tqdm(range(test_samples//batch_size)) :
      (a,b),c = next(test_gen)
      c = np.argmax(c, axis=2)
      # print(c)
      l1 = data_dict['tokenizer'].decode(c, mode='output')
      out = self.model.predict([a,b])
      out = np.argmax(out,axis=2) 
      l2 = data_dict['tokenizer'].decode(out, mode='output')
      num_hits += np.sum(np.array(l1)==np.array(l2))
      num_edits += get_score(l1,l2)
      # print(l1, l2)
      # print(out)

    print("Final Val Acc ", num_hits/test_samples)
    print("Editdistance Val Avg ",num_edits/test_samples)
    wandb.log({"final_val_acc":  num_hits/test_samples, 
               "editdistance_val_acc":  num_edits/test_samples})
    
  def evaluate_test(self, data_dict,filename):
    test_gen = data_dict['test']['batch_greedy_big']
     
    # number of test samples
    test_samples = len(data_dict['test']['df'])
    # test_samples=100
    batch_size=test_samples
    
    num_hits = 0
    num_edits = 0

    X = []
    Y_true = []
    Y_pred = []
    outputs = []
    
    for _ in tqdm(range(test_samples//batch_size)) :
      (a,b),c = next(test_gen)
      c = np.argmax(c, axis=2)
      # print(c)
      l1 = data_dict['tokenizer'].decode(c, mode='output')
      out = self.model.predict([a,b])
      out = np.argmax(out,axis=2) 
      l2 = data_dict['tokenizer'].decode(out, mode='output')
      
      ###############################################################
      <X.append(<INPUT STRING!>)>
      Y_true.append(l1)
      Y_pred.append(l2)
      
      num_hits += np.sum(np.array(l1)==np.array(l2))
      num_edits += get_score(l1,l2)
      # print(l1, l2)
      # print(out)

    df = pd.DataFrame({
      'X': X,
      'Y_true': Y_true,
      'Y_pred': Y_pred
    })

    df.to_csv(filename)
    try:
      wandb.log({'rnn_greedy_csv': df})
    except:
      pass

    '''
    with open(filename, 'wb') as f:
       pickle.dump([inputs,outputs], f)
    '''
    
    print("Final Test Acc ", num_hits/test_samples)
    print("Editdistance Test Avg ",num_edits/test_samples)
    wandb.log({"final_test_acc":  num_hits/test_samples, 
               "editdistance_test_acc":  num_edits/test_samples})

# Helper Function For Editdistance

In [None]:
def get_score(A,B) :
  fin = 0
  for a,b in zip(A,B) :
    if len(a)>0 or len(b)>0:   
      j = editdistance.eval(a,b)
      fin += 1 - j/max(len(a),len(b))
  return fin

# Helper Function For Assigning Sweep Parameters

In [None]:
class tools:
  def init_params(config,data_dict):
  
    # returning parameters
    params = {
        'num_encode_layers' : config.num_encode_layers,
        'num_decode_layers' : config.num_decode_layers,
        'cell_type' : config.cell_type,
        'rep_size' : config.rep_size,
        'embed_size' : config.embed_size,
        'dropout' : config.dropout,
        'num_epochs' : config.num_epochs,
        'data_dict' : data_dict,
        'batch_size' : config.batch_size
    }
    return params

In [None]:
sweep configuration
sweep_config = {
    
    'method' : 'bayes',
    'metric' : {
        'name' : 'val_acc',
        'goal' : 'maximize'
    },
    
    'parameters': {
        'cell_type' : {
            'values': ['LSTM', 'GRU', 'SimpleRNN']  
        },
        'embed_size': {
            'values': [2, 4, 8, 16]
        },
        'rep_size': {
            'values': [32, 64, 128, 256]
        },
        'dropout': {
            'values': [0, 0.1, 0.2, 0.3, 0.4, 0.5]
        },
        'batch_size': {
            'values': [32]
        },
        'num_epochs': {
            'values': [5, 15, 25]
        },
        'num_encode_layers': {
            'values': [1, 2, 3]
        },
        'num_decode_layers': {
            'values': [1, 2, 3]
        }
    }
}

In [None]:
# sweep_id = wandb.sweep(sweep_config, project='dakshina_v6')

In [None]:
class sweep_module:
  @staticmethod
  def train(config=None):

    with wandb.init(config):
      
      # copying the config 
      config = wandb.config
 
      # naming the run
      wandb.run.name = 'typ:'+config['cell_type'][:4]+ '_' + 'dro:'+str(config['dropout'])+ '_' + 'enc:' + str(config['num_encode_layers'])+ '_' + 'dec:'+str(config['num_decode_layers'])
      
      # returning the data dictionairy
      data_dict = dict_data_dict[config.batch_size]

      # copying the parameters
      params = tools.init_params(config,data_dict)

      # creating and training the first model
      network = rnn(params)
      run_details = network.compile_and_fit(data_dict, params)

      rnn_2 = rnn_second(network.details)
      rnn_2.evaluate(data_dict,train=True)

In [None]:
# sweep_id = '1gv485mq'

In [None]:
# performing the sweep
# wandb.agent(sweep_id, sweep_module.train)

# Best Model

In [None]:
params = {
    'num_encode_layers' : 3,
    'num_decode_layers' : 1,
    'cell_type' : 'LSTM', 
    'rep_size' : 128,
    'embed_size' : 16,
    'dropout' : 0.5,
    'num_epochs' : 25,
    'data_dict' : data_dict,
    'batch_size' : 32
}

In [None]:
wandb.init(project='final_rnn')

[34m[1mwandb[0m: Currently logged in as: [33mramkamal[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [None]:
network = rnn(params)
network.compile_and_fit(data_dict, params)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 16)     480         input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, None, 128)    74240       embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


{'run_details': <tensorflow.python.keras.callbacks.History at 0x7f690046b890>}

In [None]:
rnn_2 = rnn_second(network.details)
rnn_2.evaluate(data_dict, train=True)

  0%|          | 0/64 [00:00<?, ?it/s]

Final Train Acc  0.6351523806630002
Editdistance Train Avg  0.8995018425035165


  0%|          | 0/6 [00:00<?, ?it/s]

Final Val Acc  0.42540055857709835
Editdistance Val Avg  0.7645827045064213


In [None]:
rnn_2.evaluate_test(data_dict,'/kaggle/working/rnn_greedy.csv')

  0%|          | 0/1 [00:00<?, ?it/s]

Final Test Acc  0.46813186813186813
Editdistance Test Avg  0.8429676980387052


In [None]:
def decode_sequence_beam(input_seq, k, encoder_model, decoder_model, tk, max_target_length=20, alpha=0.7,getall=False):
    # encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    # generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # populate the first character of target sequence with the start character.
    target_seq[0, 0] = 1 
    run_condition = [True for i in range(k)]
    # print(len(states_value))
    # print([target_seq] + [states_value])
    results, *states_values_temp = decoder_model.predict([target_seq] + [states_value])
    output_tokens = results

    states_values_k = [states_values_temp for i in range(k)]
    #get topk indices
    ind = np.argpartition(np.array(output_tokens[0, -1, :]), -k)[-k:]
    bestk_ind = ind
    output_tokens = np.array(output_tokens[0, -1, :])
    bestk_prob = np.log(output_tokens[ind])
    bestk_tot = [[bestk_ind[i]] for i in range(k)]
    # print(bestk_tot)

    
    while any(run_condition):
        bestk_tot_new = []
        bestk_prob_new = []
        states_values_k_new = []
        for i in range(k) :
            if run_condition[i] :
                a = bestk_tot[i]
                b = bestk_prob[i]
                target_seq[0,0] = a[-1]
                results,*states_values_temp = decoder_model.predict([target_seq] + states_values_k[i],batch_size=1)
                output_tokens = results

                states_values_k_temp = [states_values_temp for m in range(k)]

                states_values_k_new += states_values_k_temp
                ind = np.argpartition(np.array(output_tokens[0, -1, :]), -k)[-k:]
                bestk_ind = ind
                output_tokens = np.array(output_tokens[0, -1, :])
                bestk_prob_temp = output_tokens[ind]
                bestk_tot_temp = [a+[bestk_ind[j]] for j in range(k)]
                bestk_prob_temp2 = [(b*(np.power(len(bestk_tot_temp[j])-1,alpha)) + np.log(bestk_prob_temp[j]))/(np.power(len(bestk_tot_temp[j]),alpha)) for j in range(k)]
                bestk_prob_new += bestk_prob_temp2
                bestk_tot_new += bestk_tot_temp
            
            else :
                a = bestk_tot[i]
                b = bestk_prob[i]
                bestk_tot_new += [bestk_tot[i]]
                bestk_prob_new += [b]
                states_values_k_new += [states_values_k[i]]

        bestk_prob_new = np.array(bestk_prob_new)
        # print(len(bestk_prob_new),len(bestk_tot_new),len(states_values_k_new))
        ind = np.argpartition(bestk_prob_new,-k)[-k:]
        bestk_tot = [bestk_tot_new[i] for i in ind]
        states_values_k = [states_values_k_new[i] for i in ind]
        bestk_prob = bestk_prob_new[ind]
        run_condition = []
        for i in range(k) :
            a = bestk_tot[i]
            b = bestk_prob[i]
            if a[-1]!= 2 and len(a)<=max_target_length :
              run_condition.append(True)
            else :
              run_condition.append(False)

    final_words = []
    best_word = []
    best = -5.0
    for i in range(k) :
      a = bestk_tot[i]
      b = bestk_prob[i]
      final_words += [a]
      if b > best :
        best_word = [a]
        best = b

    if getall :
      return (tk.decode(final_words,'output'),best_word)
    else :
      return final_words,best_word

In [None]:
def beam_search(details,tokenizer,test_data,out_size,beam,data_dict) :
  encoder_model = Model(details['encoder_inputs'], details['encoder_states'])
  rep_size = details['params']['rep_size']
  decoder_state_input = []
  for i in range(len(details['encoder_states'])) :
      new_state = Input(shape=(rep_size,))
      decoder_state_input.append(new_state)
  decoder_inputs = details['decoder_inputs']
  x = details['decoder_embedding'](decoder_inputs)

  for layer in details['decoder_layers'] :
    x, *decoder_states = layer(x,initial_state=decoder_state_input)

  x = details['decoder_dense'](x)
  decoder_model = Model(
      [decoder_inputs] + decoder_state_input,
      [x] + decoder_states )
  inp = tokenizer.encode(test_data['df'].input.tolist())
  out = tokenizer.encode(test_data['df'].output.tolist(),mode='output')
  val_gen = data_loader._generate_batch(inp,out,data_dict,out_size)
  acc = 0
  acc_k = 0
  num_val = len(inp)
  
  X = []
  Y_true = []
  Y_pred = []

  for i in tqdm(range(num_val)) :
    (input_seq,ans) , _ = next(val_gen)
    K,best = decode_sequence_beam(input_seq,beam,encoder_model,decoder_model,tokenizer,data_dict['max_target_length'],getall=True)
    w1 = tokenizer.decode(best,mode='output')
    w2 = tokenizer.decode([ans[0][1:]],mode='output')
    
    ###############################################################
    <X.append(<INPUT STRING!>)>
    Y_true.append(w2[0])
    Y_pred.append(w1[0])

    comp = (w1[0]==w2[0])
    if comp :
      acc += 1    
    if w2[0] in K :
      acc_k += 1

  acc /= num_val
  acc_k /= num_val

  filename = '/kaggle/working/rnn_beam.csv'

  df = pd.DataFrame({
      'X': X,
      'Y_true': Y_true,
      'Y_pred': Y_pred
  })

  df.to_csv(filename)
    try:
      wandb.log({'rnn_beam_csv': df})
    except:
      pass
  '''
  with open(filename, 'wb') as f:
    pickle.dump([inputs,outputs], f)
  '''

  print("Val Accuracy : "+str(acc))
  print("Val Accuracy K : "+str(acc_k))
  
  wandb.log({"val_acc_rnn_beam": acc,
             "val_acc_rnn_beam_K:", acc_k})

In [None]:
beam_search(network.details,data_dict['tokenizer'], data_dict['test'], data_dict['out_size'], 10, data_dict)