<a href="https://colab.research.google.com/github/rashmi05pathak/CS6910_Assignment3/blob/main/Assignment3_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2
import pathlib

1. Downloading the Dakshina dataset

In [None]:
# Downloading dakshina dataset
!yes | wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
# Unzipping dataset
!yes | tar xopf dakshina_dataset_v1.0.tar

In [None]:
!pip install wandb

2. Processing of the **Dakshina** dataset
reference used : https://colab.research.google.com/drive/1rqHhdPbOeqlP_X6AW__4P37fXoyCnl9x#scrollTo=m5luH6y4Mvgi

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2
import pathlib


class DataProcessing():

    def __init__(self, DATAPATH, source_lang = 'en', target_lang = "hi"):
    
        self.source_lang = source_lang
        self.target_lang = target_lang
    
        self.trainpath = os.path.join(DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.train.tsv")
        self.valpath = os.path.join(DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.dev.tsv")
        self.testpath = os.path.join(DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.test.tsv")
        self.train = pd.read_csv(
            self.trainpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )
        self.val = pd.read_csv(
            self.valpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )
        self.test = pd.read_csv(
            self.testpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )

        # create train data
        self.train_data = self.preprocess(self.train["src"].to_list(), self.train["tgt"].to_list())
        (
            self.train_encoder_input,
            self.train_decoder_input,
            self.train_decoder_target,
            self.source_vocab,
            self.target_vocab,
        ) = self.train_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        # create val data (only encode function suffices as the dictionary lookup should be kep the same.
        self.val_data = self.encode(
            self.val["src"].to_list(),
            self.val["tgt"].to_list(),
            list(self.source_char2int.keys()),
            list(self.target_char2int.keys()),
            source_char2int=self.source_char2int,
            target_char2int=self.target_char2int,
        )
        self.val_encoder_input, self.val_decoder_input, self.val_decoder_target = self.val_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        # create test data
        self.test_data = self.encode(
            self.test["src"].to_list(),
            self.test["tgt"].to_list(),
            list(self.source_char2int.keys()),
            list(self.target_char2int.keys()),
            source_char2int=self.source_char2int,
            target_char2int=self.target_char2int,
        )
        self.test_encoder_input, self.test_decoder_input, self.test_decoder_target = self.test_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

    


    def dictionary_lookup(self, vocab):
        char2int = dict([(char, i) for i, char in enumerate(vocab)])
        int2char = dict((i, char) for char, i in char2int.items())
        return char2int, int2char


    def encode(self, source, target, source_chars, target_chars, source_char2int=None, target_char2int=None):
        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source])
        max_target_length = max([len(txt) for txt in target])

        source_vocab, target_vocab = None, None
        if source_char2int == None and target_char2int == None:
            print("Generating the dictionary lookups for character to integer mapping and back")
            source_char2int, source_int2char = self.dictionary_lookup(source_chars)
            target_char2int, target_int2char = self.dictionary_lookup(target_chars)

            source_vocab = (source_char2int, source_int2char)
            target_vocab = (target_char2int, target_int2char)

        encoder_input_data = np.zeros(
            (len(source), max_source_length, num_encoder_tokens), dtype="float32"
        )
        decoder_input_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="float32"
        )
        decoder_target_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="float32"
        )

        for i, (input_text, target_text) in enumerate(zip(source, target)):
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, source_char2int[char]] = 1.0
            encoder_input_data[i, t + 1 :, source_char2int[" "]] = 1.0
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t, target_char2int[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    decoder_target_data[i, t - 1, target_char2int[char]] = 1.0
            decoder_input_data[i, t + 1 :, target_char2int[" "]] = 1.0
            decoder_target_data[i, t:, target_char2int[" "]] = 1.0
        if source_vocab != None and target_vocab != None:
            return (
                encoder_input_data,
                decoder_input_data,
                decoder_target_data,
                source_vocab,
                target_vocab,
            )
        else:
            return encoder_input_data, decoder_input_data, decoder_target_data


    def preprocess(self, source , target):
        source_chars = set()
        target_chars = set()

        source = [str(x) for x in source]
        target = [str(x) for x in target]

        source_words = []
        target_words = []
        for src, tgt in zip(source, target):
            tgt = "\t" + tgt + "\n"
            source_words.append(src)
            target_words.append(tgt)
            for char in src:
                if char not in source_chars:
                    source_chars.add(char)
            for char in tgt:
                if char not in target_chars:
                    target_chars.add(char)

        source_chars = sorted(list(source_chars))
        target_chars = sorted(list(target_chars))

        #The space needs to be appended so that the encode function doesn't throw errors
        source_chars.append(" ")
        target_chars.append(" ")

        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source_words])
        max_target_length = max([len(txt) for txt in target_words])

        print("Number of samples:", len(source))
        print("Source Vocab length:", num_encoder_tokens)
        print("Target Vocab length:", num_decoder_tokens)
        print("Max sequence length for inputs:", max_source_length)
        print("Max sequence length for outputs:", max_target_length)

        return self.encode(source_words, target_words, source_chars, target_chars)

### 2.2 Processing the data

Default input language is English and output language is Hindi

In [None]:

DATAPATH = "./dakshina_dataset_v1.0"

#By default source language is English and target lang is Hindi
dataBase = DataProcessing(DATAPATH) 


In [None]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K


class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
    
    Credits to Tensorflow.org and https://github.com/thushv89/attention_keras/blob/master/src/layers/attention.py
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>', U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)
            if verbose:
                print('Ws+Uh>', Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            assert_msg = "States must be an iterable. Got {} of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

## 3. Recurrent neural networks based model for sequence to sequence machine translation 
### 3.1 Seq2Seq **Translation** Model class

In [None]:
import os

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras import layers
 

#from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Input, InputLayer, Flatten, Activation, LSTM, SimpleRNN, GRU, TimeDistributed
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model, Sequential,  Model
from tensorflow.keras.callbacks import EarlyStopping



class S2STranslation():

    def __init__(self, modelConfigDict, srcChar2Int, tgtChar2Int, using_pretrained_model = False):
        self.numEncoders = modelConfigDict["numEncoders"]
        self.cell_type = modelConfigDict["cell_type"]
        self.latentDim = modelConfigDict["latentDim"]
        self.dropout = modelConfigDict["dropout"]
        self.numDecoders = modelConfigDict["numDecoders"]
        self.hidden = modelConfigDict["hidden"]
        self.tgtChar2Int = tgtChar2Int
        self.srcChar2Int = srcChar2Int

    def build_configurable_model(self):       
        if self.cell_type == "RNN":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs #was getting error that var referenced before assignment hence
            for i in range(1, self.numEncoders + 1):
                encoder = SimpleRNN(self.latentDim,return_state=True,return_sequences=True,dropout=self.dropout,name="encoder_1")
                encoder_outputs, state = encoder(encoder_inputs)
            encoder_states = [state]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = SimpleRNN(self.latentDim,return_sequences=True,return_state=True,dropout=self.dropout,name="decoder_1")
                decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)

            # dense layer
            hidden = Dense(self.hidden, activation="relu")
            hidden_outputs = hidden(decoder_outputs)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
        
        elif self.cell_type == "LSTM":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = LSTM(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state_h, state_c = encoder(encoder_outputs)
            encoder_states = [state_h, state_c]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = LSTM(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _, _ = decoder(
                    decoder_outputs, initial_state=encoder_states
                )

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_outputs = hidden(decoder_outputs)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
        
        elif self.cell_type == "GRU":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            #encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = GRU(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state = encoder(encoder_inputs)
            encoder_states = [state]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            #decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = GRU(
                    self.latentDim,
                    return_sequences=True,
                    return_state=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_outputs = hidden(decoder_outputs)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
            
            
    def build_attention_model(self):       
        if self.cell_type == "RNN":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = SimpleRNN(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state = encoder(encoder_inputs) 
                
                if i == 1:
                    encoder_first_outputs= encoder_outputs                  
            encoder_states = [state]
            

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = SimpleRNN(
                    self.latentDim,
                    return_sequences=True,
                    return_state=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)
                
                if i == self.numDecoders:
                    decoder_first_outputs = decoder_outputs

            attention_layer = AttentionLayer(name='attention_layer')
            attention_out, attention_states = attention_layer([encoder_first_outputs, decoder_first_outputs])


            decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_out])

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_time = TimeDistributed(hidden, name='time_distributed_layer')
            hidden_outputs = hidden(decoder_concat_input)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
        
        elif self.cell_type == "LSTM":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = LSTM(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state_h, state_c = encoder(encoder_outputs)
                if i == 1:
                    encoder_first_outputs= encoder_outputs                  
         
            encoder_states = [state_h, state_c]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = LSTM(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _, _ = decoder(
                    decoder_outputs, initial_state=encoder_states
                )
                if i == self.numDecoders:
                    decoder_first_outputs = decoder_outputs

            attention_layer = AttentionLayer(name='attention_layer')
            attention_out, attention_states = attention_layer([encoder_first_outputs, decoder_first_outputs])

            decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_out])

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_time = TimeDistributed(hidden, name='time_distributed_layer')
            hidden_outputs = hidden(decoder_concat_input)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model
        
        elif self.cell_type == "GRU":
            # encoder
            encoder_inputs = Input(shape=(None, len(self.srcChar2Int)))
            encoder_outputs = encoder_inputs
            for i in range(1, self.numEncoders + 1):
                encoder = GRU(
                    self.latentDim,
                    return_state=True,
                    return_sequences=True,
                    dropout=self.dropout,
                )
                encoder_outputs, state = encoder(encoder_inputs)

                if i == 1:
                    encoder_first_outputs= encoder_outputs                  
         
            encoder_states = [state]

            # decoder
            decoder_inputs = Input(shape=(None, len(self.tgtChar2Int)))
            decoder_outputs = decoder_inputs
            for i in range(1, self.numDecoders + 1):
                decoder = GRU(
                    self.latentDim,
                    return_sequences=True,
                    return_state=True,
                    dropout=self.dropout,
                )
                decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)
                if i == self.numDecoders:
                    decoder_first_outputs = decoder_outputs



            attention_layer = AttentionLayer(name='attention_layer')
            attention_out, attention_states = attention_layer([encoder_first_outputs, decoder_first_outputs])

            decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_out])

            # dense
            hidden = Dense(self.hidden, activation="relu")
            hidden_time = TimeDistributed(hidden, name='time_distributed_layer')
            hidden_outputs = hidden(decoder_concat_input)
            decoder_dense = Dense(len(self.tgtChar2Int), activation="softmax")
            decoder_outputs = decoder_dense(hidden_outputs)
            model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
            
            return model

### 3.2 Model Training


In [None]:
import numpy as np
import pandas as pd
import os

#from tensorflow.keras import Input, Model
from tensorflow.keras.layers import RNN, LSTM, GRU, Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping

import wandb
from wandb.keras import WandbCallback


import tensorflow as tf


def train():

    config_defaults = {
        "cell_type": "RNN",
        "latentDim": 256,
        "hidden": 128,
        "optimiser": "rmsprop",
        "numEncoders": 1,
        "numDecoders": 1,
        "dropout": 0.2,
        "epochs": 1,
        "batch_size": 64,
    }
    config_best = {
        "cell_type": "LSTM",
        "latentDim": 256,
        "hidden": 64,
        "optimiser": "adam",
        "numEncoders": 2,
        "numDecoders": 1,
        "dropout": 0.1,
        "epochs": 20,
        "batch_size": 32,
    }


    wandb.init(config=config_best,  project="CS6910-Assignment-3", entity="rashmi05pathak")
    config = wandb.config
    wandb.run.name = (
        str(config.cell_type)
        + dataBase.source_lang
        + str(config.numEncoders)
        + "_"
        + dataBase.target_lang
        + "_"
        + str(config.numDecoders)
        + "_"
        + config.optimiser
        + "_"
        + str(config.epochs)
        + "_"
        + str(config.dropout) 
        + "_"
        + str(config.batch_size)
        + "_"
        + str(config.latentDim)
    )
    wandb.run.save()

    modelInit = S2STranslation(config,srcChar2Int=dataBase.source_char2int, tgtChar2Int=dataBase.target_char2int)
    
    model = modelInit.build_configurable_model()
    
    model.summary()

    model.compile(
        optimizer=config.optimiser,
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )

    earlystopping = EarlyStopping(
        monitor="val_accuracy", min_delta=0.01, patience=5, verbose=2, mode="auto"
    )

    model.fit(
        [dataBase.train_encoder_input, dataBase.train_decoder_input],
        dataBase.train_decoder_target,
        batch_size=config.batch_size,
        epochs=config.epochs,
        validation_data=([dataBase.val_encoder_input, dataBase.val_decoder_input], dataBase.val_decoder_target),
        callbacks=[earlystopping, WandbCallback()],
    )

    model.save(os.path.join("./TrainedModels", wandb.run.name))    
    wandb.finish()
    
    return model

Running the train function without sweep: 

In [None]:
'''    
sweep_config = {
    "name": "Bayesian Sweep without attention",
    "method": "bayes",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        
        "cell_type": {"values": ["LSTM"]},
        
        "latentDim": {"values": [256]},
        
        "hidden": {"values": [128, 64]},
        
        "optimiser": {"values": ["rmsprop", "adam"]},
        
        "numEncoders": {"values": [1, 2, 3]},
        
        "numDecoders": {"values": [1, 2, 3]},
        
        "dropout": {"values": [0.1, 0.2, 0.3]},
        
        "epochs": {"values": [5,10,15]},
        
        "batch_size": {"values": [32, 64]},
    },
}

sweep_id = wandb.sweep(sweep_config, project="CS6910-Assignment-3", entity="rashmi05pathak")

wandb.agent(sweep_id, train)

'''
model = train()

In [None]:
model.summary()

In [None]:
#plot_model(model,to_file='model.png',show_shapes = True)

In [None]:
model.layers[-1]

In [None]:
model.layers[4]

Running the wandb sweep: 

In [None]:
  
sweep_config = {
    "name": "Bayesian Sweep with attention",
    "method": "bayes",
    "metric": {"name": "val_accuracy", "goal": "maximize"},
    "parameters": {
        
        "cell_type": {"values": ["RNN", "GRU", "LSTM"]},
        
        "latentDim": {"values": [256]},
        
        "hidden": {"values": [128, 64]},
        
        "optimiser": {"values": ["rmsprop", "adam"]},
        
        "numEncoders": {"values": [1, 2, 3]},
        
        "numDecoders": {"values": [1, 2, 3]},
        
        "dropout": {"values": [0.1, 0.2, 0.3]},
        
        "epochs": {"values": [5,10,15, 20]},
        
        "batch_size": {"values": [32, 64]},
    },
}

sweep_id = wandb.sweep(sweep_config, project="CS6910-Assignment-3", entity="rashmi05pathak")

wandb.agent(sweep_id, train)

#train()

In [None]:
import numpy as np
import pandas as pd
import os

from tensorflow.keras import Input, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Input, InputLayer, Flatten, Activation, LSTM, SimpleRNN, GRU, TimeDistributed, Concatenate


import wandb
from wandb.keras import WandbCallback


import tensorflow as tf

#By default source language is english and target lang is Hindi
dataBase = DataProcessing(DATAPATH) 

config_best = {
        "cell_type": "LSTM",
        "latentDim": 256,
        "hidden": 64,
        "optimiser": "adam",
        "numEncoders": 2,
        "numDecoders": 1,
        "dropout": 0.1,
        "epochs": 20,
        "batch_size": 32,
    }

config_defaults = {
        "cell_type": "RNN",
        "latentDim": 256,
        "hidden": 128,
        "optimiser": "rmsprop",
        "numEncoders": 1,
        "numDecoders": 1,
        "dropout": 0.2,
        "epochs": 1,
        "batch_size": 64,
    }
    
config_best_attention = {
        "cell_type": "RNN",
        "latentDim": 256,
        "hidden": 16,
        "optimiser": "rmsprop",
        "numEncoders": 1,
        "numDecoders": 1,
        "dropout": 0.1,
        "epochs": 10,
        "batch_size": 32,
    }
    
def test_model(
    model,
    attention = False
):

    if attention == False:
        wandb.init(config=config_best,  project="CS6910-Assignment-3", entity="rashmi05pathak")
        config = wandb.config
        wandb.run.name = (
            "Inference_" 
            + str(config.cell_type)
            + dataBase.source_lang
            + str(config.numEncoders)
            + "_"
            + dataBase.target_lang
            + "_"
            + str(config.numDecoders)
            + "_"
            + config.optimiser
            + "_"
            + str(config.epochs)
            + "_"
            + str(config.dropout) 
            + "_"
            + str(config.batch_size)
            + "_"
            + str(config.latentDim)
        )
        wandb.run.save()


        if config.cell_type == "LSTM":
            encoder_inputs = model.input[0]
            
            if config.numEncoders == 1:
                encoder_outputs, state_h_enc, state_c_enc = model.get_layer(name = "lstm").output 
            else:           
                encoder_outputs, state_h_enc, state_c_enc = model.get_layer(name = "lstm_"+ str(config.numEncoders-1)).output

            encoder_states = [state_h_enc, state_c_enc]
            encoder_model = Model(encoder_inputs, encoder_states)

            decoder_inputs = model.input[1]
            decoder_state_input_h = Input(shape=(config.latentDim,), name="input_3")
            decoder_state_input_c = Input(shape=(config.latentDim,), name="input_4")
            decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
            decoder_lstm = model.layers[-3]
            decoder_outputs, state_h_dec, state_c_dec = decoder_lstm( decoder_inputs, initial_state=decoder_states_inputs )
            decoder_states = [state_h_dec, state_c_dec]
            decoder_dense = model.layers[-2]
            decoder_outputs = decoder_dense(decoder_outputs)
            
            decoder_dense = model.layers[-1]
            decoder_outputs = decoder_dense(decoder_outputs)
            decoder_model = Model(
                [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
            )
        elif config.cell_type == "GRU" or config.cell_type == "RNN":
            encoder_inputs = model.input[0]
            if config.cell_type == "GRU":
                if config.numEncoders == 1:
                    encoder_outputs, state = model.get_layer(name = "gru").output
                else:
                    encoder_outputs, state = model.get_layer(name = "gru_"+ str(config.numEncoders-1)).output
            else:
                if config.numEncoders == 1:
                    encoder_outputs, state = model.layers[2].output 
                else:
                    encoder_outputs, state = model.get_layer(name = "simple_rnn_"+ str(config.numEncoders-1)).output

            encoder_states = [state]

            encoder_model = Model(encoder_inputs, encoder_states)

            decoder_inputs = model.input[1]

            decoder_state = Input(shape=(config.latentDim,))
            decoder_states_inputs = [decoder_state]

            decoder_gru = model.layers[-3]
            (decoder_outputs, state,) = decoder_gru(decoder_inputs, initial_state=decoder_states_inputs)
            decoder_states = [state]
            decoder_dense = model.layers[-2]
            decoder_outputs = decoder_dense(decoder_outputs)
            decoder_dense = model.layers[-1]
            decoder_outputs = decoder_dense(decoder_outputs)
            print(decoder_inputs)
            print(decoder_outputs)
            decoder_model = Model(
                [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
            )

        def decode_sequence(input_seq):
            # Encode the input as state vectors.
            states_value = encoder_model.predict(input_seq)

            # Generate empty target sequence of length 1.
            target_seq = np.zeros((1, 1, len(dataBase.target_char2int)))
            # Populate the first character of target sequence with the start character.
            target_seq[0, 0, dataBase.target_char2int["\n"]] = 1.0

            # Sampling loop for a batch of sequences
            # (to simplify, here we assume a batch of size 1).
            stop_condition = False
            decoded_sentence = ""
            while not stop_condition:
                if config.cell_type == "LSTM":
                    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
                elif config.cell_type == "RNN" or config.cell_type == "GRU":
                    states_value = states_value[0].reshape((1, 256))
                    output_tokens, h = decoder_model.predict([target_seq] + [states_value])

                # Sample a token
                sampled_token_index = np.argmax(output_tokens[0, -1, :])
                sampled_char = dataBase.target_int2char[sampled_token_index]
                decoded_sentence += sampled_char

                if sampled_char == "\n" or len(decoded_sentence) > 25:
                    stop_condition = True

                
                target_seq = np.zeros((1, 1, len(dataBase.target_char2int)))
                target_seq[0, 0, sampled_token_index] = 1.0

                # Update states
                if config.cell_type == "LSTM":
                    states_value = [h, c]
                elif config.cell_type == "RNN" or config.cell_type == "GRU":
                    states_value = [h]
            return decoded_sentence

        acc = 0
        sourcelang = []
        predictions = []
        original = []
        for i, row in dataBase.test.iterrows():
            input_seq = dataBase.test_encoder_input[i : i + 1]
            decoded_sentence = decode_sequence(input_seq)
            og_tokens = [dataBase.target_char2int[x] for x in row["tgt"]]
            predicted_tokens = [dataBase.target_char2int[x] for x in decoded_sentence.rstrip("\n")]
            sourcelang.append(row['src'])
            original.append(row['tgt'])
            predictions.append(decoded_sentence)

            if og_tokens == predicted_tokens:
                acc += 1

            if i % 100 == 0:
                print(f"Finished {i} examples")
                print(f"Source: {row['src']}")
                print(f"Original: {row['tgt']}")
                print(f"Predicted: {decoded_sentence}")
                print(f"Accuracy: {acc / (i+1)}")
                print(og_tokens)
                print(predicted_tokens)
                

        print(f'Test Accuracy: {acc}')
        wandb.log({'test_accuracy': acc / len(dataBase.test)})
        wandb.finish()
        return acc / len(dataBase.test), sourcelang, original, predictions

In [None]:
acc,sourcelang, original, predictions = test_model(model,attention = False)

In [None]:
#dict2 = [{"input":sourcelang[i], "true": original[i], "predicted": predictions[i]} for i in range(len(sourcelang))] 

In [None]:
#dict2

In [None]:
test_predictions = pd.DataFrame(dict2)

In [None]:
#test_predictions

Move the trained models to Google drive

In [None]:
config_best_attention2 = {
        "cell_type": "GRU",
        "latentDim": 256,
        "hidden": 128,
        "optimiser": "rmsprop",
        "numEncoders": 1,
        "numDecoders": 1,
        "dropout": 0.2,
        "epochs": 10,
        "batch_size": 32,
    }

#testing the model with attention
def test_model_with_attention(model,attention = True):
    if attention == True:
        wandb.init(config=config_best_attention2,  project="CS6910-Assignment-3", entity="rashmi05patha")
        config = wandb.config
        wandb.run.name = (
            "Inference_WithAttn_" 
            + str(config.cell_type)
            + dataBase.source_lang
            + str(config.numEncoders)
            + "_"
            + dataBase.target_lang
            + "_"
            + str(config.numDecoders)
            + "_"
            + config.optimiser
            + "_"
            + str(config.epochs)
            + "_"
            + str(config.dropout) 
            + "_"
            + str(config.batch_size)
            + "_"
            + str(config.latentDim)
        )
        wandb.run.save()


        if config.cell_type == "LSTM":
            encoder_inputs = model.input[0]
            if config.numEncoders == 1:
                encoder_outputs, state_h_enc, state_c_enc = model.get_layer(name = "lstm").output 
            else:           
                encoder_outputs, state_h_enc, state_c_enc = model.get_layer(name = "lstm_"+ str(config.numEncoders-1)).output
            encoder_first_outputs, _, _ = model.get_layer(name = "lstm").output
            encoder_states = [state_h_enc, state_c_enc]
            encoder_model = Model(encoder_inputs, encoder_states)

            decoder_inputs = model.input[1]
            decoder_state_input_h = Input(shape=(config.latentDim,), name="input_3")
            decoder_state_input_c = Input(shape=(config.latentDim,), name="input_4")
            decoder_hidden_state = Input(shape=(None,config["latentDim"]), name = "input_5")
            decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
          
            decoder_lstm = model.get_layer(name = "lstm_"+ str(config.numEncoders + config.numDecoders -1))
            decoder_outputs, state_h_dec, state_c_dec = decoder_lstm( decoder_inputs, initial_state=decoder_states_inputs )
            decoder_states = [state_h_dec, state_c_dec]

            attention_layer = model.get_layer(name = "attention_layer")
            attention_out, attention_states = attention_layer([encoder_first_outputs, decoder_outputs])


            decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_out])
            
            decoder_dense = model.layers[-2]
            decoder_time = TimeDistributed(decoder_dense)
            hidden_outputs = decoder_time(decoder_concat_input)
            decoder_dense = model.layers[-1]
            decoder_outputs = decoder_dense(hidden_outputs)

            decoder_model = Model(inputs = [decoder_inputs] + [decoder_hidden_state , decoder_states_inputs], outputs = [decoder_outputs] + decoder_states)
            
        elif config.cell_type == "GRU" or config.cell_type == "RNN":
            encoder_inputs = model.input[0]
            if config.cell_type == "GRU":
                if config.numEncoders == 1:
                    encoder_outputs, state = model.get_layer(name = "gru").output
                else:
                    encoder_outputs, state = model.get_layer(name = "gru_"+ str(config.numEncoders-1)).output
                encoder_first_outputs, _ = model.get_layer(name = "gru").output
            else:
                if config.numEncoders == 1:
                    encoder_outputs, state = model.get_layer(name = "simple_rnn").output
                else:
                    encoder_outputs, state = model.get_layer(name = "simple_rnn_"+ str(config.numEncoders-1)).output
                encoder_first_outputs, _ = model.get_layer(name = "simple_rnn").output
            encoder_states = [state]

            encoder_model = Model(encoder_inputs, outputs = [encoder_first_outputs, encoder_outputs] + encoder_states)

            decoder_inputs = model.input[1]

            decoder_state = Input(shape=(config.latentDim,), name="input_3")
            decoder_hidden_state = Input(shape=(None,config["latentDim"]), name = "input_4")
            decoder_states_inputs = [decoder_state]

            if config.cell_type == "GRU":
                decoder_gru = model.get_layer(name = "gru_"+ str(config.numEncoders + config.numDecoders -1))#model.layers[-3]
                (decoder_outputs, state) = decoder_gru(decoder_inputs, initial_state=decoder_states_inputs)
                decoder_states = [state]

            else:
                decoder_gru = model.get_layer(name = "simple_rnn_"+ str(config.numEncoders + config.numDecoders -1))#model.layers[-3]
                (decoder_outputs, state) = decoder_gru(decoder_inputs, initial_state=decoder_states_inputs)
                decoder_states = [state]

                    
            attention_layer = AttentionLayer(name='attention_layer')
            attention_out, attention_states = attention_layer([decoder_hidden_state, decoder_outputs])

            decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_out])

            decoder_dense = model.layers[-2]
            decoder_time = TimeDistributed(decoder_dense)
            hidden_outputs = decoder_time(decoder_concat_input)
            decoder_dense = model.layers[-1]
            decoder_outputs = decoder_dense(hidden_outputs)

            decoder_model = Model(inputs = [decoder_inputs] + [decoder_hidden_state , decoder_states_inputs], outputs = [decoder_outputs] + decoder_states)
            
        def decode_sequence(input_seq):
            # Encode the input as state vectors.
            encoder_first_outputs, _, states_value = encoder_model.predict(input_seq)

            # Generate empty target sequence of length 1.
            target_seq = np.zeros((1, 1, len(dataBase.target_char2int)))
            # Populate the first character of target sequence with the start character.
            target_seq[0, 0, dataBase.target_char2int["\n"]] = 1.0

            # Sampling loop for a batch of sequences
            # (to simplify, here we assume a batch of size 1).
            stop_condition = False
            decoded_sentence = ""
            attention_weights = []
            while not stop_condition:
                if config.cell_type == "LSTM":
                    output_tokens, h, c = decoder_model.predict([target_seq, encoder_first_outputs] + states_value)
                elif config.cell_type == "RNN" or config.cell_type == "GRU":
                    states_value = states_value[0].reshape((1, config.latentDim))
                    output_tokens, h = decoder_model.predict([target_seq] + [encoder_first_outputs] + [states_value])
                #dec_ind = np.argmax(output_tokens, axis=-1)[0, 0]
                #attention_weights.append((dec_ind, attn_states))
                # Sample a token
                sampled_token_index = np.argmax(output_tokens[0, -1, :])
                sampled_char = dataBase.target_int2char[sampled_token_index]
                decoded_sentence += sampled_char

                # Exit condition: either hit max length
                # or find stop character.
                if sampled_char == "\n" or len(decoded_sentence) > 25:
                    stop_condition = True

                # Update the target sequence (of length 1).
                target_seq = np.zeros((1, 1, len(dataBase.target_char2int)))
                target_seq[0, 0, sampled_token_index] = 1.0

                # Update states
                if config.cell_type == "LSTM":
                    states_value = [h, c]
                elif config.cell_type == "RNN" or config.cell_type == "GRU":
                    states_value = [h]
            return decoded_sentence #, attention_weights

        acc = 0
        sourcelang = []
        predictions = []
        original = []
        for i, row in dataBase.test.iterrows():
            input_seq = dataBase.test_encoder_input[i : i + 1]
            decoded_sentence, attention_weights = decode_sequence(input_seq)
            og_tokens = [dataBase.target_char2int[x] for x in row["tgt"]]
            predicted_tokens = [dataBase.target_char2int[x] for x in decoded_sentence.rstrip("\n")]
            sourcelang.append(row['src'])
            original.append(row['tgt'])
            predictions.append(decoded_sentence)
            
            if og_tokens == predicted_tokens:
                acc += 1

            if i % 100 == 0:
                print(f"Finished {i} examples")
                print(f"Source: {row['src']}")
                print(f"Original: {row['tgt']}")
                print(f"Predicted: {decoded_sentence}")
                print(f"Accuracy: {acc / (i+1)}")
                print(og_tokens)
                print(predicted_tokens)
                

        print(f'Test Accuracy: {acc}')
        wandb.log({'test_accuracy': acc / len(dataBase.test)})
        wandb.finish()
        return acc / len(dataBase.test) , sourcelang, original, predictions #, attention_weights_test


In [None]:
acc,sourcelang, original, predictions = test_model(model,attention = True)