Fetch data from google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Import libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2
import pathlib
import keras
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.layers import SimpleRNN,LSTM,GRU,Embedding,Dense,Dropout,Input,Concatenate
from tensorflow.keras.optimizers import Adam,Nadam
from keras import Model

Data pre-processing

In [3]:
# token mapping
def tok_map(data):
    source = data['en'].values
    target = data['hi'].values
    target = '\t'+target+'\n'

    len_list_s = [len(i) for i in source]
    s_max_len = max(len_list_s)

    len_list_t = [len(i) for i in target]
    t_max_len = max(len_list_t)

    # creating tokens of source language
    s_tok = set()
    for sw in source:
        for chr in sw:
            s_tok.add(chr)
    source_tokens = sorted(list(s_tok))
    s_tok_map = dict([(chr,i+1) for i,chr in enumerate(source_tokens)])
    s_tok_map[" "] = 0

    # creating tokens of target language
    t_tok = set()
    for st in target:
        for chr in st:
            t_tok.add(chr)
    tar_tokens = sorted(list(t_tok))
    t_tok_map = dict([(chr,i+1) for i,chr in enumerate(tar_tokens)])
    t_tok_map[" "] = 0

    return source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len


# load the data given the path
def dataLoad(path):
    with open(path) as dataFile:
        dataset = pd.read_csv(dataFile,sep='\t',header=None,names=["hi","en",""],skip_blank_lines=True,index_col=None)
    dataset = dataset[dataset['hi'].notna()]
    dataset = dataset[dataset['en'].notna()]
    dataset = dataset[['hi','en']]
    return dataset

# create inputs for encoder & decoder and target for decoder
def dataProcess(data):
    src,tar = data['en'].values, data['hi'].values
    tar = "\t" + tar + "\n"

    slen = len(src)
    enc_inp = np.zeros(
        (slen,s_max_len), dtype="float32"
    )

    tlen = len(tar)
    dec_inp = np.zeros(
        (tlen,t_max_len), dtype="float32"
    )
    dec_tar = np.zeros(
        (tlen, t_max_len, len(tar_tokens)+1), dtype="int"
    )

    for i,(sw,tw) in enumerate(zip(src,tar)):
        # encoder input
        for j,ch in enumerate(sw):
            enc_inp[i,j] = s_tok_map[ch]
        enc_inp[i,j+1:] = s_tok_map[" "]

        # decoder input
        for j,ch in enumerate(tw):
            dec_inp[i,j] = t_tok_map[ch]
            if j>0:
                dec_tar[i,j-1,t_tok_map[ch]] = 1

        # decoder target
        dec_inp[i,j+1:] = t_tok_map[" "]
        dec_tar[i,j:,t_tok_map[" "]] = 1
        
    return enc_inp, dec_inp, dec_tar

In [4]:
train = dataLoad("/content/gdrive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv")
source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len = tok_map(train)
dev = dataLoad("/content/gdrive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv")
test = dataLoad("/content/gdrive/MyDrive/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv")   

In [5]:
# Process the training data
train_encoder_input, train_decoder_input, train_decoder_target = dataProcess(train)

# Process the validation data
val_encoder_input, val_decoder_input, val_decoder_target = dataProcess(dev)

In [6]:
print(train)

              hi          en
0             अं          an
1        अंकगणित    ankganit
2           अंकल       uncle
3          अंकुर       ankur
4         अंकुरण     ankuran
...          ...         ...
44199  ह्वेनसांग  hiuentsang
44200  ह्वेनसांग  hsuantsang
44201  ह्वेनसांग    hyensang
44202  ह्वेनसांग    xuanzang
44203          ॐ          om

[44202 rows x 2 columns]


In [7]:
train.head()

Unnamed: 0,hi,en
0,अं,an
1,अंकगणित,ankganit
2,अंकल,uncle
3,अंकुर,ankur
4,अंकुरण,ankuran


In [8]:
# Attention mechanism
class Attention(tf.keras.layers.Layer):
    def __init__(self, nunits):
        super(Attention, self).__init__()
        self.V = Dense(1)
        self.W1 = Dense(nunits)
        self.W2 = Dense(nunits)

    def call(self, query, values):
        # calculate the score
        score = self.V(tf.nn.tanh(self.W1((tf.expand_dims(query, 1))) + self.W2(values)))
        # attention weights
        weights = tf.nn.softmax(score, axis=1)
        # context vector
        con_vec = weights * values
        con_vec = tf.reduce_sum(con_vec, axis=1)
        
        return con_vec, weights

In [15]:
# seq2seq model with attention
def seq2seqModel(Layer = "LSTM", nunits = 32, encl = 2, decl = 2,embds = 32,dense_size=32,dropout=None):
    keras.backend.clear_session()
    # source_tokens, s_tok_map, s_max_len, tar_tokens, t_tok_map, t_max_len
    enc_inps = Input(shape=(None,))
    enc_emb = Embedding(input_dim=len(source_tokens)+1, output_dim = embds, mask_zero=True)
    encop = enc_emb(enc_inps)

    dec_inps = Input(shape=(None,))
    dec_emb = Embedding(input_dim = len(tar_tokens)+1,output_dim = embds,mask_zero=True)

    att = Attention(nunits)


    all = []
    concat = Concatenate(axis=-1)

    # If the cell type is chosen as LSTM ----------------------------------------------------    
    if Layer == "LSTM":
        encLays = [LSTM(nunits,return_sequences=True) for i in range(encl-1)]
        encLast = LSTM(nunits,return_state=True)
        encmb = encop
        for enLay in encLays:
            encmb = enLay(encmb)
            if dropout is not None:
                encmb = Dropout(dropout)(encmb)
            
        o,state_h,state_c = encLast(encmb)
        encoder_states = [state_h,state_c]
        e_states = [encoder_states]*decl
        
        decoder = [LSTM(nunits,return_sequences=True,return_state=True) for i in range(decl)]
        decEmbop = dec_emb(dec_inps)

        for i in range(t_max_len):
            con_W , _ = att(e_states[0][0],o)
            temp = concat([tf.expand_dims(con_W, 1), decEmbop[:,i:i+1,:]])

            for i in range(decl):
                o,state_h,state_c = decoder[i](temp,initial_state=e_states[i])
                e_states[i] = [state_h,state_c]
            all.append(o)
        

    # If the cell type is chosen as GRU ----------------------------------------------------  
    elif Layer == "GRU":
        encLays = [GRU(nunits,return_sequences=True) for i in range(encl-1)]
        encLast = GRU(nunits,return_state=True)
        encmb = encop
        for enLay in encLays:
            encmb = enLay(encmb)
            if dropout is not None:
                encmb = Dropout(dropout)(encmb)
            
        out = encLast(encmb)
        encoder_states = out[1:]

        e_states = []
        for _ in range(decl):
            e_states += encoder_states
        
        decoder = [GRU(nunits,return_sequences=True,return_state=True) for i in range(decl)]
        decEmbop = dec_emb(dec_inps)

        for i in range(t_max_len):
            con_W , _ = att(e_states[0],out[0])
            temp = concat([(tf.expand_dims(con_W, 1)), decEmbop[:,i:i+1,:]])

            for i in range(decl):
                o,state = decoder[i](temp,initial_state=e_states[i])
                e_states[i] = state
            all.append(temp)
            
        
    DLayerH = Dense(dense_size, activation='relu')
    concat = Concatenate(axis=1)
    preact = DLayerH(concat(all))
    DL_O = Dense(len(tar_tokens)+1, activation = 'softmax')
    act_op = DL_O(preact)
    
    train_model = Model([enc_inps,dec_inps],act_op)

    return train_model

In [None]:
# to calculate accuracy
def accuracy(true,pred):
    mask = tf.cast(tf.math.logical_not(tf.math.equal(true, 0)), dtype='int32')
    acc = tf.cast(tf.math.equal(tf.math.argmax(true,axis=2),tf.math.argmax(pred,axis=2)), dtype='int32')
    acc = tf.cast(tf.math.equal(tf.reduce_sum(tf.math.multiply(acc,mask),axis=1),tf.reduce_sum(mask,axis=1)), dtype='float32')
    return (tf.reduce_mean(acc))

In [10]:
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'val_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'epochs': {
            'values': [10, 15, 20]
        },
        'lr': {
            'values': [1e-2,1e-3]
        },
        'dense_size': {
            'values': [64,128,512]
        },
        'beam_width': {
            'values': [3.0, 5.0, 7.0]
        },
        'dropout': {
            'values': [0.0,0.2,0.3]
        },
        'teacher_forcing': {
            'values': [0.2, 0.5, 0.9]
        },
        'nunits': {
            'values': [128,256,512]
        },
        'type_of_layer': {
            'values': ["LSTM","GRU"]
        },
        'embed_dim': {
            'values': [64,128,256]
        },
        'enc_dec_layers': {
            'values': [1,2,3]
        }
    }
}

In [None]:
!pip install wandb

In [11]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mpratyushdash[0m ([33mpandp[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [17]:
import wandb
from wandb.keras import WandbCallback
from tensorflow.keras.callbacks import EarlyStopping

def train():
    # Default values for hyper-parameters we're going to sweep over
    config_defaults = {
        'epochs' : 15,
        'lr': 1e-2,
        'dense_size': 128,
        'beam_width': 5.0,
        'dropout': 0.,
        'teacher_forcing': 0.2,
        'nunits': 128,
        'type_of_layer': 'GRU',
        'embed_dim': 64,
        'enc_dec_layers': 1,
    }

    # Initialize a new wandb run
    wandb.init(config=config_defaults,name="cs6910-ass3-attn")
    cfg = wandb.config
    
    wandb.run.name = f'epochs_{cfg.epochs}_nunits_{cfg.nunits}_layer_{cfg.type_of_layer}_edl_{cfg.enc_dec_layers}_emb_{cfg.embed_dim}_ds_{cfg.dense_size}_beam_width_{cfg.beam_width}_do_{cfg.dropout}_lr_{cfg.lr}_teacher_forcing_{cfg.teacher_forcing}'
    wandb.run.save()

    #Building the model using the parameters provided by the config ---------------------------------------    
    train = seq2seqModel(Layer=cfg.type_of_layer, nunits=cfg.nunits, embds=cfg.embed_dim, encl=cfg.enc_dec_layers, decl=cfg.enc_dec_layers, dense_size=cfg.dense_size, dropout=cfg.dropout)
    train.compile(optimizer = Adam(learning_rate=cfg.lr),loss='categorical_crossentropy',metrics=['accuracy'])
    print("Build Sucessfully")

    # Define early stopping mechanism to reduce extra useless epochs
    earlystopping = EarlyStopping(
        monitor="val_accuracy", min_delta=0.01, patience=5, verbose=2, mode="auto"
    )

    # Fit the model after sucessfull model building 
    train.fit([train_encoder_input,train_decoder_input],train_decoder_target,
             batch_size=64,
             validation_data = ([val_encoder_input,val_decoder_input],val_decoder_target),
             epochs=cfg.epochs,
             callbacks = [earlystopping, WandbCallback(monitor='val_accuracy',mode='max')])
    print("Completed ! ")

In [None]:
#sweep_id = wandb.sweep(sweep_config,entity="pandp",project = 'CS6910-AS3')

In [None]:
wandb.agent(sweep_id, train, entity = "pandp" , project = "CS6910-AS3", count=10)