<a href="https://colab.research.google.com/github/prashantmishra311/Attentive-Seq2Seq-with-Copying-Mechanism-for-Table-to-text/blob/master/NLGv1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import regex as re

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Wikitable_data_splits_Batch_model/train.csv')
df.head()

Unnamed: 0,attributes,cells,captions
0,subj_title subj_subtitle date winning_$$_team ...,1978_$$_federation_$$_cup_$$_(_$$_tennis_$$_) ...,philippines won thailand with 3–0 during 1978 ...
1,subj_title subj_subtitle playoff_$$_round date...,1985_$$_new_$$_england_$$_patriots_$$_season s...,afc_$$_championship was played on january_$$_1...
2,subj_title subj_subtitle game date opponent re...,1961_$$_minnesota_$$_vikings_$$_season preseas...,minnesota vikings season was in the memorial_$...
3,subj_title subj_subtitle position name term_$$...,"st_$$_._$$_augustine_$$_beach_$$_,_$$_florida ...",undine_$$_pawlowski_$$_george was vice-mayor o...
4,subj_title subj_subtitle time name height weig...,40-yard_$$_dash records 4.30 darrius_$$_heywar...,darrius_$$_heyward-bey was 6_$$_ft_$$_2_$$_in_...


In [None]:
class Encoder(tf.keras.Model):
    '''
    Arguments:-
        vocab_size: (int) Size of source vocabulary
        embedding_size: (int) Embedding size
        encoder_size: (int) Dimensions of encoder hidden state
        batch_size: (int) Batch size
    '''

    def __init__(self, attr_vocab_size, attr_embedding_size, vocab_size, embedding_size, encoder_size, batch_size):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.encoder_size = encoder_size
        self.batch_size = batch_size

        self.attr_vocab_size = attr_vocab_size
        self.attr_embedding_size = attr_embedding_size

        # attr_embedding_input_shape = (batch_size, seq_length)
        self.Attr_Embedding = tf.keras.layers.Embedding(self.attr_vocab_size, self.attr_embedding_size)
        # attr_embedding_output_shape = (batch_size, seq_length, attr_embedding_size)

        # embedding_input_shape = (batch_size, seq_length)
        self.Embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        # embedding_output_shape = (batch_size, seq_length, embedding_size)

        self.W_e = tf.keras.layers.Dense(self.embedding_size)

        # gru_input_shape = (batch_size, seq_length, embedding_size)
        self.GRU = tf.keras.layers.GRU(self.encoder_size, 
                                       return_sequences=True, 
                                       return_state=True)
        # output_shape = (batch_size, seq_length, encoder_size)
        # hidden_state_shape = (batch_size, encoder_size)
    
    def call(self, attr_input, input, prev_hidden_state):

        # attr_input --> (batch_size, seq_length)
        attr_embed_output = self.Attr_Embedding(attr_input)
        # attr_embed_output --> (batch_size, seq_length, attr_embedding_size)
        
        # input --> (batch_size, seq_length)
        embed_output = self.Embedding(input)
        # embed_output --> (batch_size, seq_length, embedding_size)

        concat_embeds = tf.concat([embed_output, attr_embed_output], axis=2)
        # concat_embeds --> (batch_size, seq_length, embedding_size+attr_embedding_size)

        embed_output_ = tf.nn.tanh(self.W_e(concat_embeds))

        output, hidden_state = self.GRU(embed_output_, initial_state=prev_hidden_state)
        # output --> (batch_size, seq_length, encoder_size)
        # hidden_state --> (batch_size, encoder_size)
        return output, hidden_state

    def init_hidden_state(self):
        init_hidden = tf.zeros([self.batch_size, self.encoder_size])
        # init_hidden --> (batch_size, encoder_size)
        return init_hidden

* **Luong Score:** $$ \text{score}(h_t,\bar{h_s}) = h_t^\top \mathbf{W} \bar{h_s}$$
* **Bahdanau Score:**$$ \text{score}(h_t,\bar{h_s}) = \nu^\top \text{tanh}(\mathbf{W_1}h_t +\mathbf{W_2} \bar{h_s}) $$
* **Attention Weights:** $$\alpha_{ts} = \frac{\exp\{\text{score}(h_t,\bar{h_s})\}}{\sum _{i=1}^S \exp\{\text{score}(h_t,\bar{h_i})\}} $$
* **Context Vector:** $$ \mathbf{c_t} = \sum _{s} \alpha_{ts}\bar{h_s} $$
* **Attention Vector:** $$ \mathbf{a_t} = \text{tanh}(\mathbf{W_c}[\mathbf{c_t};h_t]) $$

In [None]:
class Attention(tf.keras.layers.Layer):

    '''
    Arguments:-
        attention_size: (int) Must be same as decoder_size
        style: attention mechanism, 'bahdanau' or 'luong'
    '''

    def __init__(self, attention_size, style='bahdanau'):
        super(Attention, self).__init__()

        if style.lower() not in ['bahdanau', 'luong']:
            raise ValueError(f'Attention style {style} unrecognized, try "bahdanau" or "luong"')

        self.attention_size = attention_size
        self.style = style.lower()

        # for 'Bahdanau' style attention
        self.W1 = tf.keras.layers.Dense(self.attention_size) # --> (, attention_size)
        self.W2 = tf.keras.layers.Dense(self.attention_size) # --> (, attention_size)
        self.v = tf.keras.layers.Dense(1) # --> (, 1)

        # for 'Luong' style attention
        self.W = tf.keras.layers.Dense(self.attention_size)
    
    def call(self, decoder_current_hidden, encoder_output):
        
        # encoder_output --> (batch_size, seq_length, encoder_size)
        # decoder_current_hidden --> (batch_size, decoder_size)
        decoder_current_hidden = tf.expand_dims(decoder_current_hidden, axis=1) 
        # decoder_current_hidden --> (batch_size, 1, decoder_size)

        bahdanau_score = self.v(tf.nn.tanh(self.W1(decoder_current_hidden) + self.W2(encoder_output)))
        # bahdanau_score --> (1)'((batch_size, 1, attention_size) + (batch_size, seq_length, attention_size))
        # bahdanau_score --> (1)'(batch_size, seq_length, attention_size) [broadcasting over axis 1]
        # bahdanau_score --> (batch_size, seq_length, 1)

        luong_score = tf.matmul(self.W(encoder_output), decoder_current_hidden, transpose_b=True)
        # luong_score --> ((batch_size, seq_length, attention_size), (batch_size, 1, decoder_size)')
        # luong_score --> (batch_size, seq_length, 1) [attention_size must be equal to decoder_size]

        score = bahdanau_score if self.style == 'bahdanau' else luong_score

        attention_weights = tf.nn.softmax(score, axis=1)
        # attention_weights --> (batch_size, seq_length, 1) [floats between 0 and 1]

        context_vector = tf.reduce_sum(attention_weights*encoder_output, axis=1)
        # attention_weights*encoder_output --> (batch_size, seq_length, encoder_size)
        # context_vector --> (batch_size, encoder_size)

        return context_vector, attention_weights

In [None]:
class Decoder(tf.keras.Model):
    '''
    Arguments:-
        vocab_size: (int) Size of target vocabulary
        embedding_size: (int) Embedding size
        decoder_size: (int) Dimensions of decoder hidden state
        batch_size: (int) Batch size
        attention_style: attention mechanism, 'bahdanau' or 'luong'
    '''
    def __init__(self, vocab_size, embedding_size, decoder_size, batch_size, attention_style='bahdanau'):
        super(Decoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.decoder_size = decoder_size
        self.batch_size = batch_size
        self.style = attention_style

        # embedding_input_shape = (batch_size, 1) [seq_length = 1 for decoder]
        self.Embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        # embedding_output_shape = (batch_size, 1, embedding_size)

        # gru_input_shape = (batch_size, 1, embedding_size+...)
        self.GRU = tf.keras.layers.GRU(self.decoder_size, 
                                       return_sequences=True, 
                                       return_state=True)
        # output_shape = (batch_size, 1, decoder_size)
        # hidden_state_shape = (batch_size, decoder_size)

        self.Linear = tf.keras.layers.Dense(self.vocab_size)

        self.attention = Attention(self.decoder_size, style=self.style)
        # --------------with Pgen--------------------------
        self.wh = tf.keras.layers.Dense(1)
        self.ws = tf.keras.layers.Dense(1)
        self.wx = tf.keras.layers.Dense(1)
        self.Pgen = tf.keras.layers.Dense(1, use_bias=True)

    def call(self, input, decoder_prev_hidden, encoder_output):

        # input --> (batch_size, 1) [seq_length = 1 for decoder]
        embed_output = self.Embedding(input)
        # embed_output --> (batch_size, 1, embedding_size)

        # encoder_output --> (batch_size, seq_length, encoder_size)
        # decoder_prev_hidden --> (batch_size, decoder_size)
        context_vector, attention_weights = self.attention(decoder_prev_hidden, encoder_output)
        # context_vector --> (batch_size, encoder_size)
        # attention_weights --> (batch_size, seq_length, 1)

        '''# --------------without Pgen-----------------------
        context_vector = tf.expand_dims(context_vector, axis=1)
        # context_vector --> (batch_size, 1, encoder_size)

        gru_input = tf.concat([context_vector, embed_output], axis=2)
        # gru_input --> (batch_size, 1, encoder_size+embedding_size)

        output, hidden_state = self.GRU(gru_input)
        # output --> (batch_size, 1, decoder_size)
        # hidden_state --> (batch_size, decoder_size)

        output = tf.reshape(output, [output.shape[0], output.shape[2]])
        # output --> (batch_size, decoder_size)
        output = self.Linear(output)
        # output --> (batch_size, vocab_size)
        return output, hidden_state, attention_weights'''

        # --------------with Pgen--------------------------
        context_vector = context_vector
        # context_vector --> (batch_size, encoder_size)
        decoder_state = decoder_prev_hidden
        # decoder_state --> (batch_size, decoder_size)
        decoder_input = tf.reshape(embed_output, [embed_output.shape[0], embed_output.shape[2]])
        # decoder_input = (batch_size, embedding_size)

        concat_ = tf.concat([self.ws(decoder_state),self.wx(decoder_input)], axis=1)
        concat = tf.concat([self.wh(context_vector),concat_], axis=1)
        # concat --> (batch_size, 3)
        p_gen = tf.nn.sigmoid(self.Pgen(concat))
        # p_gen --> (batch_size, 1)

        context_vector = tf.expand_dims(context_vector, axis=1)
        # context_vector --> (batch_size, 1, encoder_size)

        gru_input = tf.concat([context_vector, embed_output], axis=2)
        # gru_input --> (batch_size, 1, encoder_size+embedding_size)

        output, hidden_state = self.GRU(gru_input, initial_state=decoder_prev_hidden)
        # output --> (batch_size, 1, decoder_size)
        # hidden_state --> (batch_size, decoder_size)

        output = tf.reshape(output, [output.shape[0], output.shape[2]])
        # output --> (batch_size, decoder_size)
        outputs = tf.nn.log_softmax(self.Linear(output))
        # outputs --> (batch_size, vocab_size)
        return outputs, hidden_state, attention_weights, p_gen

In [None]:
def final_dist(vocab_dist, attn_dist, max_enc_seq_len):
    """Calculate the final distribution, for the pointer-generator model
    Args:
      vocab_dist: The vocabulary distributions. (batch_size, vsize) arrays
      attn_dist: The attention distributions. (batch_size, attn_len) arrays
    Returns:
      final_dist: The final distributions. (batch_size, extended_vsize) arrays.
    """
    vocab_dist_ = p_gen*vocab_dist
    attn_dist_ = (1-p_gen)*attn_dist

    batchSize = vocab_dist.shape[0]
    decVocabSize = vocab_dist.shape[1]
    ExtdecVocabSize = decVocabSize + max_enc_seq_len
    extra_zeros = tf.zeros((batchSize, max_enc_seq_len))
    vocab_dist_extended = tf.concat((vocab_dist_,extra_zeros), axis=1)

    batch_nums = tf.range(0, limit=batchSize) # shape (batch_size)
    batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1)
    pass

In [None]:
def preprocess(text, lower=True, match_sub={('[^A-Za-z0-9]',' '),
                                            ('\s+',' '),
                                            ('\s+$','')}):
    text = text.lower() if lower is True else text
    for (pattern,sub) in match_sub:
        text = re.sub(pattern, sub, text)
    return text

In [None]:
df.captions = df.captions.apply(preprocess)

In [None]:
df.captions = df.captions.apply(lambda text: '<start> ' + text + ' <end>')

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

In [None]:
tokenizer.fit_on_texts(df['attributes'])
tokenizer.fit_on_texts(df['cells'])
tokenizer.fit_on_texts(df['captions'])

In [None]:
with open('vocab.txt', 'w') as f:
    for (word, count) in tokenizer.word_counts.items():
        f.write(f'{word} {count}\n')

In [None]:
msk = np.random.rand(len(df)) < 0.8
df_tr = df[msk]
df_te = df[~msk]

In [None]:
att_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
att_tokenizer.fit_on_texts(df_tr['attributes'])

In [None]:
src_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
src_tokenizer.fit_on_texts(df_tr['cells'])

In [None]:
trg_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
trg_tokenizer.fit_on_texts(df_tr['captions'])

In [None]:
att_inp_tr = att_tokenizer.texts_to_sequences(df_tr['attributes'])
att_inp_tr = tf.keras.preprocessing.sequence.pad_sequences(att_inp_tr, padding='post')

src_inp_tr = src_tokenizer.texts_to_sequences(df_tr['cells'])
src_inp_tr = tf.keras.preprocessing.sequence.pad_sequences(src_inp_tr, padding='post')

trg_out_tr = trg_tokenizer.texts_to_sequences(df_tr['captions'])
trg_out_tr = tf.keras.preprocessing.sequence.pad_sequences(trg_out_tr, padding='post')

In [None]:
att_inp_te = att_tokenizer.texts_to_sequences(df_te['attributes'])
att_inp_te = tf.keras.preprocessing.sequence.pad_sequences(att_inp_te, padding='post', maxlen=att_inp_tr.shape[1])

src_inp_te = src_tokenizer.texts_to_sequences(df_te['cells'])
src_inp_te = tf.keras.preprocessing.sequence.pad_sequences(src_inp_te, padding='post', maxlen=src_inp_tr.shape[1])

trg_out_te = trg_tokenizer.texts_to_sequences(df_te['captions'])
trg_out_te = tf.keras.preprocessing.sequence.pad_sequences(trg_out_te, padding='post', maxlen=trg_out_tr.shape[1])

In [None]:
att_inp_tr = tokenizer.texts_to_sequences(df_tr['attributes'])
att_inp_tr = tf.keras.preprocessing.sequence.pad_sequences(att_inp_tr, padding='post')

src_inp_tr = tokenizer.texts_to_sequences(df_tr['cells'])
src_inp_tr = tf.keras.preprocessing.sequence.pad_sequences(src_inp_tr, padding='post')

trg_out_tr = tokenizer.texts_to_sequences(df_tr['captions'])
trg_out_tr = tf.keras.preprocessing.sequence.pad_sequences(trg_out_tr, padding='post')

att_inp_te = tokenizer.texts_to_sequences(df_te['attributes'])
att_inp_te = tf.keras.preprocessing.sequence.pad_sequences(att_inp_te, padding='post', maxlen=att_inp_tr.shape[1])

src_inp_te = tokenizer.texts_to_sequences(df_te['cells'])
src_inp_te = tf.keras.preprocessing.sequence.pad_sequences(src_inp_te, padding='post', maxlen=src_inp_tr.shape[1])

trg_out_te = tokenizer.texts_to_sequences(df_te['captions'])
trg_out_te = tf.keras.preprocessing.sequence.pad_sequences(trg_out_te, padding='post', maxlen=trg_out_tr.shape[1])

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
@tf.function
def train_step(att_inp, inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(att_inp, inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1) # <- trg_tokenizer

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
def val_step(att_inp, inp, targ, enc_hidden):
    loss = 0

    enc_output, enc_hidden = encoder(att_inp, inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1) # <- trg_tokenizer

    # No Teacher forcing - feeding the prediction as the next input
    for t in range(1, targ.shape[1]):
        # passing enc_output to the decoder
        predictions, dec_hidden, _, _ = decoder(dec_input, dec_hidden, enc_output)

        loss += loss_function(targ[:, t], predictions)

        # not using teacher forcing
        predicted_id = tf.argmax(predictions, axis=1)

        dec_input = tf.expand_dims(predicted_id, 1)

    batch_loss = (loss / int(targ.shape[1]))

    return batch_loss

In [None]:
BUFFER_SIZE = 8
BATCH_SIZE = 8

dataset_tr = tf.data.Dataset.from_tensor_slices((att_inp_tr, src_inp_tr, trg_out_tr))
dataset_tr = dataset_tr.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
BUFFER_SIZE_ = 8
BATCH_SIZE = 8

dataset_te = tf.data.Dataset.from_tensor_slices((att_inp_te, src_inp_te, trg_out_te))
dataset_te = dataset_te.shuffle(BUFFER_SIZE_).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
vocab_att_size = len(tokenizer.word_index)+1 # <- att_
vocab_inp_size = len(tokenizer.word_index)+1
vocab_tar_size = len(tokenizer.word_index)+1

EMBEDDING_SIZE_en = 16
EMBEDDING_SIZE_de = 32

ENC_SIZE = 64
DEC_SIZE = 64

In [None]:
encoder = Encoder(vocab_att_size, EMBEDDING_SIZE_en, vocab_inp_size, EMBEDDING_SIZE_en, ENC_SIZE, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, EMBEDDING_SIZE_de, DEC_SIZE, BATCH_SIZE)

In [None]:
attr_te = df_te.attributes.to_list()
cell_te = df_te.cells.to_list()
capt_te = df_te.captions.to_list()

In [None]:
def predict(text=None):
    
    '''if text is not None:
        test_source_text = text
        print('Source: ',test_source_text)
    else:
        test_source_text = np.random.choice(cell_te)
        print('Source: ',test_source_text)
        idx = cell_te.index(test_source_text)
        print('Target: ',capt_te[idx])

    test_attr_seq = att_tokenizer.texts_to_sequences([attr_te[idx]])
    test_source_seq = src_tokenizer.texts_to_sequences([test_source_text])'''
    #print(test_source_seq)
    att, cell, capt = next(iter(dataset_te.take(1)))
    x = np.random.randint(low=0, high=BATCH_SIZE)
    cell_vals = [tokenizer.index_word[idx] for idx in cell[x].numpy() if idx != 0] # <- src_tokenizer
    capt_vals = [tokenizer.index_word[idx] for idx in capt[x].numpy() if idx != 0] # <- trg_tokenizer
    print('Source: ',' '.join(cell_vals))
    print('Target: ',' '.join(capt_vals))

    en_initial_states = tf.zeros([1, encoder.encoder_size])
    enc_output, enc_hidden = encoder(tf.expand_dims(att[x],0), tf.expand_dims(cell[x],0), en_initial_states)

    dec_input = tf.constant([[tokenizer.word_index['<start>']]]) # <- trg_tokenizer
    dec_hidden = enc_hidden
    out_words = ['<start>']

    while True:
        predictions, dec_hidden, _, _ = decoder(dec_input, dec_hidden, enc_output)
        dec_input = tf.expand_dims(tf.argmax(predictions, -1), 0)
        out_words.append(tokenizer.index_word[dec_input.numpy()[0][0]]) # <- trg_tokenizer

        if len(out_words) >= 12: # out_words[-1] == '<end>' or
            break

    print('Predic: ',' '.join(out_words))

In [None]:
def run(epochs, print_per_epoch, steps_per_epoch, steps_per_epoch_):
    
    for epoch in range(epochs):

        enc_hidden = encoder.init_hidden_state()
        total_loss = 0

        for (batch, (att, inp, targ)) in enumerate(dataset_tr.take(steps_per_epoch)):
            batch_loss = train_step(att, inp, targ, enc_hidden)
            total_loss += batch_loss
        train_loss = total_loss / steps_per_epoch
        
        val_loss = 0
        for (batch, (att, inp, targ)) in enumerate(dataset_te.take(steps_per_epoch_)):
            batch_loss = val_step(att, inp, targ, enc_hidden)
            val_loss += batch_loss
        valid_loss = val_loss / steps_per_epoch_

        '''if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))'''

        # saving (checkpoint) the model every 2 epochs
        # if (epoch + 1) % 2 == 0:
        #  checkpoint.save(file_prefix = checkpoint_prefix)

        if epoch % print_per_epoch == 0:
            print('----------------------<><><><><>-----------------------')
            print('Epoch {} | Train Loss {:.4f} | Val Loss {:.4f}'.format(epoch + 1, train_loss, valid_loss))
            # try:
            predict()
            # except Exception:
                #continue

In [None]:
EPOCHS = 400
STEPS_PER_EPOCH = BUFFER_SIZE//BATCH_SIZE
STEPS_PER_EPOCH_ = BUFFER_SIZE_//BATCH_SIZE
PRINT_PER_EPOCH = 5

In [None]:
run(EPOCHS, PRINT_PER_EPOCH, STEPS_PER_EPOCH, STEPS_PER_EPOCH_)

----------------------<><><><><>-----------------------
Epoch 1 | Train Loss 1.9665 | Val Loss 3.1529
Source:  jayden female 62 70 75
Target:  <start> jayden s performance was decent but she needs to work on her math <end>
Predic:  <start> william was was was and <end> <end> <end> <end> <end> <end>
----------------------<><><><><>-----------------------
Epoch 6 | Train Loss 1.9822 | Val Loss 2.6120
Source:  ryan male 65 54 57
Target:  <start> ryan scores were average and he should work harder in reading and writing <end>
Predic:  <start> william was was was and and <end> <end> <end> <end> <end>
----------------------<><><><><>-----------------------
Epoch 11 | Train Loss 2.1949 | Val Loss 2.8263
Source:  christopher female 58 73 68
Target:  <start> christopher scored average marks and she can work more on her math <end>
Predic:  <start> oliver was was was and and <end> <end> <end> <end> <end>
----------------------<><><><><>-----------------------
Epoch 16 | Train Loss 2.2518 | Val Los